From b12363ef20b953a7904c31b346da892b38b77bc5 Mon Sep 17 00:00:00 2001 From: Andy Luo Date: Mon, 27 Apr 2026 16:33:58 +0000 Subject: [PATCH] [ROCm] Add HIP support for AMD Instinct GPUs (MI300X) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port all CUDA-specific code to work on both CUDA and ROCm via HIP: - Headers: #ifdef __HIP_PLATFORM_AMD__ guards for cuda.h → hip/hip_runtime.h, cub/cub.cuh → hipcub/hipcub.hpp - API: cudaMalloc/Free/Memcpy → hipMalloc/Free/Memcpy (guarded) - CUB → hipcub namespace mapping - ::cuda::std::tuple → ::rocprim::tuple (for DeviceRadixSort decomposer) - ::cuda::std::plus → hipcub::Sum() - Vec3f: add default __host__ __device__ constructor for hipcub compatibility - ATen/cuda → ATen/hip (in cubvh submodule) - setup.py: default arch gfx942, gate CUDA-only flags behind IS_HIP check Tested on AMD MI300X (gfx942) with ROCm 7.0.2 + PyTorch 2.9.1. All code remains cross-compilable for CUDA. Signed-off-by: Andy Luo --- setup.py | 6 +- src/atlas.cu | 234 +++--- src/atlas.hip | 1223 +++++++++++++++++++++++++++ src/clean_up.cu | 356 ++++---- src/clean_up.hip | 1237 ++++++++++++++++++++++++++++ src/connectivity.cu | 214 ++--- src/connectivity.hip | 1095 ++++++++++++++++++++++++ src/cumesh.h | 4 + src/cumesh.hip | 143 ++++ src/cumesh_hip.h | 509 ++++++++++++ src/dtypes.cuh | 16 +- src/dtypes_hip.cuh | 329 ++++++++ src/ext_hip.cpp | 69 ++ src/geometry.cu | 10 +- src/geometry.hip | 136 +++ src/hash/hash.cu | 4 + src/hash/hash.hip | 451 ++++++++++ src/io.cu | 10 +- src/io.hip | 251 ++++++ src/remesh/simple_dual_contour.cu | 6 +- src/remesh/simple_dual_contour.hip | 228 +++++ src/remesh/svox2vert.cu | 26 +- src/remesh/svox2vert.hip | 236 ++++++ src/shared.h | 64 +- src/shared.hip | 69 ++ src/shared_hip.h | 294 +++++++ src/simplify.cu | 40 +- src/simplify.hip | 588 +++++++++++++ src/utils.h | 26 +- src/utils_hip.h | 110 +++ 30 files changed, 7508 insertions(+), 476 deletions(-) create mode 100644 src/atlas.hip create mode 100644 src/clean_up.hip create mode 100644 src/connectivity.hip create mode 100644 src/cumesh.hip create mode 100644 src/cumesh_hip.h create mode 100644 src/dtypes_hip.cuh create mode 100644 src/ext_hip.cpp create mode 100644 src/geometry.hip create mode 100644 src/hash/hash.hip create mode 100644 src/io.hip create mode 100644 src/remesh/simple_dual_contour.hip create mode 100644 src/remesh/svox2vert.hip create mode 100644 src/shared.hip create mode 100644 src/shared_hip.h create mode 100644 src/simplify.hip create mode 100644 src/utils_hip.h diff --git a/setup.py b/setup.py index 8849b10..d9a2d60 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ # CUDA / ROCm specific # ------------------------------------------------- if IS_HIP: - archs = os.getenv("GPU_ARCHS", "native").split(";") + archs = os.getenv("GPU_ARCHS", "gfx942").split(";") nvcc_flags += [f"--offload-arch={arch}" for arch in archs] else: # CUDA only @@ -115,14 +115,14 @@ ], extra_compile_args={ "cxx": cxx_flags, - "nvcc": nvcc_flags + [ + "nvcc": nvcc_flags + ([] if IS_HIP else [ # The following definitions must be undefined # since we need half-precision operation. "--extended-lambda", "-U__CUDA_NO_HALF_OPERATORS__", "-U__CUDA_NO_HALF_CONVERSIONS__", "-U__CUDA_NO_HALF2_OPERATORS__", - ], + ]), }, ), diff --git a/src/atlas.cu b/src/atlas.cu index 2d78d8c..87dbf0d 100644 --- a/src/atlas.cu +++ b/src/atlas.cu @@ -1,7 +1,11 @@ #include "cumesh.h" #include "dtypes.cuh" #include "shared.h" +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include +#endif namespace cumesh { @@ -286,8 +290,8 @@ static void get_chart_connectivity( mesh.atlas_chart_adj.resize(M); mesh.atlas_chart_adj_length.resize(M); float *cu_raw_lengths, *cu_sorted_lengths; - CUDA_CHECK(cudaMalloc(&cu_raw_lengths, M * sizeof(float))); - CUDA_CHECK(cudaMalloc(&cu_sorted_lengths, M * sizeof(float))); + CUDA_CHECK(hipMalloc(&cu_raw_lengths, M * sizeof(float))); + CUDA_CHECK(hipMalloc(&cu_sorted_lengths, M * sizeof(float))); init_chart_adj_kernel<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( mesh.vertices.ptr, @@ -298,12 +302,12 @@ static void get_chart_connectivity( mesh.atlas_chart_adj.ptr, cu_raw_lengths ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // 1.2 Sort size_t temp_storage_bytes = 0; mesh.temp_storage.resize(M * sizeof(uint64_t)); - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( nullptr, temp_storage_bytes, mesh.atlas_chart_adj.ptr, reinterpret_cast(mesh.temp_storage.ptr), @@ -312,7 +316,7 @@ static void get_chart_connectivity( M )); mesh.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( mesh.cub_temp_storage.ptr, temp_storage_bytes, mesh.atlas_chart_adj.ptr, reinterpret_cast(mesh.temp_storage.ptr), @@ -320,20 +324,16 @@ static void get_chart_connectivity( cu_sorted_lengths, M )); - CUDA_CHECK(cudaFree(cu_raw_lengths)); + CUDA_CHECK(hipFree(cu_raw_lengths)); - #if CUDART_VERSION >= 12090 - auto reduce_op = ::cuda::std::plus(); - #else - auto reduce_op = cub::Sum(); - #endif + auto reduce_op = hipcub::Sum(); // 1.3 Reduce By Key (Aggregate duplicate chart pairs by summing lengths) int* cu_num_chart_adjs; - CUDA_CHECK(cudaMalloc(&cu_num_chart_adjs, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num_chart_adjs, sizeof(int))); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceReduce::ReduceByKey( + CUDA_CHECK(hipcub::DeviceReduce::ReduceByKey( nullptr, temp_storage_bytes, reinterpret_cast(mesh.temp_storage.ptr), mesh.atlas_chart_adj.ptr, @@ -344,7 +344,7 @@ static void get_chart_connectivity( M )); mesh.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceReduce::ReduceByKey( + CUDA_CHECK(hipcub::DeviceReduce::ReduceByKey( mesh.cub_temp_storage.ptr, temp_storage_bytes, reinterpret_cast(mesh.temp_storage.ptr), mesh.atlas_chart_adj.ptr, @@ -354,15 +354,15 @@ static void get_chart_connectivity( reduce_op, M )); - CUDA_CHECK(cudaMemcpy(&mesh.atlas_chart_adj.size, cu_num_chart_adjs, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(&mesh.atlas_chart_adj.size, cu_num_chart_adjs, sizeof(int), hipMemcpyDeviceToHost)); mesh.atlas_chart_adj_length.size = mesh.atlas_chart_adj.size; - CUDA_CHECK(cudaFree(cu_sorted_lengths)); - CUDA_CHECK(cudaFree(cu_num_chart_adjs)); + CUDA_CHECK(hipFree(cu_sorted_lengths)); + CUDA_CHECK(hipFree(cu_num_chart_adjs)); // Remove invalid edge (UINT64_MAX) if present // Since we sorted, invalid edges are at the end. uint64_t last_key; if (mesh.atlas_chart_adj.size > 0) { - CUDA_CHECK(cudaMemcpy(&last_key, mesh.atlas_chart_adj.ptr + mesh.atlas_chart_adj.size - 1, sizeof(uint64_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(&last_key, mesh.atlas_chart_adj.ptr + mesh.atlas_chart_adj.size - 1, sizeof(uint64_t), hipMemcpyDeviceToHost)); if (last_key == UINT64_MAX) { mesh.atlas_chart_adj.size -= 1; mesh.atlas_chart_adj_length.size -= 1; @@ -388,18 +388,18 @@ static void get_chart_connectivity( mesh.atlas_chart2edge_cnt.ptr, mesh.atlas_chart_perims.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // 2.2 Prepare CSR format for chart-edge connectivity mesh.atlas_chart2edge_offset.resize(C + 1); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, mesh.atlas_chart2edge_cnt.ptr, mesh.atlas_chart2edge_offset.ptr, C + 1 )); mesh.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( mesh.cub_temp_storage.ptr, temp_storage_bytes, mesh.atlas_chart2edge_cnt.ptr, mesh.atlas_chart2edge_offset.ptr, @@ -415,7 +415,7 @@ static void get_chart_connectivity( mesh.atlas_chart2edge_offset.ptr, mesh.atlas_chart2edge_cnt.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } @@ -489,106 +489,106 @@ void compute_chart_normal_cones( int* sorted_chart_ids; int* faces_ids; int* argsorted_faces_ids; - CUDA_CHECK(cudaMalloc(&sorted_chart_ids, F * sizeof(int))); - CUDA_CHECK(cudaMalloc(&faces_ids, F * sizeof(int))); - CUDA_CHECK(cudaMalloc(&argsorted_faces_ids, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&sorted_chart_ids, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&faces_ids, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&argsorted_faces_ids, F * sizeof(int))); arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( faces_ids, F ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( nullptr, temp_storage_bytes, mesh.atlas_chart_ids.ptr, sorted_chart_ids, faces_ids, argsorted_faces_ids, F )); mesh.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( mesh.cub_temp_storage.ptr, temp_storage_bytes, mesh.atlas_chart_ids.ptr, sorted_chart_ids, faces_ids, argsorted_faces_ids, F )); - CUDA_CHECK(cudaFree(faces_ids)); + CUDA_CHECK(hipFree(faces_ids)); // 2. Get CSR format for chart-face assignment int* cu_chart_size; int* cu_num_charts; int* cu_unique_chart_ids; - CUDA_CHECK(cudaMalloc(&cu_chart_size, (C + 1) * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_num_charts, sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_unique_chart_ids, (C + 1) * sizeof(int))); - CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( + CUDA_CHECK(hipMalloc(&cu_chart_size, (C + 1) * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num_charts, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_unique_chart_ids, (C + 1) * sizeof(int))); + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( nullptr, temp_storage_bytes, sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts, F )); mesh.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( mesh.cub_temp_storage.ptr, temp_storage_bytes, sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts, F )); - CUDA_CHECK(cudaFree(cu_num_charts)); - CUDA_CHECK(cudaFree(cu_unique_chart_ids)); + CUDA_CHECK(hipFree(cu_num_charts)); + CUDA_CHECK(hipFree(cu_unique_chart_ids)); int* cu_chart_offsets; - CUDA_CHECK(cudaMalloc(&cu_chart_offsets, (C + 1) * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_chart_offsets, (C + 1) * sizeof(int))); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, cu_chart_size, cu_chart_offsets, C + 1 )); mesh.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( mesh.cub_temp_storage.ptr, temp_storage_bytes, cu_chart_size, cu_chart_offsets, C + 1 )); - CUDA_CHECK(cudaFree(cu_chart_size)); + CUDA_CHECK(hipFree(cu_chart_size)); // 3. Compute chart normals and areas float* cu_sorted_face_areas; - CUDA_CHECK(cudaMalloc(&cu_sorted_face_areas, F * sizeof(float))); + CUDA_CHECK(hipMalloc(&cu_sorted_face_areas, F * sizeof(float))); index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( mesh.face_areas.ptr, argsorted_faces_ids, F, cu_sorted_face_areas ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); mesh.atlas_chart_areas.resize(C); - CUDA_CHECK(cub::DeviceSegmentedReduce::Sum( + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( nullptr, temp_storage_bytes, cu_sorted_face_areas, mesh.atlas_chart_areas.ptr, C, cu_chart_offsets, cu_chart_offsets + 1 )); mesh.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSegmentedReduce::Sum( + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( mesh.cub_temp_storage.ptr, temp_storage_bytes, cu_sorted_face_areas, mesh.atlas_chart_areas.ptr, C, cu_chart_offsets, cu_chart_offsets + 1 )); - CUDA_CHECK(cudaFree(cu_sorted_face_areas)); + CUDA_CHECK(hipFree(cu_sorted_face_areas)); float3* cu_sorted_face_normals; - CUDA_CHECK(cudaMalloc(&cu_sorted_face_normals, F * sizeof(float3))); + CUDA_CHECK(hipMalloc(&cu_sorted_face_normals, F * sizeof(float3))); index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( mesh.face_normals.ptr, argsorted_faces_ids, F, cu_sorted_face_normals ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(argsorted_faces_ids)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(argsorted_faces_ids)); float3* cu_chart_normals; - CUDA_CHECK(cudaMalloc(&cu_chart_normals, C * sizeof(float3))); - CUDA_CHECK(cub::DeviceSegmentedReduce::Reduce( + CUDA_CHECK(hipMalloc(&cu_chart_normals, C * sizeof(float3))); + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Reduce( nullptr, temp_storage_bytes, cu_sorted_face_normals, cu_chart_normals, C, @@ -597,7 +597,7 @@ void compute_chart_normal_cones( make_float3(0.0f, 0.0f, 0.0f) )); mesh.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSegmentedReduce::Reduce( + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Reduce( mesh.cub_temp_storage.ptr, temp_storage_bytes, cu_sorted_face_normals, cu_chart_normals, C, @@ -609,11 +609,11 @@ void compute_chart_normal_cones( cu_chart_normals, C ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // 4. Compute normal difference float* cu_normal_diff; - CUDA_CHECK(cudaMalloc(&cu_normal_diff, F * sizeof(float))); + CUDA_CHECK(hipMalloc(&cu_normal_diff, F * sizeof(float))); normal_diff_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( cu_chart_normals, cu_sorted_face_normals, @@ -621,29 +621,29 @@ void compute_chart_normal_cones( F, cu_normal_diff ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_sorted_face_normals)); - CUDA_CHECK(cudaFree(sorted_chart_ids)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_sorted_face_normals)); + CUDA_CHECK(hipFree(sorted_chart_ids)); // 5. Compute new cone half angles float* cu_new_cone_half_angles; - CUDA_CHECK(cudaMalloc(&cu_new_cone_half_angles, C * sizeof(float))); + CUDA_CHECK(hipMalloc(&cu_new_cone_half_angles, C * sizeof(float))); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceSegmentedReduce::Max( + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Max( nullptr, temp_storage_bytes, cu_normal_diff, cu_new_cone_half_angles, C, cu_chart_offsets, cu_chart_offsets + 1 )); mesh.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSegmentedReduce::Max( + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Max( mesh.cub_temp_storage.ptr, temp_storage_bytes, cu_normal_diff, cu_new_cone_half_angles, C, cu_chart_offsets, cu_chart_offsets + 1 )); - CUDA_CHECK(cudaFree(cu_chart_offsets)); - CUDA_CHECK(cudaFree(cu_normal_diff)); + CUDA_CHECK(hipFree(cu_chart_offsets)); + CUDA_CHECK(hipFree(cu_normal_diff)); // 6. Update chart normal cones mesh.atlas_chart_normal_cones.resize(C); @@ -653,9 +653,9 @@ void compute_chart_normal_cones( cu_new_cone_half_angles, C ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_chart_normals)); - CUDA_CHECK(cudaFree(cu_new_cone_half_angles)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_chart_normals)); + CUDA_CHECK(hipFree(cu_new_cone_half_angles)); } @@ -832,13 +832,13 @@ static void reassign_chart_ids( reinterpret_cast(mesh.temp_storage.ptr), F ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); int* cu_end_flag; int h_end_flag; - CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int))); do { h_end_flag = 1; - CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice)); // Hook hook_edges_if_same_chart_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( @@ -848,17 +848,17 @@ static void reassign_chart_ids( reinterpret_cast(mesh.temp_storage.ptr), cu_end_flag ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // Compress compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( reinterpret_cast(mesh.temp_storage.ptr), F ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost)); } while (h_end_flag == 0); - CUDA_CHECK(cudaFree(cu_end_flag)); + CUDA_CHECK(hipFree(cu_end_flag)); swap_buffers(mesh.atlas_chart_ids, mesh.temp_storage); mesh.atlas_num_charts = compress_ids(mesh.atlas_chart_ids.ptr, F, mesh.cub_temp_storage); @@ -940,67 +940,67 @@ void construct_chart_mesh( int* cu_sorted_chart_ids; int* cu_face_idx; int* cu_sorted_face_idx; - CUDA_CHECK(cudaMalloc(&cu_sorted_chart_ids, F * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_face_idx, F * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_sorted_face_idx, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_sorted_chart_ids, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_face_idx, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_sorted_face_idx, F * sizeof(int))); arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( cu_face_idx, F ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( nullptr, temp_storage_bytes, mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids, cu_face_idx, cu_sorted_face_idx, F )); mesh.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( mesh.cub_temp_storage.ptr, temp_storage_bytes, mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids, cu_face_idx, cu_sorted_face_idx, F )); - CUDA_CHECK(cudaFree(cu_face_idx)); + CUDA_CHECK(hipFree(cu_face_idx)); // 2. RLE for chart size int* cu_chart_size; int* cu_num_chart; int* cu_unique_chart_ids; - CUDA_CHECK(cudaMalloc(&cu_chart_size, (mesh.atlas_num_charts + 1) * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_num_chart, sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_unique_chart_ids, mesh.atlas_num_charts * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_chart_size, (mesh.atlas_num_charts + 1) * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num_chart, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_unique_chart_ids, mesh.atlas_num_charts * sizeof(int))); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( nullptr, temp_storage_bytes, cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart, F )); mesh.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( mesh.cub_temp_storage.ptr, temp_storage_bytes, cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart, F )); - CUDA_CHECK(cudaFree(cu_unique_chart_ids)); - CUDA_CHECK(cudaFree(cu_num_chart)); + CUDA_CHECK(hipFree(cu_unique_chart_ids)); + CUDA_CHECK(hipFree(cu_num_chart)); // 3. Exclusive scan for chart face offset temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, cu_chart_size, mesh.atlas_chart_faces_offset.ptr, mesh.atlas_num_charts + 1 )); mesh.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( mesh.cub_temp_storage.ptr, temp_storage_bytes, cu_chart_size, mesh.atlas_chart_faces_offset.ptr, mesh.atlas_num_charts + 1 )); - CUDA_CHECK(cudaFree(cu_chart_size)); + CUDA_CHECK(hipFree(cu_chart_size)); // 4. Expand chart ids and vertex ids uint64_t* cu_pack; - CUDA_CHECK(cudaMalloc(&cu_pack, 3 * F * sizeof(uint64_t))); + CUDA_CHECK(hipMalloc(&cu_pack, 3 * F * sizeof(uint64_t))); expand_chart_ids_and_vertex_ids_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( cu_sorted_chart_ids, cu_sorted_face_idx, @@ -1008,12 +1008,12 @@ void construct_chart_mesh( F, cu_pack ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_sorted_chart_ids)); - CUDA_CHECK(cudaFree(cu_sorted_face_idx)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_sorted_chart_ids)); + CUDA_CHECK(hipFree(cu_sorted_face_idx)); // 5. Compress pair to construct all maps uint64_t* cu_inverse_pack; - CUDA_CHECK(cudaMalloc(&cu_inverse_pack, 3 * F * sizeof(uint64_t))); + CUDA_CHECK(hipMalloc(&cu_inverse_pack, 3 * F * sizeof(uint64_t))); int new_num_vertices = compress_ids( cu_pack, 3 * F, @@ -1028,15 +1028,15 @@ void construct_chart_mesh( mesh.atlas_chart_vertex_map.ptr, mesh.atlas_chart_vertex_offset.ptr ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_inverse_pack)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_inverse_pack)); unpack_faces_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( cu_pack, F, mesh.atlas_chart_faces.ptr ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_pack)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_pack)); } @@ -1066,15 +1066,15 @@ void CuMesh::compute_charts( this->atlas_chart_ids.ptr, F ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // Main Iteration: Collapse and Refine int* cu_end_flag; int h_end_flag; - CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int))); for (int i = 0; i < global_iterations; i++) { while (true) { h_end_flag = 1; - CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice)); // 1. Compute chart connectivity get_chart_connectivity(*this); @@ -1097,7 +1097,7 @@ void CuMesh::compute_charts( E, this->edge_collapse_costs.ptr ); - CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize()); // 4. Propagate costs size_t C = this->atlas_num_charts; @@ -1109,7 +1109,7 @@ void CuMesh::compute_charts( C, this->propagated_costs.ptr ); - CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize()); // 5. Collapse edges this->vertices_map.resize(C); // store collapse map @@ -1127,10 +1127,10 @@ void CuMesh::compute_charts( this->atlas_chart_normal_cones.ptr, cu_end_flag ); - CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize()); // End of iteration - CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost)); if (h_end_flag == 1) break; // 6. Compress chart ids @@ -1142,7 +1142,7 @@ void CuMesh::compute_charts( F, reinterpret_cast(this->temp_storage.ptr) ); - CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize()); swap_buffers(this->atlas_chart_ids, this->temp_storage); } @@ -1163,7 +1163,7 @@ void CuMesh::compute_charts( this->atlas_chart_ids.ptr, reinterpret_cast(this->temp_storage.ptr) ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); swap_buffers(this->atlas_chart_ids, this->temp_storage); this->atlas_num_charts = compress_ids(this->atlas_chart_ids.ptr, F, this->cub_temp_storage); } @@ -1171,7 +1171,7 @@ void CuMesh::compute_charts( // After refinement, the chart may become disconnected, so we need to re-assign chart ids reassign_chart_ids(*this); } - CUDA_CHECK(cudaFree(cu_end_flag)); + CUDA_CHECK(hipFree(cu_end_flag)); // Finalizing: calculate vmap, chart face and chart face offset construct_chart_mesh(*this); @@ -1180,39 +1180,39 @@ void CuMesh::compute_charts( std::tuple CuMesh::read_atlas_charts() { auto chart_ids = torch::empty({ static_cast(this->faces.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA)); - CUDA_CHECK(cudaMemcpy( + CUDA_CHECK(hipMemcpy( chart_ids.data_ptr(), this->atlas_chart_ids.ptr, this->faces.size * sizeof(int), - cudaMemcpyDeviceToDevice + hipMemcpyDeviceToDevice )); auto vertex_map = torch::empty({ static_cast(this->atlas_chart_vertex_map.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA)); - CUDA_CHECK(cudaMemcpy( + CUDA_CHECK(hipMemcpy( vertex_map.data_ptr(), this->atlas_chart_vertex_map.ptr, this->atlas_chart_vertex_map.size * sizeof(int), - cudaMemcpyDeviceToDevice + hipMemcpyDeviceToDevice )); auto chart_faces = torch::empty({ static_cast(this->atlas_chart_faces.size), 3 }, torch::dtype(torch::kInt32).device(torch::kCUDA)); - CUDA_CHECK(cudaMemcpy( + CUDA_CHECK(hipMemcpy( chart_faces.data_ptr(), this->atlas_chart_faces.ptr, this->atlas_chart_faces.size * 3 * sizeof(int), - cudaMemcpyDeviceToDevice + hipMemcpyDeviceToDevice )); auto chart_vertex_offset = torch::empty({ static_cast(this->atlas_chart_vertex_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA)); - CUDA_CHECK(cudaMemcpy( + CUDA_CHECK(hipMemcpy( chart_vertex_offset.data_ptr(), this->atlas_chart_vertex_offset.ptr, this->atlas_chart_vertex_offset.size * sizeof(int), - cudaMemcpyDeviceToDevice + hipMemcpyDeviceToDevice )); auto chart_face_offset = torch::empty({ static_cast(this->atlas_chart_faces_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA)); - CUDA_CHECK(cudaMemcpy( + CUDA_CHECK(hipMemcpy( chart_face_offset.data_ptr(), this->atlas_chart_faces_offset.ptr, this->atlas_chart_faces_offset.size * sizeof(int), - cudaMemcpyDeviceToDevice + hipMemcpyDeviceToDevice )); return std::make_tuple(this->atlas_num_charts, chart_ids, vertex_map, chart_faces, chart_vertex_offset, chart_face_offset); } diff --git a/src/atlas.hip b/src/atlas.hip new file mode 100644 index 0000000..5aa28d5 --- /dev/null +++ b/src/atlas.hip @@ -0,0 +1,1223 @@ +// !!! This is a file automatically generated by hipify!!! +#include "hip/hip_runtime.h" +#include "cumesh_hip.h" +#include "dtypes_hip.cuh" +#include "shared_hip.h" +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#endif + + +namespace cumesh { + + +/* +Fast mesh parameterization / UV unwrapping using GPU + +Three main steps: +1. Split the mesh into charts + - Treat each chart as a node in a graph + - Use a parallel edge collapse algorithm to merge charts based on normal cone deviation +2. Parameterize each chart using Least Squares Conformal Maps (LSCM) +3. Pack the charts into a texture atlas +*/ + + +__device__ inline uint64_t pack_key_value_positive(int key, float value) { + unsigned int v = __float_as_uint(value); + return (static_cast(v) << 32) | + static_cast(key); +} + + +__device__ inline void unpack_key_value_positive(uint64_t key_value, int& key, float& value) { + key = static_cast(key_value & 0xffffffffu); + value = __uint_as_float(static_cast(key_value >> 32)); +} + + +// static __global__ void init_normal_cones_kernel( +// const float3* face_normals, +// const int F, +// float4* chart_normal_cones +// ) { +// const int tid = blockIdx.x * blockDim.x + threadIdx.x; +// if (tid >= F) return; +// +// float3 n = face_normals[tid]; +// chart_normal_cones[tid] = make_float4(n.x, n.y, n.z, 0.0f); // half angle = 0 +// } + + +static __global__ void init_chart_adj_kernel( + const float3* vertices, + const int3* faces, + const int2* face_adj, + const int* chart_ids, + const size_t M, + uint64_t* chart_adj, + float* length +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= M) return; + + int f0 = face_adj[tid].x; + int f1 = face_adj[tid].y; + + int c0 = chart_ids[f0]; + int c1 = chart_ids[f1]; + + if (c0 == c1) { + chart_adj[tid] = UINT64_MAX; + length[tid] = 0.0f; + return; + } + + int min_c = min(c0, c1); + int max_c = max(c0, c1); + chart_adj[tid] = (static_cast(min_c) << 32) | static_cast(max_c); + + int3 tri0 = faces[f0]; + int3 tri1 = faces[f1]; + + int t0_indices[3] = {tri0.x, tri0.y, tri0.z}; + int common_v_indices[2]; + int found_count = 0; + + #pragma unroll + for (int i = 0; i < 3; ++i) { + int v = t0_indices[i]; + if (v == tri1.x || v == tri1.y || v == tri1.z) { + if (found_count < 2) { + common_v_indices[found_count] = v; + } + found_count++; + } + } + + if (found_count >= 2) { + float3 p0 = vertices[common_v_indices[0]]; + float3 p1 = vertices[common_v_indices[1]]; + + float dx = p0.x - p1.x; + float dy = p0.y - p1.y; + float dz = p0.z - p1.z; + + length[tid] = sqrtf(dx * dx + dy * dy + dz * dz); + } else { + length[tid] = 0.0f; + } +} + + +static __global__ void get_chart_edge_cnt_kernel( + const uint64_t* chart_adj, + const float* chart_adj_length, + const int E, + int* chart2edge_cnt, + float* chart_perim +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= E) return; + + // get edge + uint64_t c = chart_adj[tid]; + float l = chart_adj_length[tid]; + int c0 = int(c >> 32); + int c1 = int(c & 0xFFFFFFFF); + + // count vertex adjacent edge number + atomicAdd(&chart2edge_cnt[c0], 1); + atomicAdd(&chart2edge_cnt[c1], 1); + atomicAdd(&chart_perim[c0], l); + atomicAdd(&chart_perim[c1], l); +} + + +static __global__ void get_chart_edge_adjacency_kernel( + const uint64_t* chart_adj, + const int E, + int* chart2edge, + int* chart2edge_offset, + int* chart2edge_cnt +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= E) return; + + // get edge + uint64_t c = chart_adj[tid]; + int c0 = int(c >> 32); + int c1 = int(c & 0xFFFFFFFF); + + // assign connectivity + chart2edge[chart2edge_offset[c0] + atomicAdd(&chart2edge_cnt[c0], 1)] = tid; + chart2edge[chart2edge_offset[c1] + atomicAdd(&chart2edge_cnt[c1], 1)] = tid; +} + + +static __global__ void compute_chart_adjacency_cost_kernel( + const uint64_t* chart_adj, + const float4* chart_normal_cones, + const float* chart_adj_length, + const float* chart_perims, + const float* chart_areas, + float area_penalty_weight, + float perimeter_area_ratio_weight, + const int E, + float* chart_adj_costs +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= E) return; + + uint64_t adj = chart_adj[tid]; + int c0 = static_cast(adj >> 32); + int c1 = static_cast(adj & 0xFFFFFFFF); + + float4 cone0 = chart_normal_cones[c0]; + float4 cone1 = chart_normal_cones[c1]; + Vec3f axis0(cone0.x, cone0.y, cone0.z); + Vec3f axis1(cone1.x, cone1.y, cone1.z); + float half_angle0 = cone0.w; + float half_angle1 = cone1.w; + float cos_angle = axis0.dot(axis1); + float axis_angle = acosf(fmaxf(fminf(cos_angle, 1.0f), -1.0f)); + float new_cone_low = fminf(-half_angle0, axis_angle - half_angle1); + float new_cone_high = fmaxf(half_angle0, axis_angle + half_angle1); + float new_half_angle = (new_cone_high - new_cone_low) * 0.5f; + float cost = new_half_angle; + + // Chart area panelty + float new_area = (chart_areas[c0] + chart_areas[c1]); + cost += area_penalty_weight * new_area; + + // Perim-area ration panelty + float new_perim = chart_perims[c0] + chart_perims[c1] - 2 * chart_adj_length[tid]; + cost += perimeter_area_ratio_weight * (new_perim * new_perim / new_area); + + chart_adj_costs[tid] = cost; +} + + +static __global__ void propagate_cost_kernel( + const int* chart2edge, + const int* chart2edge_offset, + const float* edge_collapse_costs, + const int num_charts, + uint64_t* propagated_costs +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= num_charts) return; + + // get edge with minimum cost + int min_eid = -1; + float min_cost = FLT_MAX; + for (int e = chart2edge_offset[tid]; e < chart2edge_offset[tid+1]; e++) { + int eid = chart2edge[e]; + float cost = edge_collapse_costs[eid]; + if (cost < min_cost || (cost == min_cost && eid < min_eid)) { + min_eid = eid; + min_cost = cost; + } + } + + uint64_t cost = pack_key_value_positive(min_eid, min_cost); + propagated_costs[tid] = cost; +} + + +static __global__ void collapse_edges_kernel( + uint64_t* chart_adj, + const float* edge_collapse_costs, + const uint64_t* propagated_costs, + const float collapse_thresh, + const int E, + int* chart_map, + float4* chart_normal_cones, + int* end_flag +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= E) return; + + float cost = edge_collapse_costs[tid]; + if (cost > collapse_thresh) return; + + // get edge + uint64_t c = chart_adj[tid]; + int c0 = int(c >> 32); + int c1 = int(c & 0xFFFFFFFF); + + // check if this edge has the minimum cost among neighboring charts + uint64_t pack = pack_key_value_positive(tid, cost); + if (propagated_costs[c0] < pack || propagated_costs[c1] < pack) return; + + // collapse edge + chart_map[c1] = c0; + + // update cone + float4 cone0 = chart_normal_cones[c0]; + float4 cone1 = chart_normal_cones[c1]; + Vec3f axis0(cone0.x, cone0.y, cone0.z); + Vec3f axis1(cone1.x, cone1.y, cone1.z); + float half_angle0 = cone0.w; + float half_angle1 = cone1.w; + float cos_angle = axis0.dot(axis1); + float axis_angle = acosf(fmaxf(fminf(cos_angle, 1.0f), -1.0f)); + float new_cone_low = fminf(-half_angle0, axis_angle - half_angle1); + float new_cone_high = fmaxf(half_angle0, axis_angle + half_angle1); + float new_half_angle = (new_cone_high - new_cone_low) * 0.5f; + Vec3f new_axis; + if (axis_angle < 1e-3f) { + new_axis = axis0; + } else { + float new_axis_angle = (new_cone_high + new_cone_low) * 0.5f; + new_axis = axis0 * cosf(new_axis_angle) + (axis1 - axis0 * cos_angle).normalized() * sinf(new_axis_angle); + new_axis.normalize(); + } + chart_normal_cones[c0] = make_float4(new_axis.x, new_axis.y, new_axis.z, new_half_angle); + + // not end of iteration + *end_flag = 0; +} + + +static void get_chart_connectivity( + CuMesh& mesh +) { + size_t M = mesh.manifold_face_adj.size; + + // 1. Get chart adjacency + // 1.1 Initialize chart adjacency and edge lengths + mesh.atlas_chart_adj.resize(M); + mesh.atlas_chart_adj_length.resize(M); + float *cu_raw_lengths, *cu_sorted_lengths; + CUDA_CHECK(hipMalloc(&cu_raw_lengths, M * sizeof(float))); + CUDA_CHECK(hipMalloc(&cu_sorted_lengths, M * sizeof(float))); + + hipLaunchKernelGGL(( init_chart_adj_kernel), dim3((M + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + mesh.vertices.ptr, + mesh.faces.ptr, + mesh.manifold_face_adj.ptr, + mesh.atlas_chart_ids.ptr, + M, + mesh.atlas_chart_adj.ptr, + cu_raw_lengths + ); + CUDA_CHECK(hipGetLastError()); + + // 1.2 Sort + size_t temp_storage_bytes = 0; + mesh.temp_storage.resize(M * sizeof(uint64_t)); + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, + mesh.atlas_chart_adj.ptr, + reinterpret_cast(mesh.temp_storage.ptr), + cu_raw_lengths, + cu_sorted_lengths, + M + )); + mesh.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + mesh.cub_temp_storage.ptr, temp_storage_bytes, + mesh.atlas_chart_adj.ptr, + reinterpret_cast(mesh.temp_storage.ptr), + cu_raw_lengths, + cu_sorted_lengths, + M + )); + CUDA_CHECK(hipFree(cu_raw_lengths)); + + auto reduce_op = hipcub::Sum(); + + + // 1.3 Reduce By Key (Aggregate duplicate chart pairs by summing lengths) + int* cu_num_chart_adjs; + CUDA_CHECK(hipMalloc(&cu_num_chart_adjs, sizeof(int))); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceReduce::ReduceByKey( + nullptr, temp_storage_bytes, + reinterpret_cast(mesh.temp_storage.ptr), + mesh.atlas_chart_adj.ptr, + cu_sorted_lengths, + mesh.atlas_chart_adj_length.ptr, + cu_num_chart_adjs, + reduce_op, + M + )); + mesh.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceReduce::ReduceByKey( + mesh.cub_temp_storage.ptr, temp_storage_bytes, + reinterpret_cast(mesh.temp_storage.ptr), + mesh.atlas_chart_adj.ptr, + cu_sorted_lengths, + mesh.atlas_chart_adj_length.ptr, + cu_num_chart_adjs, + reduce_op, + M + )); + CUDA_CHECK(hipMemcpy(&mesh.atlas_chart_adj.size, cu_num_chart_adjs, sizeof(int), hipMemcpyDeviceToHost)); + mesh.atlas_chart_adj_length.size = mesh.atlas_chart_adj.size; + CUDA_CHECK(hipFree(cu_sorted_lengths)); + CUDA_CHECK(hipFree(cu_num_chart_adjs)); + // Remove invalid edge (UINT64_MAX) if present + // Since we sorted, invalid edges are at the end. + uint64_t last_key; + if (mesh.atlas_chart_adj.size > 0) { + CUDA_CHECK(hipMemcpy(&last_key, mesh.atlas_chart_adj.ptr + mesh.atlas_chart_adj.size - 1, sizeof(uint64_t), hipMemcpyDeviceToHost)); + if (last_key == UINT64_MAX) { + mesh.atlas_chart_adj.size -= 1; + mesh.atlas_chart_adj_length.size -= 1; + } + } + // Early stop if no valid edges + if (mesh.atlas_chart_adj.size == 0) { + return; + } + + // 2. Get chart-edge connectivity + size_t E = mesh.atlas_chart_adj.size; + size_t C = mesh.atlas_num_charts; + // 2.1 Count edge number for each chart, along with perim + mesh.atlas_chart2edge_cnt.resize(C); + mesh.atlas_chart2edge_cnt.zero(); + mesh.atlas_chart_perims.resize(C); + mesh.atlas_chart_perims.zero(); + hipLaunchKernelGGL(( get_chart_edge_cnt_kernel), dim3((E + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + mesh.atlas_chart_adj.ptr, + mesh.atlas_chart_adj_length.ptr, + E, + mesh.atlas_chart2edge_cnt.ptr, + mesh.atlas_chart_perims.ptr + ); + CUDA_CHECK(hipGetLastError()); + // 2.2 Prepare CSR format for chart-edge connectivity + mesh.atlas_chart2edge_offset.resize(C + 1); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + mesh.atlas_chart2edge_cnt.ptr, + mesh.atlas_chart2edge_offset.ptr, + C + 1 + )); + mesh.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + mesh.cub_temp_storage.ptr, temp_storage_bytes, + mesh.atlas_chart2edge_cnt.ptr, + mesh.atlas_chart2edge_offset.ptr, + C + 1 + )); + // 2.3 Fill CSR format for chart-edge connectivity + mesh.atlas_chart2edge.resize(2 * E); // each edge connects two charts + mesh.atlas_chart2edge_cnt.zero(); + hipLaunchKernelGGL(( get_chart_edge_adjacency_kernel), dim3((E + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + mesh.atlas_chart_adj.ptr, + E, + mesh.atlas_chart2edge.ptr, + mesh.atlas_chart2edge_offset.ptr, + mesh.atlas_chart2edge_cnt.ptr + ); + CUDA_CHECK(hipGetLastError()); +} + + +struct Float3Add +{ + __host__ __device__ + float3 operator()(const float3 &a, const float3 &b) const + { + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); + } +}; + + +static __global__ void normalize_kernel( + float3* chart_normals, + const int num_charts +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= num_charts) return; + + float3 n = chart_normals[tid]; + float norm = sqrtf(n.x * n.x + n.y * n.y + n.z * n.z); + if (norm > 0.0f) { + n.x /= norm; + n.y /= norm; + n.z /= norm; + } + chart_normals[tid] = n; +} + + +static __global__ void normal_diff_kernel( + const float3* chart_normals, + const float3* sorted_face_normals, + const int* sorted_chart_ids, + const size_t F, + float* normal_diff +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + + int c = sorted_chart_ids[tid]; + Vec3f n(chart_normals[c]); + Vec3f fn(sorted_face_normals[tid]); + normal_diff[tid] = acosf(fmaxf(fminf(n.dot(fn), 1.0f), -1.0f)); +} + + +static __global__ void update_normal_cones_kernel( + float4* chart_normal_cones, + const float3* chart_normals, + const float* new_cone_half_angles, + const int num_charts +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= num_charts) return; + + float3 n = chart_normals[tid]; + float half_angle = new_cone_half_angles[tid]; + chart_normal_cones[tid] = make_float4(n.x, n.y, n.z, half_angle); +} + + +void compute_chart_normal_cones( + CuMesh& mesh +) { + size_t C = mesh.atlas_num_charts; + size_t F = mesh.faces.size; + + // 1. Sort faces by chart id + int* sorted_chart_ids; + int* faces_ids; + int* argsorted_faces_ids; + CUDA_CHECK(hipMalloc(&sorted_chart_ids, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&faces_ids, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&argsorted_faces_ids, F * sizeof(int))); + hipLaunchKernelGGL(( arange_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + faces_ids, + F + ); + CUDA_CHECK(hipGetLastError()); + size_t temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, + mesh.atlas_chart_ids.ptr, sorted_chart_ids, + faces_ids, argsorted_faces_ids, + F + )); + mesh.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + mesh.cub_temp_storage.ptr, temp_storage_bytes, + mesh.atlas_chart_ids.ptr, sorted_chart_ids, + faces_ids, argsorted_faces_ids, + F + )); + CUDA_CHECK(hipFree(faces_ids)); + + // 2. Get CSR format for chart-face assignment + int* cu_chart_size; + int* cu_num_charts; + int* cu_unique_chart_ids; + CUDA_CHECK(hipMalloc(&cu_chart_size, (C + 1) * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num_charts, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_unique_chart_ids, (C + 1) * sizeof(int))); + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( + nullptr, temp_storage_bytes, + sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts, + F + )); + mesh.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( + mesh.cub_temp_storage.ptr, temp_storage_bytes, + sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts, + F + )); + CUDA_CHECK(hipFree(cu_num_charts)); + CUDA_CHECK(hipFree(cu_unique_chart_ids)); + + int* cu_chart_offsets; + CUDA_CHECK(hipMalloc(&cu_chart_offsets, (C + 1) * sizeof(int))); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + cu_chart_size, cu_chart_offsets, + C + 1 + )); + mesh.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + mesh.cub_temp_storage.ptr, temp_storage_bytes, + cu_chart_size, cu_chart_offsets, + C + 1 + )); + CUDA_CHECK(hipFree(cu_chart_size)); + + // 3. Compute chart normals and areas + float* cu_sorted_face_areas; + CUDA_CHECK(hipMalloc(&cu_sorted_face_areas, F * sizeof(float))); + hipLaunchKernelGGL(( index_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + mesh.face_areas.ptr, + argsorted_faces_ids, + F, + cu_sorted_face_areas + ); + CUDA_CHECK(hipGetLastError()); + mesh.atlas_chart_areas.resize(C); + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( + nullptr, temp_storage_bytes, + cu_sorted_face_areas, mesh.atlas_chart_areas.ptr, + C, + cu_chart_offsets, cu_chart_offsets + 1 + )); + mesh.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( + mesh.cub_temp_storage.ptr, temp_storage_bytes, + cu_sorted_face_areas, mesh.atlas_chart_areas.ptr, + C, + cu_chart_offsets, cu_chart_offsets + 1 + )); + CUDA_CHECK(hipFree(cu_sorted_face_areas)); + + float3* cu_sorted_face_normals; + CUDA_CHECK(hipMalloc(&cu_sorted_face_normals, F * sizeof(float3))); + hipLaunchKernelGGL(( index_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + mesh.face_normals.ptr, + argsorted_faces_ids, + F, + cu_sorted_face_normals + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(argsorted_faces_ids)); + float3* cu_chart_normals; + CUDA_CHECK(hipMalloc(&cu_chart_normals, C * sizeof(float3))); + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Reduce( + nullptr, temp_storage_bytes, + cu_sorted_face_normals, cu_chart_normals, + C, + cu_chart_offsets, cu_chart_offsets + 1, + Float3Add(), + make_float3(0.0f, 0.0f, 0.0f) + )); + mesh.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Reduce( + mesh.cub_temp_storage.ptr, temp_storage_bytes, + cu_sorted_face_normals, cu_chart_normals, + C, + cu_chart_offsets, cu_chart_offsets + 1, + Float3Add(), + make_float3(0.0f, 0.0f, 0.0f) + )); + hipLaunchKernelGGL(( normalize_kernel), dim3((C + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_chart_normals, + C + ); + CUDA_CHECK(hipGetLastError()); + + // 4. Compute normal difference + float* cu_normal_diff; + CUDA_CHECK(hipMalloc(&cu_normal_diff, F * sizeof(float))); + hipLaunchKernelGGL(( normal_diff_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_chart_normals, + cu_sorted_face_normals, + sorted_chart_ids, + F, + cu_normal_diff + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_sorted_face_normals)); + CUDA_CHECK(hipFree(sorted_chart_ids)); + + // 5. Compute new cone half angles + float* cu_new_cone_half_angles; + CUDA_CHECK(hipMalloc(&cu_new_cone_half_angles, C * sizeof(float))); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Max( + nullptr, temp_storage_bytes, + cu_normal_diff, cu_new_cone_half_angles, + C, + cu_chart_offsets, cu_chart_offsets + 1 + )); + mesh.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Max( + mesh.cub_temp_storage.ptr, temp_storage_bytes, + cu_normal_diff, cu_new_cone_half_angles, + C, + cu_chart_offsets, cu_chart_offsets + 1 + )); + CUDA_CHECK(hipFree(cu_chart_offsets)); + CUDA_CHECK(hipFree(cu_normal_diff)); + + // 6. Update chart normal cones + mesh.atlas_chart_normal_cones.resize(C); + hipLaunchKernelGGL(( update_normal_cones_kernel), dim3((C + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + mesh.atlas_chart_normal_cones.ptr, + cu_chart_normals, + cu_new_cone_half_angles, + C + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_chart_normals)); + CUDA_CHECK(hipFree(cu_new_cone_half_angles)); +} + + +static __global__ void refine_charts_kernel( + const float4* chart_normal_cones, + const float3* face_normals, + const float3* vertices, + const uint64_t* edges, + const int3* face2edge, + const int* edge2face, + const int* edge2face_offset, + const size_t F, + const float lambda_smooth, + const int* chart_ids, // Read-only (Input) + int* pong_chart_ids // Write-only (Output) +) { + const int fid = blockIdx.x * blockDim.x + threadIdx.x; + if (fid >= F) return; + + // 1. Load current face data + int current_c = chart_ids[fid]; + Vec3f n(face_normals[fid]); + + // local register cache for candidate list (triangle has at most 3 neighbors, plus self, max 4 candidates) + int candidates[4]; + float smooth_scores[4]; + int num_candidates = 0; + + // init: add self to candidate list + candidates[0] = current_c; + smooth_scores[0] = 0.0f; + num_candidates = 1; + + // 2. Iterate over 3 edges to aggregate smooth scores + int eids[3] = { face2edge[fid].x, face2edge[fid].y, face2edge[fid].z }; + + #pragma unroll + for (int i = 0; i < 3; i++) { + int eid = eids[i]; + + // calculate edge length (as smooth weight) + // logic: if I add the neighbor's Chart, I can eliminate this edge as a boundary cost + int v0_idx = int(edges[eid] >> 32); + int v1_idx = int(edges[eid] & 0xFFFFFFFF); + Vec3f v0 = Vec3f(vertices[v0_idx]); + Vec3f v1 = Vec3f(vertices[v1_idx]); + float edge_len = (v1 - v0).norm(); + + int start = edge2face_offset[eid]; + int end = edge2face_offset[eid + 1]; + + // Process edge neighbors + for (int j = start; j < end; j++) { + int neighbor_fid = edge2face[j]; + if (neighbor_fid == fid) continue; + + int neighbor_c = chart_ids[neighbor_fid]; // Read from Input buffer + + int idx = -1; + for (int k = 0; k < num_candidates; ++k) { + if (candidates[k] == neighbor_c) { + idx = k; + break; + } + } + + if (idx == -1 && num_candidates < 4) { + idx = num_candidates++; + candidates[idx] = neighbor_c; + smooth_scores[idx] = 0.0f; + } + + if (idx != -1) { + smooth_scores[idx] += edge_len; + } + } + } + + // 3. Evaluate candidates and pick best + int best_c = current_c; + float best_total_score = -1e9f; + + for (int i = 0; i < num_candidates; ++i) { + int c = candidates[i]; + + // A. Geom score + float4 cone = chart_normal_cones[c]; + Vec3f axis(cone.x, cone.y, cone.z); + float geo_sim = axis.dot(n); // [-1, 1] + + // if invalid cone, skip + if (geo_sim <= 0.0f) continue; + + // B. Smooth score + float smooth_sim = smooth_scores[i] * lambda_smooth; + + float total_score = geo_sim + smooth_sim; + + if (c == current_c) { + if (best_total_score == -1e9f) { + best_total_score = total_score; + best_c = c; + } + } + + // C. Compare with best + float diff = total_score - best_total_score; + const float epsilon = 1e-5f; // dampening factor + + if (diff > epsilon) { + // new best is significantly better than current best + best_total_score = total_score; + best_c = c; + } + else if (abs(diff) <= epsilon) { + // scores are very close, break tie by choosing smaller ID + if (c < best_c) { + best_total_score = total_score; + best_c = c; + } + } + } + + // Write back to Output buffer + pong_chart_ids[fid] = best_c; +} + + +__global__ void hook_edges_if_same_chart_kernel( + const int2* adj, + const int* chart_ids, + const int M, + int* conn_comp_ids, + int* end_flag +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= M) return; + + // get adjacent faces + int f0 = adj[tid].x; + int f1 = adj[tid].y; + int c0 = chart_ids[f0]; + int c1 = chart_ids[f1]; + if (c0 != c1) return; + + // union + // find roots + int root0 = conn_comp_ids[f0]; + while (root0 != conn_comp_ids[root0]) { + root0 = conn_comp_ids[root0]; + } + int root1 = conn_comp_ids[f1]; + while (root1 != conn_comp_ids[root1]) { + root1 = conn_comp_ids[root1]; + } + + if (root0 == root1) return; + + int high = max(root0, root1); + int low = min(root0, root1); + atomicMin(&conn_comp_ids[high], low); + *end_flag = 0; +} + + +static void reassign_chart_ids( + CuMesh& mesh +) { + size_t F = mesh.faces.size; + size_t M = mesh.manifold_face_adj.size; + + mesh.temp_storage.resize(F * sizeof(int)); // Use as parent for DSU + hipLaunchKernelGGL(( arange_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + reinterpret_cast(mesh.temp_storage.ptr), + F + ); + CUDA_CHECK(hipGetLastError()); + + int* cu_end_flag; int h_end_flag; + CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int))); + do { + h_end_flag = 1; + CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice)); + + // Hook + hipLaunchKernelGGL(( hook_edges_if_same_chart_kernel), dim3((M+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + mesh.manifold_face_adj.ptr, + mesh.atlas_chart_ids.ptr, + M, + reinterpret_cast(mesh.temp_storage.ptr), + cu_end_flag + ); + CUDA_CHECK(hipGetLastError()); + + // Compress + hipLaunchKernelGGL(( compress_components_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + reinterpret_cast(mesh.temp_storage.ptr), + F + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost)); + } while (h_end_flag == 0); + CUDA_CHECK(hipFree(cu_end_flag)); + + swap_buffers(mesh.atlas_chart_ids, mesh.temp_storage); + mesh.atlas_num_charts = compress_ids(mesh.atlas_chart_ids.ptr, F, mesh.cub_temp_storage); +} + + +static __global__ void expand_chart_ids_and_vertex_ids_kernel( + const int* sorted_chart_ids, + const int* sorted_face_idx, + const int3* faces, + const size_t F, + uint64_t* pack +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + + int c = sorted_chart_ids[tid]; + int f = sorted_face_idx[tid]; + int3 face = faces[f]; + int v0 = face.x; + int v1 = face.y; + int v2 = face.z; + + pack[3 * tid + 0] = (uint64_t(c) << 32) | v0; + pack[3 * tid + 1] = (uint64_t(c) << 32) | v1; + pack[3 * tid + 2] = (uint64_t(c) << 32) | v2; +} + + +static __global__ void unpack_faces_kernel( + const uint64_t* pack, + const size_t F, + int3* faces +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + int3 face; + face.x = int(pack[3 * tid + 0]); + face.y = int(pack[3 * tid + 1]); + face.z = int(pack[3 * tid + 2]); + faces[tid] = face; +} + + +static __global__ void unpack_vertex_ids_kernel( + const uint64_t* pack, + const size_t N, + int* vertex_ids, + int* vertex_offsets +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= N) return; + vertex_ids[tid] = int(pack[tid] & 0xFFFFFFFF); + + int cur_c = int(pack[tid] >> 32); + if (tid == 0) { + vertex_offsets[0] = 0; + } + else { + int prev_c = int(pack[tid - 1] >> 32); + if (cur_c != prev_c) { + vertex_offsets[cur_c] = tid; + } + } + if (tid == N - 1) { + vertex_offsets[cur_c + 1] = N; + } +} + + +void construct_chart_mesh( + CuMesh& mesh +) { + size_t F = mesh.faces.size; + + // 1. Sort faces by chart id + mesh.atlas_chart_faces.resize(F); + mesh.atlas_chart_faces_offset.resize(mesh.atlas_num_charts + 1); + int* cu_sorted_chart_ids; + int* cu_face_idx; + int* cu_sorted_face_idx; + CUDA_CHECK(hipMalloc(&cu_sorted_chart_ids, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_face_idx, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_sorted_face_idx, F * sizeof(int))); + hipLaunchKernelGGL(( arange_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_face_idx, + F + ); + CUDA_CHECK(hipGetLastError()); + size_t temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, + mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids, + cu_face_idx, cu_sorted_face_idx, + F + )); + mesh.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + mesh.cub_temp_storage.ptr, temp_storage_bytes, + mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids, + cu_face_idx, cu_sorted_face_idx, + F + )); + CUDA_CHECK(hipFree(cu_face_idx)); + // 2. RLE for chart size + int* cu_chart_size; + int* cu_num_chart; + int* cu_unique_chart_ids; + CUDA_CHECK(hipMalloc(&cu_chart_size, (mesh.atlas_num_charts + 1) * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num_chart, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_unique_chart_ids, mesh.atlas_num_charts * sizeof(int))); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( + nullptr, temp_storage_bytes, + cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart, + F + )); + mesh.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( + mesh.cub_temp_storage.ptr, temp_storage_bytes, + cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart, + F + )); + CUDA_CHECK(hipFree(cu_unique_chart_ids)); + CUDA_CHECK(hipFree(cu_num_chart)); + // 3. Exclusive scan for chart face offset + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + cu_chart_size, mesh.atlas_chart_faces_offset.ptr, + mesh.atlas_num_charts + 1 + )); + mesh.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + mesh.cub_temp_storage.ptr, temp_storage_bytes, + cu_chart_size, mesh.atlas_chart_faces_offset.ptr, + mesh.atlas_num_charts + 1 + )); + CUDA_CHECK(hipFree(cu_chart_size)); + // 4. Expand chart ids and vertex ids + uint64_t* cu_pack; + CUDA_CHECK(hipMalloc(&cu_pack, 3 * F * sizeof(uint64_t))); + hipLaunchKernelGGL(( expand_chart_ids_and_vertex_ids_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_sorted_chart_ids, + cu_sorted_face_idx, + mesh.faces.ptr, + F, + cu_pack + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_sorted_chart_ids)); + CUDA_CHECK(hipFree(cu_sorted_face_idx)); + // 5. Compress pair to construct all maps + uint64_t* cu_inverse_pack; + CUDA_CHECK(hipMalloc(&cu_inverse_pack, 3 * F * sizeof(uint64_t))); + int new_num_vertices = compress_ids( + cu_pack, + 3 * F, + mesh.cub_temp_storage, + cu_inverse_pack + ); + mesh.atlas_chart_vertex_map.resize(new_num_vertices); + mesh.atlas_chart_vertex_offset.resize(mesh.atlas_num_charts + 1); + hipLaunchKernelGGL(( unpack_vertex_ids_kernel), dim3((new_num_vertices + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_inverse_pack, + new_num_vertices, + mesh.atlas_chart_vertex_map.ptr, + mesh.atlas_chart_vertex_offset.ptr + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_inverse_pack)); + hipLaunchKernelGGL(( unpack_faces_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_pack, + F, + mesh.atlas_chart_faces.ptr + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_pack)); +} + + +void CuMesh::compute_charts( + float threshold_cone_half_angle_rad, + int refine_iterations, + int global_iterations, + float smooth_strength, + float area_penalty_weight, + float perimeter_area_ratio_weight +) { + if (this->manifold_face_adj.is_empty()) { + this->get_manifold_face_adjacency(); + } + if (this->face_normals.is_empty()) { + this->compute_face_normals(); + } + if (this->face_areas.is_empty()) { + this->compute_face_areas(); + } + + // Initialize chart id + size_t F = this->faces.size; + this->atlas_chart_ids.resize(F); + this->atlas_num_charts = F; + hipLaunchKernelGGL(( arange_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->atlas_chart_ids.ptr, + F + ); + CUDA_CHECK(hipGetLastError()); + + // Main Iteration: Collapse and Refine + int* cu_end_flag; int h_end_flag; + CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int))); + for (int i = 0; i < global_iterations; i++) { + while (true) { + h_end_flag = 1; + CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice)); + + // 1. Compute chart connectivity + get_chart_connectivity(*this); + if (this->atlas_chart_adj.size == 0) break; + + // 2. Compute normal cones + compute_chart_normal_cones(*this); + + // 3. Compute chart adjacency cost + size_t E = this->atlas_chart_adj.size; + this->edge_collapse_costs.resize(E); + hipLaunchKernelGGL(( compute_chart_adjacency_cost_kernel), dim3((E + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->atlas_chart_adj.ptr, + this->atlas_chart_normal_cones.ptr, + this->atlas_chart_adj_length.ptr, + this->atlas_chart_perims.ptr, + this->atlas_chart_areas.ptr, + area_penalty_weight, + perimeter_area_ratio_weight, + E, + this->edge_collapse_costs.ptr + ); + CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize()); + + // 4. Propagate costs + size_t C = this->atlas_num_charts; + this->propagated_costs.resize(C); + hipLaunchKernelGGL(( propagate_cost_kernel), dim3((C + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->atlas_chart2edge.ptr, + this->atlas_chart2edge_offset.ptr, + this->edge_collapse_costs.ptr, + C, + this->propagated_costs.ptr + ); + CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize()); + + // 5. Collapse edges + this->vertices_map.resize(C); // store collapse map + hipLaunchKernelGGL(( arange_kernel), dim3((C + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->vertices_map.ptr, + C + ); + hipLaunchKernelGGL(( collapse_edges_kernel), dim3((E + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->atlas_chart_adj.ptr, + this->edge_collapse_costs.ptr, + this->propagated_costs.ptr, + threshold_cone_half_angle_rad, + E, + this->vertices_map.ptr, + this->atlas_chart_normal_cones.ptr, + cu_end_flag + ); + CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize()); + + // End of iteration + CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost)); + if (h_end_flag == 1) break; + + // 6. Compress chart ids + this->atlas_num_charts = compress_ids(this->vertices_map.ptr, C, this->cub_temp_storage); + this->temp_storage.resize(F * sizeof(int)); + hipLaunchKernelGGL(( index_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->vertices_map.ptr, + this->atlas_chart_ids.ptr, + F, + reinterpret_cast(this->temp_storage.ptr) + ); + CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize()); + swap_buffers(this->atlas_chart_ids, this->temp_storage); + } + + // Refine charts + for (int j = 0; j < refine_iterations; j++) { + compute_chart_normal_cones(*this); + this->temp_storage.resize(F * sizeof(int)); + hipLaunchKernelGGL(( refine_charts_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->atlas_chart_normal_cones.ptr, + this->face_normals.ptr, + this->vertices.ptr, + this->edges.ptr, + this->face2edge.ptr, + this->edge2face.ptr, + this->edge2face_offset.ptr, + F, + smooth_strength, + this->atlas_chart_ids.ptr, + reinterpret_cast(this->temp_storage.ptr) + ); + CUDA_CHECK(hipGetLastError()); + swap_buffers(this->atlas_chart_ids, this->temp_storage); + this->atlas_num_charts = compress_ids(this->atlas_chart_ids.ptr, F, this->cub_temp_storage); + } + + // After refinement, the chart may become disconnected, so we need to re-assign chart ids + reassign_chart_ids(*this); + } + CUDA_CHECK(hipFree(cu_end_flag)); + + // Finalizing: calculate vmap, chart face and chart face offset + construct_chart_mesh(*this); +} + + +std::tuple CuMesh::read_atlas_charts() { + auto chart_ids = torch::empty({ static_cast(this->faces.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA)); + CUDA_CHECK(hipMemcpy( + chart_ids.data_ptr(), + this->atlas_chart_ids.ptr, + this->faces.size * sizeof(int), + hipMemcpyDeviceToDevice + )); + auto vertex_map = torch::empty({ static_cast(this->atlas_chart_vertex_map.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA)); + CUDA_CHECK(hipMemcpy( + vertex_map.data_ptr(), + this->atlas_chart_vertex_map.ptr, + this->atlas_chart_vertex_map.size * sizeof(int), + hipMemcpyDeviceToDevice + )); + auto chart_faces = torch::empty({ static_cast(this->atlas_chart_faces.size), 3 }, torch::dtype(torch::kInt32).device(torch::kCUDA)); + CUDA_CHECK(hipMemcpy( + chart_faces.data_ptr(), + this->atlas_chart_faces.ptr, + this->atlas_chart_faces.size * 3 * sizeof(int), + hipMemcpyDeviceToDevice + )); + auto chart_vertex_offset = torch::empty({ static_cast(this->atlas_chart_vertex_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA)); + CUDA_CHECK(hipMemcpy( + chart_vertex_offset.data_ptr(), + this->atlas_chart_vertex_offset.ptr, + this->atlas_chart_vertex_offset.size * sizeof(int), + hipMemcpyDeviceToDevice + )); + auto chart_face_offset = torch::empty({ static_cast(this->atlas_chart_faces_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA)); + CUDA_CHECK(hipMemcpy( + chart_face_offset.data_ptr(), + this->atlas_chart_faces_offset.ptr, + this->atlas_chart_faces_offset.size * sizeof(int), + hipMemcpyDeviceToDevice + )); + return std::make_tuple(this->atlas_num_charts, chart_ids, vertex_map, chart_faces, chart_vertex_offset, chart_face_offset); +} + + +} // namespace cumesh \ No newline at end of file diff --git a/src/clean_up.cu b/src/clean_up.cu index 3c12cd7..a0dfb4b 100644 --- a/src/clean_up.cu +++ b/src/clean_up.cu @@ -1,7 +1,16 @@ #include "cumesh.h" #include "dtypes.cuh" #include "shared.h" +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include +#endif +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#endif namespace cumesh { @@ -36,25 +45,25 @@ void CuMesh::remove_faces(torch::Tensor& face_mask) { size_t temp_storage_bytes = 0; int *cu_new_num_faces; int3 *cu_new_faces; - CUDA_CHECK(cudaMalloc(&cu_new_num_faces, sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_new_faces, F * sizeof(int3))); - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipMalloc(&cu_new_num_faces, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_new_faces, F * sizeof(int3))); + CUDA_CHECK(hipcub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, this->faces.ptr, face_mask.data_ptr(), cu_new_faces, cu_new_num_faces, F )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipcub::DeviceSelect::Flagged( this->cub_temp_storage.ptr, temp_storage_bytes, this->faces.ptr, face_mask.data_ptr(), cu_new_faces, cu_new_num_faces, F )); int new_num_faces; - CUDA_CHECK(cudaMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), hipMemcpyDeviceToHost)); this->faces.resize(new_num_faces); - CUDA_CHECK(cudaMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaFree(cu_new_num_faces)); - CUDA_CHECK(cudaFree(cu_new_faces)); + CUDA_CHECK(hipMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), hipMemcpyDeviceToDevice)); + CUDA_CHECK(hipFree(cu_new_num_faces)); + CUDA_CHECK(hipFree(cu_new_faces)); this->remove_unreferenced_vertices(); } @@ -66,25 +75,25 @@ void CuMesh::_remove_faces(uint8_t* face_mask) { size_t temp_storage_bytes = 0; int *cu_new_num_faces; int3 *cu_new_faces; - CUDA_CHECK(cudaMalloc(&cu_new_num_faces, sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_new_faces, F * sizeof(int3))); - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipMalloc(&cu_new_num_faces, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_new_faces, F * sizeof(int3))); + CUDA_CHECK(hipcub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces, F )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipcub::DeviceSelect::Flagged( this->cub_temp_storage.ptr, temp_storage_bytes, this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces, F )); int new_num_faces; - CUDA_CHECK(cudaMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), hipMemcpyDeviceToHost)); this->faces.resize(new_num_faces); - CUDA_CHECK(cudaMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaFree(cu_new_num_faces)); - CUDA_CHECK(cudaFree(cu_new_faces)); + CUDA_CHECK(hipMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), hipMemcpyDeviceToDevice)); + CUDA_CHECK(hipFree(cu_new_num_faces)); + CUDA_CHECK(hipFree(cu_new_faces)); this->remove_unreferenced_vertices(); } @@ -139,28 +148,28 @@ void CuMesh::remove_unreferenced_vertices() { // Mark referenced vertices int* cu_vertex_is_referenced; - CUDA_CHECK(cudaMalloc(&cu_vertex_is_referenced, (V+1) * sizeof(int))); - CUDA_CHECK(cudaMemset(cu_vertex_is_referenced, 0, (V+1) * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_vertex_is_referenced, (V+1) * sizeof(int))); + CUDA_CHECK(hipMemset(cu_vertex_is_referenced, 0, (V+1) * sizeof(int))); set_vertex_is_referenced<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->faces.ptr, F, cu_vertex_is_referenced ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // Get vertices map size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, cu_vertex_is_referenced, V+1 )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_vertex_is_referenced, V+1 )); int new_num_vertices; - CUDA_CHECK(cudaMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), hipMemcpyDeviceToHost)); // Compress vertices this->temp_storage.resize(new_num_vertices * sizeof(float3)); @@ -170,7 +179,7 @@ void CuMesh::remove_unreferenced_vertices() { V, reinterpret_cast(this->temp_storage.ptr) ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); swap_buffers(this->temp_storage, this->vertices); // Update faces @@ -179,8 +188,8 @@ void CuMesh::remove_unreferenced_vertices() { F, this->faces.ptr ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_vertex_is_referenced)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_vertex_is_referenced)); // Delete all cached info since mesh has changed this->clear_cache(); @@ -229,10 +238,17 @@ static __global__ void select_first_in_each_group_kernel( struct int3_decomposer { +#ifdef __HIP_PLATFORM_AMD__ + __host__ __device__ ::rocprim::tuple operator()(int3& key) const + { + return ::rocprim::tie(key.x, key.y, key.z); + } +#else __host__ __device__ ::cuda::std::tuple operator()(int3& key) const { return {key.x, key.y, key.z}; } +#endif }; @@ -242,29 +258,29 @@ void CuMesh::remove_duplicate_faces() { // Create a temporary sorted copy of faces for duplicate detection // Do NOT modify the original faces to preserve vertex order and normals int3 *cu_sorted_faces; - CUDA_CHECK(cudaMalloc(&cu_sorted_faces, F * sizeof(int3))); - CUDA_CHECK(cudaMemcpy(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), cudaMemcpyDeviceToDevice)); + CUDA_CHECK(hipMalloc(&cu_sorted_faces, F * sizeof(int3))); + CUDA_CHECK(hipMemcpy(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), hipMemcpyDeviceToDevice)); // Sort vertices within each face (in the temporary copy) sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( cu_sorted_faces, F ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // Sort all faces globally by their sorted vertex indices size_t temp_storage_bytes = 0; int *cu_sorted_face_indices; - CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_sorted_face_indices, F * sizeof(int))); arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_sorted_face_indices, F); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); int *cu_sorted_indices_output; int3 *cu_sorted_faces_output; - CUDA_CHECK(cudaMalloc(&cu_sorted_indices_output, F * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_sorted_faces_output, F * sizeof(int3))); + CUDA_CHECK(hipMalloc(&cu_sorted_indices_output, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_sorted_faces_output, F * sizeof(int3))); - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( nullptr, temp_storage_bytes, cu_sorted_faces, cu_sorted_faces_output, cu_sorted_face_indices, cu_sorted_indices_output, @@ -272,45 +288,45 @@ void CuMesh::remove_duplicate_faces() { int3_decomposer{} )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( this->cub_temp_storage.ptr, temp_storage_bytes, cu_sorted_faces, cu_sorted_faces_output, cu_sorted_face_indices, cu_sorted_indices_output, F, int3_decomposer{} )); - CUDA_CHECK(cudaFree(cu_sorted_faces)); - CUDA_CHECK(cudaFree(cu_sorted_face_indices)); + CUDA_CHECK(hipFree(cu_sorted_faces)); + CUDA_CHECK(hipFree(cu_sorted_face_indices)); // Select first in each group of duplicate faces (based on sorted faces) uint8_t* cu_face_mask_sorted; - CUDA_CHECK(cudaMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t))); + CUDA_CHECK(hipMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t))); select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( cu_sorted_faces_output, F, cu_face_mask_sorted ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_sorted_faces_output)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_sorted_faces_output)); // Map the mask back to original face order using scatter // scatter: output[indices[i]] = values[i] // This maps: cu_face_mask_original[original_idx] = cu_face_mask_sorted[sorted_position] uint8_t* cu_face_mask_original; - CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t))); + CUDA_CHECK(hipMalloc(&cu_face_mask_original, F * sizeof(uint8_t))); scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( cu_sorted_indices_output, // indices: sorted_position -> original_idx cu_face_mask_sorted, // values: mask at sorted_position F, cu_face_mask_original // output: mask at original position ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_face_mask_sorted)); - CUDA_CHECK(cudaFree(cu_sorted_indices_output)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_face_mask_sorted)); + CUDA_CHECK(hipFree(cu_sorted_indices_output)); // Select faces to keep (preserving original vertex order) this->_remove_faces(cu_face_mask_original); - CUDA_CHECK(cudaFree(cu_face_mask_original)); + CUDA_CHECK(hipFree(cu_face_mask_original)); } @@ -355,7 +371,7 @@ void CuMesh::remove_degenerate_faces(float abs_thresh, float rel_thresh) { size_t F = this->faces.size; uint8_t* cu_face_mask; - CUDA_CHECK(cudaMalloc(&cu_face_mask, F * sizeof(uint8_t))); + CUDA_CHECK(hipMalloc(&cu_face_mask, F * sizeof(uint8_t))); mark_degenerate_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->vertices.ptr, this->faces.ptr, @@ -363,10 +379,10 @@ void CuMesh::remove_degenerate_faces(float abs_thresh, float rel_thresh) { F, cu_face_mask ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); this->_remove_faces(cu_face_mask); - CUDA_CHECK(cudaFree(cu_face_mask)); + CUDA_CHECK(hipFree(cu_face_mask)); } @@ -450,7 +466,7 @@ void CuMesh::fill_holes(float max_hole_perimeter) { // Compute loop boundary lengths float* cu_loop_boundary_lengths; - CUDA_CHECK(cudaMalloc(&cu_loop_boundary_lengths, E * sizeof(float))); + CUDA_CHECK(hipMalloc(&cu_loop_boundary_lengths, E * sizeof(float))); compute_loop_boundary_lengths<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->vertices.ptr, this->edges.ptr, @@ -458,13 +474,13 @@ void CuMesh::fill_holes(float max_hole_perimeter) { E, cu_loop_boundary_lengths ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // Segment sum size_t temp_storage_bytes = 0; float *cu_bound_loop_perimeters; - CUDA_CHECK(cudaMalloc(&cu_bound_loop_perimeters, L * sizeof(float))); - CUDA_CHECK(cub::DeviceSegmentedReduce::Sum( + CUDA_CHECK(hipMalloc(&cu_bound_loop_perimeters, L * sizeof(float))); + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( nullptr, temp_storage_bytes, cu_loop_boundary_lengths, cu_bound_loop_perimeters, L, @@ -472,18 +488,18 @@ void CuMesh::fill_holes(float max_hole_perimeter) { this->loop_boundaries_offset.ptr + 1 )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSegmentedReduce::Sum( + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_loop_boundary_lengths, cu_bound_loop_perimeters, L, this->loop_boundaries_offset.ptr, this->loop_boundaries_offset.ptr + 1 )); - CUDA_CHECK(cudaFree(cu_loop_boundary_lengths)); + CUDA_CHECK(hipFree(cu_loop_boundary_lengths)); // Mask small loops uint8_t* cu_bound_loop_mask; - CUDA_CHECK(cudaMalloc(&cu_bound_loop_mask, L * sizeof(uint8_t))); + CUDA_CHECK(hipMalloc(&cu_bound_loop_mask, L * sizeof(uint8_t))); compare_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( cu_bound_loop_perimeters, max_hole_perimeter, @@ -491,62 +507,62 @@ void CuMesh::fill_holes(float max_hole_perimeter) { LessThanOp(), cu_bound_loop_mask ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_bound_loop_perimeters)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_bound_loop_perimeters)); // Compress bound loops size int* cu_bound_loops_cnt; - CUDA_CHECK(cudaMalloc(&cu_bound_loops_cnt, L * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_bound_loops_cnt, L * sizeof(int))); diff_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->loop_boundaries_offset.ptr, L, cu_bound_loops_cnt ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); int *cu_new_loop_boundaries_cnt, *cu_new_num_bound_loops; - CUDA_CHECK(cudaMalloc(&cu_new_loop_boundaries_cnt, (L+1) * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_new_num_bound_loops, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_new_loop_boundaries_cnt, (L+1) * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_new_num_bound_loops, sizeof(int))); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipcub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops, L )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipcub::DeviceSelect::Flagged( this->cub_temp_storage.ptr, temp_storage_bytes, cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops, L )); int new_num_bound_loops; - CUDA_CHECK(cudaMemcpy(&new_num_bound_loops, cu_new_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaFree(cu_bound_loops_cnt)); - CUDA_CHECK(cudaFree(cu_new_num_bound_loops)); + CUDA_CHECK(hipMemcpy(&new_num_bound_loops, cu_new_num_bound_loops, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_bound_loops_cnt)); + CUDA_CHECK(hipFree(cu_new_num_bound_loops)); if (new_num_bound_loops == 0) { - CUDA_CHECK(cudaFree(cu_new_loop_boundaries_cnt)); - CUDA_CHECK(cudaFree(cu_bound_loop_mask)); + CUDA_CHECK(hipFree(cu_new_loop_boundaries_cnt)); + CUDA_CHECK(hipFree(cu_bound_loop_mask)); return; } // Get loop ids of loop boundaries int* cu_loop_bound_loop_ids; - CUDA_CHECK(cudaMalloc(&cu_loop_bound_loop_ids, E * sizeof(int))); - CUDA_CHECK(cudaMemset(cu_loop_bound_loop_ids, 0, E * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_loop_bound_loop_ids, E * sizeof(int))); + CUDA_CHECK(hipMemset(cu_loop_bound_loop_ids, 0, E * sizeof(int))); if (L > 1) { set_flag_kernel<<<(L-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->loop_boundaries_offset.ptr + 1, L - 1, cu_loop_bound_loop_ids ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::InclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::InclusiveSum( nullptr, temp_storage_bytes, cu_loop_bound_loop_ids, E )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::InclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::InclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_loop_bound_loop_ids, E @@ -554,71 +570,71 @@ void CuMesh::fill_holes(float max_hole_perimeter) { // Mask loop boundaries uint8_t* cu_loop_boundary_mask; - CUDA_CHECK(cudaMalloc(&cu_loop_boundary_mask, E * sizeof(uint8_t))); + CUDA_CHECK(hipMalloc(&cu_loop_boundary_mask, E * sizeof(uint8_t))); index_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( cu_bound_loop_mask, cu_loop_bound_loop_ids, E, cu_loop_boundary_mask ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_bound_loop_mask)); - CUDA_CHECK(cudaFree(cu_loop_bound_loop_ids)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_bound_loop_mask)); + CUDA_CHECK(hipFree(cu_loop_bound_loop_ids)); // Compress loop boundaries int *cu_new_loop_boundaries, *cu_new_num_loop_boundaries; - CUDA_CHECK(cudaMalloc(&cu_new_loop_boundaries, E * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_new_num_loop_boundaries, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_new_loop_boundaries, E * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_new_num_loop_boundaries, sizeof(int))); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipcub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries, E )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipcub::DeviceSelect::Flagged( this->cub_temp_storage.ptr, temp_storage_bytes, this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries, E )); int new_num_loop_boundaries; - CUDA_CHECK(cudaMemcpy(&new_num_loop_boundaries, cu_new_num_loop_boundaries, sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaFree(cu_new_num_loop_boundaries)); - CUDA_CHECK(cudaFree(cu_loop_boundary_mask)); + CUDA_CHECK(hipMemcpy(&new_num_loop_boundaries, cu_new_num_loop_boundaries, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_new_num_loop_boundaries)); + CUDA_CHECK(hipFree(cu_loop_boundary_mask)); // Reconstruct new bound loops int* cu_new_loop_boundaries_offset; - CUDA_CHECK(cudaMalloc(&cu_new_loop_boundaries_offset, (new_num_loop_boundaries+1) * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_new_loop_boundaries_offset, (new_num_loop_boundaries+1) * sizeof(int))); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset, new_num_bound_loops + 1 )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset, new_num_bound_loops + 1 )); int* cu_new_loop_bound_loop_ids; - CUDA_CHECK(cudaMalloc(&cu_new_loop_bound_loop_ids, new_num_loop_boundaries * sizeof(int))); - CUDA_CHECK(cudaMemset(cu_new_loop_bound_loop_ids, 0, new_num_loop_boundaries * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_new_loop_bound_loop_ids, new_num_loop_boundaries * sizeof(int))); + CUDA_CHECK(hipMemset(cu_new_loop_bound_loop_ids, 0, new_num_loop_boundaries * sizeof(int))); if (new_num_bound_loops > 1) { set_flag_kernel<<<(new_num_bound_loops-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( cu_new_loop_boundaries_offset+1, new_num_bound_loops-1, cu_new_loop_bound_loop_ids ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::InclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::InclusiveSum( nullptr, temp_storage_bytes, cu_new_loop_bound_loop_ids, new_num_loop_boundaries )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::InclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::InclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_new_loop_bound_loop_ids, new_num_loop_boundaries @@ -626,7 +642,7 @@ void CuMesh::fill_holes(float max_hole_perimeter) { // Calculate new vertex positions as average of loop vertices Vec3f* cu_new_loop_bound_centers; - CUDA_CHECK(cudaMalloc(&cu_new_loop_bound_centers, new_num_loop_boundaries * sizeof(Vec3f))); + CUDA_CHECK(hipMalloc(&cu_new_loop_bound_centers, new_num_loop_boundaries * sizeof(Vec3f))); compute_loop_boundary_midpoints<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->vertices.ptr, this->edges.ptr, @@ -634,11 +650,11 @@ void CuMesh::fill_holes(float max_hole_perimeter) { new_num_loop_boundaries, cu_new_loop_bound_centers ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); Vec3f* cu_new_vertices; - CUDA_CHECK(cudaMalloc(&cu_new_vertices, new_num_bound_loops * sizeof(Vec3f))); + CUDA_CHECK(hipMalloc(&cu_new_vertices, new_num_bound_loops * sizeof(Vec3f))); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceSegmentedReduce::Sum( + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( nullptr, temp_storage_bytes, cu_new_loop_bound_centers, cu_new_vertices, new_num_bound_loops, @@ -646,22 +662,22 @@ void CuMesh::fill_holes(float max_hole_perimeter) { cu_new_loop_boundaries_offset + 1 )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSegmentedReduce::Sum( + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_new_loop_bound_centers, cu_new_vertices, new_num_bound_loops, cu_new_loop_boundaries_offset, cu_new_loop_boundaries_offset + 1 )); - CUDA_CHECK(cudaFree(cu_new_loop_bound_centers)); - CUDA_CHECK(cudaFree(cu_new_loop_boundaries_offset)); + CUDA_CHECK(hipFree(cu_new_loop_bound_centers)); + CUDA_CHECK(hipFree(cu_new_loop_boundaries_offset)); inplace_div_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( cu_new_vertices, cu_new_loop_boundaries_cnt, new_num_bound_loops ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_new_loop_boundaries_cnt)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_new_loop_boundaries_cnt)); // Update mesh this->vertices.extend(new_num_bound_loops); @@ -671,8 +687,8 @@ void CuMesh::fill_holes(float max_hole_perimeter) { new_num_bound_loops, this->vertices.ptr + V ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_new_vertices)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_new_vertices)); connect_new_vertices_kernel<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->edges.ptr, cu_new_loop_boundaries, @@ -681,9 +697,9 @@ void CuMesh::fill_holes(float max_hole_perimeter) { V, this->faces.ptr + F ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_new_loop_boundaries)); - CUDA_CHECK(cudaFree(cu_new_loop_bound_loop_ids)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_new_loop_boundaries)); + CUDA_CHECK(hipFree(cu_new_loop_bound_loop_ids)); // Delete all cached info since mesh has changed this->clear_cache(); @@ -772,25 +788,25 @@ void CuMesh::repair_non_manifold_edges(){ // Construct vertex adjacency pairs with manifold edges int2* cu_vertex_adj_pairs; - CUDA_CHECK(cudaMalloc(&cu_vertex_adj_pairs, 2*M*sizeof(int2))); + CUDA_CHECK(hipMalloc(&cu_vertex_adj_pairs, 2*M*sizeof(int2))); construct_vertex_adj_pairs_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->manifold_face_adj.ptr, this->faces.ptr, cu_vertex_adj_pairs, M ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // Iterative Hook and Compress int* cu_vertex_ids; - CUDA_CHECK(cudaMalloc(&cu_vertex_ids, 3 * F * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_vertex_ids, 3 * F * sizeof(int))); arange_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vertex_ids, 3 * F); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); int* cu_end_flag; int h_end_flag; - CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int))); do { h_end_flag = 1; - CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice)); // Hook hook_edges_kernel<<<(2*M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( @@ -799,25 +815,25 @@ void CuMesh::repair_non_manifold_edges(){ cu_vertex_ids, cu_end_flag ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // Compress compress_components_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( cu_vertex_ids, 3 * F ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost)); } while (h_end_flag == 0); - CUDA_CHECK(cudaFree(cu_end_flag)); - CUDA_CHECK(cudaFree(cu_vertex_adj_pairs)); + CUDA_CHECK(hipFree(cu_end_flag)); + CUDA_CHECK(hipFree(cu_vertex_adj_pairs)); // Construct new faces int* cu_new_vertices_ids; - CUDA_CHECK(cudaMalloc(&cu_new_vertices_ids, 3 * F * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_new_vertices_ids, 3 * F * sizeof(int))); int new_V = compress_ids(cu_vertex_ids, 3 * F, this->cub_temp_storage, cu_new_vertices_ids); float3* cu_new_vertices; - CUDA_CHECK(cudaMalloc(&cu_new_vertices, new_V * sizeof(float3))); + CUDA_CHECK(hipMalloc(&cu_new_vertices, new_V * sizeof(float3))); index_vertice_kernel<<<(new_V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( cu_new_vertices_ids, this->faces.ptr, @@ -825,15 +841,15 @@ void CuMesh::repair_non_manifold_edges(){ new_V, cu_new_vertices ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_new_vertices_ids)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_new_vertices_ids)); this->vertices.resize(new_V); - CUDA_CHECK(cudaMemcpy(this->vertices.ptr, cu_new_vertices, new_V * sizeof(float3), cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaFree(cu_new_vertices)); + CUDA_CHECK(hipMemcpy(this->vertices.ptr, cu_new_vertices, new_V * sizeof(float3), hipMemcpyDeviceToDevice)); + CUDA_CHECK(hipFree(cu_new_vertices)); this->faces.resize(F); copy_T_to_T3_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vertex_ids, F, this->faces.ptr); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_vertex_ids)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_vertex_ids)); // Delete all cached info since mesh has changed this->clear_cache(); @@ -886,8 +902,8 @@ void CuMesh::remove_non_manifold_faces() { // Initialize face mask (1 = keep all faces initially) uint8_t* cu_face_keep_mask; - CUDA_CHECK(cudaMalloc(&cu_face_keep_mask, F * sizeof(uint8_t))); - CUDA_CHECK(cudaMemset(cu_face_keep_mask, 1, F * sizeof(uint8_t))); + CUDA_CHECK(hipMalloc(&cu_face_keep_mask, F * sizeof(uint8_t))); + CUDA_CHECK(hipMemset(cu_face_keep_mask, 1, F * sizeof(uint8_t))); // Mark faces on non-manifold edges for removal mark_non_manifold_faces_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( @@ -897,11 +913,11 @@ void CuMesh::remove_non_manifold_faces() { E, cu_face_keep_mask ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // Remove marked faces this->_remove_faces(cu_face_keep_mask); - CUDA_CHECK(cudaFree(cu_face_keep_mask)); + CUDA_CHECK(hipFree(cu_face_keep_mask)); // Clear cache since mesh has changed this->clear_cache(); @@ -930,16 +946,16 @@ void CuMesh::remove_small_connected_components(float min_area) { size_t temp_storage_bytes = 0; int *cu_sorted_conn_comp_ids; float *cu_sorted_face_areas; - CUDA_CHECK(cudaMalloc(&cu_sorted_conn_comp_ids, F * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_sorted_face_areas, F * sizeof(float))); - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipMalloc(&cu_sorted_conn_comp_ids, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_sorted_face_areas, F * sizeof(float))); + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( nullptr, temp_storage_bytes, this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids, this->face_areas.ptr, cu_sorted_face_areas, F )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( this->cub_temp_storage.ptr, temp_storage_bytes, this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids, this->face_areas.ptr, cu_sorted_face_areas, @@ -950,48 +966,48 @@ void CuMesh::remove_small_connected_components(float min_area) { int* cu_conn_comp_num_faces; int* cu_num_conn_comps; int* cu_unique_conn_comp_ids; // Not needed, but we need to pass a valid pointer. - CUDA_CHECK(cudaMalloc(&cu_conn_comp_num_faces, (this->num_conn_comps + 1) * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_num_conn_comps, sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_unique_conn_comp_ids, (this->num_conn_comps + 1) * sizeof(int))); - CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( + CUDA_CHECK(hipMalloc(&cu_conn_comp_num_faces, (this->num_conn_comps + 1) * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num_conn_comps, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_unique_conn_comp_ids, (this->num_conn_comps + 1) * sizeof(int))); + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( nullptr, temp_storage_bytes, cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids, cu_conn_comp_num_faces, cu_num_conn_comps, F )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( this->cub_temp_storage.ptr, temp_storage_bytes, cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids, cu_conn_comp_num_faces, cu_num_conn_comps, F )); int num_conn_comps; - CUDA_CHECK(cudaMemcpy(&num_conn_comps, cu_num_conn_comps, sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaFree(cu_num_conn_comps)); - CUDA_CHECK(cudaFree(cu_sorted_conn_comp_ids)); - CUDA_CHECK(cudaFree(cu_unique_conn_comp_ids)); + CUDA_CHECK(hipMemcpy(&num_conn_comps, cu_num_conn_comps, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_num_conn_comps)); + CUDA_CHECK(hipFree(cu_sorted_conn_comp_ids)); + CUDA_CHECK(hipFree(cu_unique_conn_comp_ids)); // 3. Compute the total area for each connected component via segmented reduction. int* cu_conn_comp_offsets; - CUDA_CHECK(cudaMalloc(&cu_conn_comp_offsets, (num_conn_comps + 1) * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_conn_comp_offsets, (num_conn_comps + 1) * sizeof(int))); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, cu_conn_comp_num_faces, cu_conn_comp_offsets, num_conn_comps + 1 )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_conn_comp_num_faces, cu_conn_comp_offsets, num_conn_comps + 1 )); - CUDA_CHECK(cudaFree(cu_conn_comp_num_faces)); + CUDA_CHECK(hipFree(cu_conn_comp_num_faces)); float *cu_conn_comp_areas; - CUDA_CHECK(cudaMalloc(&cu_conn_comp_areas, num_conn_comps * sizeof(float))); - CUDA_CHECK(cub::DeviceSegmentedReduce::Sum( + CUDA_CHECK(hipMalloc(&cu_conn_comp_areas, num_conn_comps * sizeof(float))); + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( nullptr, temp_storage_bytes, cu_sorted_face_areas, cu_conn_comp_areas, num_conn_comps, @@ -999,19 +1015,19 @@ void CuMesh::remove_small_connected_components(float min_area) { cu_conn_comp_offsets + 1 )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSegmentedReduce::Sum( + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_sorted_face_areas, cu_conn_comp_areas, num_conn_comps, cu_conn_comp_offsets, cu_conn_comp_offsets + 1 )); - CUDA_CHECK(cudaFree(cu_sorted_face_areas)); - CUDA_CHECK(cudaFree(cu_conn_comp_offsets)); + CUDA_CHECK(hipFree(cu_sorted_face_areas)); + CUDA_CHECK(hipFree(cu_conn_comp_offsets)); // 4. Create a "keep" mask for components with area >= min_area. uint8_t* cu_comp_keep_mask; - CUDA_CHECK(cudaMalloc(&cu_comp_keep_mask, num_conn_comps * sizeof(uint8_t))); + CUDA_CHECK(hipMalloc(&cu_comp_keep_mask, num_conn_comps * sizeof(uint8_t))); compare_kernel<<<(num_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( cu_conn_comp_areas, min_area, @@ -1019,12 +1035,12 @@ void CuMesh::remove_small_connected_components(float min_area) { GreaterThanOrEqualToOp(), cu_comp_keep_mask ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_conn_comp_areas)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_conn_comp_areas)); // 5. Propagate the component "keep" mask to every face. uint8_t* cu_face_keep_mask; - CUDA_CHECK(cudaMalloc(&cu_face_keep_mask, F * sizeof(uint8_t))); + CUDA_CHECK(hipMalloc(&cu_face_keep_mask, F * sizeof(uint8_t))); // Use an index_kernel (gather operation) index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( cu_comp_keep_mask, // Source array @@ -1032,12 +1048,12 @@ void CuMesh::remove_small_connected_components(float min_area) { F, cu_face_keep_mask // Destination array ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_comp_keep_mask)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_comp_keep_mask)); // 6. Select the faces to keep and update the mesh. this->_remove_faces(cu_face_keep_mask); - CUDA_CHECK(cudaFree(cu_face_keep_mask)); + CUDA_CHECK(hipFree(cu_face_keep_mask)); } @@ -1164,25 +1180,25 @@ void CuMesh::unify_face_orientations() { // 1. Compute the flipped flag for each edge. uint8_t* cu_flipped; - CUDA_CHECK(cudaMalloc(&cu_flipped, this->manifold_face_adj.size * sizeof(uint8_t))); + CUDA_CHECK(hipMalloc(&cu_flipped, this->manifold_face_adj.size * sizeof(uint8_t))); get_flip_flags_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->manifold_face_adj.ptr, this->faces.ptr, this->manifold_face_adj.size, cu_flipped ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // 2. Hook edges with flipped flag. int* conn_comp_with_flip; - CUDA_CHECK(cudaMalloc(&conn_comp_with_flip, this->faces.size * sizeof(int))); + CUDA_CHECK(hipMalloc(&conn_comp_with_flip, this->faces.size * sizeof(int))); arange_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(conn_comp_with_flip, this->faces.size, 2); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); int* cu_end_flag; int h_end_flag; - CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int))); do { h_end_flag = 1; - CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice)); // Hook hook_edges_with_orientation_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( @@ -1192,17 +1208,17 @@ void CuMesh::unify_face_orientations() { conn_comp_with_flip, cu_end_flag ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // Compress compress_components_with_orientation_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( conn_comp_with_flip, this->faces.size ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost)); } while (h_end_flag == 0); - CUDA_CHECK(cudaFree(cu_end_flag)); + CUDA_CHECK(hipFree(cu_end_flag)); // 3. Flip the orientation of the faces. inplace_flip_faces_with_flags_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( @@ -1210,9 +1226,9 @@ void CuMesh::unify_face_orientations() { conn_comp_with_flip, this->faces.size ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_flipped)); - CUDA_CHECK(cudaFree(conn_comp_with_flip)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_flipped)); + CUDA_CHECK(hipFree(conn_comp_with_flip)); } diff --git a/src/clean_up.hip b/src/clean_up.hip new file mode 100644 index 0000000..8bd58ef --- /dev/null +++ b/src/clean_up.hip @@ -0,0 +1,1237 @@ +// !!! This is a file automatically generated by hipify!!! +#include "hip/hip_runtime.h" +#include "cumesh_hip.h" +#include "dtypes_hip.cuh" +#include "shared_hip.h" +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#endif +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#endif + + +namespace cumesh { + + +static __global__ void copy_vec3f_to_float3_kernel( + const Vec3f* vec3f, + const size_t N, + float3* output +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= N) return; + output[tid] = make_float3(vec3f[tid].x, vec3f[tid].y, vec3f[tid].z); +} + + +template +static __global__ void copy_T_to_T3_kernel( + const T* input, + const size_t N, + U* output +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= N) return; + output[tid] = { input[3 * tid], input[3 * tid + 1], input[3 * tid + 2] }; +} + + +void CuMesh::remove_faces(torch::Tensor& face_mask) { + size_t F = this->faces.size; + + size_t temp_storage_bytes = 0; + int *cu_new_num_faces; + int3 *cu_new_faces; + CUDA_CHECK(hipMalloc(&cu_new_num_faces, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_new_faces, F * sizeof(int3))); + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + nullptr, temp_storage_bytes, + this->faces.ptr, face_mask.data_ptr(), cu_new_faces, cu_new_num_faces, + F + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + this->cub_temp_storage.ptr, temp_storage_bytes, + this->faces.ptr, face_mask.data_ptr(), cu_new_faces, cu_new_num_faces, + F + )); + int new_num_faces; + CUDA_CHECK(hipMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), hipMemcpyDeviceToHost)); + this->faces.resize(new_num_faces); + CUDA_CHECK(hipMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), hipMemcpyDeviceToDevice)); + CUDA_CHECK(hipFree(cu_new_num_faces)); + CUDA_CHECK(hipFree(cu_new_faces)); + + this->remove_unreferenced_vertices(); +} + + +void CuMesh::_remove_faces(uint8_t* face_mask) { + size_t F = this->faces.size; + + size_t temp_storage_bytes = 0; + int *cu_new_num_faces; + int3 *cu_new_faces; + CUDA_CHECK(hipMalloc(&cu_new_num_faces, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_new_faces, F * sizeof(int3))); + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + nullptr, temp_storage_bytes, + this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces, + F + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + this->cub_temp_storage.ptr, temp_storage_bytes, + this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces, + F + )); + int new_num_faces; + CUDA_CHECK(hipMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), hipMemcpyDeviceToHost)); + this->faces.resize(new_num_faces); + CUDA_CHECK(hipMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), hipMemcpyDeviceToDevice)); + CUDA_CHECK(hipFree(cu_new_num_faces)); + CUDA_CHECK(hipFree(cu_new_faces)); + + this->remove_unreferenced_vertices(); +} + + +static __global__ void set_vertex_is_referenced( + const int3* faces, + const size_t F, + int* vertex_is_referenced +) { + const int fid = blockIdx.x * blockDim.x + threadIdx.x; + if (fid >= F) return; + int3 face = faces[fid]; + vertex_is_referenced[face.x] = 1; + vertex_is_referenced[face.y] = 1; + vertex_is_referenced[face.z] = 1; +} + + +static __global__ void compress_vertices_kernel( + const int* vertices_map, + const float3* old_vertices, + const int V, + float3* new_vertices +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= V) return; + int new_id = vertices_map[tid]; + int is_kept = vertices_map[tid + 1] == new_id + 1; + if (is_kept) { + new_vertices[new_id] = old_vertices[tid]; + } +} + + +static __global__ void remap_faces_kernel( + const int* vertices_map, + const int F, + int3* faces +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + faces[tid].x = vertices_map[faces[tid].x]; + faces[tid].y = vertices_map[faces[tid].y]; + faces[tid].z = vertices_map[faces[tid].z]; +} + + +void CuMesh::remove_unreferenced_vertices() { + size_t V = this->vertices.size; + size_t F = this->faces.size; + + // Mark referenced vertices + int* cu_vertex_is_referenced; + CUDA_CHECK(hipMalloc(&cu_vertex_is_referenced, (V+1) * sizeof(int))); + CUDA_CHECK(hipMemset(cu_vertex_is_referenced, 0, (V+1) * sizeof(int))); + hipLaunchKernelGGL(( set_vertex_is_referenced), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->faces.ptr, + F, + cu_vertex_is_referenced + ); + CUDA_CHECK(hipGetLastError()); + + // Get vertices map + size_t temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + cu_vertex_is_referenced, V+1 + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_vertex_is_referenced, V+1 + )); + int new_num_vertices; + CUDA_CHECK(hipMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), hipMemcpyDeviceToHost)); + + // Compress vertices + this->temp_storage.resize(new_num_vertices * sizeof(float3)); + hipLaunchKernelGGL(( compress_vertices_kernel), dim3((V+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_vertex_is_referenced, + this->vertices.ptr, + V, + reinterpret_cast(this->temp_storage.ptr) + ); + CUDA_CHECK(hipGetLastError()); + swap_buffers(this->temp_storage, this->vertices); + + // Update faces + hipLaunchKernelGGL(( remap_faces_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_vertex_is_referenced, + F, + this->faces.ptr + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_vertex_is_referenced)); + + // Delete all cached info since mesh has changed + this->clear_cache(); +} + + +static __global__ void sort_faces_kernel( + int3* faces, + const size_t F +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + + int3 face = faces[tid]; + int tmp; + + // bubble sort 3 elements (x, y, z) + if (face.x > face.y) { tmp = face.x; face.x = face.y; face.y = tmp; } + if (face.y > face.z) { tmp = face.y; face.y = face.z; face.z = tmp; } + if (face.x > face.y) { tmp = face.x; face.x = face.y; face.y = tmp; } + + faces[tid] = face; +} + + +static __global__ void select_first_in_each_group_kernel( + const int3* faces, + const size_t F, + uint8_t* face_mask +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + if (tid == 0) { + face_mask[tid] = 1; + } else { + int3 face = faces[tid]; + int3 prev_face = faces[tid-1]; + if (face.x == prev_face.x && face.y == prev_face.y && face.z == prev_face.z) { + face_mask[tid] = 0; + } else { + face_mask[tid] = 1; + } + } +} + + +struct int3_decomposer +{ +#ifdef __HIP_PLATFORM_AMD__ + __host__ __device__ ::rocprim::tuple operator()(int3& key) const + { + return ::rocprim::tie(key.x, key.y, key.z); + } +#else + __host__ __device__ ::cuda::std::tuple operator()(int3& key) const + { + return {key.x, key.y, key.z}; + } +#endif +}; + + +void CuMesh::remove_duplicate_faces() { + size_t F = this->faces.size; + + // Create a temporary sorted copy of faces for duplicate detection + // Do NOT modify the original faces to preserve vertex order and normals + int3 *cu_sorted_faces; + CUDA_CHECK(hipMalloc(&cu_sorted_faces, F * sizeof(int3))); + CUDA_CHECK(hipMemcpy(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), hipMemcpyDeviceToDevice)); + + // Sort vertices within each face (in the temporary copy) + hipLaunchKernelGGL(( sort_faces_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_sorted_faces, + F + ); + CUDA_CHECK(hipGetLastError()); + + // Sort all faces globally by their sorted vertex indices + size_t temp_storage_bytes = 0; + int *cu_sorted_face_indices; + CUDA_CHECK(hipMalloc(&cu_sorted_face_indices, F * sizeof(int))); + hipLaunchKernelGGL(( arange_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_sorted_face_indices, F); + CUDA_CHECK(hipGetLastError()); + + int *cu_sorted_indices_output; + int3 *cu_sorted_faces_output; + CUDA_CHECK(hipMalloc(&cu_sorted_indices_output, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_sorted_faces_output, F * sizeof(int3))); + + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, + cu_sorted_faces, cu_sorted_faces_output, + cu_sorted_face_indices, cu_sorted_indices_output, + F, + int3_decomposer{} + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_sorted_faces, cu_sorted_faces_output, + cu_sorted_face_indices, cu_sorted_indices_output, + F, + int3_decomposer{} + )); + CUDA_CHECK(hipFree(cu_sorted_faces)); + CUDA_CHECK(hipFree(cu_sorted_face_indices)); + + // Select first in each group of duplicate faces (based on sorted faces) + uint8_t* cu_face_mask_sorted; + CUDA_CHECK(hipMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t))); + hipLaunchKernelGGL(( select_first_in_each_group_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_sorted_faces_output, + F, + cu_face_mask_sorted + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_sorted_faces_output)); + + // Map the mask back to original face order using scatter + // scatter: output[indices[i]] = values[i] + // This maps: cu_face_mask_original[original_idx] = cu_face_mask_sorted[sorted_position] + uint8_t* cu_face_mask_original; + CUDA_CHECK(hipMalloc(&cu_face_mask_original, F * sizeof(uint8_t))); + hipLaunchKernelGGL(( scatter_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_sorted_indices_output, // indices: sorted_position -> original_idx + cu_face_mask_sorted, // values: mask at sorted_position + F, + cu_face_mask_original // output: mask at original position + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_face_mask_sorted)); + CUDA_CHECK(hipFree(cu_sorted_indices_output)); + + // Select faces to keep (preserving original vertex order) + this->_remove_faces(cu_face_mask_original); + CUDA_CHECK(hipFree(cu_face_mask_original)); +} + + +static __global__ void mark_degenerate_faces_kernel( + const float3* vertices, + const int3* faces, + const float abs_thresh, + const float rel_thresh, + const size_t F, + uint8_t* face_mask +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + int3 face = faces[tid]; + + // 1. Check if any vertex is duplicated + if (face.x == face.y || face.y == face.z || face.z == face.x) { + face_mask[tid] = 0; + return; + } + + // 2. Check if slim or zero area + Vec3f v0 = Vec3f(vertices[face.x]); + Vec3f v1 = Vec3f(vertices[face.y]); + Vec3f v2 = Vec3f(vertices[face.z]); + Vec3f e0 = v1 - v0; + Vec3f e1 = v2 - v1; + Vec3f e2 = v0 - v2; + float max_edge_len = fmaxf(fmaxf(e0.norm(), e1.norm()), e2.norm()); + float area = e0.cross(e1).norm() / 2.0f; + float thresh = fminf(rel_thresh * max_edge_len * max_edge_len, abs_thresh); + if (area < thresh) { + face_mask[tid] = 0; + return; + } + + face_mask[tid] = 1; +} + + +void CuMesh::remove_degenerate_faces(float abs_thresh, float rel_thresh) { + size_t F = this->faces.size; + + uint8_t* cu_face_mask; + CUDA_CHECK(hipMalloc(&cu_face_mask, F * sizeof(uint8_t))); + hipLaunchKernelGGL(( mark_degenerate_faces_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->vertices.ptr, + this->faces.ptr, + abs_thresh, rel_thresh, + F, + cu_face_mask + ); + CUDA_CHECK(hipGetLastError()); + + this->_remove_faces(cu_face_mask); + CUDA_CHECK(hipFree(cu_face_mask)); +} + + +static __global__ void compute_loop_boundary_lengths( + const float3* vertices, + const uint64_t* edges, + const int* loop_boundaries, + const size_t E, + float* loop_boundary_lengths +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= E) return; + uint64_t edge = edges[loop_boundaries[tid]]; + int e0 = int(edge & 0xFFFFFFFF); + int e1 = int(edge >> 32); + Vec3f v0 = Vec3f(vertices[e0]); + Vec3f v1 = Vec3f(vertices[e1]); + loop_boundary_lengths[tid] = (v1 - v0).norm(); +} + + +static __global__ void compute_loop_boundary_midpoints( + const float3* vertices, + const uint64_t* edges, + const int* loop_boundaries, + const size_t E, + Vec3f* loop_boundary_midpoints +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= E) return; + uint64_t edge = edges[loop_boundaries[tid]]; + int e0 = int(edge & 0xFFFFFFFF); + int e1 = int(edge >> 32); + Vec3f v0 = Vec3f(vertices[e0]); + Vec3f v1 = Vec3f(vertices[e1]); + loop_boundary_midpoints[tid] = (v0 + v1) * 0.5f; +} + + +static __global__ void connect_new_vertices_kernel( + const uint64_t* edges, + const int* loop_boundaries, + const int* loop_bound_loop_ids, + const size_t L, + const size_t V, + int3* faces +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= L) return; + int loop_id = loop_bound_loop_ids[tid]; + int loop_boundary = loop_boundaries[tid]; + uint64_t e = edges[loop_boundary]; + int e0 = int(e & 0xFFFFFFFF); + int e1 = int(e >> 32); + int new_v_id = loop_id + V; + faces[tid] = {e0, e1, new_v_id}; +} + + +struct LessThanOp { + __device__ bool operator()(float a, float b) const { + return a < b; + } +}; + + +void CuMesh::fill_holes(float max_hole_perimeter) { + if (this->loop_boundaries.is_empty() || this->loop_boundaries_offset.is_empty()) { + this->get_boundary_loops(); + } + + size_t V = this->vertices.size; + size_t F = this->faces.size; + size_t L = this->num_bound_loops; + size_t E = this->loop_boundaries.size; + + // Early return if no boundary loops + if (L == 0 || E == 0) { + return; + } + + // Compute loop boundary lengths + float* cu_loop_boundary_lengths; + CUDA_CHECK(hipMalloc(&cu_loop_boundary_lengths, E * sizeof(float))); + hipLaunchKernelGGL(( compute_loop_boundary_lengths), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->vertices.ptr, + this->edges.ptr, + this->loop_boundaries.ptr, + E, + cu_loop_boundary_lengths + ); + CUDA_CHECK(hipGetLastError()); + + // Segment sum + size_t temp_storage_bytes = 0; + float *cu_bound_loop_perimeters; + CUDA_CHECK(hipMalloc(&cu_bound_loop_perimeters, L * sizeof(float))); + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( + nullptr, temp_storage_bytes, + cu_loop_boundary_lengths, cu_bound_loop_perimeters, + L, + this->loop_boundaries_offset.ptr, + this->loop_boundaries_offset.ptr + 1 + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_loop_boundary_lengths, cu_bound_loop_perimeters, + L, + this->loop_boundaries_offset.ptr, + this->loop_boundaries_offset.ptr + 1 + )); + CUDA_CHECK(hipFree(cu_loop_boundary_lengths)); + + // Mask small loops + uint8_t* cu_bound_loop_mask; + CUDA_CHECK(hipMalloc(&cu_bound_loop_mask, L * sizeof(uint8_t))); + hipLaunchKernelGGL(( compare_kernel), dim3((L+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_bound_loop_perimeters, + max_hole_perimeter, + L, + LessThanOp(), + cu_bound_loop_mask + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_bound_loop_perimeters)); + + // Compress bound loops size + int* cu_bound_loops_cnt; + CUDA_CHECK(hipMalloc(&cu_bound_loops_cnt, L * sizeof(int))); + hipLaunchKernelGGL(( diff_kernel), dim3((L+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->loop_boundaries_offset.ptr, + L, + cu_bound_loops_cnt + ); + CUDA_CHECK(hipGetLastError()); + int *cu_new_loop_boundaries_cnt, *cu_new_num_bound_loops; + CUDA_CHECK(hipMalloc(&cu_new_loop_boundaries_cnt, (L+1) * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_new_num_bound_loops, sizeof(int))); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + nullptr, temp_storage_bytes, + cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops, + L + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops, + L + )); + int new_num_bound_loops; + CUDA_CHECK(hipMemcpy(&new_num_bound_loops, cu_new_num_bound_loops, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_bound_loops_cnt)); + CUDA_CHECK(hipFree(cu_new_num_bound_loops)); + if (new_num_bound_loops == 0) { + CUDA_CHECK(hipFree(cu_new_loop_boundaries_cnt)); + CUDA_CHECK(hipFree(cu_bound_loop_mask)); + return; + } + + // Get loop ids of loop boundaries + int* cu_loop_bound_loop_ids; + CUDA_CHECK(hipMalloc(&cu_loop_bound_loop_ids, E * sizeof(int))); + CUDA_CHECK(hipMemset(cu_loop_bound_loop_ids, 0, E * sizeof(int))); + if (L > 1) { + hipLaunchKernelGGL(( set_flag_kernel), dim3((L-1+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->loop_boundaries_offset.ptr + 1, L - 1, + cu_loop_bound_loop_ids + ); + CUDA_CHECK(hipGetLastError()); + } + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::InclusiveSum( + nullptr, temp_storage_bytes, + cu_loop_bound_loop_ids, + E + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::InclusiveSum( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_loop_bound_loop_ids, + E + )); + + // Mask loop boundaries + uint8_t* cu_loop_boundary_mask; + CUDA_CHECK(hipMalloc(&cu_loop_boundary_mask, E * sizeof(uint8_t))); + hipLaunchKernelGGL(( index_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_bound_loop_mask, + cu_loop_bound_loop_ids, + E, + cu_loop_boundary_mask + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_bound_loop_mask)); + CUDA_CHECK(hipFree(cu_loop_bound_loop_ids)); + + // Compress loop boundaries + int *cu_new_loop_boundaries, *cu_new_num_loop_boundaries; + CUDA_CHECK(hipMalloc(&cu_new_loop_boundaries, E * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_new_num_loop_boundaries, sizeof(int))); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + nullptr, temp_storage_bytes, + this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries, + E + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + this->cub_temp_storage.ptr, temp_storage_bytes, + this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries, + E + )); + int new_num_loop_boundaries; + CUDA_CHECK(hipMemcpy(&new_num_loop_boundaries, cu_new_num_loop_boundaries, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_new_num_loop_boundaries)); + CUDA_CHECK(hipFree(cu_loop_boundary_mask)); + + // Reconstruct new bound loops + int* cu_new_loop_boundaries_offset; + CUDA_CHECK(hipMalloc(&cu_new_loop_boundaries_offset, (new_num_loop_boundaries+1) * sizeof(int))); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset, + new_num_bound_loops + 1 + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset, + new_num_bound_loops + 1 + )); + int* cu_new_loop_bound_loop_ids; + CUDA_CHECK(hipMalloc(&cu_new_loop_bound_loop_ids, new_num_loop_boundaries * sizeof(int))); + CUDA_CHECK(hipMemset(cu_new_loop_bound_loop_ids, 0, new_num_loop_boundaries * sizeof(int))); + if (new_num_bound_loops > 1) { + hipLaunchKernelGGL(( set_flag_kernel), dim3((new_num_bound_loops-1+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_new_loop_boundaries_offset+1, new_num_bound_loops-1, + cu_new_loop_bound_loop_ids + ); + CUDA_CHECK(hipGetLastError()); + } + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::InclusiveSum( + nullptr, temp_storage_bytes, + cu_new_loop_bound_loop_ids, + new_num_loop_boundaries + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::InclusiveSum( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_new_loop_bound_loop_ids, + new_num_loop_boundaries + )); + + // Calculate new vertex positions as average of loop vertices + Vec3f* cu_new_loop_bound_centers; + CUDA_CHECK(hipMalloc(&cu_new_loop_bound_centers, new_num_loop_boundaries * sizeof(Vec3f))); + hipLaunchKernelGGL(( compute_loop_boundary_midpoints), dim3((new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->vertices.ptr, + this->edges.ptr, + cu_new_loop_boundaries, + new_num_loop_boundaries, + cu_new_loop_bound_centers + ); + CUDA_CHECK(hipGetLastError()); + Vec3f* cu_new_vertices; + CUDA_CHECK(hipMalloc(&cu_new_vertices, new_num_bound_loops * sizeof(Vec3f))); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( + nullptr, temp_storage_bytes, + cu_new_loop_bound_centers, cu_new_vertices, + new_num_bound_loops, + cu_new_loop_boundaries_offset, + cu_new_loop_boundaries_offset + 1 + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_new_loop_bound_centers, cu_new_vertices, + new_num_bound_loops, + cu_new_loop_boundaries_offset, + cu_new_loop_boundaries_offset + 1 + )); + CUDA_CHECK(hipFree(cu_new_loop_bound_centers)); + CUDA_CHECK(hipFree(cu_new_loop_boundaries_offset)); + hipLaunchKernelGGL(( inplace_div_kernel), dim3((new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_new_vertices, + cu_new_loop_boundaries_cnt, + new_num_bound_loops + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_new_loop_boundaries_cnt)); + + // Update mesh + this->vertices.extend(new_num_bound_loops); + this->faces.extend(new_num_loop_boundaries); + hipLaunchKernelGGL(( copy_vec3f_to_float3_kernel), dim3((new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_new_vertices, + new_num_bound_loops, + this->vertices.ptr + V + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_new_vertices)); + hipLaunchKernelGGL(( connect_new_vertices_kernel), dim3((new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->edges.ptr, + cu_new_loop_boundaries, + cu_new_loop_bound_loop_ids, + new_num_loop_boundaries, + V, + this->faces.ptr + F + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_new_loop_boundaries)); + CUDA_CHECK(hipFree(cu_new_loop_bound_loop_ids)); + + // Delete all cached info since mesh has changed + this->clear_cache(); +} + + +static __global__ void construct_vertex_adj_pairs_kernel( + const int2* manifold_face_adj, + const int3* faces, + int2* vertex_adj_pairs, + const size_t M +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= M) return; + + const int2 adj_faces = manifold_face_adj[tid]; + const int3 face1 = faces[adj_faces.x]; + const int3 face2 = faces[adj_faces.y]; + + const int v1[3] = {face1.x, face1.y, face1.z}; + + int shared_local_indices1[2] = {0, 0}; + int shared_local_indices2[2] = {0, 0}; + int found_count = 0; + + for (int i = 0; i < 3; ++i) { + if (v1[i] == face2.x) { + shared_local_indices1[found_count] = i; + shared_local_indices2[found_count] = 0; + found_count++; + } else if (v1[i] == face2.y) { + shared_local_indices1[found_count] = i; + shared_local_indices2[found_count] = 1; + found_count++; + } else if (v1[i] == face2.z) { + shared_local_indices1[found_count] = i; + shared_local_indices2[found_count] = 2; + found_count++; + } + if (found_count == 2) { + break; + } + } + + // Only process if we found exactly 2 shared vertices (valid manifold edge) + if (found_count == 2) { + vertex_adj_pairs[2 * tid + 0] = make_int2( + 3 * adj_faces.x + shared_local_indices1[0], + 3 * adj_faces.y + shared_local_indices2[0] + ); + vertex_adj_pairs[2 * tid + 1] = make_int2( + 3 * adj_faces.x + shared_local_indices1[1], + 3 * adj_faces.y + shared_local_indices2[1] + ); + } else { + // Invalid edge, set to identity mapping + vertex_adj_pairs[2 * tid + 0] = make_int2(3 * adj_faces.x, 3 * adj_faces.x); + vertex_adj_pairs[2 * tid + 1] = make_int2(3 * adj_faces.y, 3 * adj_faces.y); + } +} + + +static __global__ void index_vertice_kernel( + const int* vertex_ids, + const int3* faces, + const float3* vertices, + const size_t V, + float3* new_vertices +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= V) return; + const int vid = vertex_ids[tid]; + const int3 face = faces[vid / 3]; + const int f[3] = {face.x, face.y, face.z}; + new_vertices[tid] = vertices[f[vid % 3]]; +} + + +void CuMesh::repair_non_manifold_edges(){ + // Always recompute manifold_face_adj to ensure it's up to date + // especially after operations like simplify() that modify the mesh + this->get_manifold_face_adjacency(); + + size_t F = this->faces.size; + size_t M = this->manifold_face_adj.size; + + // Construct vertex adjacency pairs with manifold edges + int2* cu_vertex_adj_pairs; + CUDA_CHECK(hipMalloc(&cu_vertex_adj_pairs, 2*M*sizeof(int2))); + hipLaunchKernelGGL(( construct_vertex_adj_pairs_kernel), dim3((M+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->manifold_face_adj.ptr, + this->faces.ptr, + cu_vertex_adj_pairs, + M + ); + CUDA_CHECK(hipGetLastError()); + + // Iterative Hook and Compress + int* cu_vertex_ids; + CUDA_CHECK(hipMalloc(&cu_vertex_ids, 3 * F * sizeof(int))); + hipLaunchKernelGGL(( arange_kernel), dim3((3*F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_vertex_ids, 3 * F); + CUDA_CHECK(hipGetLastError()); + int* cu_end_flag; int h_end_flag; + CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int))); + do { + h_end_flag = 1; + CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice)); + + // Hook + hipLaunchKernelGGL(( hook_edges_kernel), dim3((2*M+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_vertex_adj_pairs, + 2 * M, + cu_vertex_ids, + cu_end_flag + ); + CUDA_CHECK(hipGetLastError()); + + // Compress + hipLaunchKernelGGL(( compress_components_kernel), dim3((3*F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_vertex_ids, + 3 * F + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost)); + } while (h_end_flag == 0); + CUDA_CHECK(hipFree(cu_end_flag)); + CUDA_CHECK(hipFree(cu_vertex_adj_pairs)); + + // Construct new faces + int* cu_new_vertices_ids; + CUDA_CHECK(hipMalloc(&cu_new_vertices_ids, 3 * F * sizeof(int))); + int new_V = compress_ids(cu_vertex_ids, 3 * F, this->cub_temp_storage, cu_new_vertices_ids); + float3* cu_new_vertices; + CUDA_CHECK(hipMalloc(&cu_new_vertices, new_V * sizeof(float3))); + hipLaunchKernelGGL(( index_vertice_kernel), dim3((new_V+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_new_vertices_ids, + this->faces.ptr, + this->vertices.ptr, + new_V, + cu_new_vertices + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_new_vertices_ids)); + this->vertices.resize(new_V); + CUDA_CHECK(hipMemcpy(this->vertices.ptr, cu_new_vertices, new_V * sizeof(float3), hipMemcpyDeviceToDevice)); + CUDA_CHECK(hipFree(cu_new_vertices)); + this->faces.resize(F); + hipLaunchKernelGGL(( copy_T_to_T3_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_vertex_ids, F, this->faces.ptr); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_vertex_ids)); + + // Delete all cached info since mesh has changed + this->clear_cache(); +} + + +/** + * Mark faces to remove for non-manifold edges + * For each non-manifold edge (shared by >2 faces), only keep the first 2 faces + * + * @param edge2face: edge to face adjacency + * @param edge2face_offset: edge to face adjacency offset + * @param edge2face_cnt: number of faces per edge + * @param E: number of edges + * @param face_keep_mask: output mask (1 = keep, 0 = remove) + */ +static __global__ void mark_non_manifold_faces_kernel( + const int* edge2face, + const int* edge2face_offset, + const int* edge2face_cnt, + const size_t E, + uint8_t* face_keep_mask +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= E) return; + + // Only process non-manifold edges (cnt > 2) + int cnt = edge2face_cnt[tid]; + if (cnt <= 2) return; + + // Mark faces beyond the first 2 for removal + int start = edge2face_offset[tid]; + for (int i = 2; i < cnt; i++) { + int face_idx = edge2face[start + i]; + face_keep_mask[face_idx] = 0; + } +} + + +void CuMesh::remove_non_manifold_faces() { + // Get edge-face adjacency information + if (this->edge2face.is_empty() || this->edge2face_offset.is_empty()) { + this->get_edge_face_adjacency(); + } + + size_t F = this->faces.size; + size_t E = this->edges.size; + + if (F == 0 || E == 0) return; + + // Initialize face mask (1 = keep all faces initially) + uint8_t* cu_face_keep_mask; + CUDA_CHECK(hipMalloc(&cu_face_keep_mask, F * sizeof(uint8_t))); + CUDA_CHECK(hipMemset(cu_face_keep_mask, 1, F * sizeof(uint8_t))); + + // Mark faces on non-manifold edges for removal + hipLaunchKernelGGL(( mark_non_manifold_faces_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->edge2face.ptr, + this->edge2face_offset.ptr, + this->edge2face_cnt.ptr, + E, + cu_face_keep_mask + ); + CUDA_CHECK(hipGetLastError()); + + // Remove marked faces + this->_remove_faces(cu_face_keep_mask); + CUDA_CHECK(hipFree(cu_face_keep_mask)); + + // Clear cache since mesh has changed + this->clear_cache(); +} + + +struct GreaterThanOrEqualToOp { + __device__ __forceinline__ bool operator()(const float& a, const float& b) const { + return a >= b; + } +}; + + +void CuMesh::remove_small_connected_components(float min_area) { + if (this->conn_comp_ids.is_empty()) { + this->get_connected_components(); + } + if (this->face_areas.is_empty()) { + this->compute_face_areas(); + } + size_t F = this->faces.size; + if (F == 0) return; + + // 1. Sort face areas based on their connected component ID. + // This groups all faces of the same component together. + size_t temp_storage_bytes = 0; + int *cu_sorted_conn_comp_ids; + float *cu_sorted_face_areas; + CUDA_CHECK(hipMalloc(&cu_sorted_conn_comp_ids, F * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_sorted_face_areas, F * sizeof(float))); + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, + this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids, + this->face_areas.ptr, cu_sorted_face_areas, + F + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + this->cub_temp_storage.ptr, temp_storage_bytes, + this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids, + this->face_areas.ptr, cu_sorted_face_areas, + F + )); + + // 2. Find unique components and get the number of faces in each. + int* cu_conn_comp_num_faces; + int* cu_num_conn_comps; + int* cu_unique_conn_comp_ids; // Not needed, but we need to pass a valid pointer. + CUDA_CHECK(hipMalloc(&cu_conn_comp_num_faces, (this->num_conn_comps + 1) * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num_conn_comps, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_unique_conn_comp_ids, (this->num_conn_comps + 1) * sizeof(int))); + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( + nullptr, temp_storage_bytes, + cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids, + cu_conn_comp_num_faces, cu_num_conn_comps, + F + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids, + cu_conn_comp_num_faces, cu_num_conn_comps, + F + )); + int num_conn_comps; + CUDA_CHECK(hipMemcpy(&num_conn_comps, cu_num_conn_comps, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_num_conn_comps)); + CUDA_CHECK(hipFree(cu_sorted_conn_comp_ids)); + CUDA_CHECK(hipFree(cu_unique_conn_comp_ids)); + + // 3. Compute the total area for each connected component via segmented reduction. + int* cu_conn_comp_offsets; + CUDA_CHECK(hipMalloc(&cu_conn_comp_offsets, (num_conn_comps + 1) * sizeof(int))); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + cu_conn_comp_num_faces, cu_conn_comp_offsets, + num_conn_comps + 1 + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_conn_comp_num_faces, cu_conn_comp_offsets, + num_conn_comps + 1 + )); + CUDA_CHECK(hipFree(cu_conn_comp_num_faces)); + + float *cu_conn_comp_areas; + CUDA_CHECK(hipMalloc(&cu_conn_comp_areas, num_conn_comps * sizeof(float))); + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( + nullptr, temp_storage_bytes, + cu_sorted_face_areas, cu_conn_comp_areas, + num_conn_comps, + cu_conn_comp_offsets, + cu_conn_comp_offsets + 1 + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_sorted_face_areas, cu_conn_comp_areas, + num_conn_comps, + cu_conn_comp_offsets, + cu_conn_comp_offsets + 1 + )); + CUDA_CHECK(hipFree(cu_sorted_face_areas)); + CUDA_CHECK(hipFree(cu_conn_comp_offsets)); + + // 4. Create a "keep" mask for components with area >= min_area. + uint8_t* cu_comp_keep_mask; + CUDA_CHECK(hipMalloc(&cu_comp_keep_mask, num_conn_comps * sizeof(uint8_t))); + hipLaunchKernelGGL(( compare_kernel), dim3((num_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_conn_comp_areas, + min_area, + num_conn_comps, + GreaterThanOrEqualToOp(), + cu_comp_keep_mask + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_conn_comp_areas)); + + // 5. Propagate the component "keep" mask to every face. + uint8_t* cu_face_keep_mask; + CUDA_CHECK(hipMalloc(&cu_face_keep_mask, F * sizeof(uint8_t))); + // Use an index_kernel (gather operation) + hipLaunchKernelGGL(( index_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_comp_keep_mask, // Source array + this->conn_comp_ids.ptr, // Indices to gather from + F, + cu_face_keep_mask // Destination array + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_comp_keep_mask)); + + // 6. Select the faces to keep and update the mesh. + this->_remove_faces(cu_face_keep_mask); + CUDA_CHECK(hipFree(cu_face_keep_mask)); +} + + +static __global__ void hook_edges_with_orientation_kernel( + const int2* adj, + const uint8_t* flipped, + const int M, + int* conn_comp_ids, + int* end_flag +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= M) return; + + // get adjacent faces + int f0 = adj[tid].x; + int f1 = adj[tid].y; + uint8_t is_flipped = flipped[tid]; + + // union + // find roots + int root0 = conn_comp_ids[f0] >> 1; + int flip0 = conn_comp_ids[f0] & 1; + while (root0 != (conn_comp_ids[root0] >> 1)) { + flip0 ^= conn_comp_ids[root0] & 1; + root0 = conn_comp_ids[root0] >> 1; + } + int root1 = conn_comp_ids[f1] >> 1; + int flip1 = conn_comp_ids[f1] & 1; + while (root1 != (conn_comp_ids[root1] >> 1)) { + flip1 ^= conn_comp_ids[root1] & 1; + root1 = conn_comp_ids[root1] >> 1; + } + + if (root0 == root1) return; + + int high = max(root0, root1); + int low = min(root0, root1); + atomicMin(&conn_comp_ids[high], (low << 1) | (is_flipped ^ flip0 ^ flip1)); + *end_flag = 0; +} + + +static __global__ void compress_components_with_orientation_kernel( + int* conn_comp_ids, + const int F +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + + int p = conn_comp_ids[tid] >> 1; + int f = conn_comp_ids[tid] & 1; + while (p != (conn_comp_ids[p] >> 1)) { + f ^= conn_comp_ids[p] & 1; + p = conn_comp_ids[p] >> 1; + } + conn_comp_ids[tid] = (p << 1) | f; +} + + +static __global__ void get_flip_flags_kernel( + const int2* manifold_face_adj, + const int3* faces, + const int M, + uint8_t* flipped +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= M) return; + + const int2 adj_faces = manifold_face_adj[tid]; + const int3 face1 = faces[adj_faces.x]; + const int3 face2 = faces[adj_faces.y]; + + const int v1[3] = {face1.x, face1.y, face1.z}; + + int shared_local_indices1[2]; + int shared_local_indices2[2]; + int found_count = 0; + + for (int i = 0; i < 3; ++i) { + if (v1[i] == face2.x) { + shared_local_indices1[found_count] = i; + shared_local_indices2[found_count] = 0; + found_count++; + } else if (v1[i] == face2.y) { + shared_local_indices1[found_count] = i; + shared_local_indices2[found_count] = 1; + found_count++; + } else if (v1[i] == face2.z) { + shared_local_indices1[found_count] = i; + shared_local_indices2[found_count] = 2; + found_count++; + } + if (found_count == 2) { + break; + } + } + + int direction1 = (shared_local_indices1[1] - shared_local_indices1[0] + 3) % 3; + int direction2 = (shared_local_indices2[1] - shared_local_indices2[0] + 3) % 3; + flipped[tid] = (direction1 == direction2) ? 1 : 0; +} + + +static __global__ void inplace_flip_faces_with_flags_kernel( + int3* faces, + const int* conn_comp_with_flip, + const int F +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + + int is_flipped = conn_comp_with_flip[tid] & 1; + if (is_flipped) { + int3 face = faces[tid]; + faces[tid] = make_int3(face.x, face.z, face.y); + } +} + + +void CuMesh::unify_face_orientations() { + if (this->manifold_face_adj.is_empty()) { + this->get_manifold_face_adjacency(); + } + + // 1. Compute the flipped flag for each edge. + uint8_t* cu_flipped; + CUDA_CHECK(hipMalloc(&cu_flipped, this->manifold_face_adj.size * sizeof(uint8_t))); + hipLaunchKernelGGL(( get_flip_flags_kernel), dim3((this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->manifold_face_adj.ptr, + this->faces.ptr, + this->manifold_face_adj.size, + cu_flipped + ); + CUDA_CHECK(hipGetLastError()); + + // 2. Hook edges with flipped flag. + int* conn_comp_with_flip; + CUDA_CHECK(hipMalloc(&conn_comp_with_flip, this->faces.size * sizeof(int))); + hipLaunchKernelGGL(( arange_kernel), dim3((this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, conn_comp_with_flip, this->faces.size, 2); + CUDA_CHECK(hipGetLastError()); + int* cu_end_flag; int h_end_flag; + CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int))); + do { + h_end_flag = 1; + CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice)); + + // Hook + hipLaunchKernelGGL(( hook_edges_with_orientation_kernel), dim3((this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->manifold_face_adj.ptr, + cu_flipped, + this->manifold_face_adj.size, + conn_comp_with_flip, + cu_end_flag + ); + CUDA_CHECK(hipGetLastError()); + + // Compress + hipLaunchKernelGGL(( compress_components_with_orientation_kernel), dim3((this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + conn_comp_with_flip, + this->faces.size + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost)); + } while (h_end_flag == 0); + CUDA_CHECK(hipFree(cu_end_flag)); + + // 3. Flip the orientation of the faces. + hipLaunchKernelGGL(( inplace_flip_faces_with_flags_kernel), dim3((this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->faces.ptr, + conn_comp_with_flip, + this->faces.size + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_flipped)); + CUDA_CHECK(hipFree(conn_comp_with_flip)); +} + + +} // namespace cumesh \ No newline at end of file diff --git a/src/connectivity.cu b/src/connectivity.cu index 6e2f5fe..f634882 100644 --- a/src/connectivity.cu +++ b/src/connectivity.cu @@ -1,7 +1,11 @@ #include "cumesh.h" #include "shared.h" +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include +#endif namespace cumesh { @@ -64,18 +68,18 @@ void CuMesh::get_vertex_face_adjacency() { this->vert2face_cnt.resize(V + 1); this->vert2face_cnt.zero(); get_neighbor_face_cnt_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->faces.ptr, F, this->vert2face_cnt.ptr); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // allocate memory for neighboring face ids this->vert2face_offset.resize(V + 1); size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, this->vert2face_cnt.ptr, this->vert2face_offset.ptr, V + 1 )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, this->vert2face_cnt.ptr, this->vert2face_offset.ptr, V + 1 @@ -89,7 +93,7 @@ void CuMesh::get_vertex_face_adjacency() { this->vert2face_offset.ptr, this->vert2face_cnt.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } @@ -122,19 +126,19 @@ void CuMesh::get_edges() { size_t F = this->faces.size; this->edges.resize(F * 3); expand_edges_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->faces.ptr, F, this->edges.ptr); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // sort edges this->temp_storage.resize(F * 3 * sizeof(uint64_t)); size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceRadixSort::SortKeys( + CUDA_CHECK(hipcub::DeviceRadixSort::SortKeys( nullptr, temp_storage_bytes, this->edges.ptr, reinterpret_cast(this->temp_storage.ptr), F * 3 )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRadixSort::SortKeys( + CUDA_CHECK(hipcub::DeviceRadixSort::SortKeys( this->cub_temp_storage.ptr, temp_storage_bytes, this->edges.ptr, reinterpret_cast(this->temp_storage.ptr), @@ -143,22 +147,22 @@ void CuMesh::get_edges() { // unique edges int* num_edges; - CUDA_CHECK(cudaMalloc(&num_edges, sizeof(int))); + CUDA_CHECK(hipMalloc(&num_edges, sizeof(int))); this->edge2face_cnt.resize(F * 3); - CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( nullptr, temp_storage_bytes, reinterpret_cast(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges, F * 3 )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( this->cub_temp_storage.ptr, temp_storage_bytes, reinterpret_cast(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges, F * 3 )); - CUDA_CHECK(cudaMemcpy(&this->edges.size, num_edges, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(&this->edges.size, num_edges, sizeof(int), hipMemcpyDeviceToHost)); this->edge2face_cnt.size = this->edges.size; - CUDA_CHECK(cudaFree(num_edges)); + CUDA_CHECK(hipFree(num_edges)); } @@ -229,13 +233,13 @@ void CuMesh::get_edge_face_adjacency() { // allocate memory for edge2face_offset this->edge2face_offset.resize(E + 1); size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, this->edge2face_cnt.ptr, this->edge2face_offset.ptr, E + 1 )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, this->edge2face_cnt.ptr, this->edge2face_offset.ptr, E + 1 @@ -243,7 +247,7 @@ void CuMesh::get_edge_face_adjacency() { // allocate memory for edge2face int total_edge_face_cnt; - CUDA_CHECK(cudaMemcpy(&total_edge_face_cnt, &this->edge2face_offset.ptr[E], sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(&total_edge_face_cnt, &this->edge2face_offset.ptr[E], sizeof(int), hipMemcpyDeviceToHost)); this->edge2face.resize(total_edge_face_cnt); // allocate memory for face2edge @@ -261,7 +265,7 @@ void CuMesh::get_edge_face_adjacency() { this->edge2face.ptr, this->face2edge.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } @@ -334,18 +338,18 @@ void CuMesh::get_vertex_edge_adjacency() { get_vertex_edge_cnt_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->edges.ptr, E, this->vert2edge_cnt.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // allocate memory for vert2edge_offset this->vert2edge_offset.resize(V + 1); size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr, V + 1 )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr, V + 1 @@ -360,7 +364,7 @@ void CuMesh::get_vertex_edge_adjacency() { this->vert2edge_offset.ptr, this->vert2edge_cnt.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } @@ -416,26 +420,26 @@ void CuMesh::get_boundary_info() { // Select boundary edges size_t temp_storage_bytes = 0; int *cu_num_boundary, *cu_edge_idx; - CUDA_CHECK(cudaMalloc(&cu_num_boundary, sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_edge_idx, E * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num_boundary, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_edge_idx, E * sizeof(int))); this->boundaries.resize(E); arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_edge_idx, E); - CUDA_CHECK(cub::DeviceSelect::If( + CUDA_CHECK(hipcub::DeviceSelect::If( nullptr, temp_storage_bytes, cu_edge_idx, this->boundaries.ptr, cu_num_boundary, E, is_boundary_edge{this->edge2face_cnt.ptr} )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSelect::If( + CUDA_CHECK(hipcub::DeviceSelect::If( this->cub_temp_storage.ptr, temp_storage_bytes, cu_edge_idx, this->boundaries.ptr, cu_num_boundary, E, is_boundary_edge{this->edge2face_cnt.ptr} )); - CUDA_CHECK(cudaMemcpy(&this->boundaries.size, cu_num_boundary, sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaFree(cu_num_boundary)); - CUDA_CHECK(cudaFree(cu_edge_idx)); + CUDA_CHECK(hipMemcpy(&this->boundaries.size, cu_num_boundary, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_num_boundary)); + CUDA_CHECK(hipFree(cu_edge_idx)); // Set vertex boundary indicator this->vert_is_boundary.resize(this->vertices.size); @@ -445,7 +449,7 @@ void CuMesh::get_boundary_info() { this->edges.ptr, this->boundaries.ptr, this->edge2face_cnt.ptr, this->boundaries.size, this->vert_is_boundary.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } } @@ -531,18 +535,18 @@ void CuMesh::get_vertex_boundary_adjacency() { get_vertex_boundary_cnt_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->edges.ptr, this->boundaries.ptr, B, this->vert2bound_cnt.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // allocate memory for vert2bound_offset this->vert2bound_offset.resize(V + 1); size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr, V + 1 )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr, V + 1 @@ -557,7 +561,7 @@ void CuMesh::get_vertex_boundary_adjacency() { this->vert2bound_offset.ptr, this->vert2bound_cnt.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } @@ -613,7 +617,7 @@ void CuMesh::get_vertex_is_manifold() { V, this->vert_is_manifold.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } @@ -668,28 +672,28 @@ void CuMesh::get_manifold_face_adjacency() { // Select manifold edges size_t temp_storage_bytes = 0; int *cu_num_manifold_edges, *cu_edge_idx, *cu_manifold_edge_idx; - CUDA_CHECK(cudaMalloc(&cu_num_manifold_edges, sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_edge_idx, E * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_manifold_edge_idx, E * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num_manifold_edges, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_edge_idx, E * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_manifold_edge_idx, E * sizeof(int))); arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_edge_idx, E); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cub::DeviceSelect::If( + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipcub::DeviceSelect::If( nullptr, temp_storage_bytes, cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges, E, is_manifold_edge{this->edge2face_cnt.ptr} )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSelect::If( + CUDA_CHECK(hipcub::DeviceSelect::If( this->cub_temp_storage.ptr, temp_storage_bytes, cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges, E, is_manifold_edge{this->edge2face_cnt.ptr} )); int manifold_edge_count; - CUDA_CHECK(cudaMemcpy(&manifold_edge_count, cu_num_manifold_edges, sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaFree(cu_num_manifold_edges)); - CUDA_CHECK(cudaFree(cu_edge_idx)); + CUDA_CHECK(hipMemcpy(&manifold_edge_count, cu_num_manifold_edges, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_num_manifold_edges)); + CUDA_CHECK(hipFree(cu_edge_idx)); // set manifold_face_adj this->manifold_face_adj.resize(manifold_edge_count); @@ -700,8 +704,8 @@ void CuMesh::get_manifold_face_adjacency() { manifold_edge_count, this->manifold_face_adj.ptr ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_manifold_edge_idx)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_manifold_edge_idx)); } @@ -748,32 +752,32 @@ void CuMesh::get_manifold_boundary_adjacency() { // Select manifold boundary vertices size_t temp_storage_bytes = 0; int *cu_num_manifold_boundary_verts, *cu_vert_idx, *cu_manifold_vert_idx; - CUDA_CHECK(cudaMalloc(&cu_num_manifold_boundary_verts, sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_vert_idx, V * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_manifold_vert_idx, V * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num_manifold_boundary_verts, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_vert_idx, V * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_manifold_vert_idx, V * sizeof(int))); arange_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vert_idx, V); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cub::DeviceSelect::If( + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipcub::DeviceSelect::If( nullptr, temp_storage_bytes, cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts, V, is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr} )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSelect::If( + CUDA_CHECK(hipcub::DeviceSelect::If( this->cub_temp_storage.ptr, temp_storage_bytes, cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts, V, is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr} )); int manifold_boundary_vert_count; - CUDA_CHECK(cudaMemcpy(&manifold_boundary_vert_count, cu_num_manifold_boundary_verts, sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaFree(cu_num_manifold_boundary_verts)); - CUDA_CHECK(cudaFree(cu_vert_idx)); + CUDA_CHECK(hipMemcpy(&manifold_boundary_vert_count, cu_num_manifold_boundary_verts, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_num_manifold_boundary_verts)); + CUDA_CHECK(hipFree(cu_vert_idx)); // Early return if no manifold boundary vertices if (manifold_boundary_vert_count == 0) { - CUDA_CHECK(cudaFree(cu_manifold_vert_idx)); + CUDA_CHECK(hipFree(cu_manifold_vert_idx)); return; } @@ -786,7 +790,7 @@ void CuMesh::get_manifold_boundary_adjacency() { manifold_boundary_vert_count, this->manifold_bound_adj.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } @@ -801,12 +805,12 @@ void CuMesh::get_connected_components() { // Iterative Hook and Compress this->conn_comp_ids.resize(F); arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->conn_comp_ids.ptr, F); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); int* cu_end_flag; int h_end_flag; - CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int))); do { h_end_flag = 1; - CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice)); // Hook hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( @@ -815,17 +819,17 @@ void CuMesh::get_connected_components() { this->conn_comp_ids.ptr, cu_end_flag ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // Compress compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->conn_comp_ids.ptr, F ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost)); } while (h_end_flag == 0); - CUDA_CHECK(cudaFree(cu_end_flag)); + CUDA_CHECK(hipFree(cu_end_flag)); // Compresses boundary components this->num_conn_comps = compress_ids(this->conn_comp_ids.ptr, F, this->cub_temp_storage); @@ -848,12 +852,12 @@ void CuMesh::get_boundary_connected_components() { // Iterative Hook and Compress this->bound_conn_comp_ids.resize(B); arange_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->bound_conn_comp_ids.ptr, B); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); int* cu_end_flag; int h_end_flag; - CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int))); do { h_end_flag = 1; - CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice)); // Hook hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( @@ -862,17 +866,17 @@ void CuMesh::get_boundary_connected_components() { this->bound_conn_comp_ids.ptr, cu_end_flag ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // Compress compress_components_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->bound_conn_comp_ids.ptr, B ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost)); } while (h_end_flag == 0); - CUDA_CHECK(cudaFree(cu_end_flag)); + CUDA_CHECK(hipFree(cu_end_flag)); // Compresses boundary components this->num_bound_conn_comps = compress_ids(this->bound_conn_comp_ids.ptr, B, this->cub_temp_storage); @@ -940,13 +944,13 @@ void CuMesh::get_boundary_loops() { // Check if boundary components are loops int* cu_is_bound_conn_comp_loop; - CUDA_CHECK(cudaMalloc(&cu_is_bound_conn_comp_loop, this->num_bound_conn_comps * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_is_bound_conn_comp_loop, this->num_bound_conn_comps * sizeof(int))); fill_kernel<<<(this->num_bound_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( cu_is_bound_conn_comp_loop, this->num_bound_conn_comps, 1 ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); is_bound_conn_comp_loop_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( this->edges.ptr, this->boundaries.ptr, @@ -956,43 +960,43 @@ void CuMesh::get_boundary_loops() { B, cu_is_bound_conn_comp_loop ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); int* cu_num_bound_loops; - CUDA_CHECK(cudaMalloc(&cu_num_bound_loops, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num_bound_loops, sizeof(int))); size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceReduce::Sum( + CUDA_CHECK(hipcub::DeviceReduce::Sum( nullptr, temp_storage_bytes, cu_is_bound_conn_comp_loop, cu_num_bound_loops, this->num_bound_conn_comps )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceReduce::Sum( + CUDA_CHECK(hipcub::DeviceReduce::Sum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_is_bound_conn_comp_loop, cu_num_bound_loops, this->num_bound_conn_comps )); - CUDA_CHECK(cudaMemcpy(&this->num_bound_loops, cu_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaFree(cu_num_bound_loops)); + CUDA_CHECK(hipMemcpy(&this->num_bound_loops, cu_num_bound_loops, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_num_bound_loops)); if (this->num_bound_loops == 0) { - CUDA_CHECK(cudaFree(cu_is_bound_conn_comp_loop)); + CUDA_CHECK(hipFree(cu_is_bound_conn_comp_loop)); return; } // Sort boundaries by connected component ids int *cu_bound_sorted, *cu_bound_conn_comp_ids_sorted; - CUDA_CHECK(cudaMalloc(&cu_bound_sorted, B * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_bound_conn_comp_ids_sorted, B * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_bound_sorted, B * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_bound_conn_comp_ids_sorted, B * sizeof(int))); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( nullptr, temp_storage_bytes, this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted, this->boundaries.ptr, cu_bound_sorted, B )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( this->cub_temp_storage.ptr, temp_storage_bytes, this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted, this->boundaries.ptr, cu_bound_sorted, @@ -1001,84 +1005,84 @@ void CuMesh::get_boundary_loops() { // Select loops int* cu_bound_is_on_loop; - CUDA_CHECK(cudaMalloc(&cu_bound_is_on_loop, B * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_bound_is_on_loop, B * sizeof(int))); index_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( cu_is_bound_conn_comp_loop, cu_bound_conn_comp_ids_sorted, B, cu_bound_is_on_loop ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_is_bound_conn_comp_loop)); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_is_bound_conn_comp_loop)); this->loop_boundaries.resize(B); int *cu_loop_bound_conn_comp_ids_sorted, *cu_num_bound_on_loop; - CUDA_CHECK(cudaMalloc(&cu_loop_bound_conn_comp_ids_sorted, B * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_num_bound_on_loop, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_loop_bound_conn_comp_ids_sorted, B * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num_bound_on_loop, sizeof(int))); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipcub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop, B )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipcub::DeviceSelect::Flagged( this->cub_temp_storage.ptr, temp_storage_bytes, cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop, B )); int num_bound_on_loop; - CUDA_CHECK(cudaMemcpy(&num_bound_on_loop, cu_num_bound_on_loop, sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaFree(cu_bound_sorted)); + CUDA_CHECK(hipMemcpy(&num_bound_on_loop, cu_num_bound_on_loop, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_bound_sorted)); this->loop_boundaries.resize(num_bound_on_loop); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipcub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop, B )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipcub::DeviceSelect::Flagged( this->cub_temp_storage.ptr, temp_storage_bytes, cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop, B )); - CUDA_CHECK(cudaFree(cu_bound_conn_comp_ids_sorted)); - CUDA_CHECK(cudaFree(cu_bound_is_on_loop)); - CUDA_CHECK(cudaFree(cu_num_bound_on_loop)); + CUDA_CHECK(hipFree(cu_bound_conn_comp_ids_sorted)); + CUDA_CHECK(hipFree(cu_bound_is_on_loop)); + CUDA_CHECK(hipFree(cu_num_bound_on_loop)); // RLE this->loop_boundaries_offset.resize(this->num_bound_loops + 1); this->loop_boundaries_offset.zero(); int* cu_rle_unique_out, *cu_rle_num_runs; - CUDA_CHECK(cudaMalloc(&cu_rle_unique_out, this->num_bound_loops * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_rle_num_runs, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_rle_unique_out, this->num_bound_loops * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_rle_num_runs, sizeof(int))); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( nullptr, temp_storage_bytes, cu_loop_bound_conn_comp_ids_sorted, cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs, num_bound_on_loop )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( this->cub_temp_storage.ptr, temp_storage_bytes, cu_loop_bound_conn_comp_ids_sorted, cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs, num_bound_on_loop )); - CUDA_CHECK(cudaFree(cu_loop_bound_conn_comp_ids_sorted)); - CUDA_CHECK(cudaFree(cu_rle_unique_out)); - CUDA_CHECK(cudaFree(cu_rle_num_runs)); + CUDA_CHECK(hipFree(cu_loop_bound_conn_comp_ids_sorted)); + CUDA_CHECK(hipFree(cu_rle_unique_out)); + CUDA_CHECK(hipFree(cu_rle_num_runs)); // Scan loop boundaries offset temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, this->loop_boundaries_offset.ptr, this->num_bound_loops + 1 )); this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, this->loop_boundaries_offset.ptr, this->num_bound_loops + 1 diff --git a/src/connectivity.hip b/src/connectivity.hip new file mode 100644 index 0000000..d5d878e --- /dev/null +++ b/src/connectivity.hip @@ -0,0 +1,1095 @@ +// !!! This is a file automatically generated by hipify!!! +#include "hip/hip_runtime.h" +#include "cumesh_hip.h" +#include "shared_hip.h" + +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#endif + + +namespace cumesh { + +/** + * Get count of neighboring faces for each vertex + * + * @param faces: the faces of the mesh, shape (F) + * @param F: the number of faces + * @param neighbor_face_cnt: the buffer for neighbor face count, shape (V+1) + */ +static __global__ void get_neighbor_face_cnt_kernel( + const int3* faces, + const int F, + int* neighbor_face_cnt +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + + int3 f = faces[tid]; + + atomicAdd(&neighbor_face_cnt[f.x], 1); + atomicAdd(&neighbor_face_cnt[f.y], 1); + atomicAdd(&neighbor_face_cnt[f.z], 1); +} + + +/** + * Fill the neighboring face ids for each vertex + * + * @param faces: the faces of the mesh, shape (F) + * @param F: the number of faces + * @param neighbor_face_ids: the buffer for neighbor face ids, shape (total_neighbor_face_cnt) + * @param neighbor_face_ids_offset: the buffer for neighbor face ids offset, shape (V+1) + * @param neighbor_face_cnt: the buffer for neighbor face count, shape (V+1) + */ +static __global__ void fill_neighbor_face_ids_kernel( + const int3* faces, + const int F, + int* neighbor_face_ids, + int* neighbor_face_ids_offset, + int* neighbor_face_cnt +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + + int3 f = faces[tid]; + + neighbor_face_ids[neighbor_face_ids_offset[f.x] + atomicAdd(&neighbor_face_cnt[f.x], 1)] = tid; + neighbor_face_ids[neighbor_face_ids_offset[f.y] + atomicAdd(&neighbor_face_cnt[f.y], 1)] = tid; + neighbor_face_ids[neighbor_face_ids_offset[f.z] + atomicAdd(&neighbor_face_cnt[f.z], 1)] = tid; +} + + +void CuMesh::get_vertex_face_adjacency() { + size_t F = this->faces.size; + size_t V = this->vertices.size; + + // get neighboring face count for each vertex + this->vert2face_cnt.resize(V + 1); + this->vert2face_cnt.zero(); + hipLaunchKernelGGL(( get_neighbor_face_cnt_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, this->faces.ptr, F, this->vert2face_cnt.ptr); + CUDA_CHECK(hipGetLastError()); + + // allocate memory for neighboring face ids + this->vert2face_offset.resize(V + 1); + size_t temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + this->vert2face_cnt.ptr, this->vert2face_offset.ptr, + V + 1 + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + this->cub_temp_storage.ptr, temp_storage_bytes, + this->vert2face_cnt.ptr, this->vert2face_offset.ptr, + V + 1 + )); + this->vert2face.resize(F*3); + + // fill neighboring face ids for each vertex + this->vert2face_cnt.zero(); + hipLaunchKernelGGL(( fill_neighbor_face_ids_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, this->faces.ptr, F, + this->vert2face.ptr, + this->vert2face_offset.ptr, + this->vert2face_cnt.ptr + ); + CUDA_CHECK(hipGetLastError()); +} + + +/** + * Expand edges for each triangle face + * + * @param faces: the faces of the mesh, shape (F) + * @param F: the number of faces + * @param edges: the buffer for edges, shape (F*3) + */ +static __global__ void expand_edges_kernel( + const int3* faces, + const int F, + uint64_t *edges +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + + int base = tid * 3; + int3 f = faces[tid]; + + // expand edges + edges[base + 0] = ((uint64_t)min(f.x, f.y) << 32) | max(f.x, f.y); + edges[base + 1] = ((uint64_t)min(f.y, f.z) << 32) | max(f.y, f.z); + edges[base + 2] = ((uint64_t)min(f.z, f.x) << 32) | max(f.z, f.x); +} + + +void CuMesh::get_edges() { + size_t F = this->faces.size; + this->edges.resize(F * 3); + hipLaunchKernelGGL(( expand_edges_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, this->faces.ptr, F, this->edges.ptr); + CUDA_CHECK(hipGetLastError()); + + // sort edges + this->temp_storage.resize(F * 3 * sizeof(uint64_t)); + size_t temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceRadixSort::SortKeys( + nullptr, temp_storage_bytes, + this->edges.ptr, + reinterpret_cast(this->temp_storage.ptr), + F * 3 + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceRadixSort::SortKeys( + this->cub_temp_storage.ptr, temp_storage_bytes, + this->edges.ptr, + reinterpret_cast(this->temp_storage.ptr), + F * 3 + )); + + // unique edges + int* num_edges; + CUDA_CHECK(hipMalloc(&num_edges, sizeof(int))); + this->edge2face_cnt.resize(F * 3); + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( + nullptr, temp_storage_bytes, + reinterpret_cast(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges, + F * 3 + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( + this->cub_temp_storage.ptr, temp_storage_bytes, + reinterpret_cast(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges, + F * 3 + )); + CUDA_CHECK(hipMemcpy(&this->edges.size, num_edges, sizeof(int), hipMemcpyDeviceToHost)); + this->edge2face_cnt.size = this->edges.size; + CUDA_CHECK(hipFree(num_edges)); +} + + +/** + * Get edge-face adjacency + * + * @param faces: the faces of the mesh, shape (F) + * @param edges: the buffer for edges, shape (E) + * @param edge2face_cnt: the buffer for edge duplication number, shape (E) + * @param vert2face: the buffer for neighboring face ids, shape (total_neighbor_face_cnt) + * @param vert2face_offset: the buffer for neighboring face ids offset, shape (V+1) + * @param edge2face_offset: the buffer for edge to face adjacency offset, shape (E+1) + * @param E: the number of edges + * @param edge2face: the buffer for edge to face adjacency, shape (total_edge_face_cnt) + * @param face2edge: the buffer for face to edge adjacency, shape (F*3) + */ +static __global__ void get_edge_face_adjacency_kernel( + const int3* faces, + const uint64_t* edges, + const int* edge2face_cnt, + const int* vert2face, + const int* vert2face_offset, + const int* edge2face_offset, + const int E, + int* edge2face, + int3* face2edge +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= E) return; + + // get edge + uint64_t e = edges[tid]; + int e0 = int(e >> 32); + int e1 = int(e & 0xFFFFFFFF); + + // assign connectivity + int ptr = edge2face_offset[tid]; + for (int f = vert2face_offset[e0]; f < vert2face_offset[e0+1]; f++) { + int fid = vert2face[f]; + int3 f_vids = faces[fid]; + if (f_vids.x == e1 || f_vids.y == e1 || f_vids.z == e1) { + // this face contains the edge + edge2face[ptr] = fid; + ptr++; + // fill face2edge + if (f_vids.x == e0 && f_vids.y == e1 || f_vids.x == e1 && f_vids.y == e0) { + face2edge[fid].x = tid; + } else if (f_vids.y == e0 && f_vids.z == e1 || f_vids.y == e1 && f_vids.z == e0) { + face2edge[fid].y = tid; + } else if (f_vids.z == e0 && f_vids.x == e1 || f_vids.z == e1 && f_vids.x == e0) { + face2edge[fid].z = tid; + } + } + } +} + + +void CuMesh::get_edge_face_adjacency() { + if (this->edges.is_empty() || this->edge2face_cnt.is_empty()) { + this->get_edges(); + } + if (this->vert2face.is_empty() || this->vert2face_offset.is_empty()) { + this->get_vertex_face_adjacency(); + } + size_t F = this->faces.size; + size_t E = this->edges.size; + + // allocate memory for edge2face_offset + this->edge2face_offset.resize(E + 1); + size_t temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + this->edge2face_cnt.ptr, this->edge2face_offset.ptr, + E + 1 + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + this->cub_temp_storage.ptr, temp_storage_bytes, + this->edge2face_cnt.ptr, this->edge2face_offset.ptr, + E + 1 + )); + + // allocate memory for edge2face + int total_edge_face_cnt; + CUDA_CHECK(hipMemcpy(&total_edge_face_cnt, &this->edge2face_offset.ptr[E], sizeof(int), hipMemcpyDeviceToHost)); + this->edge2face.resize(total_edge_face_cnt); + + // allocate memory for face2edge + this->face2edge.resize(F); + + // get edge-face adjacency + hipLaunchKernelGGL(( get_edge_face_adjacency_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->faces.ptr, + this->edges.ptr, + this->edge2face_cnt.ptr, + this->vert2face.ptr, + this->vert2face_offset.ptr, + this->edge2face_offset.ptr, + E, + this->edge2face.ptr, + this->face2edge.ptr + ); + CUDA_CHECK(hipGetLastError()); +} + + +/** + * Get vertex adjacent edge number + * + * @param edges: the buffer for edges, shape (E) + * @param E: the number of edges + * @param vert2edge_cnt: the buffer for vertex adjacent edge number, shape (V) + */ +static __global__ void get_vertex_edge_cnt_kernel( + const uint64_t* edges, + const int E, + int* vert2edge_cnt +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= E) return; + + // get edge + uint64_t e = edges[tid]; + int e0 = int(e >> 32); + int e1 = int(e & 0xFFFFFFFF); + + // count vertex adjacent edge number + atomicAdd(&vert2edge_cnt[e0], 1); + atomicAdd(&vert2edge_cnt[e1], 1); +} + + +/** + * Get vertex-edge adjacency + * + * @param edges: the buffer for edges, shape (E) + * @param E: the number of edges + * @param vert2edge: the buffer for vertex to edge adjacency, shape (total_vertex_edge_cnt) + * @param vert2edge_offset: the buffer for vertex to edge adjacency offset, shape (V+1) + * @param vert2edge_cnt: the buffer for vertex adjacent edge number, shape (V) + */ +static __global__ void get_vertex_edge_adjacency_kernel( + const uint64_t* edges, + const int E, + int* vert2edge, + int* vert2edge_offset, + int* vert2edge_cnt +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= E) return; + + // get edge + uint64_t e = edges[tid]; + int e0 = int(e >> 32); + int e1 = int(e & 0xFFFFFFFF); + + // assign connectivity + vert2edge[vert2edge_offset[e0] + atomicAdd(&vert2edge_cnt[e0], 1)] = tid; + vert2edge[vert2edge_offset[e1] + atomicAdd(&vert2edge_cnt[e1], 1)] = tid; +} + + +void CuMesh::get_vertex_edge_adjacency() { + if (this->edges.is_empty()) { + this->get_edges(); + } + size_t E = this->edges.size; + size_t V = this->vertices.size; + + // get vertex adjacent edge number + this->vert2edge_cnt.resize(V + 1); + this->vert2edge_cnt.zero(); + hipLaunchKernelGGL(( get_vertex_edge_cnt_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->edges.ptr, E, this->vert2edge_cnt.ptr + ); + CUDA_CHECK(hipGetLastError()); + + // allocate memory for vert2edge_offset + this->vert2edge_offset.resize(V + 1); + size_t temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr, + V + 1 + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + this->cub_temp_storage.ptr, temp_storage_bytes, + this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr, + V + 1 + )); + + // get vertex-edge adjacency + this->vert2edge.resize(2 * E); + this->vert2edge_cnt.zero(); + hipLaunchKernelGGL(( get_vertex_edge_adjacency_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->edges.ptr, E, + this->vert2edge.ptr, + this->vert2edge_offset.ptr, + this->vert2edge_cnt.ptr + ); + CUDA_CHECK(hipGetLastError()); +} + + +/** + * Set vertex boundary indicator + * + * @param edges: the buffer for edges, shape (E) + * @param boundaries: the buffer for boundary edges, shape (B) + * @param edge2face_cnt: the buffer for edge duplication number, shape (E) + * @param B: the number of boundary edges + * @param vert_is_boundary: the buffer for boundary vertex indicator, shape (V) + */ +static __global__ void set_boundary_vertex_kernel( + const uint64_t* edges, + const int* boundaries, + const int* edge2face_cnt, + const int B, + uint8_t* vert_is_boundary +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= B) return; + + int eid = boundaries[tid]; + + if (edge2face_cnt[eid] == 1) { + // get edge + uint64_t e = edges[eid]; + int e0 = int(e >> 32); + int e1 = int(e & 0xFFFFFFFF); + + // set boundary vertex + vert_is_boundary[e0] = 1; + vert_is_boundary[e1] = 1; + } +} + + +struct is_boundary_edge { + const int* edge2face_cnt; + __host__ __device__ + bool operator()(const int& idx) const { + return edge2face_cnt[idx] == 1; + } +}; + + +void CuMesh::get_boundary_info() { + if (this->edges.is_empty() || this->edge2face_cnt.is_empty()) { + this->get_edges(); + } + size_t E = this->edges.size; + + // Select boundary edges + size_t temp_storage_bytes = 0; + int *cu_num_boundary, *cu_edge_idx; + CUDA_CHECK(hipMalloc(&cu_num_boundary, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_edge_idx, E * sizeof(int))); + this->boundaries.resize(E); + hipLaunchKernelGGL(( arange_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_edge_idx, E); + CUDA_CHECK(hipcub::DeviceSelect::If( + nullptr, temp_storage_bytes, + cu_edge_idx, this->boundaries.ptr, cu_num_boundary, + E, + is_boundary_edge{this->edge2face_cnt.ptr} + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSelect::If( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_edge_idx, this->boundaries.ptr, cu_num_boundary, + E, + is_boundary_edge{this->edge2face_cnt.ptr} + )); + CUDA_CHECK(hipMemcpy(&this->boundaries.size, cu_num_boundary, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_num_boundary)); + CUDA_CHECK(hipFree(cu_edge_idx)); + + // Set vertex boundary indicator + this->vert_is_boundary.resize(this->vertices.size); + this->vert_is_boundary.zero(); + if (this->boundaries.size > 0) { + hipLaunchKernelGGL(( set_boundary_vertex_kernel), dim3((this->boundaries.size+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->edges.ptr, this->boundaries.ptr, this->edge2face_cnt.ptr, + this->boundaries.size, this->vert_is_boundary.ptr + ); + CUDA_CHECK(hipGetLastError()); + } +} + + +static __global__ void get_vertex_boundary_cnt_kernel( + const uint64_t* edges, + const int* boundaries, + const int B, + int* vert2bound_cnt +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= B) return; + + int eid = boundaries[tid]; + + // get edge + uint64_t e = edges[eid]; + int e0 = int(e >> 32); + int e1 = int(e & 0xFFFFFFFF); + + // count vertex adjacent boundary number + atomicAdd(&vert2bound_cnt[e0], 1); + atomicAdd(&vert2bound_cnt[e1], 1); +} + + +/** + * Get vertex-boundary adjacency + * + * @param edges: the buffer for edges, shape (E) + * @param boundaries: the buffer for boundary edges, shape (B) + * @param B: the number of boundary edges + * @param vert2bound: the buffer for vertex to boundary adjacency, shape (total_vertex_boundary_cnt) + * @param vert2bound_offset: the buffer for vertex to boundary adjacency offset, shape (V+1) + * @param vert2bound_cnt: the buffer for vertex adjacent boundary number, shape (V) + */ +static __global__ void get_vertex_boundary_adjacency_kernel( + const uint64_t* edges, + const int* boundaries, + const int B, + int* vert2bound, + int* vert2bound_offset, + int* vert2bound_cnt +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= B) return; + + int eid = boundaries[tid]; + + // get edge + uint64_t e = edges[eid]; + int e0 = int(e >> 32); + int e1 = int(e & 0xFFFFFFFF); + + // assign connectivity + vert2bound[vert2bound_offset[e0] + atomicAdd(&vert2bound_cnt[e0], 1)] = tid; + vert2bound[vert2bound_offset[e1] + atomicAdd(&vert2bound_cnt[e1], 1)] = tid; +} + + +void CuMesh::get_vertex_boundary_adjacency() { + if (this->edges.is_empty()) { + this->get_edges(); + } + if (this->boundaries.is_empty()) { + this->get_boundary_info(); + } + size_t V = this->vertices.size; + size_t B = this->boundaries.size; + + // Early return if no boundaries + if (B == 0) { + this->vert2bound_cnt.resize(V + 1); + this->vert2bound_cnt.zero(); + this->vert2bound_offset.resize(V + 1); + this->vert2bound_offset.zero(); + return; + } + + // get vertex adjacent boundary number + this->vert2bound_cnt.resize(V + 1); + this->vert2bound_cnt.zero(); + hipLaunchKernelGGL(( get_vertex_boundary_cnt_kernel), dim3((B+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->edges.ptr, this->boundaries.ptr, B, this->vert2bound_cnt.ptr + ); + CUDA_CHECK(hipGetLastError()); + + // allocate memory for vert2bound_offset + this->vert2bound_offset.resize(V + 1); + size_t temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr, + V + 1 + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + this->cub_temp_storage.ptr, temp_storage_bytes, + this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr, + V + 1 + )); + + // get vertex-boundary adjacency + this->vert2bound.resize(2 * B); + this->vert2bound_cnt.zero(); + hipLaunchKernelGGL(( get_vertex_boundary_adjacency_kernel), dim3((B+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->edges.ptr, this->boundaries.ptr, B, + this->vert2bound.ptr, + this->vert2bound_offset.ptr, + this->vert2bound_cnt.ptr + ); + CUDA_CHECK(hipGetLastError()); +} + + +static __global__ void get_vertex_is_manifold_kernel( + const int* vert2edge, + const int* vert2edge_offset, + const int* edge2face_cnt, + const int V, + uint8_t* vert_is_manifold +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= V) return; + + // traverse all edges of the vertex + int num_boundaries = 0; + bool is_manifold = true; + for (int i = vert2edge_offset[tid]; i < vert2edge_offset[tid+1]; i++) { + int eid = vert2edge[i]; + // boundary edge + if (edge2face_cnt[eid] == 1) { + num_boundaries++; + if (num_boundaries > 2) { + is_manifold = false; + break; + } + } + // non-manifold edge + else if (edge2face_cnt[eid] > 2) { + is_manifold = false; + break; + } + } + + vert_is_manifold[tid] = is_manifold ? 1 : 0; +} + + +void CuMesh::get_vertex_is_manifold() { + if (this->vert2edge.is_empty() || this->vert2edge_offset.is_empty()) { + this->get_vertex_edge_adjacency(); + } + if (this->edge2face_cnt.is_empty()) { + this->get_edges(); + } + size_t V = this->vertices.size; + + // get vertex is manifold + this->vert_is_manifold.resize(V); + hipLaunchKernelGGL(( get_vertex_is_manifold_kernel), dim3((V+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->vert2edge.ptr, + this->vert2edge_offset.ptr, + this->edge2face_cnt.ptr, + V, + this->vert_is_manifold.ptr + ); + CUDA_CHECK(hipGetLastError()); +} + + +/** + * Set manifold face adjacency + * + * @param manifold_edge_idx: the buffer for manifold edge index, shape (M) + * @param edge2face: the buffer for edge to face adjacency, shape (total_edge_face_cnt) + * @param edge2face_offset: the buffer for edge to face adjacency offset, shape (E+1) + * @param M: the number of manifold edges + * @param manifold_face_adj: the buffer for manifold face adjacency, shape (M) + */ +static __global__ void set_manifold_face_adj_kernel( + const int* manifold_edge_idx, + const int* edge2face, + const int* edge2face_offset, + const int M, + int2* manifold_face_adj +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= M) return; + + // get edge + int edge_idx = manifold_edge_idx[tid]; + + // get adjacent faces + int start = edge2face_offset[edge_idx]; + int end = edge2face_offset[edge_idx+1]; + if (end - start != 2) return; // if not a manifold edge + int f0 = edge2face[start]; + int f1 = edge2face[start + 1]; + + manifold_face_adj[tid] = {f0, f1}; +} + + +struct is_manifold_edge { + const int* edge2face_cnt; + __host__ __device__ + bool operator()(const int& idx) const { + return edge2face_cnt[idx] == 2; + } +}; + + +void CuMesh::get_manifold_face_adjacency() { + if (this->edge2face.is_empty() || this->edge2face_offset.is_empty()) { + this->get_edge_face_adjacency(); + } + size_t E = this->edges.size; + + // Select manifold edges + size_t temp_storage_bytes = 0; + int *cu_num_manifold_edges, *cu_edge_idx, *cu_manifold_edge_idx; + CUDA_CHECK(hipMalloc(&cu_num_manifold_edges, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_edge_idx, E * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_manifold_edge_idx, E * sizeof(int))); + hipLaunchKernelGGL(( arange_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_edge_idx, E); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipcub::DeviceSelect::If( + nullptr, temp_storage_bytes, + cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges, + E, + is_manifold_edge{this->edge2face_cnt.ptr} + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSelect::If( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges, + E, + is_manifold_edge{this->edge2face_cnt.ptr} + )); + int manifold_edge_count; + CUDA_CHECK(hipMemcpy(&manifold_edge_count, cu_num_manifold_edges, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_num_manifold_edges)); + CUDA_CHECK(hipFree(cu_edge_idx)); + + // set manifold_face_adj + this->manifold_face_adj.resize(manifold_edge_count); + hipLaunchKernelGGL(( set_manifold_face_adj_kernel), dim3((manifold_edge_count+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_manifold_edge_idx, + this->edge2face.ptr, + this->edge2face_offset.ptr, + manifold_edge_count, + this->manifold_face_adj.ptr + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_manifold_edge_idx)); +} + + +static __global__ void set_manifold_bound_adj_kernel( + const int* manifold_boundary_verts_idx, + const int* vert2bound, + const int* vert2bound_offset, + const size_t MBV, + int2* manifold_bound_adj +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= MBV) return; + + // get vertex + int vert_idx = manifold_boundary_verts_idx[tid]; + + // get adjacent boundaries + int b0 = vert2bound[vert2bound_offset[vert_idx]]; + int b1 = vert2bound[vert2bound_offset[vert_idx] + 1]; + + manifold_bound_adj[tid] = {b0, b1}; +} + + +struct is_manifold_boundary_vertex { + const uint8_t* vert_is_manifold; + const uint8_t* vert_is_boundary; + __host__ __device__ + bool operator()(const int& idx) const { + return vert_is_manifold[idx] && vert_is_boundary[idx]; + } +}; + + +void CuMesh::get_manifold_boundary_adjacency() { + if (this->vert2bound.is_empty() || this->vert2bound_offset.is_empty()) { + this->get_vertex_boundary_adjacency(); + } + if (this->vert_is_manifold.is_empty()) { + this->get_vertex_is_manifold(); + } + size_t V = this->vertices.size; + + // Select manifold boundary vertices + size_t temp_storage_bytes = 0; + int *cu_num_manifold_boundary_verts, *cu_vert_idx, *cu_manifold_vert_idx; + CUDA_CHECK(hipMalloc(&cu_num_manifold_boundary_verts, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_vert_idx, V * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_manifold_vert_idx, V * sizeof(int))); + hipLaunchKernelGGL(( arange_kernel), dim3((V+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_vert_idx, V); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipcub::DeviceSelect::If( + nullptr, temp_storage_bytes, + cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts, + V, + is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr} + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSelect::If( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts, + V, + is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr} + )); + int manifold_boundary_vert_count; + CUDA_CHECK(hipMemcpy(&manifold_boundary_vert_count, cu_num_manifold_boundary_verts, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_num_manifold_boundary_verts)); + CUDA_CHECK(hipFree(cu_vert_idx)); + + // Early return if no manifold boundary vertices + if (manifold_boundary_vert_count == 0) { + CUDA_CHECK(hipFree(cu_manifold_vert_idx)); + return; + } + + // set manifold_bound_adj + this->manifold_bound_adj.resize(manifold_boundary_vert_count); + hipLaunchKernelGGL(( set_manifold_bound_adj_kernel), dim3((manifold_boundary_vert_count+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_manifold_vert_idx, + this->vert2bound.ptr, + this->vert2bound_offset.ptr, + manifold_boundary_vert_count, + this->manifold_bound_adj.ptr + ); + CUDA_CHECK(hipGetLastError()); +} + + +void CuMesh::get_connected_components() { + if (this->manifold_face_adj.is_empty()) { + this->get_manifold_face_adjacency(); + } + + size_t M = this->manifold_face_adj.size; + size_t F = this->faces.size; + + // Iterative Hook and Compress + this->conn_comp_ids.resize(F); + hipLaunchKernelGGL(( arange_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, this->conn_comp_ids.ptr, F); + CUDA_CHECK(hipGetLastError()); + int* cu_end_flag; int h_end_flag; + CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int))); + do { + h_end_flag = 1; + CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice)); + + // Hook + hipLaunchKernelGGL(( hook_edges_kernel), dim3((M+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->manifold_face_adj.ptr, + M, + this->conn_comp_ids.ptr, + cu_end_flag + ); + CUDA_CHECK(hipGetLastError()); + + // Compress + hipLaunchKernelGGL(( compress_components_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->conn_comp_ids.ptr, + F + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost)); + } while (h_end_flag == 0); + CUDA_CHECK(hipFree(cu_end_flag)); + + // Compresses boundary components + this->num_conn_comps = compress_ids(this->conn_comp_ids.ptr, F, this->cub_temp_storage); +} + + +void CuMesh::get_boundary_connected_components() { + if (this->manifold_bound_adj.is_empty()) { + this->get_manifold_boundary_adjacency(); + } + size_t M = this->manifold_bound_adj.size; + size_t B = this->boundaries.size; + + // Early return if no boundaries + if (B == 0) { + this->num_bound_conn_comps = 0; + return; + } + + // Iterative Hook and Compress + this->bound_conn_comp_ids.resize(B); + hipLaunchKernelGGL(( arange_kernel), dim3((B+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, this->bound_conn_comp_ids.ptr, B); + CUDA_CHECK(hipGetLastError()); + int* cu_end_flag; int h_end_flag; + CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int))); + do { + h_end_flag = 1; + CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice)); + + // Hook + hipLaunchKernelGGL(( hook_edges_kernel), dim3((M+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->manifold_bound_adj.ptr, + M, + this->bound_conn_comp_ids.ptr, + cu_end_flag + ); + CUDA_CHECK(hipGetLastError()); + + // Compress + hipLaunchKernelGGL(( compress_components_kernel), dim3((B+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->bound_conn_comp_ids.ptr, + B + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost)); + } while (h_end_flag == 0); + CUDA_CHECK(hipFree(cu_end_flag)); + + // Compresses boundary components + this->num_bound_conn_comps = compress_ids(this->bound_conn_comp_ids.ptr, B, this->cub_temp_storage); +} + + +static __global__ void is_bound_conn_comp_loop_kernel( + const uint64_t* edges, + const int* boundaries, + const int* bound_conn_comp_ids, + const int* vert2bound, + const int* vert2bound_offset, + const int B, + int* is_bound_conn_comp_loop +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= B) return; + + // get edge + int eid = boundaries[tid]; + uint64_t e = edges[eid]; + int e0 = int(e >> 32); + int e1 = int(e & 0xFFFFFFFF); + + int self_comp_id = bound_conn_comp_ids[tid]; + + // check if both vertices are connected to another boundary with the same connected component id + int cnt = 0; + for (int i = vert2bound_offset[e0]; i < vert2bound_offset[e0+1]; i++) { + int b = vert2bound[i]; + if (b == tid) continue; // skip self + int comp_id = bound_conn_comp_ids[b]; + if (comp_id == self_comp_id) cnt++; + } + if (cnt == 0) { + is_bound_conn_comp_loop[self_comp_id] = 0; // no loop + return; + } + cnt = 0; + for (int i = vert2bound_offset[e1]; i < vert2bound_offset[e1+1]; i++) { + int b = vert2bound[i]; + if (b == tid) continue; // skip self + int comp_id = bound_conn_comp_ids[b]; + if (comp_id == self_comp_id) cnt++; + } + if (cnt == 0) { + is_bound_conn_comp_loop[self_comp_id] = 0; // no loop + return; + } +} + + +void CuMesh::get_boundary_loops() { + if (this->bound_conn_comp_ids.is_empty()) { + this->get_boundary_connected_components(); + } + + size_t B = this->boundaries.size; + + // Early return if no boundaries or boundary components + if (B == 0 || this->num_bound_conn_comps == 0) { + this->num_bound_loops = 0; + return; + } + + // Check if boundary components are loops + int* cu_is_bound_conn_comp_loop; + CUDA_CHECK(hipMalloc(&cu_is_bound_conn_comp_loop, this->num_bound_conn_comps * sizeof(int))); + hipLaunchKernelGGL(( fill_kernel), dim3((this->num_bound_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_is_bound_conn_comp_loop, + this->num_bound_conn_comps, + 1 + ); + CUDA_CHECK(hipGetLastError()); + hipLaunchKernelGGL(( is_bound_conn_comp_loop_kernel), dim3((B+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->edges.ptr, + this->boundaries.ptr, + this->bound_conn_comp_ids.ptr, + this->vert2bound.ptr, + this->vert2bound_offset.ptr, + B, + cu_is_bound_conn_comp_loop + ); + CUDA_CHECK(hipGetLastError()); + int* cu_num_bound_loops; + CUDA_CHECK(hipMalloc(&cu_num_bound_loops, sizeof(int))); + size_t temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceReduce::Sum( + nullptr, temp_storage_bytes, + cu_is_bound_conn_comp_loop, + cu_num_bound_loops, + this->num_bound_conn_comps + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceReduce::Sum( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_is_bound_conn_comp_loop, + cu_num_bound_loops, + this->num_bound_conn_comps + )); + CUDA_CHECK(hipMemcpy(&this->num_bound_loops, cu_num_bound_loops, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_num_bound_loops)); + if (this->num_bound_loops == 0) { + CUDA_CHECK(hipFree(cu_is_bound_conn_comp_loop)); + return; + } + + // Sort boundaries by connected component ids + int *cu_bound_sorted, *cu_bound_conn_comp_ids_sorted; + CUDA_CHECK(hipMalloc(&cu_bound_sorted, B * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_bound_conn_comp_ids_sorted, B * sizeof(int))); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, + this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted, + this->boundaries.ptr, cu_bound_sorted, + B + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + this->cub_temp_storage.ptr, temp_storage_bytes, + this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted, + this->boundaries.ptr, cu_bound_sorted, + B + )); + + // Select loops + int* cu_bound_is_on_loop; + CUDA_CHECK(hipMalloc(&cu_bound_is_on_loop, B * sizeof(int))); + hipLaunchKernelGGL(( index_kernel), dim3((B+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_is_bound_conn_comp_loop, + cu_bound_conn_comp_ids_sorted, + B, + cu_bound_is_on_loop + ); + CUDA_CHECK(hipGetLastError()); + CUDA_CHECK(hipFree(cu_is_bound_conn_comp_loop)); + this->loop_boundaries.resize(B); + int *cu_loop_bound_conn_comp_ids_sorted, *cu_num_bound_on_loop; + CUDA_CHECK(hipMalloc(&cu_loop_bound_conn_comp_ids_sorted, B * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num_bound_on_loop, sizeof(int))); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + nullptr, temp_storage_bytes, + cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop, + B + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop, + B + )); + int num_bound_on_loop; + CUDA_CHECK(hipMemcpy(&num_bound_on_loop, cu_num_bound_on_loop, sizeof(int), hipMemcpyDeviceToHost)); + CUDA_CHECK(hipFree(cu_bound_sorted)); + this->loop_boundaries.resize(num_bound_on_loop); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + nullptr, temp_storage_bytes, + cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop, + B + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop, + B + )); + CUDA_CHECK(hipFree(cu_bound_conn_comp_ids_sorted)); + CUDA_CHECK(hipFree(cu_bound_is_on_loop)); + CUDA_CHECK(hipFree(cu_num_bound_on_loop)); + + // RLE + this->loop_boundaries_offset.resize(this->num_bound_loops + 1); + this->loop_boundaries_offset.zero(); + int* cu_rle_unique_out, *cu_rle_num_runs; + CUDA_CHECK(hipMalloc(&cu_rle_unique_out, this->num_bound_loops * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_rle_num_runs, sizeof(int))); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( + nullptr, temp_storage_bytes, + cu_loop_bound_conn_comp_ids_sorted, + cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs, + num_bound_on_loop + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_loop_bound_conn_comp_ids_sorted, + cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs, + num_bound_on_loop + )); + CUDA_CHECK(hipFree(cu_loop_bound_conn_comp_ids_sorted)); + CUDA_CHECK(hipFree(cu_rle_unique_out)); + CUDA_CHECK(hipFree(cu_rle_num_runs)); + + // Scan loop boundaries offset + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + this->loop_boundaries_offset.ptr, + this->num_bound_loops + 1 + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + this->cub_temp_storage.ptr, temp_storage_bytes, + this->loop_boundaries_offset.ptr, + this->num_bound_loops + 1 + )); +} + + +} // namespace cumesh diff --git a/src/cumesh.h b/src/cumesh.h index 01a073b..e0da9ca 100644 --- a/src/cumesh.h +++ b/src/cumesh.h @@ -1,7 +1,11 @@ #pragma once +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include #include +#endif #include #include "utils.h" diff --git a/src/cumesh.hip b/src/cumesh.hip new file mode 100644 index 0000000..d5860db --- /dev/null +++ b/src/cumesh.hip @@ -0,0 +1,143 @@ +// !!! This is a file automatically generated by hipify!!! +#include "cumesh_hip.h" + + +namespace cumesh { + +CuMesh::CuMesh() {} + +CuMesh::~CuMesh() { + vertices.free(); + faces.free(); + face_areas.free(); + face_normals.free(); + vertex_normals.free(); + edges.free(); + boundaries.free(); + vert_is_boundary.free(); + vert_is_manifold.free(); + vert2edge.free(); + vert2edge_cnt.free(); + vert2edge_offset.free(); + vert2bound.free(); + vert2bound_cnt.free(); + vert2bound_offset.free(); + edge2face.free(); + edge2face_cnt.free(); + edge2face_offset.free(); + face2edge.free(); + vert2face.free(); + vert2face_cnt.free(); + vert2face_offset.free(); + manifold_face_adj.free(); + manifold_bound_adj.free(); + conn_comp_ids.free(); + bound_conn_comp_ids.free(); + loop_boundaries.free(); + loop_boundaries_offset.free(); + vertices_map.free(); + faces_map.free(); + edge_collapse_costs.free(); + propagated_costs.free(); + + atlas_chart_ids.free(); + atlas_chart_vertex_map.free(); + atlas_chart_faces.free(); + atlas_chart_faces_offset.free(); + atlas_chart_vertex_offset.free(); + atlas_chart_uvs.free(); + + atlas_chart_normal_cones.free(); + atlas_chart_adj.free(); + atlas_chart_adj_length.free(); + atlas_chart_perims.free(); + atlas_chart_areas.free(); + atlas_chart2edge.free(); + atlas_chart2edge_cnt.free(); + atlas_chart2edge_offset.free(); + + temp_storage.free(); + cub_temp_storage.free(); +} + +int CuMesh::num_vertices() const { + return vertices.size; +} + +int CuMesh::num_faces() const { + return faces.size; +} + +int CuMesh::num_edges() const { + return edges.size; +} + +int CuMesh::num_boundaries() const { + return boundaries.size; +} + +int CuMesh::num_conneted_components() const { + return num_conn_comps; +} + +int CuMesh::num_boundary_conneted_components() const { + return num_bound_conn_comps; +} + +int CuMesh::num_boundary_loops() const { + return num_bound_loops; +} + +void CuMesh::clear_cache() { + face_areas.free(); + face_normals.free(); + vertex_normals.free(); + edges.free(); + boundaries.free(); + vert_is_boundary.free(); + vert_is_manifold.free(); + vert2edge.free(); + vert2edge_cnt.free(); + vert2edge_offset.free(); + vert2bound.free(); + vert2bound_cnt.free(); + vert2bound_offset.free(); + edge2face.free(); + edge2face_cnt.free(); + edge2face_offset.free(); + face2edge.free(); + vert2face.free(); + vert2face_cnt.free(); + vert2face_offset.free(); + manifold_face_adj.free(); + manifold_bound_adj.free(); + conn_comp_ids.free(); + bound_conn_comp_ids.free(); + loop_boundaries.free(); + loop_boundaries_offset.free(); + vertices_map.free(); + faces_map.free(); + edge_collapse_costs.free(); + propagated_costs.free(); + + atlas_chart_ids.free(); + atlas_chart_vertex_map.free(); + atlas_chart_faces.free(); + atlas_chart_faces_offset.free(); + atlas_chart_vertex_offset.free(); + atlas_chart_uvs.free(); + + atlas_chart_normal_cones.free(); + atlas_chart_adj.free(); + atlas_chart_adj_length.free(); + atlas_chart_perims.free(); + atlas_chart_areas.free(); + atlas_chart2edge.free(); + atlas_chart2edge_cnt.free(); + atlas_chart2edge_offset.free(); + + temp_storage.free(); + cub_temp_storage.free(); +} + +} // namespace cumesh diff --git a/src/cumesh_hip.h b/src/cumesh_hip.h new file mode 100644 index 0000000..243e27d --- /dev/null +++ b/src/cumesh_hip.h @@ -0,0 +1,509 @@ +// !!! This is a file automatically generated by hipify!!! +#pragma once + +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#include +#endif +#include + +#include "utils_hip.h" + + +#define BLOCK_SIZE 256 + + +namespace cumesh { + +class CuMesh { +public: + Buffer vertices; + Buffer faces; + + // Geometric properties + Buffer face_areas; + Buffer face_normals; + Buffer vertex_normals; + + // Connectivity + Buffer edges; + Buffer boundaries; + Buffer vert_is_boundary; + Buffer vert_is_manifold; + Buffer vert2edge; + Buffer vert2edge_cnt; + Buffer vert2edge_offset; + Buffer vert2bound; + Buffer vert2bound_cnt; + Buffer vert2bound_offset; + Buffer edge2face; + Buffer edge2face_cnt; + Buffer edge2face_offset; + Buffer face2edge; + Buffer vert2face; + Buffer vert2face_cnt; + Buffer vert2face_offset; + Buffer manifold_face_adj; + Buffer manifold_bound_adj; + Buffer conn_comp_ids; + Buffer bound_conn_comp_ids; + Buffer loop_boundaries; + Buffer loop_boundaries_offset; + int num_conn_comps; + int num_bound_conn_comps; + int num_bound_loops; + + // Cleanup + Buffer vertices_map; + Buffer faces_map; + + // Simplification + Buffer edge_collapse_costs; + Buffer propagated_costs; + + // Atlasing + int atlas_num_charts; + Buffer atlas_chart_ids; + Buffer atlas_chart_vertex_map; + Buffer atlas_chart_faces; + Buffer atlas_chart_faces_offset; + Buffer atlas_chart_vertex_offset; + Buffer atlas_chart_uvs; + + Buffer atlas_chart_normal_cones; + Buffer atlas_chart_adj; + Buffer atlas_chart_adj_length; + Buffer atlas_chart_perims; + Buffer atlas_chart_areas; + Buffer atlas_chart2edge; + Buffer atlas_chart2edge_cnt; + Buffer atlas_chart2edge_offset; + + // Temporary storage + Buffer temp_storage; + Buffer cub_temp_storage; + + CuMesh(); + + ~CuMesh(); + + int num_vertices() const; + + int num_faces() const; + + int num_edges() const; + + int num_boundaries() const; + + int num_conneted_components() const; + + int num_boundary_conneted_components() const; + + int num_boundary_loops() const; + + void clear_cache(); + + /** + * Initialize mesh + * + * @param vertices The vertex positions as an [V, 3] tensor. + * @param faces The triangle faces as an [F, 3] tensor. + */ + void init(const torch::Tensor& vertices, const torch::Tensor& faces); + + /** + * Get the mesh. + * + * @return A tuple of the vertex positions and the triangle faces. + */ + std::tuple read(); + + /** + * Get the face normals. + * + * @return The face normals as an [F, 3] tensor. + */ + torch::Tensor read_face_normals(); + + /** + * Get the normals of the vertices. + * + * @return The vertex normals as an [V, 3] tensor. + */ + torch::Tensor read_vertex_normals(); + + /** + * Get the edges of the mesh. + * + * @return The edges as an [E, 2] tensor. + */ + torch::Tensor read_edges(); + + /** + * Get the boundaries of the mesh. + * + * @return The boundaries as an [B] tensor. + * Each element is the index of a boundary edge. + */ + torch::Tensor read_boundaries(); + + /** + * Get the manifold faces adjacency. + * + * @return The manifold faces adjacency as an [M, 2] tensor. + */ + torch::Tensor read_manifold_face_adjacency(); + + /** + * Get the manifold boundary adjacency. + * + * @return The manifold boundary adjacency as an [M, 2] tensor. + */ + torch::Tensor read_manifold_boundary_adjacency(); + + /** + * Get the connected components of the mesh. + * + * @return A tuple of: + * - The number of connected components. + * - The connected components ids as an [F] tensor. + */ + std::tuple read_connected_components(); + + /** + * Get the connected components of the mesh boundaries. + * + * @return A tuple of: + * - The number of boundary connected components. + * - The boundary connected components ids as an [B] tensor. + */ + std::tuple read_boundary_connected_components(); + + /** + * Get the boundary loops of the mesh. + * + * @return A tuple of: + * - The number of boundary loops. + * - The boundary loops as an [L] tensor. + * - The boundary loops offsets as an [L+1] tensor. + */ + std::tuple read_boundary_loops(); + + /** + * Get all cached data. + * + * @return A dictionary of all cached data. + */ + std::unordered_map read_all_cache(); + + + // Geometric functions + + /** + * Compute face areas. + * This function refreshes: + * - face_areas + */ + void compute_face_areas(); + + /** + * Compute face normals. + * This function refreshes: + * - face_normals + */ + void compute_face_normals(); + + /** + * Compute vertex normals. + * This function requires: + * - vert2face + * - vert2face_offset + * This function refreshes: + * - vertex_normals + */ + void compute_vertex_normals(); + + + // Connectivity functions + + /** + * Get the vertex to face adjacency. + * This function refreshes: + * - vert2face + * - vert2face_cnt + * - vert2face_offset + */ + void get_vertex_face_adjacency(); + + /** + * Get the edges of the mesh. + * This function refreshes: + * - edges + * - edge2face_cnt + */ + void get_edges(); + + /** + * Get the edges of the mesh. + * This function requires: + * - edges + * - edge2face_cnt + * - vert2face + * - vert2face_offset + * This function refreshes: + * - edge2face + * - edge2face_offset + * - face2edge + */ + void get_edge_face_adjacency(); + + /** + * Get the vertex to edge adjacency. + * This function requires: + * - edges + * This function refreshes: + * - vert2edge + * - vert2edge_cnt + * - vert2edge_offset + */ + void get_vertex_edge_adjacency(); + + /** + * Get boundary information. + * This function requires: + * - edges + * - edge2face_cnt + * This function refreshes: + * - boundaries + * - vert_is_boundary + */ + void get_boundary_info(); + + /** + * Get the vertex to boundary adjacency. + * This function requires: + * - edges + * - boundaries + * This function refreshes: + * - vert2bound + * - vert2bound_cnt + * - vert2bound_offset + */ + void get_vertex_boundary_adjacency(); + + /** + * Get edge is manifold information. + * This function requires: + * - vert2edge + * - vert2edge_offset + * - edge2face_cnt + * This function refreshes: + * - vert_is_manifold + */ + void get_vertex_is_manifold(); + + /** + * Get the face adjacency for manifold edges. + * This function requires: + * - edge2face + * - edge2face_offset + * This function refreshes: + * - manifold_face_adj + */ + void get_manifold_face_adjacency(); + + /** + * Get the face adjacency for manifold boundaries. + * This function requires: + * - vert_is_manifold + * - vert2bound + * - vert2bound_offset + * This function refreshes: + * - manifold_bound_adj + */ + void get_manifold_boundary_adjacency(); + + /** + * Get the connected components of the mesh. + * This function requires: + * - manifold_face_adj + * This function refreshes: + * - conn_comp_ids + */ + void get_connected_components(); + + /** + * Get the boundary connected components of the mesh. + * This function requires: + * - manifold_bound_adj + * This function refreshes: + * - bound_conn_comp_ids + */ + void get_boundary_connected_components(); + + /** + * Get the boundary loops of the mesh. + * This function requires: + * - vert2bound + * - vert2bound_offset + * - vert_is_boundary + * - bound_conn_comp_ids + * This function refreshes: + * - loop_boundaries + * - loop_boundaries_offset + */ + void get_boundary_loops(); + + + // Cleanup functions + + /** + * Remove faces. + */ + void remove_faces(torch::Tensor& face_mask); + void _remove_faces(uint8_t* face_mask); + + /** + * Remove unreferenced vertices. + */ + void remove_unreferenced_vertices(); + + /** + * Remove duplicate faces. + */ + void remove_duplicate_faces(); + + /** + * Remove degenerate faces. + */ + void remove_degenerate_faces(float abs_thresh, float rel_thresh); + + /** + * Fill holes. + * This function requires: + * - loop_boundaries + * - loop_boundaries_offset + * + * @param max_hole_perimeter The maximum perimeter of a hole to be filled. + */ + void fill_holes(float max_hole_perimeter); + + /** + * Repair Non-manifold edges by splitting edges. + * This function requires: + * - manifold_face_adj + * This function refreshes: + * - vertices + * - faces + * This function destroys: + * - All connectivity information + */ + void repair_non_manifold_edges(); + + /** + * Remove faces on non-manifold edges. + * For each non-manifold edge (shared by >2 faces), only keep the first 2 faces. + * This repairs non-manifold edges by deleting faces instead of splitting vertices. + * This function requires: + * - edge2face + * - edge2face_offset + * - edge2face_cnt + * This function refreshes: + * - vertices + * - faces + * This function destroys: + * - All connectivity information + */ + void remove_non_manifold_faces(); + + /** + * Remove small connected components. + * This function requires: + * - conn_comp_ids + * This function refreshes: + * - vertices + * - faces + * This function destroys: + * - All connectivity information + * + * @param min_area The minimum area of the connected components to be kept. + */ + void remove_small_connected_components(float min_area); + + /** + * Unify face orientations. + * This function requires: + * - manifold_face_adj + * This function refreshes: + * - faces + */ + void unify_face_orientations(); + + + // Simplification functions + + /** + * Run the edge collapse algorithm. + * This function refreshes: + * - vertices + * - faces + * This function destroys: + * - All connectivity information + * + * @param lambda_edge_length The weight for edge length term. + * @param lambda_skinny The weight for skinny term. + * @param threshold The threshold for edge collapse cost. + * @return A tuple of the number of vertices and the number of faces after simplification. + */ + std::tuple simplify_step(float lambda_edge_length, float lambda_skinny, float threshold, bool timing=false); + + + // Atlasing functions + + /** + * Compute charts for atlasing. + * This function requires: + * - manifold_face_adj + * This function refreshes: + * - atlas_face_chart_ids + * - atlas_chart_vertex_map + * - atlas_chart_faces + * - atlas_chart_faces_offset + * + * @param threshold_cone_half_angle_rad The threshold for the cone half angle in radians. + * @param refine_iterations The number of refinement iterations. + * @param global_iterations The number of global iterations. + * @param smooth_strength The strength of the smoothing. + * @param area_penalty_weight Coefficient for chart size penalty. Cost += Area * weight. + * Prevents charts from becoming too large if > 0, + * or encourages larger charts if < 0 (though usually used to penalize size variance). + * @param perimeter_area_ratio_weight Coefficient for shape irregularity (long-strip) penalty. + * Cost += (Perimeter / Area) * weight. + * Higher values penalize long strips and encourage circular/compact shapes. + */ + void compute_charts( + float threshold_cone_half_angle_rad, + int refine_iterations, + int global_iterations, + float smooth_strength, + float area_penalty_weight, + float perimeter_area_ratio_weight + ); + + /** + * Read the atlas charts. + * + * @return A tuple of: + * - The number of charts. + * - The chart ids as an [F] tensor. + * - The chart vertex map as an [V] tensor. + * - The chart faces as an [F, 3] tensor. + * - The chart vertices offset as an [C+1] tensor. + * - The chart faces offset as an [C+1] tensor. + */ + std::tuple read_atlas_charts(); +}; + +} // namespace cumesh diff --git a/src/dtypes.cuh b/src/dtypes.cuh index bff560c..ddc31ea 100644 --- a/src/dtypes.cuh +++ b/src/dtypes.cuh @@ -1,7 +1,11 @@ #pragma once +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include #include +#endif namespace cumesh { @@ -13,9 +17,9 @@ namespace cumesh { struct __align__(16) Vec3f { float x, y, z; - __device__ __forceinline__ Vec3f(); - __device__ __forceinline__ Vec3f(float x, float y, float z); - __device__ __forceinline__ Vec3f(float3 v); + __host__ __device__ __forceinline__ Vec3f(); + __host__ __device__ __forceinline__ Vec3f(float x, float y, float z); + __host__ __device__ __forceinline__ Vec3f(float3 v); __device__ __forceinline__ Vec3f operator+(const Vec3f& o) const; __device__ __forceinline__ Vec3f& operator+=(const Vec3f& o); __device__ __forceinline__ Vec3f operator-(const Vec3f& o) const; @@ -55,19 +59,19 @@ struct __align__(16) QEM }; -__device__ __forceinline__ Vec3f::Vec3f() { +__host__ __device__ __forceinline__ Vec3f::Vec3f() { x = 0.0f; y = 0.0f; z = 0.0f; } -__device__ __forceinline__ Vec3f::Vec3f(float x, float y, float z) { +__host__ __device__ __forceinline__ Vec3f::Vec3f(float x, float y, float z) { this->x = x; this->y = y; this->z = z; } -__device__ __forceinline__ Vec3f::Vec3f(float3 v) { +__host__ __device__ __forceinline__ Vec3f::Vec3f(float3 v) { x = v.x; y = v.y; z = v.z; diff --git a/src/dtypes_hip.cuh b/src/dtypes_hip.cuh new file mode 100644 index 0000000..c46201d --- /dev/null +++ b/src/dtypes_hip.cuh @@ -0,0 +1,329 @@ +// !!! This is a file automatically generated by hipify!!! +#pragma once + +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#include +#endif + + +namespace cumesh { + + +/** + * A 3D vector class with overloaded operators and methods. + */ +struct __align__(16) Vec3f { + float x, y, z; + + __host__ __device__ __forceinline__ Vec3f(); + __host__ __device__ __forceinline__ Vec3f(float x, float y, float z); + __host__ __device__ __forceinline__ Vec3f(float3 v); + __device__ __forceinline__ Vec3f operator+(const Vec3f& o) const; + __device__ __forceinline__ Vec3f& operator+=(const Vec3f& o); + __device__ __forceinline__ Vec3f operator-(const Vec3f& o) const; + __device__ __forceinline__ Vec3f& operator-=(const Vec3f& o); + __device__ __forceinline__ Vec3f operator*(float s) const; + __device__ __forceinline__ Vec3f& operator*=(float s); + __device__ __forceinline__ Vec3f operator/(float s) const; + __device__ __forceinline__ Vec3f& operator/=(float s); + __device__ __forceinline__ float dot(const Vec3f& o) const; + __device__ __forceinline__ float norm() const; + __device__ __forceinline__ float norm2() const; + __device__ __forceinline__ Vec3f normalized() const; + __device__ __forceinline__ void normalize(); + __device__ __forceinline__ Vec3f cross(const Vec3f& o) const; + __device__ __forceinline__ Vec3f slerp(const Vec3f& o, float t) const; +}; + + +/** + * QEM (Quadric Error Metric) class for mesh simplification. + */ +struct __align__(16) QEM +{ + // store upper triangle of symmetric 4x4 matrix: + // e = [ 00, 01, 02, 03, 11, 12, 13, 22, 23, 33 ] + float e[10]; + + __device__ __forceinline__ QEM(); + __device__ __forceinline__ QEM operator+(const QEM& o) const; + __device__ __forceinline__ QEM& operator+=(const QEM& o); + __device__ __forceinline__ QEM operator-(const QEM& o) const; + __device__ __forceinline__ QEM& operator-=(const QEM& o); + __device__ __forceinline__ void zero(); + __device__ __forceinline__ void add_plane(float4 p); + __device__ __forceinline__ float evaluate(const Vec3f& p) const; + __device__ __forceinline__ bool solve_optimal(float3 &out, float &err) const; +}; + + +__host__ __device__ __forceinline__ Vec3f::Vec3f() { + x = 0.0f; + y = 0.0f; + z = 0.0f; +} + +__host__ __device__ __forceinline__ Vec3f::Vec3f(float x, float y, float z) { + this->x = x; + this->y = y; + this->z = z; +} + +__host__ __device__ __forceinline__ Vec3f::Vec3f(float3 v) { + x = v.x; + y = v.y; + z = v.z; +} + + +__device__ __forceinline__ Vec3f Vec3f::operator+(const Vec3f& o) const { + return Vec3f(x + o.x, y + o.y, z + o.z); +} + + +__device__ __forceinline__ Vec3f& Vec3f::operator+=(const Vec3f& o) { + x += o.x; + y += o.y; + z += o.z; + return *this; +} + + +__device__ __forceinline__ Vec3f Vec3f::operator-(const Vec3f& o) const { + return Vec3f(x - o.x, y - o.y, z - o.z); +} + + +__device__ __forceinline__ Vec3f& Vec3f::operator-=(const Vec3f& o) { + x -= o.x; + y -= o.y; + z -= o.z; + return *this; +} + + +__device__ __forceinline__ Vec3f Vec3f::operator*(float s) const { + return Vec3f(x * s, y * s, z * s); +} + + +__device__ __forceinline__ Vec3f& Vec3f::operator*=(float s) { + x *= s; + y *= s; + z *= s; + return *this; +} + + +__device__ __forceinline__ Vec3f Vec3f::operator/(float s) const { + return Vec3f(x / s, y / s, z / s); +} + + +__device__ __forceinline__ Vec3f& Vec3f::operator/=(float s) { + x /= s; + y /= s; + z /= s; + return *this; +} + + +__device__ __forceinline__ float Vec3f::dot(const Vec3f& o) const { + return x * o.x + y * o.y + z * o.z; +} + + +__device__ __forceinline__ float Vec3f::norm() const { + return sqrtf(x * x + y * y + z * z); +} + + +__device__ __forceinline__ float Vec3f::norm2() const { + return x * x + y * y + z * z; +} + + +__device__ __forceinline__ Vec3f Vec3f::normalized() const { + float inv_norm = rsqrtf(x * x + y * y + z * z); + return Vec3f(x * inv_norm, y * inv_norm, z * inv_norm); +} + + +__device__ __forceinline__ void Vec3f::normalize() { + float inv_norm = rsqrtf(x * x + y * y + z * z); + x *= inv_norm; + y *= inv_norm; + z *= inv_norm; +} + + +__device__ __forceinline__ Vec3f Vec3f::cross(const Vec3f& o) const { + return Vec3f(y * o.z - z * o.y, z * o.x - x * o.z, x * o.y - y * o.x); +} + + +__device__ __forceinline__ Vec3f Vec3f::slerp(const Vec3f& o, float t) const { + float dot_prod = this->dot(o); + dot_prod = fmaxf(fminf(dot_prod, 1.0f), -1.0f); // Clamp to [-1, 1] + float theta = acosf(dot_prod) * t; + Vec3f relative_vec = (o - (*this) * dot_prod).normalized(); + return (*this) * cosf(theta) + relative_vec * sinf(theta); +} + + +__device__ __forceinline__ QEM::QEM() { + zero(); +} + + +__device__ __forceinline__ QEM QEM::operator+(const QEM& o) const { + QEM res; + #pragma unroll + for (int i = 0; i < 10; ++i) res.e[i] = e[i] + o.e[i]; + return res; +} + + +__device__ __forceinline__ QEM& QEM::operator+=(const QEM& o) { + #pragma unroll + for (int i = 0; i < 10; ++i) e[i] += o.e[i]; + return *this; +} + + +__device__ __forceinline__ QEM QEM::operator-(const QEM& o) const { + QEM res; + #pragma unroll + for (int i = 0; i < 10; ++i) res.e[i] = e[i] - o.e[i]; + return res; +} + + +__device__ __forceinline__ QEM& QEM::operator-=(const QEM& o) { + #pragma unroll + for (int i = 0; i < 10; ++i) e[i] -= o.e[i]; + return *this; +} + +__device__ __forceinline__ void QEM::zero() { + #pragma unroll + for (int i = 0; i < 10; ++i) e[i] = 0.0f; +} + + +// Add plane p = (a,b,c,d) as outer product p * p^T +__device__ __forceinline__ void QEM::add_plane(float4 p) { + // upper triangle indices mapping: + // (0,0)->e[0] + // (0,1)->e[1] + // (0,2)->e[2] + // (0,3)->e[3] + // (1,1)->e[4] + // (1,2)->e[5] + // (1,3)->e[6] + // (2,2)->e[7] + // (2,3)->e[8] + // (3,3)->e[9] + float a = p.x, b = p.y, c = p.z, d = p.w; + e[0] += a * a; + e[1] += a * b; + e[2] += a * c; + e[3] += a * d; + e[4] += b * b; + e[5] += b * c; + e[6] += b * d; + e[7] += c * c; + e[8] += c * d; + e[9] += d * d; +} + + +// Evaluate v^T * Q * v for v = (x,y,z,1) +__device__ __forceinline__ float QEM::evaluate(const Vec3f& p) const { + // compute v = [x,y,z,1] + float x = p.x, y = p.y, z = p.z, w = 1.0f; + // expand symmetric multiplication using stored upper triangular + // result = sum_{i<=j} M_ij * v_i * v_j * (1 if i==j else 2) + float res = 0.0f; + // (0,0) + res += e[0] * x * x; + // (0,1) and (1,0) + res += 2.0f * e[1] * x * y; + // (0,2) + res += 2.0f * e[2] * x * z; + // (0,3) + res += 2.0f * e[3] * x * w; + // (1,1) + res += e[4] * y * y; + // (1,2) + res += 2.0f * e[5] * y * z; + // (1,3) + res += 2.0f * e[6] * y * w; + // (2,2) + res += e[7] * z * z; + // (2,3) + res += 2.0f * e[8] * z * w; + // (3,3) + res += e[9] * w * w; + return res; +} + + +// Try to solve for optimal point minimizing v^T Q v with constraint v = (x,y,z,1) +// Solve the linear system: A * [x y z]^T = -b, where +// A = top-left 3x3 of Q, b = [e03, e13, e23] (note signs) +// Return true if solved (matrix invertible), false otherwise. err returns the error at the solution. +__device__ __forceinline__ bool QEM::solve_optimal(float3 &out, float &err) const { + // Build A (symmetric) + float A00 = e[0]; + float A01 = e[1]; + float A02 = e[2]; + float A11 = e[4]; + float A12 = e[5]; + float A22 = e[7]; + // b = (e03, e13, e23) where e03=e[3], e13=e[6], e23=e[8] + float b0 = e[3]; + float b1 = e[6]; + float b2 = e[8]; + + // Solve A * x = -b + // Use analytic inverse for 3x3 symmetric matrix (compute determinant) + // Compute determinant + float det = + A00 * (A11 * A22 - A12 * A12) - + A01 * (A01 * A22 - A12 * A02) + + A02 * (A01 * A12 - A11 * A02); + + if (fabsf(det) < 1e-12f) { + // singular - fall back: pick minimal among corners (or average 0) + // Here choose to put out as (0,0,0) + out = make_float3(0.0f, 0.0f, 0.0f); + err = evaluate(out); + return false; + } + + float invDet = 1.0f / det; + + // Compute inverse(A) via adjugate + float inv00 = (A11 * A22 - A12 * A12) * invDet; + float inv01 = -(A01 * A22 - A12 * A02) * invDet; + float inv02 = (A01 * A12 - A11 * A02) * invDet; + float inv11 = (A00 * A22 - A02 * A02) * invDet; + float inv12 = -(A00 * A12 - A01 * A02) * invDet; + float inv22 = (A00 * A11 - A01 * A01) * invDet; + + // x = -inv(A) * b + float x = -(inv00 * b0 + inv01 * b1 + inv02 * b2); + float y = -(inv01 * b0 + inv11 * b1 + inv12 * b2); + float z = -(inv02 * b0 + inv12 * b1 + inv22 * b2); + + out = make_float3(x, y, z); + err = evaluate(out); + return true; +} + + +} // namespace cumesh diff --git a/src/ext_hip.cpp b/src/ext_hip.cpp new file mode 100644 index 0000000..e24577d --- /dev/null +++ b/src/ext_hip.cpp @@ -0,0 +1,69 @@ +// !!! This is a file automatically generated by hipify!!! +#include +#include "hash/api.h" +#include "cumesh_hip.h" +#include "remesh/api.h" + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + // Hash functions + m.def("hashmap_insert_cuda", &cumesh::hashmap_insert_cuda); + m.def("hashmap_lookup_cuda", &cumesh::hashmap_lookup_cuda); + m.def("hashmap_insert_3d_cuda", &cumesh::hashmap_insert_3d_cuda); + m.def("hashmap_lookup_3d_cuda", &cumesh::hashmap_lookup_3d_cuda); + m.def("hashmap_insert_3d_idx_as_val_cuda", &cumesh::hashmap_insert_3d_idx_as_val_cuda); + + /* CUMESH */ + py::class_(m, "CuMesh") + .def(py::init<>()) + .def("num_vertices", &cumesh::CuMesh::num_vertices) + .def("num_faces", &cumesh::CuMesh::num_faces) + .def("num_edges", &cumesh::CuMesh::num_edges) + .def("num_boundaries", &cumesh::CuMesh::num_boundaries) + .def("num_conneted_components", &cumesh::CuMesh::num_conneted_components) + .def("num_boundary_conneted_components", &cumesh::CuMesh::num_boundary_conneted_components) + .def("num_boundary_loops", &cumesh::CuMesh::num_boundary_loops) + .def("clear_cache", &cumesh::CuMesh::clear_cache) + .def("init", &cumesh::CuMesh::init) + .def("read", &cumesh::CuMesh::read) + .def("read_face_normals", &cumesh::CuMesh::read_face_normals) + .def("read_vertex_normals", &cumesh::CuMesh::read_vertex_normals) + .def("read_edges", &cumesh::CuMesh::read_edges) + .def("read_boundaries", &cumesh::CuMesh::read_boundaries) + .def("read_manifold_face_adjacency", &cumesh::CuMesh::read_manifold_face_adjacency) + .def("read_manifold_boundary_adjacency", &cumesh::CuMesh::read_manifold_boundary_adjacency) + .def("read_connected_components", &cumesh::CuMesh::read_connected_components) + .def("read_boundary_connected_components", &cumesh::CuMesh::read_boundary_connected_components) + .def("read_boundary_loops", &cumesh::CuMesh::read_boundary_loops) + .def("read_all_cache", &cumesh::CuMesh::read_all_cache) + .def("compute_face_normals", &cumesh::CuMesh::compute_face_normals) + .def("compute_vertex_normals", &cumesh::CuMesh::compute_vertex_normals) + .def("get_vertex_face_adjacency", &cumesh::CuMesh::get_vertex_face_adjacency) + .def("get_edges", &cumesh::CuMesh::get_edges) + .def("get_edge_face_adjacency", &cumesh::CuMesh::get_edge_face_adjacency) + .def("get_vertex_edge_adjacency", &cumesh::CuMesh::get_vertex_edge_adjacency) + .def("get_boundary_info", &cumesh::CuMesh::get_boundary_info) + .def("get_vertex_boundary_adjacency", &cumesh::CuMesh::get_vertex_boundary_adjacency) + .def("get_vertex_is_manifold", &cumesh::CuMesh::get_vertex_is_manifold) + .def("get_manifold_face_adjacency", &cumesh::CuMesh::get_manifold_face_adjacency) + .def("get_manifold_boundary_adjacency", &cumesh::CuMesh::get_manifold_boundary_adjacency) + .def("get_connected_components", &cumesh::CuMesh::get_connected_components) + .def("get_boundary_connected_components", &cumesh::CuMesh::get_boundary_connected_components) + .def("get_boundary_loops", &cumesh::CuMesh::get_boundary_loops) + .def("remove_faces", &cumesh::CuMesh::remove_faces) + .def("remove_unreferenced_vertices", &cumesh::CuMesh::remove_unreferenced_vertices) + .def("remove_duplicate_faces", &cumesh::CuMesh::remove_duplicate_faces) + .def("remove_degenerate_faces", &cumesh::CuMesh::remove_degenerate_faces) + .def("fill_holes", &cumesh::CuMesh::fill_holes) + .def("repair_non_manifold_edges", &cumesh::CuMesh::repair_non_manifold_edges) + .def("remove_non_manifold_faces", &cumesh::CuMesh::remove_non_manifold_faces) + .def("remove_small_connected_components", &cumesh::CuMesh::remove_small_connected_components) + .def("unify_face_orientations", &cumesh::CuMesh::unify_face_orientations) + .def("simplify_step", &cumesh::CuMesh::simplify_step) + .def("compute_charts", &cumesh::CuMesh::compute_charts) + .def("read_atlas_charts", &cumesh::CuMesh::read_atlas_charts); + + // Remeshing functions + m.def("get_sparse_voxel_grid_active_vertices", &cumesh::get_sparse_voxel_grid_active_vertices); + m.def("simple_dual_contour", &cumesh::simple_dual_contour); +} \ No newline at end of file diff --git a/src/geometry.cu b/src/geometry.cu index 0e493ba..8866ee4 100644 --- a/src/geometry.cu +++ b/src/geometry.cu @@ -1,7 +1,11 @@ #include "cumesh.h" #include "dtypes.cuh" #include "shared.h" +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include +#endif namespace cumesh { @@ -32,7 +36,7 @@ void CuMesh::compute_face_areas() { F, this->face_areas.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } @@ -65,7 +69,7 @@ void CuMesh::compute_face_normals() { F, this->face_normals.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } @@ -123,7 +127,7 @@ void CuMesh::compute_vertex_normals() { V, this->vertex_normals.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } diff --git a/src/geometry.hip b/src/geometry.hip new file mode 100644 index 0000000..5cbb9ce --- /dev/null +++ b/src/geometry.hip @@ -0,0 +1,136 @@ +// !!! This is a file automatically generated by hipify!!! +#include "hip/hip_runtime.h" +#include "cumesh_hip.h" +#include "dtypes_hip.cuh" +#include "shared_hip.h" +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#endif + + +namespace cumesh { + + +static __global__ void compute_face_areas_kernel( + const float3* vertices, + const int3* faces, + const size_t F, + float* face_areas +) { + const int fid = blockIdx.x * blockDim.x + threadIdx.x; + if (fid >= F) return; + int3 face = faces[fid]; + Vec3f v0 = Vec3f(vertices[face.x]); + Vec3f v1 = Vec3f(vertices[face.y]); + Vec3f v2 = Vec3f(vertices[face.z]); + face_areas[fid] = 0.5 * (v1 - v0).cross(v2 - v0).norm(); +} + + +void CuMesh::compute_face_areas() { + size_t F = this->faces.size; + this->face_areas.resize(F); + hipLaunchKernelGGL(( compute_face_areas_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->vertices.ptr, + this->faces.ptr, + F, + this->face_areas.ptr + ); + CUDA_CHECK(hipGetLastError()); +} + + +static __global__ void compute_face_normals_kernel( + const float3* vertices, + const int3* faces, + const size_t F, + float3* face_normals +) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + + int3 face = faces[tid]; + Vec3f v0 = Vec3f(vertices[face.x]); + Vec3f v1 = Vec3f(vertices[face.y]); + Vec3f v2 = Vec3f(vertices[face.z]); + + Vec3f normal = (v1 - v0).cross(v2 - v0); + normal.normalize(); + face_normals[tid] = make_float3(normal.x, normal.y, normal.z); +} + + +void CuMesh::compute_face_normals() { + size_t F = this->faces.size; + this->face_normals.resize(F); + hipLaunchKernelGGL(( compute_face_normals_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->vertices.ptr, + this->faces.ptr, + F, + this->face_normals.ptr + ); + CUDA_CHECK(hipGetLastError()); +} + + +static __global__ void compute_vertex_normals_kernel( + const float3* vertices, + const int3* faces, + const int* vert2face, + const int* vert2face_offset, + const size_t V, + float3* vertex_normals +) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= V) return; + + int start = vert2face_offset[tid]; + int end = vert2face_offset[tid + 1]; + + Vec3f normal(0.0f, 0.0f, 0.0f); + Vec3f first_face_normal; + for (int i = start; i < end; i++) { + int fid = vert2face[i]; + int3 face = faces[fid]; + Vec3f v0 = Vec3f(vertices[face.x]); + Vec3f v1 = Vec3f(vertices[face.y]); + Vec3f v2 = Vec3f(vertices[face.z]); + + Vec3f face_normal = (v1 - v0).cross(v2 - v0); + normal += face_normal; + if (i == start) { + first_face_normal = face_normal; + } + } + + normal.normalize(); + // if NAN, fallback to first face normal + if (isnan(normal.x)) { + normal = first_face_normal; + } + vertex_normals[tid] = make_float3(normal.x, normal.y, normal.z); +} + + +void CuMesh::compute_vertex_normals() { + if (this->vert2face.is_empty() || this->vert2face_offset.is_empty()) { + this->get_vertex_face_adjacency(); + } + + size_t V = this->vertices.size; + this->vertex_normals.resize(V); + hipLaunchKernelGGL(( compute_vertex_normals_kernel), dim3((V + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + this->vertices.ptr, + this->faces.ptr, + this->vert2face.ptr, + this->vert2face_offset.ptr, + V, + this->vertex_normals.ptr + ); + CUDA_CHECK(hipGetLastError()); +} + + +} // namespace cumesh \ No newline at end of file diff --git a/src/hash/hash.cu b/src/hash/hash.cu index a9b1c23..9fd20c9 100644 --- a/src/hash/hash.cu +++ b/src/hash/hash.cu @@ -1,6 +1,10 @@ #include +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include #include +#endif #include "api.h" #include "hash.cuh" diff --git a/src/hash/hash.hip b/src/hash/hash.hip new file mode 100644 index 0000000..0e02def --- /dev/null +++ b/src/hash/hash.hip @@ -0,0 +1,451 @@ +// !!! This is a file automatically generated by hipify!!! +#include +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#include +#endif + +#include "api.h" +#include "hash.cuh" + + +template +static __global__ void hashmap_insert_cuda_kernel( + const size_t N, + const size_t M, + K* __restrict__ hashmap_keys, + V* __restrict__ hashmap_values, + const K* __restrict__ keys, + const V* __restrict__ values +) { + size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < M) + { + K key = keys[thread_id]; + V value = values[thread_id]; + linear_probing_insert(hashmap_keys, hashmap_values, key, value, N); + } +} + + +template +static void dispatch_hashmap_insert_cuda( + torch::Tensor& hashmap_keys, + torch::Tensor& hashmap_values, + const torch::Tensor& keys, + const torch::Tensor& values +) { + hipLaunchKernelGGL(( hashmap_insert_cuda_kernel), + dim3((keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE), + dim3(BLOCK_SIZE) + , 0, 0, + hashmap_keys.size(0), + keys.size(0), + hashmap_keys.data_ptr(), + hashmap_values.data_ptr(), + keys.data_ptr(), + values.data_ptr() + ); +} + + +/** + * Insert keys into the hashmap + * + * @param hashmap_keys [N] uint32/uint64 tensor containing the hashmap keys + * @param hashmap_values [N] uint32/uint64 tensor containing the hashmap values + * @param keys [M] uint32/uint64 tensor containing the keys to be inserted + * @param values [M] uint32/uint64 tensor containing the values to be inserted + */ +void cumesh::hashmap_insert_cuda( + torch::Tensor& hashmap_keys, + torch::Tensor& hashmap_values, + const torch::Tensor& keys, + const torch::Tensor& values +) { + // Dispatch to 32-bit or 64-bit kernel + if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt32) { + TORCH_CHECK(keys.dtype() == torch::kUInt32, "Keys must be uint32"); + TORCH_CHECK(values.dtype() == torch::kUInt32, "Values must be uint32"); + dispatch_hashmap_insert_cuda(hashmap_keys, hashmap_values, keys, values); + } + else if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt64) { + TORCH_CHECK(keys.dtype() == torch::kUInt32, "Keys must be uint32"); + TORCH_CHECK(values.dtype() == torch::kUInt64, "Values must be uint64"); + dispatch_hashmap_insert_cuda(hashmap_keys, hashmap_values, keys, values); + } + else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt32) { + TORCH_CHECK(keys.dtype() == torch::kUInt64, "Keys must be uint64"); + TORCH_CHECK(values.dtype() == torch::kUInt32, "Values must be uint32"); + dispatch_hashmap_insert_cuda(hashmap_keys, hashmap_values, keys, values); + } + else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt64) { + TORCH_CHECK(keys.dtype() == torch::kUInt64, "Keys must be uint64"); + TORCH_CHECK(values.dtype() == torch::kUInt64, "Values must be uint64"); + dispatch_hashmap_insert_cuda(hashmap_keys, hashmap_values, keys, values); + } + else { + TORCH_CHECK(false, "Unsupported data type"); + } +} + + +template +static __global__ void hashmap_lookup_cuda_kernel( + const size_t N, + const size_t M, + const K * __restrict__ hashmap_keys, + const V * __restrict__ hashmap_values, + const K * __restrict__ keys, + V * __restrict__ values +) { + size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < M) { + K key = keys[thread_id]; + values[thread_id] = linear_probing_lookup(hashmap_keys, hashmap_values, key, N); + } +} + + +template +static void dispatch_hashmap_lookup_cuda( + const torch::Tensor& hashmap_keys, + const torch::Tensor& hashmap_values, + const torch::Tensor& keys, + torch::Tensor& values +) { + hipLaunchKernelGGL(( hashmap_lookup_cuda_kernel), + dim3((keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE), + dim3(BLOCK_SIZE) + , 0, 0, + hashmap_keys.size(0), + keys.size(0), + hashmap_keys.data_ptr(), + hashmap_values.data_ptr(), + keys.data_ptr(), + values.data_ptr() + ); +} + + +/** + * Lookup keys in the hashmap + * + * @param hashmap_keys [N] uint32/uint64 tensor containing the hashmap keys + * @param hashmap_values [N] uint32/uint64 tensor containing the hashmap values + * @param keys [M] uint32/uint64 tensor containing the keys to be looked up + * @return [M] uint32/uint64 tensor containing the values of the keys + */ +torch::Tensor cumesh::hashmap_lookup_cuda( + const torch::Tensor& hashmap_keys, + const torch::Tensor& hashmap_values, + const torch::Tensor& keys +) { + // Allocate output tensor + auto output = torch::empty({keys.size(0)}, torch::dtype(hashmap_values.dtype()).device(hashmap_values.device())); + + // Dispatch to 32-bit or 64-bit kernel + if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt32) { + TORCH_CHECK(keys.dtype() == torch::kUInt32, "Keys must be uint32"); + TORCH_CHECK(output.dtype() == torch::kUInt32, "Output must be uint32"); + dispatch_hashmap_lookup_cuda(hashmap_keys, hashmap_values, keys, output); + } + else if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt64) { + TORCH_CHECK(keys.dtype() == torch::kUInt32, "Keys must be uint32"); + TORCH_CHECK(output.dtype() == torch::kUInt64, "Output must be uint64"); + dispatch_hashmap_lookup_cuda(hashmap_keys, hashmap_values, keys, output); + } + else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt32) { + TORCH_CHECK(keys.dtype() == torch::kUInt64, "Keys must be uint64"); + TORCH_CHECK(output.dtype() == torch::kUInt32, "Output must be uint32"); + dispatch_hashmap_lookup_cuda(hashmap_keys, hashmap_values, keys, output); + } + else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt64) { + TORCH_CHECK(keys.dtype() == torch::kUInt64, "Keys must be uint64"); + TORCH_CHECK(output.dtype() == torch::kUInt64, "Output must be uint64"); + dispatch_hashmap_lookup_cuda(hashmap_keys, hashmap_values, keys, output); + } + else { + TORCH_CHECK(false, "Unsupported data type"); + } + + return output; +} + + +template +static __global__ void hashmap_insert_3d_cuda_kernel( + const size_t N, + const size_t M, + const int W, + const int H, + const int D, + K* __restrict__ hashmap_keys, + V* __restrict__ hashmap_values, + const int32_t* __restrict__ coords, + const V* __restrict__ values +) { + size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < M) { + int4 coord = reinterpret_cast(coords)[thread_id]; + int b = coord.x; + int x = coord.y; + int y = coord.z; + int z = coord.w; + size_t flat_idx = (size_t)b * W * H * D + (size_t)x * H * D + (size_t)y * D + z; + K key = static_cast(flat_idx); + V value = values[thread_id]; + linear_probing_insert(hashmap_keys, hashmap_values, key, value, N); + } +} + + +template +static void dispatch_hashmap_insert_3d_cuda( + torch::Tensor& hashmap_keys, + torch::Tensor& hashmap_values, + const torch::Tensor& coords, + const torch::Tensor& values, + int W, int H, int D +) { + hipLaunchKernelGGL(( hashmap_insert_3d_cuda_kernel), + dim3((coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE), + dim3(BLOCK_SIZE) + , 0, 0, + hashmap_keys.size(0), + coords.size(0), + W, H, D, + hashmap_keys.data_ptr(), + hashmap_values.data_ptr(), + coords.data_ptr(), + values.data_ptr() + ); +} + + +/** + * Insert 3D coordinates into the hashmap + * + * @param hashmap_keys [N] uint32/uint64 tensor containing the hashmap keys + * @param hashmap_values [N] uint32/uint64 tensor containing the hashmap values + * @param coords [M, 4] int32 tensor containing the keys to be inserted + * @param values [M] uint32/uint64 tensor containing the values to be inserted + * @param W the number of width dimensions + * @param H the number of height dimensions + * @param D the number of depth dimensions + */ +void cumesh::hashmap_insert_3d_cuda( + torch::Tensor& hashmap_keys, + torch::Tensor& hashmap_values, + const torch::Tensor& coords, + const torch::Tensor& values, + int W, + int H, + int D +) { + TORCH_CHECK(coords.dtype() == torch::kInt32, "Coords must be int32"); + + // Dispatch to 32-bit or 64-bit kernel + if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt32) { + TORCH_CHECK(values.dtype() == torch::kUInt32, "Values must be uint32"); + dispatch_hashmap_insert_3d_cuda(hashmap_keys, hashmap_values, coords, values, W, H, D); + } + else if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt64) { + TORCH_CHECK(values.dtype() == torch::kUInt64, "Values must be uint64"); + dispatch_hashmap_insert_3d_cuda(hashmap_keys, hashmap_values, coords, values, W, H, D); + } + else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt32) { + TORCH_CHECK(values.dtype() == torch::kUInt32, "Values must be uint32"); + dispatch_hashmap_insert_3d_cuda(hashmap_keys, hashmap_values, coords, values, W, H, D); + } + else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt64) { + TORCH_CHECK(values.dtype() == torch::kUInt64, "Values must be uint64"); + dispatch_hashmap_insert_3d_cuda(hashmap_keys, hashmap_values, coords, values, W, H, D); + } + else { + TORCH_CHECK(false, "Unsupported data type"); + } +} + + +template +static __global__ void hashmap_lookup_3d_cuda_kernel( + const size_t N, + const size_t M, + const int W, + const int H, + const int D, + const K* __restrict__ hashmap_keys, + const V* __restrict__ hashmap_values, + const int32_t* __restrict__ coords, + V* __restrict__ values +) { + const size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < M) { + int4 coord = reinterpret_cast(coords)[thread_id]; + int b = coord.x; + int x = coord.y; + int y = coord.z; + int z = coord.w; + if (x < 0 || x >= W || y < 0 || y >= H || z < 0 || z >= D) { + values[thread_id] = std::numeric_limits::max(); + return; + } + size_t flat_idx = (size_t)b * W * H * D + (size_t)x * H * D + (size_t)y * D + z; + K key = static_cast(flat_idx); + values[thread_id] = linear_probing_lookup(hashmap_keys, hashmap_values, key, N); + } +} + + +template +static void dispatch_hashmap_lookup_3d_cuda( + const torch::Tensor& hashmap_keys, + const torch::Tensor& hashmap_values, + const torch::Tensor& coords, + torch::Tensor& values, + int W, int H, int D +) { + hipLaunchKernelGGL(( hashmap_lookup_3d_cuda_kernel), + dim3((coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE), + dim3(BLOCK_SIZE) + , 0, 0, + hashmap_keys.size(0), + coords.size(0), + W, H, D, + hashmap_keys.data_ptr(), + hashmap_values.data_ptr(), + coords.data_ptr(), + values.data_ptr() + ); +} + + +/** + * Lookup 3D coordinates in the hashmap + * + * @param hashmap_keys [N] uint32/uint64 tensor containing the hashmap keys + * @param hashmap_values [N] uint32/uint64 tensor containing the hashmap values + * @param coords [M, 4] int32 tensor containing the keys to be looked up + * @param W the number of width dimensions + * @param H the number of height dimensions + * @param D the number of depth dimensions + * + * @return [M] uint32/uint64 tensor containing the values of the keys + */ +torch::Tensor cumesh::hashmap_lookup_3d_cuda( + const torch::Tensor& hashmap_keys, + const torch::Tensor& hashmap_values, + const torch::Tensor& coords, + int W, + int H, + int D +) { + // Allocate output tensor + auto output = torch::empty({coords.size(0)}, torch::dtype(hashmap_values.dtype()).device(hashmap_values.device())); + + // Dispatch to 32-bit or 64-bit kernel + if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt32) { + dispatch_hashmap_lookup_3d_cuda(hashmap_keys, hashmap_values, coords, output, W, H, D); + } + else if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt64) { + dispatch_hashmap_lookup_3d_cuda(hashmap_keys, hashmap_values, coords, output, W, H, D); + } + else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt32) { + dispatch_hashmap_lookup_3d_cuda(hashmap_keys, hashmap_values, coords, output, W, H, D); + } + else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt64) { + dispatch_hashmap_lookup_3d_cuda(hashmap_keys, hashmap_values, coords, output, W, H, D); + } + else { + TORCH_CHECK(false, "Unsupported data type"); + } + + return output; +} + + +template +static __global__ void hashmap_insert_3d_idx_as_val_cuda_kernel( + const size_t N, + const size_t M, + const int W, + const int H, + const int D, + K* __restrict__ hashmap, + V* __restrict__ values, + const int32_t* __restrict__ coords +) { + const size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < M) { + int4 coord = reinterpret_cast(coords)[thread_id]; + int b = coord.x; + int x = coord.y; + int y = coord.z; + int z = coord.w; + size_t flat_idx = (size_t)b * W * H * D + (size_t)x * H * D + (size_t)y * D + z; + K key = static_cast(flat_idx); + V value = static_cast(thread_id); + linear_probing_insert(hashmap, values, key, value, N); + } +} + + +template +static void dispatch_hashmap_insert_3d_idx_as_val_cuda( + torch::Tensor& hashmap_keys, + torch::Tensor& hashmap_values, + const torch::Tensor& coords, + int W, int H, int D +) { + hipLaunchKernelGGL(( hashmap_insert_3d_idx_as_val_cuda_kernel), + dim3((coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE), + dim3(BLOCK_SIZE) + , 0, 0, + hashmap_keys.size(0), + coords.size(0), + W, H, D, + hashmap_keys.data_ptr(), + hashmap_values.data_ptr(), + coords.data_ptr() + ); +} + + +/** + * Insert 3D coordinates into the hashmap using index as value + * + * @param hashmap_keys [N] uint32/uint64 tensor containing the hashmap keys + * @param hashmap_values [N] uint32/uint64 tensor containing the hashmap values + * @param coords [M, 4] int32 tensor containing the keys to be inserted + * @param W the number of width dimensions + * @param H the number of height dimensions + * @param D the number of depth dimensions + */ +void cumesh::hashmap_insert_3d_idx_as_val_cuda( + torch::Tensor& hashmap_keys, + torch::Tensor& hashmap_values, + const torch::Tensor& coords, + int W, + int H, + int D +) { + // Dispatch to 32-bit or 64-bit kernel + if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt32) { + dispatch_hashmap_insert_3d_idx_as_val_cuda(hashmap_keys, hashmap_values, coords, W, H, D); + } + else if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt64) { + dispatch_hashmap_insert_3d_idx_as_val_cuda(hashmap_keys, hashmap_values, coords, W, H, D); + } + else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt32) { + dispatch_hashmap_insert_3d_idx_as_val_cuda(hashmap_keys, hashmap_values, coords, W, H, D); + } + else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt64) { + dispatch_hashmap_insert_3d_idx_as_val_cuda(hashmap_keys, hashmap_values, coords, W, H, D); + } + else { + TORCH_CHECK(false, "Unsupported data type"); + } +} \ No newline at end of file diff --git a/src/io.cu b/src/io.cu index 579cf16..aa12883 100644 --- a/src/io.cu +++ b/src/io.cu @@ -85,11 +85,11 @@ static torch::Tensor buffer_to_tensor(const Buffer buffer) { static constexpr int dst_bytes = Mapping::sizeof_scalar * Mapping::channels; if (sizeof(T) == dst_bytes) { - CUDA_CHECK(cudaMemcpy( + CUDA_CHECK(hipMemcpy( tensor.data_ptr(), buffer.ptr, count * sizeof(T), - cudaMemcpyDeviceToDevice + hipMemcpyDeviceToDevice )); } else { CUDA_CHECK(cudaMemcpy2D( @@ -99,7 +99,7 @@ static torch::Tensor buffer_to_tensor(const Buffer buffer) { sizeof(T), dst_bytes, count, - cudaMemcpyDeviceToDevice + hipMemcpyDeviceToDevice )); } @@ -119,7 +119,7 @@ void CuMesh::init(const torch::Tensor& vertices, const torch::Tensor& faces) { sizeof(float) * 3, sizeof(float) * 3, num_vertices, - cudaMemcpyDeviceToDevice + hipMemcpyDeviceToDevice )); CUDA_CHECK(cudaMemcpy2D( this->faces.ptr, @@ -128,7 +128,7 @@ void CuMesh::init(const torch::Tensor& vertices, const torch::Tensor& faces) { sizeof(int) * 3, sizeof(int) * 3, num_faces, - cudaMemcpyDeviceToDevice + hipMemcpyDeviceToDevice )); } diff --git a/src/io.hip b/src/io.hip new file mode 100644 index 0000000..0043191 --- /dev/null +++ b/src/io.hip @@ -0,0 +1,251 @@ +// !!! This is a file automatically generated by hipify!!! +#include "cumesh_hip.h" + + +namespace cumesh { + + +template +struct TorchTypeMapping; + +template <> struct TorchTypeMapping { + static constexpr auto scalar_type = torch::kInt32; + static constexpr int sizeof_scalar = 4; + static constexpr int channels = 1; +}; + +template <> struct TorchTypeMapping { + static constexpr auto scalar_type = torch::kInt8; + static constexpr int sizeof_scalar = 1; + static constexpr int channels = 1; +}; + +template <> struct TorchTypeMapping { + static constexpr auto scalar_type = torch::kUInt8; + static constexpr int sizeof_scalar = 1; + static constexpr int channels = 1; +}; + +template <> struct TorchTypeMapping { + static constexpr auto scalar_type = torch::kFloat32; + static constexpr int sizeof_scalar = 4; + static constexpr int channels = 1; +}; + +template <> struct TorchTypeMapping { + static constexpr auto scalar_type = torch::kInt32; + static constexpr int sizeof_scalar = 4; + static constexpr int channels = 2; +}; + +template <> struct TorchTypeMapping { + static constexpr auto scalar_type = torch::kInt32; + static constexpr int sizeof_scalar = 4; + static constexpr int channels = 2; +}; + +template <> struct TorchTypeMapping { + static constexpr auto scalar_type = torch::kInt32; + static constexpr int sizeof_scalar = 4; + static constexpr int channels = 3; +}; + +template <> struct TorchTypeMapping { + static constexpr auto scalar_type = torch::kFloat32; + static constexpr int sizeof_scalar = 4; + static constexpr int channels = 2; +}; + +template <> struct TorchTypeMapping { + static constexpr auto scalar_type = torch::kFloat32; + static constexpr int sizeof_scalar = 4; + static constexpr int channels = 3; +}; + +template <> struct TorchTypeMapping { + static constexpr auto scalar_type = torch::kFloat32; + static constexpr int sizeof_scalar = 4; + static constexpr int channels = 4; +}; + + +template +static torch::Tensor buffer_to_tensor(const Buffer buffer) { + using Mapping = TorchTypeMapping; + + int64_t count = static_cast(buffer.size); + std::vector shape; + if (Mapping::channels == 1) { + shape = {count}; // 1D Tensor + } else { + shape = {count, Mapping::channels}; // 2D Tensor [N, C] + } + + auto options = torch::dtype(Mapping::scalar_type).device(torch::kCUDA); + auto tensor = torch::empty(shape, options); + + static constexpr int dst_bytes = Mapping::sizeof_scalar * Mapping::channels; + if (sizeof(T) == dst_bytes) { + CUDA_CHECK(hipMemcpy( + tensor.data_ptr(), + buffer.ptr, + count * sizeof(T), + hipMemcpyDeviceToDevice + )); + } else { + CUDA_CHECK(hipMemcpy2D( + tensor.data_ptr(), + dst_bytes, + buffer.ptr, + sizeof(T), + dst_bytes, + count, + hipMemcpyDeviceToDevice + )); + } + + return tensor; +} + + +void CuMesh::init(const torch::Tensor& vertices, const torch::Tensor& faces) { + size_t num_vertices = vertices.size(0); + size_t num_faces = faces.size(0); + this->vertices.resize(num_vertices); + this->faces.resize(num_faces); + CUDA_CHECK(hipMemcpy2D( + this->vertices.ptr, + sizeof(float3), + vertices.data_ptr(), + sizeof(float) * 3, + sizeof(float) * 3, + num_vertices, + hipMemcpyDeviceToDevice + )); + CUDA_CHECK(hipMemcpy2D( + this->faces.ptr, + sizeof(int3), + faces.data_ptr(), + sizeof(int) * 3, + sizeof(int) * 3, + num_faces, + hipMemcpyDeviceToDevice + )); +} + + +std::tuple CuMesh::read() { + auto vertices = buffer_to_tensor(this->vertices); + auto faces = buffer_to_tensor(this->faces); + return std::make_tuple(vertices, faces); +} + + +torch::Tensor CuMesh::read_face_normals() { + auto face_normals = buffer_to_tensor(this->face_normals); + return face_normals; +} + + +torch::Tensor CuMesh::read_vertex_normals() { + auto vertex_normals = buffer_to_tensor(this->vertex_normals); + return vertex_normals; +} + + +torch::Tensor CuMesh::read_edges() { + auto edges = buffer_to_tensor(this->edges); + return edges; +} + + +torch::Tensor CuMesh::read_boundaries() { + auto boundaries = buffer_to_tensor(this->boundaries); + return boundaries; +} + + +torch::Tensor CuMesh::read_manifold_face_adjacency() { + auto manifold_face_adj = buffer_to_tensor(this->manifold_face_adj); + return manifold_face_adj; +} + + +torch::Tensor CuMesh::read_manifold_boundary_adjacency() { + auto manifold_bound_adj = buffer_to_tensor(this->manifold_bound_adj); + return manifold_bound_adj; +} + + +std::tuple CuMesh::read_connected_components() { + auto conn_comp_ids_tensor = buffer_to_tensor(this->conn_comp_ids); + return std::make_tuple(this->num_conn_comps, conn_comp_ids_tensor); +} + + +std::tuple CuMesh::read_boundary_connected_components() { + auto bound_conn_comp_ids_tensor = buffer_to_tensor(this->bound_conn_comp_ids); + return std::make_tuple(this->num_bound_conn_comps, bound_conn_comp_ids_tensor); +} + + +std::tuple CuMesh::read_boundary_loops() { + auto loop_boundaries_tensor = buffer_to_tensor(this->loop_boundaries); + auto loop_boundaries_offset_tensor = buffer_to_tensor(this->loop_boundaries_offset); + return std::make_tuple(this->num_bound_loops, loop_boundaries_tensor, loop_boundaries_offset_tensor); +} + + +std::unordered_map CuMesh::read_all_cache() { + std::unordered_map cache; + cache["face_areas"] = buffer_to_tensor(this->face_areas); + cache["face_normals"] = buffer_to_tensor(this->face_normals); + cache["vertex_normals"] = buffer_to_tensor(this->vertex_normals); + cache["edges"] = buffer_to_tensor(this->edges); + cache["boundaries"] = buffer_to_tensor(this->boundaries); + cache["vert_is_boundary"] = buffer_to_tensor(this->vert_is_boundary); + cache["vert_is_manifold"] = buffer_to_tensor(this->vert_is_manifold); + cache["vert2edge"] = buffer_to_tensor(this->vert2edge); + cache["vert2edge_cnt"] = buffer_to_tensor(this->vert2edge_cnt); + cache["vert2edge_offset"] = buffer_to_tensor(this->vert2edge_offset); + cache["vert2bound"] = buffer_to_tensor(this->vert2bound); + cache["vert2bound_cnt"] = buffer_to_tensor(this->vert2bound_cnt); + cache["vert2bound_offset"] = buffer_to_tensor(this->vert2bound_offset); + cache["edge2face"] = buffer_to_tensor(this->edge2face); + cache["edge2face_cnt"] = buffer_to_tensor(this->edge2face_cnt); + cache["edge2face_offset"] = buffer_to_tensor(this->edge2face_offset); + cache["face2edge"] = buffer_to_tensor(this->face2edge); + cache["vert2face"] = buffer_to_tensor(this->vert2face); + cache["vert2face_cnt"] = buffer_to_tensor(this->vert2face_cnt); + cache["vert2face_offset"] = buffer_to_tensor(this->vert2face_offset); + cache["manifold_face_adj"] = buffer_to_tensor(this->manifold_face_adj); + cache["manifold_bound_adj"] = buffer_to_tensor(this->manifold_bound_adj); + cache["conn_comp_ids"] = buffer_to_tensor(this->conn_comp_ids); + cache["bound_conn_comp_ids"] = buffer_to_tensor(this->bound_conn_comp_ids); + cache["loop_boundaries"] = buffer_to_tensor(this->loop_boundaries); + cache["loop_boundaries_offset"] = buffer_to_tensor(this->loop_boundaries_offset); + cache["vertices_map"] = buffer_to_tensor(this->vertices_map); + cache["faces_map"] = buffer_to_tensor(this->faces_map); + cache["edge_collapse_costs"] = buffer_to_tensor(this->edge_collapse_costs); + cache["propagated_costs"] = buffer_to_tensor(this->propagated_costs); + cache["atlas_chart_ids"] = buffer_to_tensor(this->atlas_chart_ids); + cache["atlas_chart_vertex_map"] = buffer_to_tensor(this->atlas_chart_vertex_map); + cache["atlas_chart_faces"] = buffer_to_tensor(this->atlas_chart_faces); + cache["atlas_chart_faces_offset"] = buffer_to_tensor(this->atlas_chart_faces_offset); + cache["atlas_chart_vertex_offset"] = buffer_to_tensor(this->atlas_chart_vertex_offset); + cache["atlas_chart_uvs"] = buffer_to_tensor(this->atlas_chart_uvs); + cache["atlas_chart_normal_cones"] = buffer_to_tensor(this->atlas_chart_normal_cones); + cache["atlas_chart_adj"] = buffer_to_tensor(this->atlas_chart_adj); + cache["atlas_chart_adj_length"] = buffer_to_tensor(this->atlas_chart_adj_length); + cache["atlas_chart_perims"] = buffer_to_tensor(this->atlas_chart_perims); + cache["atlas_chart_areas"] = buffer_to_tensor(this->atlas_chart_areas); + cache["atlas_chart2edge"] = buffer_to_tensor(this->atlas_chart2edge); + cache["atlas_chart2edge_cnt"] = buffer_to_tensor(this->atlas_chart2edge_cnt); + cache["atlas_chart2edge_offset"] = buffer_to_tensor(this->atlas_chart2edge_offset); + cache["temp_storage"] = buffer_to_tensor(this->temp_storage); + cache["cub_temp_storage"] = buffer_to_tensor(this->cub_temp_storage); + return cache; +} + + +} // namespace cumesh diff --git a/src/remesh/simple_dual_contour.cu b/src/remesh/simple_dual_contour.cu index e064ee3..81c6782 100644 --- a/src/remesh/simple_dual_contour.cu +++ b/src/remesh/simple_dual_contour.cu @@ -1,6 +1,10 @@ #include +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include #include +#endif #include #include "api.h" @@ -218,6 +222,6 @@ std::tuple cumesh::simple_dual_contour( TORCH_CHECK(false, "Unsupported hashmap data type"); } - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); return std::make_tuple(vertices, intersected); } diff --git a/src/remesh/simple_dual_contour.hip b/src/remesh/simple_dual_contour.hip new file mode 100644 index 0000000..6bbe049 --- /dev/null +++ b/src/remesh/simple_dual_contour.hip @@ -0,0 +1,228 @@ +// !!! This is a file automatically generated by hipify!!! +#include +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#include +#endif +#include + +#include "api.h" +#include "../utils_hip.h" +#include "../hash/hash.cuh" + + +template +__device__ __forceinline__ float get_vertex_val( + const T* __restrict__ hashmap_keys, + const uint32_t* __restrict__ hashmap_vals, + const float* __restrict__ udf, + const size_t N_vert, + int x, int y, int z, + int W, int H, int D +) { + size_t flat_idx = (size_t)x * H * D + (size_t)y * D + z; + T key = static_cast(flat_idx); + uint32_t idx = linear_probing_lookup(hashmap_keys, hashmap_vals, key, N_vert); + return udf[idx]; +} + + +template +static __global__ void simple_dual_contour_kernel( + const size_t N_vert, + const size_t M, + const int W, + const int H, + const int D, + const T* __restrict__ hashmap_keys, + const uint32_t* __restrict__ hashmap_vals, + const int32_t* __restrict__ coords, + const float* __restrict__ udf, + float* __restrict__ out_vertices, + int32_t* __restrict__ out_intersected +) { + size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id >= M) return; + + int vx = coords[thread_id * 3 + 0]; + int vy = coords[thread_id * 3 + 1]; + int vz = coords[thread_id * 3 + 2]; + + float3 intersection_sum = make_float3(0.0f, 0.0f, 0.0f); + int intersection_count = 0; + + // Traverse the 12 edges of the voxel + // Axis X + #pragma unroll + for (int u = 0; u <= 1; ++u) { + #pragma unroll + for (int v = 0; v <= 1; ++v) { + float val1 = get_vertex_val(hashmap_keys, hashmap_vals, udf, N_vert, vx, vy + u, vz + v, W, H, D); + float val2 = get_vertex_val(hashmap_keys, hashmap_vals, udf, N_vert, vx + 1, vy + u, vz + v, W, H, D); + + // Calculate the intersection point + if ((val1 < 0 && val2 >= 0) || (val1 >= 0 && val2 < 0)) { + float t = -val1 / (val2 - val1); + // P = P1 + t * (P2 - P1) + intersection_sum.x += (float)vx + t; + intersection_sum.y += (float)(vy + u); + intersection_sum.z += (float)(vz + v); + intersection_count++; + } + + if (u == 1 && v == 1) { + if (val1 < 0 && val2 >= 0) { + out_intersected[thread_id * 3 + 0] = 1; + } + else if (val1 >= 0 && val2 < 0) { + out_intersected[thread_id * 3 + 0] = -1; + } + else { + out_intersected[thread_id * 3 + 0] = 0; + } + } + } + } + + // Axis Y + #pragma unroll + for (int u = 0; u <= 1; ++u) { + #pragma unroll + for (int v = 0; v <= 1; ++v) { + float val1 = get_vertex_val(hashmap_keys, hashmap_vals, udf, N_vert, vx + u, vy, vz + v, W, H, D); + float val2 = get_vertex_val(hashmap_keys, hashmap_vals, udf, N_vert, vx + u, vy + 1, vz + v, W, H, D); + + if ((val1 < 0 && val2 >= 0) || (val1 >= 0 && val2 < 0)) { + float t = -val1 / (val2 - val1); + intersection_sum.x += (float)(vx + u); + intersection_sum.y += (float)vy + t; + intersection_sum.z += (float)(vz + v); + intersection_count++; + } + + if (u == 1 && v == 1) { + if (val1 < 0 && val2 >= 0) { + out_intersected[thread_id * 3 + 1] = 1; + } + else if (val1 >= 0 && val2 < 0) { + out_intersected[thread_id * 3 + 1] = -1; + } + else { + out_intersected[thread_id * 3 + 1] = 0; + } + } + } + } + + // Axis Z + #pragma unroll + for (int u = 0; u <= 1; ++u) { + #pragma unroll + for (int v = 0; v <= 1; ++v) { + float val1 = get_vertex_val(hashmap_keys, hashmap_vals, udf, N_vert, vx + u, vy + v, vz, W, H, D); + float val2 = get_vertex_val(hashmap_keys, hashmap_vals, udf, N_vert, vx + u, vy + v, vz + 1, W, H, D); + + if ((val1 < 0 && val2 >= 0) || (val1 >= 0 && val2 < 0)) { + float t = -val1 / (val2 - val1); + intersection_sum.x += (float)(vx + u); + intersection_sum.y += (float)(vy + v); + intersection_sum.z += (float)vz + t; + intersection_count++; + } + + if (u == 1 && v == 1) { + if (val1 < 0 && val2 >= 0) { + out_intersected[thread_id * 3 + 2] = 1; + } + else if (val1 >= 0 && val2 < 0) { + out_intersected[thread_id * 3 + 2] = -1; + } + else { + out_intersected[thread_id * 3 + 2] = 0; + } + } + } + } + + // Calculate the mean intersection point + if (intersection_count > 0) { + out_vertices[thread_id * 3 + 0] = intersection_sum.x / intersection_count; + out_vertices[thread_id * 3 + 1] = intersection_sum.y / intersection_count; + out_vertices[thread_id * 3 + 2] = intersection_sum.z / intersection_count; + } else { + // Fallback: Voxel Center + out_vertices[thread_id * 3 + 0] = (float)vx + 0.5f; + out_vertices[thread_id * 3 + 1] = (float)vy + 0.5f; + out_vertices[thread_id * 3 + 2] = (float)vz + 0.5f; + } +} + + +/** + * Isosurfacing a volume defined on vertices of a sparse voxel grid using a simple dual contouring algorithm. + * Dual vertices are computed by mean of edge intersections. + * + * @param hashmap_keys [Nvert] uint32/uint64 hashmap of the vertices keys + * @param hashmap_vals [Nvert] uint32 tensor containing the hashmap values as vertex indices + * @param coords [Mvox, 3] int32 tensor containing the coordinates of the active voxels + * @param udf [Mvert] float tensor containing the UDF/SDF values at the vertices + * @param W the number of width dimensions + * @param H the number of height dimensions + * @param D the number of depth dimensions + * + * @return [L, 3] float tensor containing the active vertices (Dual Vertices) + [L, 3] int32 tensor containing the intersected edges (1: intersected, 0: not intersected) + */ +std::tuple cumesh::simple_dual_contour( + const torch::Tensor& hashmap_keys, + const torch::Tensor& hashmap_vals, + const torch::Tensor& coords, + const torch::Tensor& udf, + int W, + int H, + int D +) { + const size_t M = coords.size(0); + const size_t N_vert = hashmap_keys.size(0); + + auto vertices = torch::empty({(long)M, 3}, torch::dtype(torch::kFloat32).device(coords.device())); + auto intersected = torch::empty({(long)M, 3}, torch::dtype(torch::kInt32).device(coords.device())); + + dim3 threads(BLOCK_SIZE); + dim3 blocks((M + BLOCK_SIZE - 1) / BLOCK_SIZE); + + if (hashmap_keys.dtype() == torch::kUInt32) { + hipLaunchKernelGGL(( simple_dual_contour_kernel), dim3(blocks), dim3(threads), 0, 0, + N_vert, + M, + W, H, D, + hashmap_keys.data_ptr(), + hashmap_vals.data_ptr(), + coords.data_ptr(), + udf.data_ptr(), + vertices.data_ptr(), + intersected.data_ptr() + ); + } + else if (hashmap_keys.dtype() == torch::kUInt64) { + hipLaunchKernelGGL(( simple_dual_contour_kernel), dim3(blocks), dim3(threads), 0, 0, + N_vert, + M, + W, H, D, + hashmap_keys.data_ptr(), + hashmap_vals.data_ptr(), + coords.data_ptr(), + udf.data_ptr(), + vertices.data_ptr(), + intersected.data_ptr() + ); + } + else { + TORCH_CHECK(false, "Unsupported hashmap data type"); + } + + CUDA_CHECK(hipGetLastError()); + return std::make_tuple(vertices, intersected); +} diff --git a/src/remesh/svox2vert.cu b/src/remesh/svox2vert.cu index 6f1d517..63f1ee3 100644 --- a/src/remesh/svox2vert.cu +++ b/src/remesh/svox2vert.cu @@ -1,7 +1,15 @@ #include +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include #include +#endif +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include +#endif #include "api.h" #include "../utils.h" @@ -148,7 +156,7 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices( // Get the number of active vertices for each voxel size_t N = hashmap_keys.size(0); int* num_vertices; - CUDA_CHECK(cudaMalloc(&num_vertices, (M + 1) * sizeof(int))); + CUDA_CHECK(hipMalloc(&num_vertices, (M + 1) * sizeof(int))); if (hashmap_keys.dtype() == torch::kUInt32) { get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( N, @@ -176,17 +184,17 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices( } else { TORCH_CHECK(false, "Unsupported data type"); } - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // Compute the offset size_t temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1); + hipcub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1); void* d_temp_storage = nullptr; - CUDA_CHECK(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1); - CUDA_CHECK(cudaFree(d_temp_storage)); + CUDA_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1); + CUDA_CHECK(hipFree(d_temp_storage)); int total_vertices; - CUDA_CHECK(cudaMemcpy(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(&total_vertices, num_vertices + M, sizeof(int), hipMemcpyDeviceToHost)); // Set the active vertices for each voxel auto vertices = torch::empty({total_vertices, 3}, torch::dtype(torch::kInt32).device(hashmap_keys.device())); @@ -218,10 +226,10 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices( vertices.data_ptr() ); } - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // Free the temporary memory - CUDA_CHECK(cudaFree(num_vertices)); + CUDA_CHECK(hipFree(num_vertices)); return vertices; } diff --git a/src/remesh/svox2vert.hip b/src/remesh/svox2vert.hip new file mode 100644 index 0000000..dde7050 --- /dev/null +++ b/src/remesh/svox2vert.hip @@ -0,0 +1,236 @@ +// !!! This is a file automatically generated by hipify!!! +#include +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#include +#endif +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#endif + +#include "api.h" +#include "../utils_hip.h" +#include "../hash/api.h" +#include "../hash/hash.cuh" + + +template +static __global__ void get_vertex_num( + const size_t N, + const size_t M, + const int W, + const int H, + const int D, + const T* __restrict__ hashmap_keys, + const uint32_t* __restrict__ hashmap_vals, + const int32_t* __restrict__ coords, + int* __restrict__ num_vertices +) { + size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id >= M) return; + + int num = 1; // include the current voxel + + int x = coords[3 * thread_id + 0]; + int y = coords[3 * thread_id + 1]; + int z = coords[3 * thread_id + 2]; + + size_t flat_idx; + T key; + + #pragma unroll + for (int i = 0; i <= 1; i++) { + #pragma unroll + for (int j = 0; j <= 1; j++) { + #pragma unroll + for (int k = 0; k <= 1; k++) { + if (i == 0 && j == 0 && k == 0) continue; + int xx = x + i; + int yy = y + j; + int zz = z + k; + if (xx >= W || yy >= H || zz >= D) { + num++; + continue; + } + flat_idx = (size_t)(xx * H + yy) * D + zz; + key = static_cast(flat_idx); + if (linear_probing_lookup(hashmap_keys, hashmap_vals, key, N) == std::numeric_limits::max()) { + num++; + } + } + } + } + + num_vertices[thread_id] = num; +} + + +template +static __global__ void set_vertex( + const size_t N, + const size_t M, + const int W, + const int H, + const int D, + const T* __restrict__ hashmap_keys, + const uint32_t* __restrict__ hashmap_vals, + const int32_t* __restrict__ coords, + const int* __restrict__ vertices_offset, + int* __restrict__ vertices +) { + size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id >= M) return; + + int x = coords[3 * thread_id + 0]; + int y = coords[3 * thread_id + 1]; + int z = coords[3 * thread_id + 2]; + int ptr_start = vertices_offset[thread_id]; + vertices[3 * ptr_start + 0] = x; + vertices[3 * ptr_start + 1] = y; + vertices[3 * ptr_start + 2] = z; + ptr_start++; + + size_t flat_idx; + T key; + + #pragma unroll + for (int i = 0; i <= 1; i++) { + #pragma unroll + for (int j = 0; j <= 1; j++) { + #pragma unroll + for (int k = 0; k <= 1; k++) { + if (i == 0 && j == 0 && k == 0) continue; + int xx = x + i; + int yy = y + j; + int zz = z + k; + if (xx >= W || yy >= H || zz >= D) { + vertices[3 * ptr_start + 0] = xx; + vertices[3 * ptr_start + 1] = yy; + vertices[3 * ptr_start + 2] = zz; + ptr_start++; + continue; + } + flat_idx = (size_t)(xx * H + yy) * D + zz; + key = static_cast(flat_idx); + if (linear_probing_lookup(hashmap_keys, hashmap_vals, key, N) == std::numeric_limits::max()) { + vertices[3 * ptr_start + 0] = xx; + vertices[3 * ptr_start + 1] = yy; + vertices[3 * ptr_start + 2] = zz; + ptr_start++; + } + } + } + } +} + + +/** + * Get the active vetices of a sparse voxel grid + * + * @param hashmap_keys [N] uint32/uint64 tensor containing the hashmap keys + * @param hashmap_vals [N] uint32 tensor containing the hashmap values as voxel indices + * @param coords [M, 3] int32 tensor containing the coordinates of the active voxels + * @param W the number of width dimensions + * @param H the number of height dimensions + * @param D the number of depth dimensions + * + * @return [L, 3] int32 tensor containing the active vertices + */ +torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices( + torch::Tensor& hashmap_keys, + torch::Tensor& hashmap_vals, + const torch::Tensor& coords, + const int W, + const int H, + const int D +) { + // Handle empty input - return early to avoid launching kernels with 0 blocks + size_t M = coords.size(0); + if (M == 0) { + return torch::empty({0, 3}, torch::dtype(torch::kInt32).device(hashmap_keys.device())); + } + + // Get the number of active vertices for each voxel + size_t N = hashmap_keys.size(0); + int* num_vertices; + CUDA_CHECK(hipMalloc(&num_vertices, (M + 1) * sizeof(int))); + if (hashmap_keys.dtype() == torch::kUInt32) { + hipLaunchKernelGGL(( get_vertex_num), dim3((M + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + N, + M, + W, + H, + D, + hashmap_keys.data_ptr(), + hashmap_vals.data_ptr(), + coords.data_ptr(), + num_vertices + ); + } else if (hashmap_keys.dtype() == torch::kUInt64) { + hipLaunchKernelGGL(( get_vertex_num), dim3((M + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + N, + M, + W, + H, + D, + hashmap_keys.data_ptr(), + hashmap_vals.data_ptr(), + coords.data_ptr(), + num_vertices + ); + } else { + TORCH_CHECK(false, "Unsupported data type"); + } + CUDA_CHECK(hipGetLastError()); + + // Compute the offset + size_t temp_storage_bytes = 0; + hipcub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1); + void* d_temp_storage = nullptr; + CUDA_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1); + CUDA_CHECK(hipFree(d_temp_storage)); + int total_vertices; + CUDA_CHECK(hipMemcpy(&total_vertices, num_vertices + M, sizeof(int), hipMemcpyDeviceToHost)); + + // Set the active vertices for each voxel + auto vertices = torch::empty({total_vertices, 3}, torch::dtype(torch::kInt32).device(hashmap_keys.device())); + if (hashmap_keys.dtype() == torch::kUInt32) { + hipLaunchKernelGGL(( set_vertex), dim3((M + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + N, + M, + W, + H, + D, + hashmap_keys.data_ptr(), + hashmap_vals.data_ptr(), + coords.data_ptr(), + num_vertices, + vertices.data_ptr() + ); + } + else if (hashmap_keys.dtype() == torch::kUInt64) { + hipLaunchKernelGGL(( set_vertex), dim3((M + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + N, + M, + W, + H, + D, + hashmap_keys.data_ptr(), + hashmap_vals.data_ptr(), + coords.data_ptr(), + num_vertices, + vertices.data_ptr() + ); + } + CUDA_CHECK(hipGetLastError()); + + // Free the temporary memory + CUDA_CHECK(hipFree(num_vertices)); + + return vertices; +} diff --git a/src/shared.h b/src/shared.h index 66ecac7..e6bf04c 100644 --- a/src/shared.h +++ b/src/shared.h @@ -1,8 +1,16 @@ #pragma once +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include #include +#endif +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include +#endif #include "utils.h" #include "cumesh.h" @@ -160,66 +168,66 @@ template int compress_ids(T* ids, size_t N, Buffer& cub_temp_storage, T* inverse=nullptr) { int *cu_indices, *cu_indices_argsorted; T *cu_ids_sorted; - CUDA_CHECK(cudaMalloc(&cu_indices, N * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_indices_argsorted, N * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_ids_sorted, N * sizeof(T))); + CUDA_CHECK(hipMalloc(&cu_indices, N * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_indices_argsorted, N * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_ids_sorted, N * sizeof(T))); arange_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_indices, N); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( nullptr, temp_storage_bytes, ids, cu_ids_sorted, cu_indices, cu_indices_argsorted, N )); cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( cub_temp_storage.ptr, temp_storage_bytes, ids, cu_ids_sorted, cu_indices, cu_indices_argsorted, N )); - CUDA_CHECK(cudaFree(cu_indices)); + CUDA_CHECK(hipFree(cu_indices)); // get diff T* cu_new_ids; - CUDA_CHECK(cudaMalloc(&cu_new_ids, N * sizeof(T))); + CUDA_CHECK(hipMalloc(&cu_new_ids, N * sizeof(T))); get_diff_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( cu_ids_sorted, cu_new_ids, N ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // inverse if (inverse) { int* cu_num; - CUDA_CHECK(cudaMalloc(&cu_num, sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_num, sizeof(int))); temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipcub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, cu_ids_sorted, cu_new_ids, inverse, cu_num, N )); cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceSelect::Flagged( + CUDA_CHECK(hipcub::DeviceSelect::Flagged( cub_temp_storage.ptr, temp_storage_bytes, cu_ids_sorted, cu_new_ids, inverse, cu_num, N )); - CUDA_CHECK(cudaFree(cu_num)); + CUDA_CHECK(hipFree(cu_num)); } - CUDA_CHECK(cudaFree(cu_ids_sorted)); + CUDA_CHECK(hipFree(cu_ids_sorted)); // scan diff temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, cu_new_ids, N )); cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( cub_temp_storage.ptr, temp_storage_bytes, cu_new_ids, N @@ -232,12 +240,12 @@ int compress_ids(T* ids, size_t N, Buffer& cub_temp_storage, T* inverse=nu N, ids ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); T num_components; - CUDA_CHECK(cudaMemcpy(&num_components, cu_new_ids + N-1, sizeof(T), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(&num_components, cu_new_ids + N-1, sizeof(T), hipMemcpyDeviceToHost)); num_components += 1; - CUDA_CHECK(cudaFree(cu_new_ids)); - CUDA_CHECK(cudaFree(cu_indices_argsorted)); + CUDA_CHECK(hipFree(cu_new_ids)); + CUDA_CHECK(hipFree(cu_indices_argsorted)); return static_cast(num_components); } @@ -248,33 +256,33 @@ int compress_ids(T* ids, size_t N, Buffer& cub_temp_storage, T* inverse=nu template void print_max_val(T* ptr, size_t size) { T* dbg_cu_max_val; - CUDA_CHECK(cudaMalloc(&dbg_cu_max_val, sizeof(T))); + CUDA_CHECK(hipMalloc(&dbg_cu_max_val, sizeof(T))); size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceReduce::Max( + CUDA_CHECK(hipcub::DeviceReduce::Max( nullptr, temp_storage_bytes, ptr, dbg_cu_max_val, size )); char* temp_storage; - CUDA_CHECK(cudaMalloc(&temp_storage, temp_storage_bytes)); - CUDA_CHECK(cub::DeviceReduce::Max( + CUDA_CHECK(hipMalloc(&temp_storage, temp_storage_bytes)); + CUDA_CHECK(hipcub::DeviceReduce::Max( temp_storage, temp_storage_bytes, ptr, dbg_cu_max_val, size )); T h_max_val; - CUDA_CHECK(cudaMemcpy(&h_max_val, dbg_cu_max_val, sizeof(T), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(&h_max_val, dbg_cu_max_val, sizeof(T), hipMemcpyDeviceToHost)); std::cout << "Max value: " << h_max_val << std::endl; - CUDA_CHECK(cudaFree(dbg_cu_max_val)); - CUDA_CHECK(cudaFree(temp_storage)); + CUDA_CHECK(hipFree(dbg_cu_max_val)); + CUDA_CHECK(hipFree(temp_storage)); } template void print_val(T* ptr, size_t size) { T h_ptr[size]; - CUDA_CHECK(cudaMemcpy(h_ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(h_ptr, ptr, size * sizeof(T), hipMemcpyDeviceToHost)); for (size_t i = 0; i < size; i++) { std::cout << h_ptr[i] << " "; } diff --git a/src/shared.hip b/src/shared.hip new file mode 100644 index 0000000..9d74d4c --- /dev/null +++ b/src/shared.hip @@ -0,0 +1,69 @@ +// !!! This is a file automatically generated by hipify!!! +#include "hip/hip_runtime.h" +#include "shared_hip.h" + + +namespace cumesh { + + +/** + * Hook edges + * @param adj: the buffer for adjacency, shape (M) + * @param M: the number of adjacency + * @param conn_comp_ids: the buffer for connected component ids, shape (F) + * @param end_flag: flag to indicate if any union operation happened + */ +__global__ void hook_edges_kernel( + const int2* adj, + const int M, + int* conn_comp_ids, + int* end_flag +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= M) return; + + // get adjacent faces + int f0 = adj[tid].x; + int f1 = adj[tid].y; + + // union + // find roots + int root0 = conn_comp_ids[f0]; + while (root0 != conn_comp_ids[root0]) { + root0 = conn_comp_ids[root0]; + } + int root1 = conn_comp_ids[f1]; + while (root1 != conn_comp_ids[root1]) { + root1 = conn_comp_ids[root1]; + } + + if (root0 == root1) return; + + int high = max(root0, root1); + int low = min(root0, root1); + atomicMin(&conn_comp_ids[high], low); + *end_flag = 0; +} + + +/** + * Compress connected components + * @param conn_comp_ids: the buffer for connected component ids, shape (F) + * @param F: the number of faces + */ +__global__ void compress_components_kernel( + int* conn_comp_ids, + const int F +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + + int p = conn_comp_ids[tid]; + while (p != conn_comp_ids[p]) { + p = conn_comp_ids[p]; + } + conn_comp_ids[tid] = p; +} + + +} // namespace cumesh \ No newline at end of file diff --git a/src/shared_hip.h b/src/shared_hip.h new file mode 100644 index 0000000..a82e1b4 --- /dev/null +++ b/src/shared_hip.h @@ -0,0 +1,294 @@ +// !!! This is a file automatically generated by hipify!!! +#pragma once + +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#include +#endif +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#endif +#include "utils_hip.h" +#include "cumesh_hip.h" + + +namespace cumesh { + + +template +__global__ void arange_kernel(T* array, int N, int stride=1) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= N) return; + array[tid] = static_cast(tid * stride); +} + + +template +__global__ void cast_kernel(T1* input, T2* output, int N) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= N) return; + output[tid] = static_cast(input[tid]); +} + + +template +__global__ void fill_kernel(T* array, int N, T value) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= N) return; + array[tid] = value; +} + + +template +__global__ void scatter_kernel( + const int* indices, + const T* values, + const size_t N, + T* output +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= N) return; + output[indices[tid]] = values[tid]; +} + + +template +__global__ void index_kernel( + const T* values, + const int* indices, + const size_t N, + T* output +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= N) return; + output[tid] = values[indices[tid]]; +} + + +template +__global__ void diff_kernel( + const T* values, + const size_t N, + T* output +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= N) return; + output[tid] = values[tid+1] - values[tid]; +} + + +template +__global__ void set_flag_kernel( + const int* indices, + const size_t N, + T* output +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= N) return; + output[indices[tid]] = static_cast(1); +} + + +template +__global__ void compare_kernel( + const CompT* values, + const CompT threshold, + const size_t N, + Comparator op, + FlagT* flag +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= N) return; + flag[tid] = op(values[tid], threshold) ? static_cast(1) : static_cast(0); +} + + +template +__global__ void inplace_div_kernel( + Ta* a, + const Tb* b, + const size_t N +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= N) return; + a[tid] = a[tid] / static_cast(b[tid]); +} + + +/** + * Hook edges + * @param adj: the buffer for adjacency, shape (M) + * @param M: the number of adjacency + * @param conn_comp_ids: the buffer for connected component ids, shape (F) + * @param end_flag: flag to indicate if any union operation happened + */ +__global__ void hook_edges_kernel( + const int2* adj, + const int M, + int* conn_comp_ids, + int* end_flag +); + + +/** + * Compress connected components + * @param conn_comp_ids: the buffer for connected component ids, shape (F) + * @param F: the number of faces + */ +__global__ void compress_components_kernel( + int* conn_comp_ids, + const int F +); + + +template +__global__ void get_diff_kernel( + const T* ids_sorted, + T* ids_diff, + const int N +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= N) return; + if (tid == N-1) { + ids_diff[tid] = 1; + return; + } + if (ids_sorted[tid] != ids_sorted[tid+1]) { + ids_diff[tid] = 1; + } else { + ids_diff[tid] = 0; + } +} + + +template +int compress_ids(T* ids, size_t N, Buffer& cub_temp_storage, T* inverse=nullptr) { + int *cu_indices, *cu_indices_argsorted; + T *cu_ids_sorted; + CUDA_CHECK(hipMalloc(&cu_indices, N * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_indices_argsorted, N * sizeof(int))); + CUDA_CHECK(hipMalloc(&cu_ids_sorted, N * sizeof(T))); + hipLaunchKernelGGL(( arange_kernel), dim3((N+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_indices, N); + CUDA_CHECK(hipGetLastError()); + size_t temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, + ids, cu_ids_sorted, + cu_indices, cu_indices_argsorted, + N + )); + cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs( + cub_temp_storage.ptr, temp_storage_bytes, + ids, cu_ids_sorted, + cu_indices, cu_indices_argsorted, + N + )); + CUDA_CHECK(hipFree(cu_indices)); + + // get diff + T* cu_new_ids; + CUDA_CHECK(hipMalloc(&cu_new_ids, N * sizeof(T))); + hipLaunchKernelGGL(( get_diff_kernel), dim3((N+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_ids_sorted, + cu_new_ids, + N + ); + CUDA_CHECK(hipGetLastError()); + + // inverse + if (inverse) { + int* cu_num; + CUDA_CHECK(hipMalloc(&cu_num, sizeof(int))); + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + nullptr, temp_storage_bytes, + cu_ids_sorted, cu_new_ids, inverse, cu_num, + N + )); + cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceSelect::Flagged( + cub_temp_storage.ptr, temp_storage_bytes, + cu_ids_sorted, cu_new_ids, inverse, cu_num, + N + )); + CUDA_CHECK(hipFree(cu_num)); + } + CUDA_CHECK(hipFree(cu_ids_sorted)); + + // scan diff + temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + cu_new_ids, + N + )); + cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + cub_temp_storage.ptr, temp_storage_bytes, + cu_new_ids, + N + )); + + // scatter + hipLaunchKernelGGL(( scatter_kernel), dim3((N+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + cu_indices_argsorted, + cu_new_ids, + N, + ids + ); + CUDA_CHECK(hipGetLastError()); + T num_components; + CUDA_CHECK(hipMemcpy(&num_components, cu_new_ids + N-1, sizeof(T), hipMemcpyDeviceToHost)); + num_components += 1; + CUDA_CHECK(hipFree(cu_new_ids)); + CUDA_CHECK(hipFree(cu_indices_argsorted)); + + return static_cast(num_components); +} + + +// DEBUG + +template +void print_max_val(T* ptr, size_t size) { + T* dbg_cu_max_val; + CUDA_CHECK(hipMalloc(&dbg_cu_max_val, sizeof(T))); + size_t temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceReduce::Max( + nullptr, temp_storage_bytes, + ptr, + dbg_cu_max_val, + size + )); + char* temp_storage; + CUDA_CHECK(hipMalloc(&temp_storage, temp_storage_bytes)); + CUDA_CHECK(hipcub::DeviceReduce::Max( + temp_storage, temp_storage_bytes, + ptr, + dbg_cu_max_val, + size + )); + T h_max_val; + CUDA_CHECK(hipMemcpy(&h_max_val, dbg_cu_max_val, sizeof(T), hipMemcpyDeviceToHost)); + std::cout << "Max value: " << h_max_val << std::endl; + CUDA_CHECK(hipFree(dbg_cu_max_val)); + CUDA_CHECK(hipFree(temp_storage)); +} + +template +void print_val(T* ptr, size_t size) { + T h_ptr[size]; + CUDA_CHECK(hipMemcpy(h_ptr, ptr, size * sizeof(T), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + std::cout << h_ptr[i] << " "; + } + std::cout << std::endl; +} + + +} // namespace cumesh \ No newline at end of file diff --git a/src/simplify.cu b/src/simplify.cu index 9efde9e..50eb5b6 100644 --- a/src/simplify.cu +++ b/src/simplify.cu @@ -1,6 +1,10 @@ #include "cumesh.h" #include "dtypes.cuh" +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include +#endif namespace cumesh { @@ -77,7 +81,7 @@ void get_qem( V, F, reinterpret_cast(ctx.temp_storage.ptr) ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } @@ -246,7 +250,7 @@ void get_edge_collapse_cost( lambda_edge_length, lambda_skinny, ctx.edge_collapse_costs.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } @@ -311,7 +315,7 @@ void propagate_cost( V, F, E, ctx.propagated_costs.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); } @@ -466,22 +470,22 @@ void collapse_edges( ctx.vertices_map.ptr, ctx.faces_map.ptr ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); // update vertices buffer // get vertices map size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, ctx.vertices_map.ptr, V+1 )); ctx.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( ctx.cub_temp_storage.ptr, temp_storage_bytes, ctx.vertices_map.ptr, V+1 )); int new_num_vertices; - CUDA_CHECK(cudaMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), hipMemcpyDeviceToHost)); // compress vertices ctx.temp_storage.resize(new_num_vertices * sizeof(float3)); compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( @@ -490,22 +494,22 @@ void collapse_edges( V, reinterpret_cast(ctx.temp_storage.ptr) ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); swap_buffers(ctx.temp_storage, ctx.vertices); // update faces buffer // get faces map - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, ctx.faces_map.ptr, F+1 )); ctx.cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( ctx.cub_temp_storage.ptr, temp_storage_bytes, ctx.faces_map.ptr, F+1 )); int new_num_faces; - CUDA_CHECK(cudaMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(hipMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), hipMemcpyDeviceToHost)); // compress faces ctx.temp_storage.resize(new_num_faces * sizeof(int3)); compress_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( @@ -515,7 +519,7 @@ void collapse_edges( F, reinterpret_cast(ctx.temp_storage.ptr) ); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipGetLastError()); swap_buffers(ctx.temp_storage, ctx.faces); } @@ -526,7 +530,7 @@ std::tuple CuMesh::simplify_step(float lambda_edge_length, float lambd if (timing) start = std::chrono::high_resolution_clock::now(); this->get_vertex_face_adjacency(); if (timing) { - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(hipDeviceSynchronize()); end = std::chrono::high_resolution_clock::now(); std::cout << "get_vertex_face_adjacency: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; } @@ -535,7 +539,7 @@ std::tuple CuMesh::simplify_step(float lambda_edge_length, float lambd this->get_edges(); this->get_boundary_info(); if (timing) { - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(hipDeviceSynchronize()); end = std::chrono::high_resolution_clock::now(); std::cout << "get_edges: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; } @@ -543,7 +547,7 @@ std::tuple CuMesh::simplify_step(float lambda_edge_length, float lambd if (timing) start = std::chrono::high_resolution_clock::now(); get_qem(*this); if (timing) { - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(hipDeviceSynchronize()); end = std::chrono::high_resolution_clock::now(); std::cout << "get_qem: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; } @@ -551,7 +555,7 @@ std::tuple CuMesh::simplify_step(float lambda_edge_length, float lambd if (timing) start = std::chrono::high_resolution_clock::now(); get_edge_collapse_cost(*this, lambda_edge_length, lambda_skinny); if (timing) { - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(hipDeviceSynchronize()); end = std::chrono::high_resolution_clock::now(); std::cout << "get_edge_collapse_cost: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; } @@ -559,7 +563,7 @@ std::tuple CuMesh::simplify_step(float lambda_edge_length, float lambd if (timing) start = std::chrono::high_resolution_clock::now(); propagate_cost(*this); if (timing) { - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(hipDeviceSynchronize()); end = std::chrono::high_resolution_clock::now(); std::cout << "propagate_cost: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; } @@ -567,7 +571,7 @@ std::tuple CuMesh::simplify_step(float lambda_edge_length, float lambd if (timing) start = std::chrono::high_resolution_clock::now(); collapse_edges(*this, threshold); if (timing) { - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(hipDeviceSynchronize()); end = std::chrono::high_resolution_clock::now(); std::cout << "collapse_edges: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; } diff --git a/src/simplify.hip b/src/simplify.hip new file mode 100644 index 0000000..3748bd6 --- /dev/null +++ b/src/simplify.hip @@ -0,0 +1,588 @@ +// !!! This is a file automatically generated by hipify!!! +#include "hip/hip_runtime.h" +#include "cumesh_hip.h" +#include "dtypes_hip.cuh" +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#endif + + +namespace cumesh { + + +__device__ inline uint64_t pack_key_value_positive(int key, float value) { + unsigned int v = __float_as_uint(value); + return (static_cast(v) << 32) | + static_cast(key); +} + + +__device__ inline void unpack_key_value_positive(uint64_t key_value, int& key, float& value) { + key = static_cast(key_value & 0xffffffffu); + value = __uint_as_float(static_cast(key_value >> 32)); +} + + +/** + * Get the QEM for each vertex + * + * @param vertices: the vertices of the mesh, shape (V) + * @param faces: the faces of the mesh, shape (F) + * @param vert2face: the buffer for neighbor face ids, shape (total_neighbor_face_cnt) + * @param vert2face_offset: the buffer for neighbor face ids offset, shape (V+1) + * @param V: the number of vertices + * @param F: the number of faces + * @param qems: the buffer for QEMs, shape (V) + */ +static __global__ void get_qem_kernel( + const float3* vertices, + const int3* faces, + const int* vert2face, + const int* vert2face_offset, + const int V, + const int F, + QEM* qems +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= V) return; + + // compute QEM + QEM v_qem; + for (int f = vert2face_offset[tid]; f < vert2face_offset[tid+1]; f++) { + int3 f_vids = faces[vert2face[f]]; + Vec3f f_v0(vertices[f_vids.x]); + Vec3f e1(vertices[f_vids.y]); + Vec3f e2(vertices[f_vids.z]); + e1 -= f_v0; + e2 -= f_v0; + Vec3f n = e1.cross(e2); + n.normalize(); + float d = -(n.dot(f_v0)); + v_qem.add_plane({ n.x, n.y, n.z, d }); + } + qems[tid] = v_qem; +} + + +/** + * Get the QEM for each vertex + */ +void get_qem( + CuMesh& ctx +) { + size_t V = ctx.vertices.size; + size_t F = ctx.faces.size; + ctx.temp_storage.resize(V * sizeof(QEM)); + hipLaunchKernelGGL(( get_qem_kernel), dim3((V+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + ctx.vertices.ptr, + ctx.faces.ptr, + ctx.vert2face.ptr, + ctx.vert2face_offset.ptr, + V, F, + reinterpret_cast(ctx.temp_storage.ptr) + ); + CUDA_CHECK(hipGetLastError()); +} + + +inline __device__ bool process_incident_tri( + int tri_idx, + int collapse_keep_vert, // the vertex we keep (e0 or e1) + int collapse_other_vert, // the other one (the one removed) + const float3* vertices, + const int3* faces, + const Vec3f& v_new, // midpoint + float& skinny_cost, + int& num_tri +) { + const float EPS = 1e-12f; + int3 f_vids = faces[tri_idx]; + + // If this triangle contains the other vertex (the edge), it will be removed, skip it + if (f_vids.x == collapse_other_vert || f_vids.y == collapse_other_vert || f_vids.z == collapse_other_vert) + return true; // skip, not an error + + // get old positions + Vec3f a(vertices[f_vids.x]); + Vec3f b(vertices[f_vids.y]); + Vec3f c(vertices[f_vids.z]); + + // build new positions: replace occurrences of collapse_keep_vert with v_new + Vec3f na = (f_vids.x == collapse_keep_vert) ? v_new : a; + Vec3f nb = (f_vids.y == collapse_keep_vert) ? v_new : b; + Vec3f nc = (f_vids.z == collapse_keep_vert) ? v_new : c; + + // compute old edge vectors (for old normal) + Vec3f old_e1 = b - a; + Vec3f old_e2 = c - a; + Vec3f old_normal = old_e1.cross(old_e2); + float old_area = 0.5f * old_normal.norm(); + + // compute new edge vectors consistently: e1 = nb - na, e2 = nc - na + Vec3f new_e1 = nb - na; + Vec3f new_e2 = nc - na; + Vec3f new_normal = new_e1.cross(new_e2); + float new_area = 0.5f * new_normal.norm(); + + // check flipping + if (old_normal.dot(new_normal) < 0.0f) { + return false; // invalid (flipped) + } + + // compute side lengths squared for shape metric + Vec3f new_e0 = nc - nb; + float denom = new_e0.norm2() + new_e1.norm2() + new_e2.norm2(); + if (denom < EPS) denom = EPS; + float shapeMetric = 4.0f * sqrtf(3.0f) * new_area / denom; + float term = 1.0f - fminf(fmaxf(shapeMetric, 0.0f), 1.0f); + skinny_cost += term; + num_tri += 1; + return true; +} + + +/** + * Get the cost for each edge collapse + * + * @param vertices: the vertices of the mesh, shape (V) + * @param faces: the faces of the mesh, shape (F) + * @param vert2face: the buffer for neighbor face ids, shape (total_neighbor_face_cnt) + * @param vert2face_offset: the buffer for neighbor face ids offset, shape (V+1) + * @param edges: the buffer for edges, shape (E) + * @param vert_is_boundary: the buffer for boundary vertex indicator, shape (V) + * @param qems: the buffer for QEMs, shape (V) + * @param V: the number of vertices + * @param F: the number of faces + * @param E: the number of edges + * @param edge_collapse_costs: the buffer for edge collapse costs, shape (E) + */ +static __global__ void get_edge_collapse_cost_kernel( + const float3* vertices, + const int3* faces, + const int* vert2face, + const int* vert2face_offset, + const uint64_t* edges, + const uint8_t * vert_is_boundary, + const QEM* qems, + const int V, + const int F, + const int E, + const float lambda_edge_length, + const float lambda_skinny, + float* edge_collapse_costs +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= E) return; + + // get edge + uint64_t e = edges[tid]; + int e0 = int(e >> 32); + int e1 = int(e & 0xFFFFFFFF); + + // get edge vertices + Vec3f v0(vertices[e0]); + Vec3f v1(vertices[e1]); + uint8_t v0_is_bound = vert_is_boundary[e0]; + uint8_t v1_is_bound = vert_is_boundary[e1]; + float w0 = 0.5; + if (v0_is_bound && !v1_is_bound) w0 = 1.0; + else if (!v0_is_bound && v1_is_bound) w0 = 0.0; + Vec3f v = v0 * w0 + v1 * (1.0f - w0); + + float cost = 0.0f; + + // QEM cost + QEM edge_qem = qems[e0] + qems[e1]; + float qem_cost = edge_qem.evaluate(v); + cost += qem_cost; + + // edge length cost + float edge_length2 = (v1 - v0).norm2(); + cost += lambda_edge_length * edge_length2; + + // skinny cost + float skinny_cost = 0.0f; + int num_tri = 0; + for (int f = vert2face_offset[e0]; f < vert2face_offset[e0+1]; f++) { + int tri_idx = vert2face[f]; + if (!process_incident_tri(tri_idx, e0, e1, vertices, faces, v, skinny_cost, num_tri)) { + edge_collapse_costs[tid] = INFINITY; + return; + } + } + for (int f = vert2face_offset[e1]; f < vert2face_offset[e1+1]; f++) { + int tri_idx = vert2face[f]; + if (!process_incident_tri(tri_idx, e1, e0, vertices, faces, v, skinny_cost, num_tri)) { + edge_collapse_costs[tid] = INFINITY; + return; + } + } + if (num_tri > 0) { + skinny_cost /= num_tri; + } + cost += lambda_skinny * skinny_cost * edge_length2; + + edge_collapse_costs[tid] = cost; +} + + +/** + * Get the cost for each edge collapse + */ +void get_edge_collapse_cost( + CuMesh& ctx, + float lambda_edge_length, + float lambda_skinny +) { + size_t V = ctx.vertices.size; + size_t F = ctx.faces.size; + size_t E = ctx.edges.size; + ctx.edge_collapse_costs.resize(E); + hipLaunchKernelGGL(( get_edge_collapse_cost_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + ctx.vertices.ptr, + ctx.faces.ptr, + ctx.vert2face.ptr, + ctx.vert2face_offset.ptr, + ctx.edges.ptr, + ctx.vert_is_boundary.ptr, + reinterpret_cast(ctx.temp_storage.ptr), + V, F, E, + lambda_edge_length, lambda_skinny, + ctx.edge_collapse_costs.ptr + ); + CUDA_CHECK(hipGetLastError()); +} + + +/** + * Propagate cost to neighboring faces + * + * @param edges: the buffer for edges, shape (E) + * @param vert2face: the buffer for neighboring face ids, shape (total_neighbor_face_cnt) + * @param vert2face_offset: the buffer for neighboring face ids offset, shape (V+1) + * @param edge_collapse_costs: the buffer for edge collapse costs, shape (E) + * @param V: the number of vertices + * @param F: the number of faces + * @param E: the number of edges + * @param propagated_costs: the buffer for edge collapse costs propagated, shape (F) + */ +static __global__ void propagate_cost_kernel( + const uint64_t* edges, + const int* vert2face, + const int* vert2face_offset, + const float* edge_collapse_costs, + const int V, + const int F, + const int E, + uint64_t* propagated_costs +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= E) return; + + // get edge + uint64_t e = edges[tid]; + int e0 = int(e >> 32); + int e1 = int(e & 0xFFFFFFFF); + + uint64_t cost = pack_key_value_positive(tid, edge_collapse_costs[tid]); + + // propagate cost to neighboring faces + for (int f = vert2face_offset[e0]; f < vert2face_offset[e0+1]; f++) { + atomicMin(reinterpret_cast(&propagated_costs[vert2face[f]]), static_cast(cost)); + } + for (int f = vert2face_offset[e1]; f < vert2face_offset[e1+1]; f++) { + atomicMin(reinterpret_cast(&propagated_costs[vert2face[f]]), static_cast(cost)); + } +} + + +/** + * Propagate cost to neighboring faces + */ +void propagate_cost( + CuMesh& ctx +) { + size_t V = ctx.vertices.size; + size_t F = ctx.faces.size; + size_t E = ctx.edges.size; + ctx.propagated_costs.resize(F); + ctx.propagated_costs.fill(std::numeric_limits::max()); + hipLaunchKernelGGL(( propagate_cost_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + ctx.edges.ptr, + ctx.vert2face.ptr, + ctx.vert2face_offset.ptr, + ctx.edge_collapse_costs.ptr, + V, F, E, + ctx.propagated_costs.ptr + ); + CUDA_CHECK(hipGetLastError()); +} + + +/** + * Collapse edges parallelly + * + * @param vertices: the vertices of the mesh, shape (V) + * @param faces: the faces of the mesh, shape (F) + * @param edges: the buffer for edges, shape (E) + * @param vert2face: the buffer for neighboring face ids, shape (total_neighbor_face_cnt) + * @param vert2face_offset: the buffer for neighboring face ids offset, shape (V+1) + * @param edge_collapse_costs: the buffer for edge collapse costs, shape (E) + * @param propagated_costs: the buffer for edge collapse costs propagated, shape (F) + * @param vert_is_boundary: the buffer for boundary vertex indicator, shape (V) + * @param V: the number of vertices + * @param F: the number of faces + * @param E: the number of edges + * @param collapse_thresh: the threshold for cost collapse + * @param vertices_kept: the flag for vertices kept, shape (V) + * @param faces_kept: the flag for faces kept, shape (F) + */ +static __global__ void collapse_edges_kernel( + float3* vertices, + int3* faces, + uint64_t* edges, + const int* vert2face, + const int* vert2face_offset, + const float* edge_collapse_costs, + const uint64_t* propagated_costs, + const uint8_t * vert_is_boundary, + const int V, + const int F, + const int E, + const float collapse_thresh, + int* vertices_kept, + int* faces_kept +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= E) return; + + float cost = edge_collapse_costs[tid]; + if (cost > collapse_thresh) return; + + // get edge + uint64_t e = edges[tid]; + int e0 = int(e >> 32); + int e1 = int(e & 0xFFFFFFFF); + uint64_t pack = pack_key_value_positive(tid, cost); + + for (int f = vert2face_offset[e0]; f < vert2face_offset[e0+1]; f++) { + if (propagated_costs[vert2face[f]] != pack) return; + } + for (int f = vert2face_offset[e1]; f < vert2face_offset[e1+1]; f++) { + if (propagated_costs[vert2face[f]] != pack) return; + } + + // collapse edge + Vec3f v0(vertices[e0]); + Vec3f v1(vertices[e1]); + uint8_t v0_is_bound = vert_is_boundary[e0]; + uint8_t v1_is_bound = vert_is_boundary[e1]; + float w0 = 0.5; + if (v0_is_bound && !v1_is_bound) w0 = 1.0; + else if (!v0_is_bound && v1_is_bound) w0 = 0.0; + Vec3f v_new = v0 * w0 + v1 * (1.0f - w0); + vertices[e0] = { v_new.x, v_new.y, v_new.z }; + vertices_kept[e1] = 0; + // delete shared faces + for (int f = vert2face_offset[e0]; f < vert2face_offset[e0+1]; f++) { + int fid = vert2face[f]; + int3 f_vids = faces[fid]; + if (f_vids.x == e1 || f_vids.y == e1 || f_vids.z == e1) { + faces_kept[fid] = 0; + } + } + // update faces + for (int f = vert2face_offset[e1]; f < vert2face_offset[e1+1]; f++) { + int fid = vert2face[f]; + int3 f_vids = faces[fid]; + if (f_vids.x == e1) { + f_vids.x = e0; + } else if (f_vids.y == e1) { + f_vids.y = e0; + } else if (f_vids.z == e1) { + f_vids.z = e0; + } + faces[fid] = f_vids; + } +} + + +static __global__ void compress_vertices_kernel( + const int* vertices_map, + const float3* old_vertices, + const int V, + float3* new_vertices +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= V) return; + int new_id = vertices_map[tid]; + int is_kept = vertices_map[tid + 1] == new_id + 1; + if (is_kept) { + new_vertices[new_id] = old_vertices[tid]; + } +} + + +static __global__ void compress_faces_kernel( + const int* faces_map, + const int* vertices_map, + const int3* old_faces, + const int F, + int3* new_faces +) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= F) return; + int new_id = faces_map[tid]; + int is_kept = faces_map[tid + 1] == new_id + 1; + if (is_kept) { + new_faces[new_id].x = vertices_map[old_faces[tid].x]; + new_faces[new_id].y = vertices_map[old_faces[tid].y]; + new_faces[new_id].z = vertices_map[old_faces[tid].z]; + } +} + + +/** + * Collapse edges parallelly + */ +void collapse_edges( + CuMesh& ctx, + float collapse_thresh +) { + size_t V = ctx.vertices.size; + size_t F = ctx.faces.size; + size_t E = ctx.edges.size; + ctx.vertices_map.resize(V + 1); + ctx.faces_map.resize(F + 1); + ctx.vertices_map.fill(1); + ctx.faces_map.fill(1); + hipLaunchKernelGGL(( collapse_edges_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + ctx.vertices.ptr, + ctx.faces.ptr, + ctx.edges.ptr, + ctx.vert2face.ptr, + ctx.vert2face_offset.ptr, + ctx.edge_collapse_costs.ptr, + ctx.propagated_costs.ptr, + ctx.vert_is_boundary.ptr, + V, F, E, + collapse_thresh, + ctx.vertices_map.ptr, + ctx.faces_map.ptr + ); + CUDA_CHECK(hipGetLastError()); + + // update vertices buffer + // get vertices map + size_t temp_storage_bytes = 0; + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + ctx.vertices_map.ptr, V+1 + )); + ctx.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + ctx.cub_temp_storage.ptr, temp_storage_bytes, + ctx.vertices_map.ptr, V+1 + )); + int new_num_vertices; + CUDA_CHECK(hipMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), hipMemcpyDeviceToHost)); + // compress vertices + ctx.temp_storage.resize(new_num_vertices * sizeof(float3)); + hipLaunchKernelGGL(( compress_vertices_kernel), dim3((V+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + ctx.vertices_map.ptr, + ctx.vertices.ptr, + V, + reinterpret_cast(ctx.temp_storage.ptr) + ); + CUDA_CHECK(hipGetLastError()); + swap_buffers(ctx.temp_storage, ctx.vertices); + + // update faces buffer + // get faces map + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, + ctx.faces_map.ptr, F+1 + )); + ctx.cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum( + ctx.cub_temp_storage.ptr, temp_storage_bytes, + ctx.faces_map.ptr, F+1 + )); + int new_num_faces; + CUDA_CHECK(hipMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), hipMemcpyDeviceToHost)); + // compress faces + ctx.temp_storage.resize(new_num_faces * sizeof(int3)); + hipLaunchKernelGGL(( compress_faces_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, + ctx.faces_map.ptr, + ctx.vertices_map.ptr, + ctx.faces.ptr, + F, + reinterpret_cast(ctx.temp_storage.ptr) + ); + CUDA_CHECK(hipGetLastError()); + swap_buffers(ctx.temp_storage, ctx.faces); +} + + +std::tuple CuMesh::simplify_step(float lambda_edge_length, float lambda_skinny, float threshold, bool timing) { + std::chrono::high_resolution_clock::time_point start, end; + + if (timing) start = std::chrono::high_resolution_clock::now(); + this->get_vertex_face_adjacency(); + if (timing) { + CUDA_CHECK(hipDeviceSynchronize()); + end = std::chrono::high_resolution_clock::now(); + std::cout << "get_vertex_face_adjacency: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; + } + + if (timing) start = std::chrono::high_resolution_clock::now(); + this->get_edges(); + this->get_boundary_info(); + if (timing) { + CUDA_CHECK(hipDeviceSynchronize()); + end = std::chrono::high_resolution_clock::now(); + std::cout << "get_edges: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; + } + + if (timing) start = std::chrono::high_resolution_clock::now(); + get_qem(*this); + if (timing) { + CUDA_CHECK(hipDeviceSynchronize()); + end = std::chrono::high_resolution_clock::now(); + std::cout << "get_qem: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; + } + + if (timing) start = std::chrono::high_resolution_clock::now(); + get_edge_collapse_cost(*this, lambda_edge_length, lambda_skinny); + if (timing) { + CUDA_CHECK(hipDeviceSynchronize()); + end = std::chrono::high_resolution_clock::now(); + std::cout << "get_edge_collapse_cost: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; + } + + if (timing) start = std::chrono::high_resolution_clock::now(); + propagate_cost(*this); + if (timing) { + CUDA_CHECK(hipDeviceSynchronize()); + end = std::chrono::high_resolution_clock::now(); + std::cout << "propagate_cost: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; + } + + if (timing) start = std::chrono::high_resolution_clock::now(); + collapse_edges(*this, threshold); + if (timing) { + CUDA_CHECK(hipDeviceSynchronize()); + end = std::chrono::high_resolution_clock::now(); + std::cout << "collapse_edges: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; + } + + // Delete all cached info since mesh has changed + this->clear_cache(); + + return std::make_tuple(this->vertices.size, this->faces.size); +} + + +} // namespace cumesh diff --git a/src/utils.h b/src/utils.h index f15823b..8757bba 100644 --- a/src/utils.h +++ b/src/utils.h @@ -1,21 +1,25 @@ #pragma once #include +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include #include +#endif #include #define CUDA_CHECK(call) \ do { \ - const cudaError_t error_code = call; \ - if (error_code != cudaSuccess) { \ + const hipError_t error_code = call; \ + if (error_code != hipSuccess) { \ TORCH_CHECK(false, \ - "[CuMesh] CUDA error:\n", \ + "[CuMesh] HIP error:\n", \ " File: ", __FILE__, "\n", \ " Line: ", __LINE__, "\n", \ " Error code: ", error_code, "\n", \ " Error text: ", \ - cudaGetErrorString(error_code), "\n"); \ + hipGetErrorString(error_code), "\n"); \ } \ } while (0) @@ -39,11 +43,11 @@ struct Buffer { void init(size_t capacity) { this->capacity = capacity; - CUDA_CHECK(cudaMalloc(&ptr, capacity * sizeof(T))); + CUDA_CHECK(hipMalloc(&ptr, capacity * sizeof(T))); } void free() { - if (ptr != nullptr) CUDA_CHECK(cudaFree(ptr)); + if (ptr != nullptr) CUDA_CHECK(hipFree(ptr)); ptr = nullptr; size = 0; capacity = 0; @@ -61,9 +65,9 @@ struct Buffer { size_t new_size = size + this->size; if (new_size > capacity) { T* new_ptr; - CUDA_CHECK(cudaMalloc(&new_ptr, new_size * sizeof(T))); - CUDA_CHECK(cudaMemcpy(new_ptr, ptr, this->size * sizeof(T), cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaFree(ptr)); + CUDA_CHECK(hipMalloc(&new_ptr, new_size * sizeof(T))); + CUDA_CHECK(hipMemcpy(new_ptr, ptr, this->size * sizeof(T), hipMemcpyDeviceToDevice)); + CUDA_CHECK(hipFree(ptr)); ptr = new_ptr; this->capacity = new_size; } @@ -71,12 +75,12 @@ struct Buffer { } void zero() { - CUDA_CHECK(cudaMemset(ptr, 0, size * sizeof(T))); + CUDA_CHECK(hipMemset(ptr, 0, size * sizeof(T))); } void fill(T val) { std::vector tmp(size, val); - CUDA_CHECK(cudaMemcpy(ptr, tmp.data(), size * sizeof(T), cudaMemcpyHostToDevice)); + CUDA_CHECK(hipMemcpy(ptr, tmp.data(), size * sizeof(T), hipMemcpyHostToDevice)); } }; diff --git a/src/utils_hip.h b/src/utils_hip.h new file mode 100644 index 0000000..e5ed6b8 --- /dev/null +++ b/src/utils_hip.h @@ -0,0 +1,110 @@ +// !!! This is a file automatically generated by hipify!!! +#pragma once + +#include +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#include +#endif +#include + +#define CUDA_CHECK(call) \ +do { \ + const hipError_t error_code = call; \ + if (error_code != hipSuccess) { \ + TORCH_CHECK(false, \ + "[CuMesh] HIP error:\n", \ + " File: ", __FILE__, "\n", \ + " Line: ", __LINE__, "\n", \ + " Error code: ", error_code, "\n", \ + " Error text: ", \ + hipGetErrorString(error_code), "\n"); \ + } \ +} while (0) + +namespace cumesh { + + +/** + * A GPU buffer class that manages device memory. + */ +template +struct Buffer { + T* ptr; + size_t size; + size_t capacity; + + Buffer() : ptr(nullptr), size(0), capacity(0) {} + + bool is_empty() const { + return size == 0; + } + + void init(size_t capacity) { + this->capacity = capacity; + CUDA_CHECK(hipMalloc(&ptr, capacity * sizeof(T))); + } + + void free() { + if (ptr != nullptr) CUDA_CHECK(hipFree(ptr)); + ptr = nullptr; + size = 0; + capacity = 0; + } + + void resize(size_t size) { + if (size > capacity) { + free(); + init(size); + } + this->size = size; + } + + void extend(size_t size) { + size_t new_size = size + this->size; + if (new_size > capacity) { + T* new_ptr; + CUDA_CHECK(hipMalloc(&new_ptr, new_size * sizeof(T))); + CUDA_CHECK(hipMemcpy(new_ptr, ptr, this->size * sizeof(T), hipMemcpyDeviceToDevice)); + CUDA_CHECK(hipFree(ptr)); + ptr = new_ptr; + this->capacity = new_size; + } + this->size = new_size; + } + + void zero() { + CUDA_CHECK(hipMemset(ptr, 0, size * sizeof(T))); + } + + void fill(T val) { + std::vector tmp(size, val); + CUDA_CHECK(hipMemcpy(ptr, tmp.data(), size * sizeof(T), hipMemcpyHostToDevice)); + } +}; + + +/** + * Swap the contents of two buffers. + */ +template +void swap_buffers(Buffer& b1, Buffer& b2) { + void* b1_ptr = reinterpret_cast(b1.ptr); + void* b2_ptr = reinterpret_cast(b2.ptr); + size_t b1_capacity_bytes = b1.capacity * sizeof(T1); + size_t b2_capacity_bytes = b2.capacity * sizeof(T2); + size_t b1_size_bytes = b1.size * sizeof(T1); + size_t b2_size_bytes = b2.size * sizeof(T2); + + b1.ptr = reinterpret_cast(b2_ptr); + b2.ptr = reinterpret_cast(b1_ptr); + b1.capacity = b2_capacity_bytes / sizeof(T1); + b2.capacity = b1_capacity_bytes / sizeof(T2); + b1.size = b2_size_bytes / sizeof(T1); + b2.size = b1_size_bytes / sizeof(T2); +} + + +} // namespace cumesh