From 2dad46c836864da45a61bf41a986bc6587905106 Mon Sep 17 00:00:00 2001 From: Y-L-LIU Date: Mon, 29 Dec 2025 09:58:33 +0000 Subject: [PATCH 1/2] fix unsupported operations --- src/clean_up.cu | 159 +++++++++++++++++++++++----------------- src/connectivity.cu | 4 +- src/remesh/svox2vert.cu | 4 +- src/shared.h | 4 +- src/simplify.cu | 8 +- 5 files changed, 100 insertions(+), 79 deletions(-) diff --git a/src/clean_up.cu b/src/clean_up.cu index 3c12cd7..a250dc0 100644 --- a/src/clean_up.cu +++ b/src/clean_up.cu @@ -2,10 +2,56 @@ #include "dtypes.cuh" #include "shared.h" #include - +#include +#include +#include +#include namespace cumesh { + // Marks faces as 1 (keep) or 0 (remove) by comparing adjacent sorted faces +__global__ void mark_duplicates_from_indices_kernel( + const int* sorted_indices, + const int3* faces, + uint8_t* mask_original, + int n +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + int current_original_idx = sorted_indices[idx]; + + // The first element in the sorted list is always unique + if (idx == 0) { + mask_original[current_original_idx] = 1; + return; + } + + // Compare with the previous element in the sorted list + int prev_original_idx = sorted_indices[idx - 1]; + + int3 curr_f = faces[current_original_idx]; + int3 prev_f = faces[prev_original_idx]; + + // If identical to previous, it's a duplicate -> mark 0 (remove) + // Otherwise -> mark 1 (keep) + bool is_duplicate = (curr_f.x == prev_f.x && curr_f.y == prev_f.y && curr_f.z == prev_f.z); + mask_original[current_original_idx] = is_duplicate ? 0 : 1; +} + +// Comparator for Thrust to sort indices based on face values +struct FaceComparator { + const int3* faces; + FaceComparator(const int3* f) : faces(f) {} + + __device__ bool operator()(int i, int j) const { + const int3& a = faces[i]; + const int3& b = faces[j]; + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + return a.z < b.z; + } +}; static __global__ void copy_vec3f_to_float3_kernel( const Vec3f* vec3f, @@ -152,12 +198,12 @@ void CuMesh::remove_unreferenced_vertices() { size_t temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, - cu_vertex_is_referenced, V+1 + cu_vertex_is_referenced,cu_vertex_is_referenced, V+1 )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, - cu_vertex_is_referenced, V+1 + cu_vertex_is_referenced,cu_vertex_is_referenced, V+1 )); int new_num_vertices; CUDA_CHECK(cudaMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), cudaMemcpyDeviceToHost)); @@ -226,12 +272,14 @@ static __global__ void select_first_in_each_group_kernel( } } - -struct int3_decomposer +struct int3_comparator { - __host__ __device__ ::cuda::std::tuple operator()(int3& key) const + __host__ __device__ bool operator()(const int3& a, const int3& b) const { - return {key.x, key.y, key.z}; + // Lexicographical comparison: check x, then y, then z + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + return a.z < b.z; } }; @@ -239,81 +287,54 @@ struct int3_decomposer void CuMesh::remove_duplicate_faces() { size_t F = this->faces.size; - // Create a temporary sorted copy of faces for duplicate detection - // Do NOT modify the original faces to preserve vertex order and normals + // 1. Create a temporary copy of faces for canonicalization int3 *cu_sorted_faces; CUDA_CHECK(cudaMalloc(&cu_sorted_faces, F * sizeof(int3))); CUDA_CHECK(cudaMemcpy(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), cudaMemcpyDeviceToDevice)); - // Sort vertices within each face (in the temporary copy) - sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( - cu_sorted_faces, - F - ); + // 2. Sort vertices within each face (canonical form) + // (This ensures that face [0,1,2] is treated same as [2,0,1]) + sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_sorted_faces, F); CUDA_CHECK(cudaGetLastError()); - // Sort all faces globally by their sorted vertex indices - size_t temp_storage_bytes = 0; + // 3. Create indices [0, 1, 2, ... F-1] int *cu_sorted_face_indices; CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int))); - arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_sorted_face_indices, F); - CUDA_CHECK(cudaGetLastError()); - - int *cu_sorted_indices_output; - int3 *cu_sorted_faces_output; - CUDA_CHECK(cudaMalloc(&cu_sorted_indices_output, F * sizeof(int))); - CUDA_CHECK(cudaMalloc(&cu_sorted_faces_output, F * sizeof(int3))); - - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( - nullptr, temp_storage_bytes, - cu_sorted_faces, cu_sorted_faces_output, - cu_sorted_face_indices, cu_sorted_indices_output, - F, - int3_decomposer{} - )); - this->cub_temp_storage.resize(temp_storage_bytes); - CUDA_CHECK(cub::DeviceRadixSort::SortPairs( - this->cub_temp_storage.ptr, temp_storage_bytes, - cu_sorted_faces, cu_sorted_faces_output, - cu_sorted_face_indices, cu_sorted_indices_output, - F, - int3_decomposer{} - )); - CUDA_CHECK(cudaFree(cu_sorted_faces)); - CUDA_CHECK(cudaFree(cu_sorted_face_indices)); - - // Select first in each group of duplicate faces (based on sorted faces) - uint8_t* cu_face_mask_sorted; - CUDA_CHECK(cudaMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t))); - select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( - cu_sorted_faces_output, - F, - cu_face_mask_sorted - ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_sorted_faces_output)); - - // Map the mask back to original face order using scatter - // scatter: output[indices[i]] = values[i] - // This maps: cu_face_mask_original[original_idx] = cu_face_mask_sorted[sorted_position] + + // Use Thrust to generate sequence + thrust::sequence(thrust::device, + thrust::device_pointer_cast(cu_sorted_face_indices), + thrust::device_pointer_cast(cu_sorted_face_indices + F)); + + // 4. Sort the INDICES based on the face values using Thrust + // This groups identical faces together in the index list + thrust::sort(thrust::device, + thrust::device_pointer_cast(cu_sorted_face_indices), + thrust::device_pointer_cast(cu_sorted_face_indices + F), + FaceComparator(cu_sorted_faces)); + + // 5. Mark duplicates + // We traverse the sorted indices. If neighbors are equal, mark original index for removal. uint8_t* cu_face_mask_original; CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t))); - scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( - cu_sorted_indices_output, // indices: sorted_position -> original_idx - cu_face_mask_sorted, // values: mask at sorted_position - F, - cu_face_mask_original // output: mask at original position + + mark_duplicates_from_indices_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + cu_sorted_face_indices, + cu_sorted_faces, + cu_face_mask_original, + (int)F ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_face_mask_sorted)); - CUDA_CHECK(cudaFree(cu_sorted_indices_output)); - // Select faces to keep (preserving original vertex order) + // 6. Cleanup temporary memory + CUDA_CHECK(cudaFree(cu_sorted_faces)); + CUDA_CHECK(cudaFree(cu_sorted_face_indices)); + + // 7. Remove faces using the generated mask this->_remove_faces(cu_face_mask_original); CUDA_CHECK(cudaFree(cu_face_mask_original)); } - static __global__ void mark_degenerate_faces_kernel( const float3* vertices, const int3* faces, @@ -542,13 +563,13 @@ void CuMesh::fill_holes(float max_hole_perimeter) { temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::InclusiveSum( nullptr, temp_storage_bytes, - cu_loop_bound_loop_ids, + cu_loop_bound_loop_ids,cu_loop_bound_loop_ids, E )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::InclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, - cu_loop_bound_loop_ids, + cu_loop_bound_loop_ids,cu_loop_bound_loop_ids, E )); @@ -614,13 +635,13 @@ void CuMesh::fill_holes(float max_hole_perimeter) { temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::InclusiveSum( nullptr, temp_storage_bytes, - cu_new_loop_bound_loop_ids, + cu_new_loop_bound_loop_ids,cu_new_loop_bound_loop_ids, new_num_loop_boundaries )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::InclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, - cu_new_loop_bound_loop_ids, + cu_new_loop_bound_loop_ids,cu_new_loop_bound_loop_ids, new_num_loop_boundaries )); diff --git a/src/connectivity.cu b/src/connectivity.cu index 6e2f5fe..c061163 100644 --- a/src/connectivity.cu +++ b/src/connectivity.cu @@ -1074,13 +1074,13 @@ void CuMesh::get_boundary_loops() { temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, - this->loop_boundaries_offset.ptr, + this->loop_boundaries_offset.ptr,this->loop_boundaries_offset.ptr, this->num_bound_loops + 1 )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, - this->loop_boundaries_offset.ptr, + this->loop_boundaries_offset.ptr,this->loop_boundaries_offset.ptr, this->num_bound_loops + 1 )); } diff --git a/src/remesh/svox2vert.cu b/src/remesh/svox2vert.cu index 43123d3..ae57430 100644 --- a/src/remesh/svox2vert.cu +++ b/src/remesh/svox2vert.cu @@ -175,10 +175,10 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices( // Compute the offset size_t temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1); + cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices,num_vertices, M + 1); void* d_temp_storage = nullptr; CUDA_CHECK(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1); + cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices,num_vertices, M + 1); CUDA_CHECK(cudaFree(d_temp_storage)); int total_vertices; CUDA_CHECK(cudaMemcpy(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost)); diff --git a/src/shared.h b/src/shared.h index 66ecac7..039eb88 100644 --- a/src/shared.h +++ b/src/shared.h @@ -215,13 +215,13 @@ int compress_ids(T* ids, size_t N, Buffer& cub_temp_storage, T* inverse=nu temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, - cu_new_ids, + cu_new_ids,cu_new_ids, N )); cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( cub_temp_storage.ptr, temp_storage_bytes, - cu_new_ids, + cu_new_ids,cu_new_ids, N )); diff --git a/src/simplify.cu b/src/simplify.cu index 9efde9e..b83fc38 100644 --- a/src/simplify.cu +++ b/src/simplify.cu @@ -473,12 +473,12 @@ void collapse_edges( size_t temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, - ctx.vertices_map.ptr, V+1 + ctx.vertices_map.ptr,ctx.vertices_map.ptr, V+1 )); ctx.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( ctx.cub_temp_storage.ptr, temp_storage_bytes, - ctx.vertices_map.ptr, V+1 + ctx.vertices_map.ptr,ctx.vertices_map.ptr, V+1 )); int new_num_vertices; CUDA_CHECK(cudaMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), cudaMemcpyDeviceToHost)); @@ -497,12 +497,12 @@ void collapse_edges( // get faces map CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, - ctx.faces_map.ptr, F+1 + ctx.faces_map.ptr,ctx.faces_map.ptr,F+1 )); ctx.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( ctx.cub_temp_storage.ptr, temp_storage_bytes, - ctx.faces_map.ptr, F+1 + ctx.faces_map.ptr,ctx.faces_map.ptr, F+1 )); int new_num_faces; CUDA_CHECK(cudaMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), cudaMemcpyDeviceToHost)); From e87dcd2d55ca6a59b2d63f136908f1ef32708a2f Mon Sep 17 00:00:00 2001 From: Y-L-LIU Date: Tue, 10 Feb 2026 20:18:51 +0800 Subject: [PATCH 2/2] Isolate Thrust duplicate-face path for CUDA < 12.4 --- src/clean_up.cu | 107 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 80 insertions(+), 27 deletions(-) diff --git a/src/clean_up.cu b/src/clean_up.cu index a250dc0..7deb921 100644 --- a/src/clean_up.cu +++ b/src/clean_up.cu @@ -2,14 +2,16 @@ #include "dtypes.cuh" #include "shared.h" #include -#include +#if defined(CUDART_VERSION) && (CUDART_VERSION < 12040) #include #include #include +#endif namespace cumesh { - // Marks faces as 1 (keep) or 0 (remove) by comparing adjacent sorted faces +#if defined(CUDART_VERSION) && (CUDART_VERSION < 12040) +// Marks faces as 1 (keep) or 0 (remove) by comparing adjacent sorted faces __global__ void mark_duplicates_from_indices_kernel( const int* sorted_indices, const int3* faces, @@ -52,6 +54,7 @@ struct FaceComparator { return a.z < b.z; } }; +#endif static __global__ void copy_vec3f_to_float3_kernel( const Vec3f* vec3f, @@ -272,14 +275,11 @@ static __global__ void select_first_in_each_group_kernel( } } -struct int3_comparator +struct int3_decomposer { - __host__ __device__ bool operator()(const int3& a, const int3& b) const + __host__ __device__ ::cuda::std::tuple operator()(int3& key) const { - // Lexicographical comparison: check x, then y, then z - if (a.x != b.x) return a.x < b.x; - if (a.y != b.y) return a.y < b.y; - return a.z < b.z; + return {key.x, key.y, key.z}; } }; @@ -287,34 +287,33 @@ struct int3_comparator void CuMesh::remove_duplicate_faces() { size_t F = this->faces.size; - // 1. Create a temporary copy of faces for canonicalization + // Create a temporary sorted copy of faces for duplicate detection + // Do NOT modify the original faces to preserve vertex order and normals int3 *cu_sorted_faces; CUDA_CHECK(cudaMalloc(&cu_sorted_faces, F * sizeof(int3))); CUDA_CHECK(cudaMemcpy(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), cudaMemcpyDeviceToDevice)); - // 2. Sort vertices within each face (canonical form) - // (This ensures that face [0,1,2] is treated same as [2,0,1]) - sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_sorted_faces, F); + // Sort vertices within each face (in the temporary copy) + sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + cu_sorted_faces, + F + ); CUDA_CHECK(cudaGetLastError()); - // 3. Create indices [0, 1, 2, ... F-1] +#if defined(CUDART_VERSION) && (CUDART_VERSION < 12040) + // CUDA < 12.4: use Thrust implementation int *cu_sorted_face_indices; CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int))); - - // Use Thrust to generate sequence - thrust::sequence(thrust::device, - thrust::device_pointer_cast(cu_sorted_face_indices), + + thrust::sequence(thrust::device, + thrust::device_pointer_cast(cu_sorted_face_indices), thrust::device_pointer_cast(cu_sorted_face_indices + F)); - // 4. Sort the INDICES based on the face values using Thrust - // This groups identical faces together in the index list - thrust::sort(thrust::device, - thrust::device_pointer_cast(cu_sorted_face_indices), - thrust::device_pointer_cast(cu_sorted_face_indices + F), + thrust::sort(thrust::device, + thrust::device_pointer_cast(cu_sorted_face_indices), + thrust::device_pointer_cast(cu_sorted_face_indices + F), FaceComparator(cu_sorted_faces)); - // 5. Mark duplicates - // We traverse the sorted indices. If neighbors are equal, mark original index for removal. uint8_t* cu_face_mask_original; CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t))); @@ -326,13 +325,67 @@ void CuMesh::remove_duplicate_faces() { ); CUDA_CHECK(cudaGetLastError()); - // 6. Cleanup temporary memory CUDA_CHECK(cudaFree(cu_sorted_faces)); CUDA_CHECK(cudaFree(cu_sorted_face_indices)); - // 7. Remove faces using the generated mask this->_remove_faces(cu_face_mask_original); CUDA_CHECK(cudaFree(cu_face_mask_original)); +#else + // CUDA >= 12.4: keep existing CUB behavior + size_t temp_storage_bytes = 0; + int *cu_sorted_face_indices; + CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int))); + arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_sorted_face_indices, F); + CUDA_CHECK(cudaGetLastError()); + + int *cu_sorted_indices_output; + int3 *cu_sorted_faces_output; + CUDA_CHECK(cudaMalloc(&cu_sorted_indices_output, F * sizeof(int))); + CUDA_CHECK(cudaMalloc(&cu_sorted_faces_output, F * sizeof(int3))); + + CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, + cu_sorted_faces, cu_sorted_faces_output, + cu_sorted_face_indices, cu_sorted_indices_output, + F, + int3_decomposer{} + )); + this->cub_temp_storage.resize(temp_storage_bytes); + CUDA_CHECK(cub::DeviceRadixSort::SortPairs( + this->cub_temp_storage.ptr, temp_storage_bytes, + cu_sorted_faces, cu_sorted_faces_output, + cu_sorted_face_indices, cu_sorted_indices_output, + F, + int3_decomposer{} + )); + CUDA_CHECK(cudaFree(cu_sorted_faces)); + CUDA_CHECK(cudaFree(cu_sorted_face_indices)); + + uint8_t* cu_face_mask_sorted; + CUDA_CHECK(cudaMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t))); + select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + cu_sorted_faces_output, + F, + cu_face_mask_sorted + ); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaFree(cu_sorted_faces_output)); + + uint8_t* cu_face_mask_original; + CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t))); + scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + cu_sorted_indices_output, + cu_face_mask_sorted, + F, + cu_face_mask_original + ); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaFree(cu_face_mask_sorted)); + CUDA_CHECK(cudaFree(cu_sorted_indices_output)); + + this->_remove_faces(cu_face_mask_original); + CUDA_CHECK(cudaFree(cu_face_mask_original)); +#endif } static __global__ void mark_degenerate_faces_kernel( @@ -1237,4 +1290,4 @@ void CuMesh::unify_face_orientations() { } -} // namespace cumesh \ No newline at end of file +} // namespace cumesh