diff --git a/src/atlas.cu b/src/atlas.cu index 2d78d8c..d48031f 100644 --- a/src/atlas.cu +++ b/src/atlas.cu @@ -2,6 +2,7 @@ #include "dtypes.cuh" #include "shared.h" #include +#include namespace cumesh { @@ -39,7 +40,7 @@ __device__ inline void unpack_key_value_positive(uint64_t key_value, int& key, f // ) { // const int tid = blockIdx.x * blockDim.x + threadIdx.x; // if (tid >= F) return; -// +// // float3 n = face_normals[tid]; // chart_normal_cones[tid] = make_float4(n.x, n.y, n.z, 0.0f); // half angle = 0 // } @@ -59,7 +60,7 @@ static __global__ void init_chart_adj_kernel( int f0 = face_adj[tid].x; int f1 = face_adj[tid].y; - + int c0 = chart_ids[f0]; int c1 = chart_ids[f1]; @@ -77,7 +78,7 @@ static __global__ void init_chart_adj_kernel( int3 tri1 = faces[f1]; int t0_indices[3] = {tri0.x, tri0.y, tri0.z}; - int common_v_indices[2]; + int common_v_indices[2]; int found_count = 0; #pragma unroll @@ -277,7 +278,8 @@ static __global__ void collapse_edges_kernel( static void get_chart_connectivity( - CuMesh& mesh + CuMesh& mesh, + cudaStream_t stream ) { size_t M = mesh.manifold_face_adj.size; @@ -289,7 +291,7 @@ static void get_chart_connectivity( CUDA_CHECK(cudaMalloc(&cu_raw_lengths, M * sizeof(float))); CUDA_CHECK(cudaMalloc(&cu_sorted_lengths, M * sizeof(float))); - init_chart_adj_kernel<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + init_chart_adj_kernel<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( mesh.vertices.ptr, mesh.faces.ptr, mesh.manifold_face_adj.ptr, @@ -309,7 +311,7 @@ static void get_chart_connectivity( reinterpret_cast(mesh.temp_storage.ptr), cu_raw_lengths, cu_sorted_lengths, - M + M, 0, sizeof(uint64_t) * 8, stream )); mesh.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceRadixSort::SortPairs( @@ -318,16 +320,15 @@ static void get_chart_connectivity( reinterpret_cast(mesh.temp_storage.ptr), cu_raw_lengths, cu_sorted_lengths, - M + M, 0, sizeof(uint64_t) * 8, stream )); - CUDA_CHECK(cudaFree(cu_raw_lengths)); - + #if CUDART_VERSION >= 12090 auto reduce_op = ::cuda::std::plus(); #else auto reduce_op = cub::Sum(); #endif - + // 1.3 Reduce By Key (Aggregate duplicate chart pairs by summing lengths) int* cu_num_chart_adjs; @@ -340,8 +341,8 @@ static void get_chart_connectivity( cu_sorted_lengths, mesh.atlas_chart_adj_length.ptr, cu_num_chart_adjs, - reduce_op, - M + reduce_op, + M, stream )); mesh.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceReduce::ReduceByKey( @@ -351,19 +352,22 @@ static void get_chart_connectivity( cu_sorted_lengths, mesh.atlas_chart_adj_length.ptr, cu_num_chart_adjs, - reduce_op, - M + reduce_op, + M, stream )); - CUDA_CHECK(cudaMemcpy(&mesh.atlas_chart_adj.size, cu_num_chart_adjs, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&mesh.atlas_chart_adj.size, cu_num_chart_adjs, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); mesh.atlas_chart_adj_length.size = mesh.atlas_chart_adj.size; + CUDA_CHECK(cudaFree(cu_raw_lengths)); CUDA_CHECK(cudaFree(cu_sorted_lengths)); CUDA_CHECK(cudaFree(cu_num_chart_adjs)); // Remove invalid edge (UINT64_MAX) if present // Since we sorted, invalid edges are at the end. uint64_t last_key; if (mesh.atlas_chart_adj.size > 0) { - CUDA_CHECK(cudaMemcpy(&last_key, mesh.atlas_chart_adj.ptr + mesh.atlas_chart_adj.size - 1, sizeof(uint64_t), cudaMemcpyDeviceToHost)); - if (last_key == UINT64_MAX) { + CUDA_CHECK(cudaMemcpyAsync(&last_key, mesh.atlas_chart_adj.ptr + mesh.atlas_chart_adj.size - 1, sizeof(uint64_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + if (last_key == UINT64_MAX) { mesh.atlas_chart_adj.size -= 1; mesh.atlas_chart_adj_length.size -= 1; } @@ -381,7 +385,7 @@ static void get_chart_connectivity( mesh.atlas_chart2edge_cnt.zero(); mesh.atlas_chart_perims.resize(C); mesh.atlas_chart_perims.zero(); - get_chart_edge_cnt_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + get_chart_edge_cnt_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( mesh.atlas_chart_adj.ptr, mesh.atlas_chart_adj_length.ptr, E, @@ -396,19 +400,19 @@ static void get_chart_connectivity( nullptr, temp_storage_bytes, mesh.atlas_chart2edge_cnt.ptr, mesh.atlas_chart2edge_offset.ptr, - C + 1 + C + 1, stream )); mesh.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( mesh.cub_temp_storage.ptr, temp_storage_bytes, mesh.atlas_chart2edge_cnt.ptr, mesh.atlas_chart2edge_offset.ptr, - C + 1 + C + 1, stream )); // 2.3 Fill CSR format for chart-edge connectivity mesh.atlas_chart2edge.resize(2 * E); // each edge connects two charts mesh.atlas_chart2edge_cnt.zero(); - get_chart_edge_adjacency_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + get_chart_edge_adjacency_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( mesh.atlas_chart_adj.ptr, E, mesh.atlas_chart2edge.ptr, @@ -480,7 +484,8 @@ static __global__ void update_normal_cones_kernel( void compute_chart_normal_cones( - CuMesh& mesh + CuMesh& mesh, + cudaStream_t stream ) { size_t C = mesh.atlas_num_charts; size_t F = mesh.faces.size; @@ -492,7 +497,7 @@ void compute_chart_normal_cones( CUDA_CHECK(cudaMalloc(&sorted_chart_ids, F * sizeof(int))); CUDA_CHECK(cudaMalloc(&faces_ids, F * sizeof(int))); CUDA_CHECK(cudaMalloc(&argsorted_faces_ids, F * sizeof(int))); - arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( faces_ids, F ); @@ -502,17 +507,18 @@ void compute_chart_normal_cones( nullptr, temp_storage_bytes, mesh.atlas_chart_ids.ptr, sorted_chart_ids, faces_ids, argsorted_faces_ids, - F + F, 0, sizeof(int) * 8, stream )); mesh.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceRadixSort::SortPairs( mesh.cub_temp_storage.ptr, temp_storage_bytes, mesh.atlas_chart_ids.ptr, sorted_chart_ids, faces_ids, argsorted_faces_ids, - F + F, 0, sizeof(int) * 8, stream )); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(faces_ids)); - + // 2. Get CSR format for chart-face assignment int* cu_chart_size; int* cu_num_charts; @@ -523,14 +529,15 @@ void compute_chart_normal_cones( CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( nullptr, temp_storage_bytes, sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts, - F + F, stream )); mesh.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( mesh.cub_temp_storage.ptr, temp_storage_bytes, sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts, - F + F, stream )); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_num_charts)); CUDA_CHECK(cudaFree(cu_unique_chart_ids)); @@ -540,20 +547,21 @@ void compute_chart_normal_cones( CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, cu_chart_size, cu_chart_offsets, - C + 1 + C + 1, stream )); mesh.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( mesh.cub_temp_storage.ptr, temp_storage_bytes, cu_chart_size, cu_chart_offsets, - C + 1 + C + 1, stream )); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_chart_size)); // 3. Compute chart normals and areas float* cu_sorted_face_areas; CUDA_CHECK(cudaMalloc(&cu_sorted_face_areas, F * sizeof(float))); - index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( mesh.face_areas.ptr, argsorted_faces_ids, F, @@ -565,26 +573,30 @@ void compute_chart_normal_cones( nullptr, temp_storage_bytes, cu_sorted_face_areas, mesh.atlas_chart_areas.ptr, C, - cu_chart_offsets, cu_chart_offsets + 1 + cu_chart_offsets, cu_chart_offsets + 1, + stream )); mesh.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSegmentedReduce::Sum( mesh.cub_temp_storage.ptr, temp_storage_bytes, cu_sorted_face_areas, mesh.atlas_chart_areas.ptr, C, - cu_chart_offsets, cu_chart_offsets + 1 + cu_chart_offsets, cu_chart_offsets + 1, + stream )); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_sorted_face_areas)); float3* cu_sorted_face_normals; CUDA_CHECK(cudaMalloc(&cu_sorted_face_normals, F * sizeof(float3))); - index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( mesh.face_normals.ptr, argsorted_faces_ids, F, cu_sorted_face_normals ); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(argsorted_faces_ids)); float3* cu_chart_normals; CUDA_CHECK(cudaMalloc(&cu_chart_normals, C * sizeof(float3))); @@ -594,7 +606,8 @@ void compute_chart_normal_cones( C, cu_chart_offsets, cu_chart_offsets + 1, Float3Add(), - make_float3(0.0f, 0.0f, 0.0f) + make_float3(0.0f, 0.0f, 0.0f), + stream )); mesh.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSegmentedReduce::Reduce( @@ -603,9 +616,10 @@ void compute_chart_normal_cones( C, cu_chart_offsets, cu_chart_offsets + 1, Float3Add(), - make_float3(0.0f, 0.0f, 0.0f) + make_float3(0.0f, 0.0f, 0.0f), + stream )); - normalize_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + normalize_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_chart_normals, C ); @@ -614,7 +628,7 @@ void compute_chart_normal_cones( // 4. Compute normal difference float* cu_normal_diff; CUDA_CHECK(cudaMalloc(&cu_normal_diff, F * sizeof(float))); - normal_diff_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + normal_diff_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_chart_normals, cu_sorted_face_normals, sorted_chart_ids, @@ -622,6 +636,7 @@ void compute_chart_normal_cones( cu_normal_diff ); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_sorted_face_normals)); CUDA_CHECK(cudaFree(sorted_chart_ids)); @@ -633,27 +648,31 @@ void compute_chart_normal_cones( nullptr, temp_storage_bytes, cu_normal_diff, cu_new_cone_half_angles, C, - cu_chart_offsets, cu_chart_offsets + 1 + cu_chart_offsets, cu_chart_offsets + 1, + stream )); mesh.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSegmentedReduce::Max( mesh.cub_temp_storage.ptr, temp_storage_bytes, cu_normal_diff, cu_new_cone_half_angles, C, - cu_chart_offsets, cu_chart_offsets + 1 + cu_chart_offsets, cu_chart_offsets + 1, + stream )); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_chart_offsets)); CUDA_CHECK(cudaFree(cu_normal_diff)); // 6. Update chart normal cones mesh.atlas_chart_normal_cones.resize(C); - update_normal_cones_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + update_normal_cones_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( mesh.atlas_chart_normal_cones.ptr, cu_chart_normals, cu_new_cone_half_angles, C ); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_chart_normals)); CUDA_CHECK(cudaFree(cu_new_cone_half_angles)); } @@ -677,7 +696,7 @@ static __global__ void refine_charts_kernel( // 1. Load current face data int current_c = chart_ids[fid]; - Vec3f n(face_normals[fid]); + Vec3f n(face_normals[fid]); // local register cache for candidate list (triangle has at most 3 neighbors, plus self, max 4 candidates) int candidates[4]; @@ -691,7 +710,7 @@ static __global__ void refine_charts_kernel( // 2. Iterate over 3 edges to aggregate smooth scores int eids[3] = { face2edge[fid].x, face2edge[fid].y, face2edge[fid].z }; - + #pragma unroll for (int i = 0; i < 3; i++) { int eid = eids[i]; @@ -740,7 +759,7 @@ static __global__ void refine_charts_kernel( for (int i = 0; i < num_candidates; ++i) { int c = candidates[i]; - + // A. Geom score float4 cone = chart_normal_cones[c]; Vec3f axis(cone.x, cone.y, cone.z); @@ -769,7 +788,7 @@ static __global__ void refine_charts_kernel( // new best is significantly better than current best best_total_score = total_score; best_c = c; - } + } else if (abs(diff) <= epsilon) { // scores are very close, break tie by choosing smaller ID if (c < best_c) { @@ -822,13 +841,14 @@ __global__ void hook_edges_if_same_chart_kernel( static void reassign_chart_ids( - CuMesh& mesh + CuMesh& mesh, + cudaStream_t stream ) { size_t F = mesh.faces.size; size_t M = mesh.manifold_face_adj.size; mesh.temp_storage.resize(F * sizeof(int)); // Use as parent for DSU - arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( reinterpret_cast(mesh.temp_storage.ptr), F ); @@ -838,10 +858,11 @@ static void reassign_chart_ids( CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int))); do { h_end_flag = 1; - CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); // Hook - hook_edges_if_same_chart_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + hook_edges_if_same_chart_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( mesh.manifold_face_adj.ptr, mesh.atlas_chart_ids.ptr, M, @@ -851,15 +872,16 @@ static void reassign_chart_ids( CUDA_CHECK(cudaGetLastError()); // Compress - compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( reinterpret_cast(mesh.temp_storage.ptr), F ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } while (h_end_flag == 0); CUDA_CHECK(cudaFree(cu_end_flag)); - + swap_buffers(mesh.atlas_chart_ids, mesh.temp_storage); mesh.atlas_num_charts = compress_ids(mesh.atlas_chart_ids.ptr, F, mesh.cub_temp_storage); } @@ -930,7 +952,8 @@ static __global__ void unpack_vertex_ids_kernel( void construct_chart_mesh( - CuMesh& mesh + CuMesh& mesh, + cudaStream_t stream ) { size_t F = mesh.faces.size; @@ -943,7 +966,7 @@ void construct_chart_mesh( CUDA_CHECK(cudaMalloc(&cu_sorted_chart_ids, F * sizeof(int))); CUDA_CHECK(cudaMalloc(&cu_face_idx, F * sizeof(int))); CUDA_CHECK(cudaMalloc(&cu_sorted_face_idx, F * sizeof(int))); - arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_face_idx, F ); @@ -953,15 +976,16 @@ void construct_chart_mesh( nullptr, temp_storage_bytes, mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids, cu_face_idx, cu_sorted_face_idx, - F + F, 0, sizeof(int) * 8, stream )); mesh.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceRadixSort::SortPairs( mesh.cub_temp_storage.ptr, temp_storage_bytes, mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids, cu_face_idx, cu_sorted_face_idx, - F + F, 0, sizeof(int) * 8, stream )); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_face_idx)); // 2. RLE for chart size int* cu_chart_size; @@ -974,14 +998,15 @@ void construct_chart_mesh( CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( nullptr, temp_storage_bytes, cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart, - F + F, stream )); mesh.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( mesh.cub_temp_storage.ptr, temp_storage_bytes, cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart, - F + F, stream )); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_unique_chart_ids)); CUDA_CHECK(cudaFree(cu_num_chart)); // 3. Exclusive scan for chart face offset @@ -989,19 +1014,20 @@ void construct_chart_mesh( CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, cu_chart_size, mesh.atlas_chart_faces_offset.ptr, - mesh.atlas_num_charts + 1 + mesh.atlas_num_charts + 1, stream )); mesh.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( mesh.cub_temp_storage.ptr, temp_storage_bytes, cu_chart_size, mesh.atlas_chart_faces_offset.ptr, - mesh.atlas_num_charts + 1 + mesh.atlas_num_charts + 1, stream )); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_chart_size)); // 4. Expand chart ids and vertex ids uint64_t* cu_pack; CUDA_CHECK(cudaMalloc(&cu_pack, 3 * F * sizeof(uint64_t))); - expand_chart_ids_and_vertex_ids_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + expand_chart_ids_and_vertex_ids_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_sorted_chart_ids, cu_sorted_face_idx, mesh.faces.ptr, @@ -1009,6 +1035,7 @@ void construct_chart_mesh( cu_pack ); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_sorted_chart_ids)); CUDA_CHECK(cudaFree(cu_sorted_face_idx)); // 5. Compress pair to construct all maps @@ -1022,32 +1049,35 @@ void construct_chart_mesh( ); mesh.atlas_chart_vertex_map.resize(new_num_vertices); mesh.atlas_chart_vertex_offset.resize(mesh.atlas_num_charts + 1); - unpack_vertex_ids_kernel<<<(new_num_vertices + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + unpack_vertex_ids_kernel<<<(new_num_vertices + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_inverse_pack, new_num_vertices, mesh.atlas_chart_vertex_map.ptr, mesh.atlas_chart_vertex_offset.ptr ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_inverse_pack)); - unpack_faces_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + unpack_faces_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_pack, F, mesh.atlas_chart_faces.ptr ); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(cu_inverse_pack)); CUDA_CHECK(cudaFree(cu_pack)); } void CuMesh::compute_charts( - float threshold_cone_half_angle_rad, - int refine_iterations, - int global_iterations, + float threshold_cone_half_angle_rad, + int refine_iterations, + int global_iterations, float smooth_strength, float area_penalty_weight, float perimeter_area_ratio_weight ) { + cudaStream_t stream = current_stream(); + if (this->manifold_face_adj.is_empty()) { this->get_manifold_face_adjacency(); } @@ -1062,7 +1092,7 @@ void CuMesh::compute_charts( size_t F = this->faces.size; this->atlas_chart_ids.resize(F); this->atlas_num_charts = F; - arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->atlas_chart_ids.ptr, F ); @@ -1074,19 +1104,20 @@ void CuMesh::compute_charts( for (int i = 0; i < global_iterations; i++) { while (true) { h_end_flag = 1; - CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); // 1. Compute chart connectivity - get_chart_connectivity(*this); + get_chart_connectivity(*this, stream); if (this->atlas_chart_adj.size == 0) break; // 2. Compute normal cones - compute_chart_normal_cones(*this); + compute_chart_normal_cones(*this, stream); // 3. Compute chart adjacency cost size_t E = this->atlas_chart_adj.size; this->edge_collapse_costs.resize(E); - compute_chart_adjacency_cost_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + compute_chart_adjacency_cost_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->atlas_chart_adj.ptr, this->atlas_chart_normal_cones.ptr, this->atlas_chart_adj_length.ptr, @@ -1097,27 +1128,27 @@ void CuMesh::compute_charts( E, this->edge_collapse_costs.ptr ); - CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaGetLastError()); // 4. Propagate costs size_t C = this->atlas_num_charts; this->propagated_costs.resize(C); - propagate_cost_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + propagate_cost_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->atlas_chart2edge.ptr, this->atlas_chart2edge_offset.ptr, this->edge_collapse_costs.ptr, C, this->propagated_costs.ptr ); - CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaGetLastError()); // 5. Collapse edges this->vertices_map.resize(C); // store collapse map - arange_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + arange_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->vertices_map.ptr, C ); - collapse_edges_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + collapse_edges_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->atlas_chart_adj.ptr, this->edge_collapse_costs.ptr, this->propagated_costs.ptr, @@ -1127,30 +1158,31 @@ void CuMesh::compute_charts( this->atlas_chart_normal_cones.ptr, cu_end_flag ); - CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaGetLastError()); // End of iteration - CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); if (h_end_flag == 1) break; // 6. Compress chart ids this->atlas_num_charts = compress_ids(this->vertices_map.ptr, C, this->cub_temp_storage); this->temp_storage.resize(F * sizeof(int)); - index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->vertices_map.ptr, this->atlas_chart_ids.ptr, F, reinterpret_cast(this->temp_storage.ptr) ); - CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaGetLastError()); swap_buffers(this->atlas_chart_ids, this->temp_storage); } // Refine charts for (int j = 0; j < refine_iterations; j++) { - compute_chart_normal_cones(*this); + compute_chart_normal_cones(*this, stream); this->temp_storage.resize(F * sizeof(int)); - refine_charts_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + refine_charts_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->atlas_chart_normal_cones.ptr, this->face_normals.ptr, this->vertices.ptr, @@ -1169,53 +1201,55 @@ void CuMesh::compute_charts( } // After refinement, the chart may become disconnected, so we need to re-assign chart ids - reassign_chart_ids(*this); + reassign_chart_ids(*this, stream); } CUDA_CHECK(cudaFree(cu_end_flag)); // Finalizing: calculate vmap, chart face and chart face offset - construct_chart_mesh(*this); + construct_chart_mesh(*this, stream); } std::tuple CuMesh::read_atlas_charts() { + cudaStream_t stream = current_stream(); + auto chart_ids = torch::empty({ static_cast(this->faces.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA)); - CUDA_CHECK(cudaMemcpy( + CUDA_CHECK(cudaMemcpyAsync( chart_ids.data_ptr(), this->atlas_chart_ids.ptr, this->faces.size * sizeof(int), - cudaMemcpyDeviceToDevice + cudaMemcpyDeviceToDevice, stream )); auto vertex_map = torch::empty({ static_cast(this->atlas_chart_vertex_map.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA)); - CUDA_CHECK(cudaMemcpy( + CUDA_CHECK(cudaMemcpyAsync( vertex_map.data_ptr(), this->atlas_chart_vertex_map.ptr, this->atlas_chart_vertex_map.size * sizeof(int), - cudaMemcpyDeviceToDevice + cudaMemcpyDeviceToDevice, stream )); auto chart_faces = torch::empty({ static_cast(this->atlas_chart_faces.size), 3 }, torch::dtype(torch::kInt32).device(torch::kCUDA)); - CUDA_CHECK(cudaMemcpy( + CUDA_CHECK(cudaMemcpyAsync( chart_faces.data_ptr(), this->atlas_chart_faces.ptr, this->atlas_chart_faces.size * 3 * sizeof(int), - cudaMemcpyDeviceToDevice + cudaMemcpyDeviceToDevice, stream )); auto chart_vertex_offset = torch::empty({ static_cast(this->atlas_chart_vertex_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA)); - CUDA_CHECK(cudaMemcpy( + CUDA_CHECK(cudaMemcpyAsync( chart_vertex_offset.data_ptr(), this->atlas_chart_vertex_offset.ptr, this->atlas_chart_vertex_offset.size * sizeof(int), - cudaMemcpyDeviceToDevice + cudaMemcpyDeviceToDevice, stream )); auto chart_face_offset = torch::empty({ static_cast(this->atlas_chart_faces_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA)); - CUDA_CHECK(cudaMemcpy( + CUDA_CHECK(cudaMemcpyAsync( chart_face_offset.data_ptr(), this->atlas_chart_faces_offset.ptr, this->atlas_chart_faces_offset.size * sizeof(int), - cudaMemcpyDeviceToDevice + cudaMemcpyDeviceToDevice, stream )); return std::make_tuple(this->atlas_num_charts, chart_ids, vertex_map, chart_faces, chart_vertex_offset, chart_face_offset); } -} // namespace cumesh \ No newline at end of file +} // namespace cumesh diff --git a/src/clean_up.cu b/src/clean_up.cu index 3c12cd7..a72b1d0 100644 --- a/src/clean_up.cu +++ b/src/clean_up.cu @@ -2,6 +2,7 @@ #include "dtypes.cuh" #include "shared.h" #include +#include namespace cumesh { @@ -31,6 +32,7 @@ static __global__ void copy_T_to_T3_kernel( void CuMesh::remove_faces(torch::Tensor& face_mask) { + cudaStream_t stream = current_stream(); size_t F = this->faces.size; size_t temp_storage_bytes = 0; @@ -41,18 +43,20 @@ void CuMesh::remove_faces(torch::Tensor& face_mask) { CUDA_CHECK(cub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, this->faces.ptr, face_mask.data_ptr(), cu_new_faces, cu_new_num_faces, - F + F, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSelect::Flagged( this->cub_temp_storage.ptr, temp_storage_bytes, this->faces.ptr, face_mask.data_ptr(), cu_new_faces, cu_new_num_faces, - F + F, stream )); int new_num_faces; - CUDA_CHECK(cudaMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); this->faces.resize(new_num_faces); - CUDA_CHECK(cudaMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpyAsync(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_new_num_faces)); CUDA_CHECK(cudaFree(cu_new_faces)); @@ -61,6 +65,7 @@ void CuMesh::remove_faces(torch::Tensor& face_mask) { void CuMesh::_remove_faces(uint8_t* face_mask) { + cudaStream_t stream = current_stream(); size_t F = this->faces.size; size_t temp_storage_bytes = 0; @@ -71,18 +76,20 @@ void CuMesh::_remove_faces(uint8_t* face_mask) { CUDA_CHECK(cub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces, - F + F, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSelect::Flagged( this->cub_temp_storage.ptr, temp_storage_bytes, this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces, - F + F, stream )); int new_num_faces; - CUDA_CHECK(cudaMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); this->faces.resize(new_num_faces); - CUDA_CHECK(cudaMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpyAsync(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_new_num_faces)); CUDA_CHECK(cudaFree(cu_new_faces)); @@ -134,14 +141,15 @@ static __global__ void remap_faces_kernel( void CuMesh::remove_unreferenced_vertices() { + cudaStream_t stream = current_stream(); size_t V = this->vertices.size; size_t F = this->faces.size; // Mark referenced vertices int* cu_vertex_is_referenced; CUDA_CHECK(cudaMalloc(&cu_vertex_is_referenced, (V+1) * sizeof(int))); - CUDA_CHECK(cudaMemset(cu_vertex_is_referenced, 0, (V+1) * sizeof(int))); - set_vertex_is_referenced<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + CUDA_CHECK(cudaMemsetAsync(cu_vertex_is_referenced, 0, (V+1) * sizeof(int), stream)); + set_vertex_is_referenced<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->faces.ptr, F, cu_vertex_is_referenced @@ -152,19 +160,20 @@ void CuMesh::remove_unreferenced_vertices() { size_t temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, - cu_vertex_is_referenced, V+1 + cu_vertex_is_referenced, V+1, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, - cu_vertex_is_referenced, V+1 + cu_vertex_is_referenced, V+1, stream )); int new_num_vertices; - CUDA_CHECK(cudaMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); // Compress vertices this->temp_storage.resize(new_num_vertices * sizeof(float3)); - compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_vertex_is_referenced, this->vertices.ptr, V, @@ -174,12 +183,13 @@ void CuMesh::remove_unreferenced_vertices() { swap_buffers(this->temp_storage, this->vertices); // Update faces - remap_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + remap_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_vertex_is_referenced, F, this->faces.ptr ); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_vertex_is_referenced)); // Delete all cached info since mesh has changed @@ -237,16 +247,17 @@ struct int3_decomposer void CuMesh::remove_duplicate_faces() { + cudaStream_t stream = current_stream(); size_t F = this->faces.size; // Create a temporary sorted copy of faces for duplicate detection // Do NOT modify the original faces to preserve vertex order and normals int3 *cu_sorted_faces; CUDA_CHECK(cudaMalloc(&cu_sorted_faces, F * sizeof(int3))); - CUDA_CHECK(cudaMemcpy(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpyAsync(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), cudaMemcpyDeviceToDevice, stream)); // Sort vertices within each face (in the temporary copy) - sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_sorted_faces, F ); @@ -256,7 +267,7 @@ void CuMesh::remove_duplicate_faces() { size_t temp_storage_bytes = 0; int *cu_sorted_face_indices; CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int))); - arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_sorted_face_indices, F); + arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_sorted_face_indices, F); CUDA_CHECK(cudaGetLastError()); int *cu_sorted_indices_output; @@ -269,7 +280,8 @@ void CuMesh::remove_duplicate_faces() { cu_sorted_faces, cu_sorted_faces_output, cu_sorted_face_indices, cu_sorted_indices_output, F, - int3_decomposer{} + int3_decomposer{}, + stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceRadixSort::SortPairs( @@ -277,34 +289,36 @@ void CuMesh::remove_duplicate_faces() { cu_sorted_faces, cu_sorted_faces_output, cu_sorted_face_indices, cu_sorted_indices_output, F, - int3_decomposer{} + int3_decomposer{}, + stream )); - CUDA_CHECK(cudaFree(cu_sorted_faces)); - CUDA_CHECK(cudaFree(cu_sorted_face_indices)); // Select first in each group of duplicate faces (based on sorted faces) uint8_t* cu_face_mask_sorted; CUDA_CHECK(cudaMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t))); - select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_sorted_faces_output, F, cu_face_mask_sorted ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_sorted_faces_output)); // Map the mask back to original face order using scatter // scatter: output[indices[i]] = values[i] // This maps: cu_face_mask_original[original_idx] = cu_face_mask_sorted[sorted_position] uint8_t* cu_face_mask_original; CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t))); - scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_sorted_indices_output, // indices: sorted_position -> original_idx cu_face_mask_sorted, // values: mask at sorted_position F, cu_face_mask_original // output: mask at original position ); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(cu_sorted_faces)); + CUDA_CHECK(cudaFree(cu_sorted_face_indices)); + CUDA_CHECK(cudaFree(cu_sorted_faces_output)); CUDA_CHECK(cudaFree(cu_face_mask_sorted)); CUDA_CHECK(cudaFree(cu_sorted_indices_output)); @@ -434,6 +448,7 @@ struct LessThanOp { void CuMesh::fill_holes(float max_hole_perimeter) { + cudaStream_t stream = current_stream(); if (this->loop_boundaries.is_empty() || this->loop_boundaries_offset.is_empty()) { this->get_boundary_loops(); } @@ -451,7 +466,7 @@ void CuMesh::fill_holes(float max_hole_perimeter) { // Compute loop boundary lengths float* cu_loop_boundary_lengths; CUDA_CHECK(cudaMalloc(&cu_loop_boundary_lengths, E * sizeof(float))); - compute_loop_boundary_lengths<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + compute_loop_boundary_lengths<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->vertices.ptr, this->edges.ptr, this->loop_boundaries.ptr, @@ -469,7 +484,8 @@ void CuMesh::fill_holes(float max_hole_perimeter) { cu_loop_boundary_lengths, cu_bound_loop_perimeters, L, this->loop_boundaries_offset.ptr, - this->loop_boundaries_offset.ptr + 1 + this->loop_boundaries_offset.ptr + 1, + stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSegmentedReduce::Sum( @@ -477,14 +493,14 @@ void CuMesh::fill_holes(float max_hole_perimeter) { cu_loop_boundary_lengths, cu_bound_loop_perimeters, L, this->loop_boundaries_offset.ptr, - this->loop_boundaries_offset.ptr + 1 + this->loop_boundaries_offset.ptr + 1, + stream )); - CUDA_CHECK(cudaFree(cu_loop_boundary_lengths)); // Mask small loops uint8_t* cu_bound_loop_mask; CUDA_CHECK(cudaMalloc(&cu_bound_loop_mask, L * sizeof(uint8_t))); - compare_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + compare_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_bound_loop_perimeters, max_hole_perimeter, L, @@ -492,12 +508,11 @@ void CuMesh::fill_holes(float max_hole_perimeter) { cu_bound_loop_mask ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_bound_loop_perimeters)); // Compress bound loops size int* cu_bound_loops_cnt; CUDA_CHECK(cudaMalloc(&cu_bound_loops_cnt, L * sizeof(int))); - diff_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + diff_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->loop_boundaries_offset.ptr, L, cu_bound_loops_cnt @@ -510,16 +525,19 @@ void CuMesh::fill_holes(float max_hole_perimeter) { CUDA_CHECK(cub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops, - L + L, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSelect::Flagged( this->cub_temp_storage.ptr, temp_storage_bytes, cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops, - L + L, stream )); int new_num_bound_loops; - CUDA_CHECK(cudaMemcpy(&new_num_bound_loops, cu_new_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&new_num_bound_loops, cu_new_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(cu_loop_boundary_lengths)); + CUDA_CHECK(cudaFree(cu_bound_loop_perimeters)); CUDA_CHECK(cudaFree(cu_bound_loops_cnt)); CUDA_CHECK(cudaFree(cu_new_num_bound_loops)); if (new_num_bound_loops == 0) { @@ -531,9 +549,9 @@ void CuMesh::fill_holes(float max_hole_perimeter) { // Get loop ids of loop boundaries int* cu_loop_bound_loop_ids; CUDA_CHECK(cudaMalloc(&cu_loop_bound_loop_ids, E * sizeof(int))); - CUDA_CHECK(cudaMemset(cu_loop_bound_loop_ids, 0, E * sizeof(int))); + CUDA_CHECK(cudaMemsetAsync(cu_loop_bound_loop_ids, 0, E * sizeof(int), stream)); if (L > 1) { - set_flag_kernel<<<(L-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + set_flag_kernel<<<(L-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->loop_boundaries_offset.ptr + 1, L - 1, cu_loop_bound_loop_ids ); @@ -543,27 +561,25 @@ void CuMesh::fill_holes(float max_hole_perimeter) { CUDA_CHECK(cub::DeviceScan::InclusiveSum( nullptr, temp_storage_bytes, cu_loop_bound_loop_ids, - E + E, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::InclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_loop_bound_loop_ids, - E + E, stream )); // Mask loop boundaries uint8_t* cu_loop_boundary_mask; CUDA_CHECK(cudaMalloc(&cu_loop_boundary_mask, E * sizeof(uint8_t))); - index_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + index_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_bound_loop_mask, cu_loop_bound_loop_ids, E, cu_loop_boundary_mask ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_bound_loop_mask)); - CUDA_CHECK(cudaFree(cu_loop_bound_loop_ids)); // Compress loop boundaries int *cu_new_loop_boundaries, *cu_new_num_loop_boundaries; @@ -573,16 +589,19 @@ void CuMesh::fill_holes(float max_hole_perimeter) { CUDA_CHECK(cub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries, - E + E, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSelect::Flagged( this->cub_temp_storage.ptr, temp_storage_bytes, this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries, - E + E, stream )); int new_num_loop_boundaries; - CUDA_CHECK(cudaMemcpy(&new_num_loop_boundaries, cu_new_num_loop_boundaries, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&new_num_loop_boundaries, cu_new_num_loop_boundaries, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(cu_bound_loop_mask)); + CUDA_CHECK(cudaFree(cu_loop_bound_loop_ids)); CUDA_CHECK(cudaFree(cu_new_num_loop_boundaries)); CUDA_CHECK(cudaFree(cu_loop_boundary_mask)); @@ -593,19 +612,19 @@ void CuMesh::fill_holes(float max_hole_perimeter) { CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset, - new_num_bound_loops + 1 + new_num_bound_loops + 1, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset, - new_num_bound_loops + 1 + new_num_bound_loops + 1, stream )); int* cu_new_loop_bound_loop_ids; CUDA_CHECK(cudaMalloc(&cu_new_loop_bound_loop_ids, new_num_loop_boundaries * sizeof(int))); - CUDA_CHECK(cudaMemset(cu_new_loop_bound_loop_ids, 0, new_num_loop_boundaries * sizeof(int))); + CUDA_CHECK(cudaMemsetAsync(cu_new_loop_bound_loop_ids, 0, new_num_loop_boundaries * sizeof(int), stream)); if (new_num_bound_loops > 1) { - set_flag_kernel<<<(new_num_bound_loops-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + set_flag_kernel<<<(new_num_bound_loops-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_new_loop_boundaries_offset+1, new_num_bound_loops-1, cu_new_loop_bound_loop_ids ); @@ -615,19 +634,19 @@ void CuMesh::fill_holes(float max_hole_perimeter) { CUDA_CHECK(cub::DeviceScan::InclusiveSum( nullptr, temp_storage_bytes, cu_new_loop_bound_loop_ids, - new_num_loop_boundaries + new_num_loop_boundaries, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::InclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_new_loop_bound_loop_ids, - new_num_loop_boundaries + new_num_loop_boundaries, stream )); // Calculate new vertex positions as average of loop vertices Vec3f* cu_new_loop_bound_centers; CUDA_CHECK(cudaMalloc(&cu_new_loop_bound_centers, new_num_loop_boundaries * sizeof(Vec3f))); - compute_loop_boundary_midpoints<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + compute_loop_boundary_midpoints<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->vertices.ptr, this->edges.ptr, cu_new_loop_boundaries, @@ -643,7 +662,8 @@ void CuMesh::fill_holes(float max_hole_perimeter) { cu_new_loop_bound_centers, cu_new_vertices, new_num_bound_loops, cu_new_loop_boundaries_offset, - cu_new_loop_boundaries_offset + 1 + cu_new_loop_boundaries_offset + 1, + stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSegmentedReduce::Sum( @@ -651,29 +671,26 @@ void CuMesh::fill_holes(float max_hole_perimeter) { cu_new_loop_bound_centers, cu_new_vertices, new_num_bound_loops, cu_new_loop_boundaries_offset, - cu_new_loop_boundaries_offset + 1 + cu_new_loop_boundaries_offset + 1, + stream )); - CUDA_CHECK(cudaFree(cu_new_loop_bound_centers)); - CUDA_CHECK(cudaFree(cu_new_loop_boundaries_offset)); - inplace_div_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + inplace_div_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_new_vertices, cu_new_loop_boundaries_cnt, new_num_bound_loops ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_new_loop_boundaries_cnt)); // Update mesh this->vertices.extend(new_num_bound_loops); this->faces.extend(new_num_loop_boundaries); - copy_vec3f_to_float3_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + copy_vec3f_to_float3_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_new_vertices, new_num_bound_loops, this->vertices.ptr + V ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_new_vertices)); - connect_new_vertices_kernel<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + connect_new_vertices_kernel<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->edges.ptr, cu_new_loop_boundaries, cu_new_loop_bound_loop_ids, @@ -682,6 +699,11 @@ void CuMesh::fill_holes(float max_hole_perimeter) { this->faces.ptr + F ); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(cu_new_loop_bound_centers)); + CUDA_CHECK(cudaFree(cu_new_loop_boundaries_offset)); + CUDA_CHECK(cudaFree(cu_new_loop_boundaries_cnt)); + CUDA_CHECK(cudaFree(cu_new_vertices)); CUDA_CHECK(cudaFree(cu_new_loop_boundaries)); CUDA_CHECK(cudaFree(cu_new_loop_bound_loop_ids)); @@ -763,6 +785,7 @@ static __global__ void index_vertice_kernel( void CuMesh::repair_non_manifold_edges(){ + cudaStream_t stream = current_stream(); // Always recompute manifold_face_adj to ensure it's up to date // especially after operations like simplify() that modify the mesh this->get_manifold_face_adjacency(); @@ -773,7 +796,7 @@ void CuMesh::repair_non_manifold_edges(){ // Construct vertex adjacency pairs with manifold edges int2* cu_vertex_adj_pairs; CUDA_CHECK(cudaMalloc(&cu_vertex_adj_pairs, 2*M*sizeof(int2))); - construct_vertex_adj_pairs_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + construct_vertex_adj_pairs_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->manifold_face_adj.ptr, this->faces.ptr, cu_vertex_adj_pairs, @@ -784,16 +807,17 @@ void CuMesh::repair_non_manifold_edges(){ // Iterative Hook and Compress int* cu_vertex_ids; CUDA_CHECK(cudaMalloc(&cu_vertex_ids, 3 * F * sizeof(int))); - arange_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vertex_ids, 3 * F); + arange_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_vertex_ids, 3 * F); CUDA_CHECK(cudaGetLastError()); int* cu_end_flag; int h_end_flag; CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int))); do { h_end_flag = 1; - CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); // Hook - hook_edges_kernel<<<(2*M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + hook_edges_kernel<<<(2*M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_vertex_adj_pairs, 2 * M, cu_vertex_ids, @@ -802,12 +826,13 @@ void CuMesh::repair_non_manifold_edges(){ CUDA_CHECK(cudaGetLastError()); // Compress - compress_components_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + compress_components_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_vertex_ids, 3 * F ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } while (h_end_flag == 0); CUDA_CHECK(cudaFree(cu_end_flag)); CUDA_CHECK(cudaFree(cu_vertex_adj_pairs)); @@ -818,7 +843,7 @@ void CuMesh::repair_non_manifold_edges(){ int new_V = compress_ids(cu_vertex_ids, 3 * F, this->cub_temp_storage, cu_new_vertices_ids); float3* cu_new_vertices; CUDA_CHECK(cudaMalloc(&cu_new_vertices, new_V * sizeof(float3))); - index_vertice_kernel<<<(new_V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + index_vertice_kernel<<<(new_V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_new_vertices_ids, this->faces.ptr, this->vertices.ptr, @@ -826,13 +851,14 @@ void CuMesh::repair_non_manifold_edges(){ cu_new_vertices ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_new_vertices_ids)); this->vertices.resize(new_V); - CUDA_CHECK(cudaMemcpy(this->vertices.ptr, cu_new_vertices, new_V * sizeof(float3), cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaFree(cu_new_vertices)); + CUDA_CHECK(cudaMemcpyAsync(this->vertices.ptr, cu_new_vertices, new_V * sizeof(float3), cudaMemcpyDeviceToDevice, stream)); this->faces.resize(F); - copy_T_to_T3_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vertex_ids, F, this->faces.ptr); + copy_T_to_T3_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_vertex_ids, F, this->faces.ptr); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(cu_new_vertices_ids)); + CUDA_CHECK(cudaFree(cu_new_vertices)); CUDA_CHECK(cudaFree(cu_vertex_ids)); // Delete all cached info since mesh has changed @@ -874,6 +900,7 @@ static __global__ void mark_non_manifold_faces_kernel( void CuMesh::remove_non_manifold_faces() { + cudaStream_t stream = current_stream(); // Get edge-face adjacency information if (this->edge2face.is_empty() || this->edge2face_offset.is_empty()) { this->get_edge_face_adjacency(); @@ -887,10 +914,10 @@ void CuMesh::remove_non_manifold_faces() { // Initialize face mask (1 = keep all faces initially) uint8_t* cu_face_keep_mask; CUDA_CHECK(cudaMalloc(&cu_face_keep_mask, F * sizeof(uint8_t))); - CUDA_CHECK(cudaMemset(cu_face_keep_mask, 1, F * sizeof(uint8_t))); + CUDA_CHECK(cudaMemsetAsync(cu_face_keep_mask, 1, F * sizeof(uint8_t), stream)); // Mark faces on non-manifold edges for removal - mark_non_manifold_faces_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + mark_non_manifold_faces_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->edge2face.ptr, this->edge2face_offset.ptr, this->edge2face_cnt.ptr, @@ -916,6 +943,7 @@ struct GreaterThanOrEqualToOp { void CuMesh::remove_small_connected_components(float min_area) { + cudaStream_t stream = current_stream(); if (this->conn_comp_ids.is_empty()) { this->get_connected_components(); } @@ -936,14 +964,14 @@ void CuMesh::remove_small_connected_components(float min_area) { nullptr, temp_storage_bytes, this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids, this->face_areas.ptr, cu_sorted_face_areas, - F + F, 0, sizeof(int) * 8, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceRadixSort::SortPairs( this->cub_temp_storage.ptr, temp_storage_bytes, this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids, this->face_areas.ptr, cu_sorted_face_areas, - F + F, 0, sizeof(int) * 8, stream )); // 2. Find unique components and get the number of faces in each. @@ -957,17 +985,18 @@ void CuMesh::remove_small_connected_components(float min_area) { nullptr, temp_storage_bytes, cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids, cu_conn_comp_num_faces, cu_num_conn_comps, - F + F, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( this->cub_temp_storage.ptr, temp_storage_bytes, cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids, cu_conn_comp_num_faces, cu_num_conn_comps, - F + F, stream )); int num_conn_comps; - CUDA_CHECK(cudaMemcpy(&num_conn_comps, cu_num_conn_comps, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&num_conn_comps, cu_num_conn_comps, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_num_conn_comps)); CUDA_CHECK(cudaFree(cu_sorted_conn_comp_ids)); CUDA_CHECK(cudaFree(cu_unique_conn_comp_ids)); @@ -979,15 +1008,14 @@ void CuMesh::remove_small_connected_components(float min_area) { CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, cu_conn_comp_num_faces, cu_conn_comp_offsets, - num_conn_comps + 1 + num_conn_comps + 1, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_conn_comp_num_faces, cu_conn_comp_offsets, - num_conn_comps + 1 + num_conn_comps + 1, stream )); - CUDA_CHECK(cudaFree(cu_conn_comp_num_faces)); float *cu_conn_comp_areas; CUDA_CHECK(cudaMalloc(&cu_conn_comp_areas, num_conn_comps * sizeof(float))); @@ -996,7 +1024,8 @@ void CuMesh::remove_small_connected_components(float min_area) { cu_sorted_face_areas, cu_conn_comp_areas, num_conn_comps, cu_conn_comp_offsets, - cu_conn_comp_offsets + 1 + cu_conn_comp_offsets + 1, + stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSegmentedReduce::Sum( @@ -1004,15 +1033,14 @@ void CuMesh::remove_small_connected_components(float min_area) { cu_sorted_face_areas, cu_conn_comp_areas, num_conn_comps, cu_conn_comp_offsets, - cu_conn_comp_offsets + 1 + cu_conn_comp_offsets + 1, + stream )); - CUDA_CHECK(cudaFree(cu_sorted_face_areas)); - CUDA_CHECK(cudaFree(cu_conn_comp_offsets)); // 4. Create a "keep" mask for components with area >= min_area. uint8_t* cu_comp_keep_mask; CUDA_CHECK(cudaMalloc(&cu_comp_keep_mask, num_conn_comps * sizeof(uint8_t))); - compare_kernel<<<(num_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + compare_kernel<<<(num_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_conn_comp_areas, min_area, num_conn_comps, @@ -1020,23 +1048,26 @@ void CuMesh::remove_small_connected_components(float min_area) { cu_comp_keep_mask ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_conn_comp_areas)); // 5. Propagate the component "keep" mask to every face. uint8_t* cu_face_keep_mask; CUDA_CHECK(cudaMalloc(&cu_face_keep_mask, F * sizeof(uint8_t))); // Use an index_kernel (gather operation) - index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_comp_keep_mask, // Source array this->conn_comp_ids.ptr, // Indices to gather from F, cu_face_keep_mask // Destination array ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_comp_keep_mask)); // 6. Select the faces to keep and update the mesh. this->_remove_faces(cu_face_keep_mask); + CUDA_CHECK(cudaFree(cu_conn_comp_num_faces)); + CUDA_CHECK(cudaFree(cu_sorted_face_areas)); + CUDA_CHECK(cudaFree(cu_conn_comp_offsets)); + CUDA_CHECK(cudaFree(cu_conn_comp_areas)); + CUDA_CHECK(cudaFree(cu_comp_keep_mask)); CUDA_CHECK(cudaFree(cu_face_keep_mask)); } @@ -1158,6 +1189,7 @@ static __global__ void inplace_flip_faces_with_flags_kernel( void CuMesh::unify_face_orientations() { + cudaStream_t stream = current_stream(); if (this->manifold_face_adj.is_empty()) { this->get_manifold_face_adjacency(); } @@ -1165,7 +1197,7 @@ void CuMesh::unify_face_orientations() { // 1. Compute the flipped flag for each edge. uint8_t* cu_flipped; CUDA_CHECK(cudaMalloc(&cu_flipped, this->manifold_face_adj.size * sizeof(uint8_t))); - get_flip_flags_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + get_flip_flags_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->manifold_face_adj.ptr, this->faces.ptr, this->manifold_face_adj.size, @@ -1176,16 +1208,17 @@ void CuMesh::unify_face_orientations() { // 2. Hook edges with flipped flag. int* conn_comp_with_flip; CUDA_CHECK(cudaMalloc(&conn_comp_with_flip, this->faces.size * sizeof(int))); - arange_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(conn_comp_with_flip, this->faces.size, 2); + arange_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(conn_comp_with_flip, this->faces.size, 2); CUDA_CHECK(cudaGetLastError()); int* cu_end_flag; int h_end_flag; CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int))); do { h_end_flag = 1; - CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); // Hook - hook_edges_with_orientation_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + hook_edges_with_orientation_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->manifold_face_adj.ptr, cu_flipped, this->manifold_face_adj.size, @@ -1195,25 +1228,27 @@ void CuMesh::unify_face_orientations() { CUDA_CHECK(cudaGetLastError()); // Compress - compress_components_with_orientation_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + compress_components_with_orientation_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( conn_comp_with_flip, this->faces.size ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } while (h_end_flag == 0); CUDA_CHECK(cudaFree(cu_end_flag)); // 3. Flip the orientation of the faces. - inplace_flip_faces_with_flags_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + inplace_flip_faces_with_flags_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->faces.ptr, conn_comp_with_flip, this->faces.size ); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_flipped)); CUDA_CHECK(cudaFree(conn_comp_with_flip)); } -} // namespace cumesh \ No newline at end of file +} // namespace cumesh diff --git a/src/connectivity.cu b/src/connectivity.cu index 6e2f5fe..de0471e 100644 --- a/src/connectivity.cu +++ b/src/connectivity.cu @@ -1,6 +1,7 @@ #include "cumesh.h" #include "shared.h" +#include #include @@ -8,7 +9,7 @@ namespace cumesh { /** * Get count of neighboring faces for each vertex - * + * * @param faces: the faces of the mesh, shape (F) * @param F: the number of faces * @param neighbor_face_cnt: the buffer for neighbor face count, shape (V+1) @@ -31,7 +32,7 @@ static __global__ void get_neighbor_face_cnt_kernel( /** * Fill the neighboring face ids for each vertex - * + * * @param faces: the faces of the mesh, shape (F) * @param F: the number of faces * @param neighbor_face_ids: the buffer for neighbor face ids, shape (total_neighbor_face_cnt) @@ -57,13 +58,14 @@ static __global__ void fill_neighbor_face_ids_kernel( void CuMesh::get_vertex_face_adjacency() { + cudaStream_t stream = current_stream(); size_t F = this->faces.size; size_t V = this->vertices.size; // get neighboring face count for each vertex this->vert2face_cnt.resize(V + 1); this->vert2face_cnt.zero(); - get_neighbor_face_cnt_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->faces.ptr, F, this->vert2face_cnt.ptr); + get_neighbor_face_cnt_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->faces.ptr, F, this->vert2face_cnt.ptr); CUDA_CHECK(cudaGetLastError()); // allocate memory for neighboring face ids @@ -72,19 +74,19 @@ void CuMesh::get_vertex_face_adjacency() { CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, this->vert2face_cnt.ptr, this->vert2face_offset.ptr, - V + 1 + V + 1, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, this->vert2face_cnt.ptr, this->vert2face_offset.ptr, - V + 1 + V + 1, stream )); this->vert2face.resize(F*3); // fill neighboring face ids for each vertex this->vert2face_cnt.zero(); - fill_neighbor_face_ids_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->faces.ptr, F, + fill_neighbor_face_ids_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->faces.ptr, F, this->vert2face.ptr, this->vert2face_offset.ptr, this->vert2face_cnt.ptr @@ -95,7 +97,7 @@ void CuMesh::get_vertex_face_adjacency() { /** * Expand edges for each triangle face - * + * * @param faces: the faces of the mesh, shape (F) * @param F: the number of faces * @param edges: the buffer for edges, shape (F*3) @@ -110,7 +112,7 @@ static __global__ void expand_edges_kernel( int base = tid * 3; int3 f = faces[tid]; - + // expand edges edges[base + 0] = ((uint64_t)min(f.x, f.y) << 32) | max(f.x, f.y); edges[base + 1] = ((uint64_t)min(f.y, f.z) << 32) | max(f.y, f.z); @@ -119,9 +121,10 @@ static __global__ void expand_edges_kernel( void CuMesh::get_edges() { + cudaStream_t stream = current_stream(); size_t F = this->faces.size; this->edges.resize(F * 3); - expand_edges_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->faces.ptr, F, this->edges.ptr); + expand_edges_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->faces.ptr, F, this->edges.ptr); CUDA_CHECK(cudaGetLastError()); // sort edges @@ -131,14 +134,14 @@ void CuMesh::get_edges() { nullptr, temp_storage_bytes, this->edges.ptr, reinterpret_cast(this->temp_storage.ptr), - F * 3 + F * 3, 0, sizeof(uint64_t) * 8, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceRadixSort::SortKeys( this->cub_temp_storage.ptr, temp_storage_bytes, this->edges.ptr, reinterpret_cast(this->temp_storage.ptr), - F * 3 + F * 3, 0, sizeof(uint64_t) * 8, stream )); // unique edges @@ -148,15 +151,16 @@ void CuMesh::get_edges() { CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( nullptr, temp_storage_bytes, reinterpret_cast(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges, - F * 3 + F * 3, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( this->cub_temp_storage.ptr, temp_storage_bytes, reinterpret_cast(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges, - F * 3 + F * 3, stream )); - CUDA_CHECK(cudaMemcpy(&this->edges.size, num_edges, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&this->edges.size, num_edges, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); this->edge2face_cnt.size = this->edges.size; CUDA_CHECK(cudaFree(num_edges)); } @@ -164,7 +168,7 @@ void CuMesh::get_edges() { /** * Get edge-face adjacency - * + * * @param faces: the faces of the mesh, shape (F) * @param edges: the buffer for edges, shape (E) * @param edge2face_cnt: the buffer for edge duplication number, shape (E) @@ -217,11 +221,12 @@ static __global__ void get_edge_face_adjacency_kernel( void CuMesh::get_edge_face_adjacency() { + cudaStream_t stream = current_stream(); if (this->edges.is_empty() || this->edge2face_cnt.is_empty()) { this->get_edges(); } if (this->vert2face.is_empty() || this->vert2face_offset.is_empty()) { - this->get_vertex_face_adjacency(); + this->get_vertex_face_adjacency(); } size_t F = this->faces.size; size_t E = this->edges.size; @@ -232,25 +237,26 @@ void CuMesh::get_edge_face_adjacency() { CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, this->edge2face_cnt.ptr, this->edge2face_offset.ptr, - E + 1 + E + 1, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, this->edge2face_cnt.ptr, this->edge2face_offset.ptr, - E + 1 + E + 1, stream )); // allocate memory for edge2face int total_edge_face_cnt; - CUDA_CHECK(cudaMemcpy(&total_edge_face_cnt, &this->edge2face_offset.ptr[E], sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&total_edge_face_cnt, &this->edge2face_offset.ptr[E], sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); this->edge2face.resize(total_edge_face_cnt); // allocate memory for face2edge this->face2edge.resize(F); // get edge-face adjacency - get_edge_face_adjacency_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + get_edge_face_adjacency_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->faces.ptr, this->edges.ptr, this->edge2face_cnt.ptr, @@ -267,7 +273,7 @@ void CuMesh::get_edge_face_adjacency() { /** * Get vertex adjacent edge number - * + * * @param edges: the buffer for edges, shape (E) * @param E: the number of edges * @param vert2edge_cnt: the buffer for vertex adjacent edge number, shape (V) @@ -293,7 +299,7 @@ static __global__ void get_vertex_edge_cnt_kernel( /** * Get vertex-edge adjacency - * + * * @param edges: the buffer for edges, shape (E) * @param E: the number of edges * @param vert2edge: the buffer for vertex to edge adjacency, shape (total_vertex_edge_cnt) @@ -322,6 +328,7 @@ static __global__ void get_vertex_edge_adjacency_kernel( void CuMesh::get_vertex_edge_adjacency() { + cudaStream_t stream = current_stream(); if (this->edges.is_empty()) { this->get_edges(); } @@ -331,7 +338,7 @@ void CuMesh::get_vertex_edge_adjacency() { // get vertex adjacent edge number this->vert2edge_cnt.resize(V + 1); this->vert2edge_cnt.zero(); - get_vertex_edge_cnt_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + get_vertex_edge_cnt_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->edges.ptr, E, this->vert2edge_cnt.ptr ); CUDA_CHECK(cudaGetLastError()); @@ -342,19 +349,19 @@ void CuMesh::get_vertex_edge_adjacency() { CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr, - V + 1 + V + 1, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr, - V + 1 + V + 1, stream )); // get vertex-edge adjacency this->vert2edge.resize(2 * E); this->vert2edge_cnt.zero(); - get_vertex_edge_adjacency_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + get_vertex_edge_adjacency_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->edges.ptr, E, this->vert2edge.ptr, this->vert2edge_offset.ptr, @@ -366,7 +373,7 @@ void CuMesh::get_vertex_edge_adjacency() { /** * Set vertex boundary indicator - * + * * @param edges: the buffer for edges, shape (E) * @param boundaries: the buffer for boundary edges, shape (B) * @param edge2face_cnt: the buffer for edge duplication number, shape (E) @@ -408,6 +415,7 @@ struct is_boundary_edge { void CuMesh::get_boundary_info() { + cudaStream_t stream = current_stream(); if (this->edges.is_empty() || this->edge2face_cnt.is_empty()) { this->get_edges(); } @@ -419,21 +427,25 @@ void CuMesh::get_boundary_info() { CUDA_CHECK(cudaMalloc(&cu_num_boundary, sizeof(int))); CUDA_CHECK(cudaMalloc(&cu_edge_idx, E * sizeof(int))); this->boundaries.resize(E); - arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_edge_idx, E); + arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_edge_idx, E); + CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cub::DeviceSelect::If( nullptr, temp_storage_bytes, cu_edge_idx, this->boundaries.ptr, cu_num_boundary, E, - is_boundary_edge{this->edge2face_cnt.ptr} + is_boundary_edge{this->edge2face_cnt.ptr}, + stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSelect::If( this->cub_temp_storage.ptr, temp_storage_bytes, cu_edge_idx, this->boundaries.ptr, cu_num_boundary, E, - is_boundary_edge{this->edge2face_cnt.ptr} + is_boundary_edge{this->edge2face_cnt.ptr}, + stream )); - CUDA_CHECK(cudaMemcpy(&this->boundaries.size, cu_num_boundary, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&this->boundaries.size, cu_num_boundary, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_num_boundary)); CUDA_CHECK(cudaFree(cu_edge_idx)); @@ -441,7 +453,7 @@ void CuMesh::get_boundary_info() { this->vert_is_boundary.resize(this->vertices.size); this->vert_is_boundary.zero(); if (this->boundaries.size > 0) { - set_boundary_vertex_kernel<<<(this->boundaries.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + set_boundary_vertex_kernel<<<(this->boundaries.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->edges.ptr, this->boundaries.ptr, this->edge2face_cnt.ptr, this->boundaries.size, this->vert_is_boundary.ptr ); @@ -474,7 +486,7 @@ static __global__ void get_vertex_boundary_cnt_kernel( /** * Get vertex-boundary adjacency - * + * * @param edges: the buffer for edges, shape (E) * @param boundaries: the buffer for boundary edges, shape (B) * @param B: the number of boundary edges @@ -507,9 +519,10 @@ static __global__ void get_vertex_boundary_adjacency_kernel( void CuMesh::get_vertex_boundary_adjacency() { + cudaStream_t stream = current_stream(); if (this->edges.is_empty()) { this->get_edges(); - } + } if (this->boundaries.is_empty()) { this->get_boundary_info(); } @@ -528,7 +541,7 @@ void CuMesh::get_vertex_boundary_adjacency() { // get vertex adjacent boundary number this->vert2bound_cnt.resize(V + 1); this->vert2bound_cnt.zero(); - get_vertex_boundary_cnt_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + get_vertex_boundary_cnt_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->edges.ptr, this->boundaries.ptr, B, this->vert2bound_cnt.ptr ); CUDA_CHECK(cudaGetLastError()); @@ -539,19 +552,19 @@ void CuMesh::get_vertex_boundary_adjacency() { CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr, - V + 1 + V + 1, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr, - V + 1 + V + 1, stream )); // get vertex-boundary adjacency this->vert2bound.resize(2 * B); this->vert2bound_cnt.zero(); - get_vertex_boundary_adjacency_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + get_vertex_boundary_adjacency_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->edges.ptr, this->boundaries.ptr, B, this->vert2bound.ptr, this->vert2bound_offset.ptr, @@ -592,10 +605,11 @@ static __global__ void get_vertex_is_manifold_kernel( } vert_is_manifold[tid] = is_manifold ? 1 : 0; -} +} void CuMesh::get_vertex_is_manifold() { + cudaStream_t stream = current_stream(); if (this->vert2edge.is_empty() || this->vert2edge_offset.is_empty()) { this->get_vertex_edge_adjacency(); } @@ -606,7 +620,7 @@ void CuMesh::get_vertex_is_manifold() { // get vertex is manifold this->vert_is_manifold.resize(V); - get_vertex_is_manifold_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + get_vertex_is_manifold_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->vert2edge.ptr, this->vert2edge_offset.ptr, this->edge2face_cnt.ptr, @@ -619,7 +633,7 @@ void CuMesh::get_vertex_is_manifold() { /** * Set manifold face adjacency - * + * * @param manifold_edge_idx: the buffer for manifold edge index, shape (M) * @param edge2face: the buffer for edge to face adjacency, shape (total_edge_face_cnt) * @param edge2face_offset: the buffer for edge to face adjacency offset, shape (E+1) @@ -660,6 +674,7 @@ struct is_manifold_edge { void CuMesh::get_manifold_face_adjacency() { + cudaStream_t stream = current_stream(); if (this->edge2face.is_empty() || this->edge2face_offset.is_empty()) { this->get_edge_face_adjacency(); } @@ -671,29 +686,32 @@ void CuMesh::get_manifold_face_adjacency() { CUDA_CHECK(cudaMalloc(&cu_num_manifold_edges, sizeof(int))); CUDA_CHECK(cudaMalloc(&cu_edge_idx, E * sizeof(int))); CUDA_CHECK(cudaMalloc(&cu_manifold_edge_idx, E * sizeof(int))); - arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_edge_idx, E); + arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_edge_idx, E); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cub::DeviceSelect::If( nullptr, temp_storage_bytes, cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges, E, - is_manifold_edge{this->edge2face_cnt.ptr} + is_manifold_edge{this->edge2face_cnt.ptr}, + stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSelect::If( this->cub_temp_storage.ptr, temp_storage_bytes, cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges, E, - is_manifold_edge{this->edge2face_cnt.ptr} + is_manifold_edge{this->edge2face_cnt.ptr}, + stream )); int manifold_edge_count; - CUDA_CHECK(cudaMemcpy(&manifold_edge_count, cu_num_manifold_edges, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&manifold_edge_count, cu_num_manifold_edges, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_num_manifold_edges)); CUDA_CHECK(cudaFree(cu_edge_idx)); // set manifold_face_adj this->manifold_face_adj.resize(manifold_edge_count); - set_manifold_face_adj_kernel<<<(manifold_edge_count+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + set_manifold_face_adj_kernel<<<(manifold_edge_count+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_manifold_edge_idx, this->edge2face.ptr, this->edge2face_offset.ptr, @@ -701,6 +719,7 @@ void CuMesh::get_manifold_face_adjacency() { this->manifold_face_adj.ptr ); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_manifold_edge_idx)); } @@ -737,6 +756,7 @@ struct is_manifold_boundary_vertex { void CuMesh::get_manifold_boundary_adjacency() { + cudaStream_t stream = current_stream(); if (this->vert2bound.is_empty() || this->vert2bound_offset.is_empty()) { this->get_vertex_boundary_adjacency(); } @@ -751,23 +771,26 @@ void CuMesh::get_manifold_boundary_adjacency() { CUDA_CHECK(cudaMalloc(&cu_num_manifold_boundary_verts, sizeof(int))); CUDA_CHECK(cudaMalloc(&cu_vert_idx, V * sizeof(int))); CUDA_CHECK(cudaMalloc(&cu_manifold_vert_idx, V * sizeof(int))); - arange_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vert_idx, V); + arange_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_vert_idx, V); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cub::DeviceSelect::If( nullptr, temp_storage_bytes, cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts, V, - is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr} + is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr}, + stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSelect::If( this->cub_temp_storage.ptr, temp_storage_bytes, cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts, V, - is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr} + is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr}, + stream )); int manifold_boundary_vert_count; - CUDA_CHECK(cudaMemcpy(&manifold_boundary_vert_count, cu_num_manifold_boundary_verts, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&manifold_boundary_vert_count, cu_num_manifold_boundary_verts, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_num_manifold_boundary_verts)); CUDA_CHECK(cudaFree(cu_vert_idx)); @@ -779,7 +802,7 @@ void CuMesh::get_manifold_boundary_adjacency() { // set manifold_bound_adj this->manifold_bound_adj.resize(manifold_boundary_vert_count); - set_manifold_bound_adj_kernel<<<(manifold_boundary_vert_count+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + set_manifold_bound_adj_kernel<<<(manifold_boundary_vert_count+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_manifold_vert_idx, this->vert2bound.ptr, this->vert2bound_offset.ptr, @@ -787,10 +810,13 @@ void CuMesh::get_manifold_boundary_adjacency() { this->manifold_bound_adj.ptr ); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(cu_manifold_vert_idx)); } void CuMesh::get_connected_components() { + cudaStream_t stream = current_stream(); if (this->manifold_face_adj.is_empty()) { this->get_manifold_face_adjacency(); } @@ -800,16 +826,17 @@ void CuMesh::get_connected_components() { // Iterative Hook and Compress this->conn_comp_ids.resize(F); - arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->conn_comp_ids.ptr, F); + arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->conn_comp_ids.ptr, F); CUDA_CHECK(cudaGetLastError()); int* cu_end_flag; int h_end_flag; CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int))); do { h_end_flag = 1; - CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); // Hook - hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->manifold_face_adj.ptr, M, this->conn_comp_ids.ptr, @@ -818,12 +845,13 @@ void CuMesh::get_connected_components() { CUDA_CHECK(cudaGetLastError()); // Compress - compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->conn_comp_ids.ptr, F ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } while (h_end_flag == 0); CUDA_CHECK(cudaFree(cu_end_flag)); @@ -833,6 +861,7 @@ void CuMesh::get_connected_components() { void CuMesh::get_boundary_connected_components() { + cudaStream_t stream = current_stream(); if (this->manifold_bound_adj.is_empty()) { this->get_manifold_boundary_adjacency(); } @@ -847,16 +876,17 @@ void CuMesh::get_boundary_connected_components() { // Iterative Hook and Compress this->bound_conn_comp_ids.resize(B); - arange_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->bound_conn_comp_ids.ptr, B); + arange_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->bound_conn_comp_ids.ptr, B); CUDA_CHECK(cudaGetLastError()); int* cu_end_flag; int h_end_flag; CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int))); do { h_end_flag = 1; - CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); // Hook - hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->manifold_bound_adj.ptr, M, this->bound_conn_comp_ids.ptr, @@ -865,12 +895,13 @@ void CuMesh::get_boundary_connected_components() { CUDA_CHECK(cudaGetLastError()); // Compress - compress_components_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + compress_components_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->bound_conn_comp_ids.ptr, B ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } while (h_end_flag == 0); CUDA_CHECK(cudaFree(cu_end_flag)); @@ -926,6 +957,7 @@ static __global__ void is_bound_conn_comp_loop_kernel( void CuMesh::get_boundary_loops() { + cudaStream_t stream = current_stream(); if (this->bound_conn_comp_ids.is_empty()) { this->get_boundary_connected_components(); } @@ -941,13 +973,13 @@ void CuMesh::get_boundary_loops() { // Check if boundary components are loops int* cu_is_bound_conn_comp_loop; CUDA_CHECK(cudaMalloc(&cu_is_bound_conn_comp_loop, this->num_bound_conn_comps * sizeof(int))); - fill_kernel<<<(this->num_bound_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + fill_kernel<<<(this->num_bound_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_is_bound_conn_comp_loop, this->num_bound_conn_comps, 1 ); CUDA_CHECK(cudaGetLastError()); - is_bound_conn_comp_loop_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + is_bound_conn_comp_loop_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->edges.ptr, this->boundaries.ptr, this->bound_conn_comp_ids.ptr, @@ -964,22 +996,23 @@ void CuMesh::get_boundary_loops() { nullptr, temp_storage_bytes, cu_is_bound_conn_comp_loop, cu_num_bound_loops, - this->num_bound_conn_comps + this->num_bound_conn_comps, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceReduce::Sum( this->cub_temp_storage.ptr, temp_storage_bytes, cu_is_bound_conn_comp_loop, cu_num_bound_loops, - this->num_bound_conn_comps + this->num_bound_conn_comps, stream )); - CUDA_CHECK(cudaMemcpy(&this->num_bound_loops, cu_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&this->num_bound_loops, cu_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_num_bound_loops)); if (this->num_bound_loops == 0) { CUDA_CHECK(cudaFree(cu_is_bound_conn_comp_loop)); return; } - + // Sort boundaries by connected component ids int *cu_bound_sorted, *cu_bound_conn_comp_ids_sorted; CUDA_CHECK(cudaMalloc(&cu_bound_sorted, B * sizeof(int))); @@ -989,27 +1022,26 @@ void CuMesh::get_boundary_loops() { nullptr, temp_storage_bytes, this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted, this->boundaries.ptr, cu_bound_sorted, - B + B, 0, sizeof(int) * 8, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceRadixSort::SortPairs( this->cub_temp_storage.ptr, temp_storage_bytes, this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted, this->boundaries.ptr, cu_bound_sorted, - B + B, 0, sizeof(int) * 8, stream )); // Select loops int* cu_bound_is_on_loop; CUDA_CHECK(cudaMalloc(&cu_bound_is_on_loop, B * sizeof(int))); - index_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + index_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_is_bound_conn_comp_loop, cu_bound_conn_comp_ids_sorted, B, cu_bound_is_on_loop ); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaFree(cu_is_bound_conn_comp_loop)); this->loop_boundaries.resize(B); int *cu_loop_bound_conn_comp_ids_sorted, *cu_num_bound_on_loop; CUDA_CHECK(cudaMalloc(&cu_loop_bound_conn_comp_ids_sorted, B * sizeof(int))); @@ -1018,34 +1050,37 @@ void CuMesh::get_boundary_loops() { CUDA_CHECK(cub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop, - B + B, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSelect::Flagged( this->cub_temp_storage.ptr, temp_storage_bytes, cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop, - B + B, stream )); int num_bound_on_loop; - CUDA_CHECK(cudaMemcpy(&num_bound_on_loop, cu_num_bound_on_loop, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&num_bound_on_loop, cu_num_bound_on_loop, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(cu_is_bound_conn_comp_loop)); CUDA_CHECK(cudaFree(cu_bound_sorted)); this->loop_boundaries.resize(num_bound_on_loop); temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop, - B + B, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSelect::Flagged( this->cub_temp_storage.ptr, temp_storage_bytes, cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop, - B + B, stream )); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_bound_conn_comp_ids_sorted)); CUDA_CHECK(cudaFree(cu_bound_is_on_loop)); CUDA_CHECK(cudaFree(cu_num_bound_on_loop)); - + // RLE this->loop_boundaries_offset.resize(this->num_bound_loops + 1); this->loop_boundaries_offset.zero(); @@ -1057,15 +1092,16 @@ void CuMesh::get_boundary_loops() { nullptr, temp_storage_bytes, cu_loop_bound_conn_comp_ids_sorted, cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs, - num_bound_on_loop + num_bound_on_loop, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceRunLengthEncode::Encode( this->cub_temp_storage.ptr, temp_storage_bytes, cu_loop_bound_conn_comp_ids_sorted, cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs, - num_bound_on_loop + num_bound_on_loop, stream )); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(cu_loop_bound_conn_comp_ids_sorted)); CUDA_CHECK(cudaFree(cu_rle_unique_out)); CUDA_CHECK(cudaFree(cu_rle_num_runs)); @@ -1075,13 +1111,13 @@ void CuMesh::get_boundary_loops() { CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, this->loop_boundaries_offset.ptr, - this->num_bound_loops + 1 + this->num_bound_loops + 1, stream )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, this->loop_boundaries_offset.ptr, - this->num_bound_loops + 1 + this->num_bound_loops + 1, stream )); } diff --git a/src/geometry.cu b/src/geometry.cu index 0e493ba..45b8492 100644 --- a/src/geometry.cu +++ b/src/geometry.cu @@ -24,9 +24,10 @@ static __global__ void compute_face_areas_kernel( void CuMesh::compute_face_areas() { + cudaStream_t stream = current_stream(); size_t F = this->faces.size; this->face_areas.resize(F); - compute_face_areas_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + compute_face_areas_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->vertices.ptr, this->faces.ptr, F, @@ -57,9 +58,10 @@ static __global__ void compute_face_normals_kernel( void CuMesh::compute_face_normals() { + cudaStream_t stream = current_stream(); size_t F = this->faces.size; this->face_normals.resize(F); - compute_face_normals_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + compute_face_normals_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->vertices.ptr, this->faces.ptr, F, @@ -113,9 +115,10 @@ void CuMesh::compute_vertex_normals() { this->get_vertex_face_adjacency(); } + cudaStream_t stream = current_stream(); size_t V = this->vertices.size; this->vertex_normals.resize(V); - compute_vertex_normals_kernel<<<(V + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + compute_vertex_normals_kernel<<<(V + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( this->vertices.ptr, this->faces.ptr, this->vert2face.ptr, diff --git a/src/hash/hash.cu b/src/hash/hash.cu index a9b1c23..b691096 100644 --- a/src/hash/hash.cu +++ b/src/hash/hash.cu @@ -1,6 +1,7 @@ #include #include #include +#include #include "api.h" #include "hash.cuh" @@ -32,9 +33,12 @@ static void dispatch_hashmap_insert_cuda( const torch::Tensor& keys, const torch::Tensor& values ) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); hashmap_insert_cuda_kernel<<< (keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE, - BLOCK_SIZE + BLOCK_SIZE, + 0, + stream >>>( hashmap_keys.size(0), keys.size(0), @@ -111,9 +115,12 @@ static void dispatch_hashmap_lookup_cuda( const torch::Tensor& keys, torch::Tensor& values ) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); hashmap_lookup_cuda_kernel<<< (keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE, - BLOCK_SIZE + BLOCK_SIZE, + 0, + stream >>>( hashmap_keys.size(0), keys.size(0), @@ -205,9 +212,12 @@ static void dispatch_hashmap_insert_3d_cuda( const torch::Tensor& values, int W, int H, int D ) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); hashmap_insert_3d_cuda_kernel<<< (coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE, - BLOCK_SIZE + BLOCK_SIZE, + 0, + stream >>>( hashmap_keys.size(0), coords.size(0), @@ -303,9 +313,12 @@ static void dispatch_hashmap_lookup_3d_cuda( torch::Tensor& values, int W, int H, int D ) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); hashmap_lookup_3d_cuda_kernel<<< (coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE, - BLOCK_SIZE + BLOCK_SIZE, + 0, + stream >>>( hashmap_keys.size(0), coords.size(0), @@ -395,9 +408,12 @@ static void dispatch_hashmap_insert_3d_idx_as_val_cuda( const torch::Tensor& coords, int W, int H, int D ) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); hashmap_insert_3d_idx_as_val_cuda_kernel<<< (coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE, - BLOCK_SIZE + BLOCK_SIZE, + 0, + stream >>>( hashmap_keys.size(0), coords.size(0), diff --git a/src/remesh/simple_dual_contour.cu b/src/remesh/simple_dual_contour.cu index e064ee3..1629cd1 100644 --- a/src/remesh/simple_dual_contour.cu +++ b/src/remesh/simple_dual_contour.cu @@ -1,6 +1,7 @@ #include #include #include +#include #include #include "api.h" @@ -181,6 +182,7 @@ std::tuple cumesh::simple_dual_contour( ) { const size_t M = coords.size(0); const size_t N_vert = hashmap_keys.size(0); + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); auto vertices = torch::empty({(long)M, 3}, torch::dtype(torch::kFloat32).device(coords.device())); auto intersected = torch::empty({(long)M, 3}, torch::dtype(torch::kInt32).device(coords.device())); @@ -189,7 +191,7 @@ std::tuple cumesh::simple_dual_contour( dim3 blocks((M + BLOCK_SIZE - 1) / BLOCK_SIZE); if (hashmap_keys.dtype() == torch::kUInt32) { - simple_dual_contour_kernel<<>>( + simple_dual_contour_kernel<<>>( N_vert, M, W, H, D, @@ -200,9 +202,9 @@ std::tuple cumesh::simple_dual_contour( vertices.data_ptr(), intersected.data_ptr() ); - } + } else if (hashmap_keys.dtype() == torch::kUInt64) { - simple_dual_contour_kernel<<>>( + simple_dual_contour_kernel<<>>( N_vert, M, W, H, D, @@ -213,7 +215,7 @@ std::tuple cumesh::simple_dual_contour( vertices.data_ptr(), intersected.data_ptr() ); - } + } else { TORCH_CHECK(false, "Unsupported hashmap data type"); } diff --git a/src/remesh/svox2vert.cu b/src/remesh/svox2vert.cu index 6f1d517..d6b6511 100644 --- a/src/remesh/svox2vert.cu +++ b/src/remesh/svox2vert.cu @@ -2,6 +2,7 @@ #include #include #include +#include #include "api.h" #include "../utils.h" @@ -147,10 +148,11 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices( // Get the number of active vertices for each voxel size_t N = hashmap_keys.size(0); + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); int* num_vertices; CUDA_CHECK(cudaMalloc(&num_vertices, (M + 1) * sizeof(int))); if (hashmap_keys.dtype() == torch::kUInt32) { - get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( N, M, W, @@ -162,7 +164,7 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices( num_vertices ); } else if (hashmap_keys.dtype() == torch::kUInt64) { - get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( N, M, W, @@ -180,18 +182,19 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices( // Compute the offset size_t temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1); + cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1, stream); void* d_temp_storage = nullptr; CUDA_CHECK(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1); - CUDA_CHECK(cudaFree(d_temp_storage)); + cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1, stream); int total_vertices; - CUDA_CHECK(cudaMemcpy(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(d_temp_storage)); // Set the active vertices for each voxel auto vertices = torch::empty({total_vertices, 3}, torch::dtype(torch::kInt32).device(hashmap_keys.device())); if (hashmap_keys.dtype() == torch::kUInt32) { - set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( N, M, W, @@ -205,7 +208,7 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices( ); } else if (hashmap_keys.dtype() == torch::kUInt64) { - set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( N, M, W, @@ -220,7 +223,8 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices( } CUDA_CHECK(cudaGetLastError()); - // Free the temporary memory + // Free the temporary memory — sync stream first so set_vertex kernel is done + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(num_vertices)); return vertices; diff --git a/src/shared.h b/src/shared.h index 66ecac7..70eda3e 100644 --- a/src/shared.h +++ b/src/shared.h @@ -3,12 +3,17 @@ #include #include #include +#include #include "utils.h" #include "cumesh.h" namespace cumesh { +inline cudaStream_t current_stream() { + return at::cuda::getCurrentCUDAStream().stream(); +} + template __global__ void arange_kernel(T* array, int N, int stride=1) { @@ -158,33 +163,33 @@ __global__ void get_diff_kernel( template int compress_ids(T* ids, size_t N, Buffer& cub_temp_storage, T* inverse=nullptr) { + cudaStream_t stream = current_stream(); int *cu_indices, *cu_indices_argsorted; + int *cu_num = nullptr; T *cu_ids_sorted; CUDA_CHECK(cudaMalloc(&cu_indices, N * sizeof(int))); CUDA_CHECK(cudaMalloc(&cu_indices_argsorted, N * sizeof(int))); CUDA_CHECK(cudaMalloc(&cu_ids_sorted, N * sizeof(T))); - arange_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_indices, N); + arange_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_indices, N); CUDA_CHECK(cudaGetLastError()); size_t temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceRadixSort::SortPairs( nullptr, temp_storage_bytes, ids, cu_ids_sorted, cu_indices, cu_indices_argsorted, - N + N, 0, sizeof(T) * 8, stream )); cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceRadixSort::SortPairs( cub_temp_storage.ptr, temp_storage_bytes, ids, cu_ids_sorted, cu_indices, cu_indices_argsorted, - N + N, 0, sizeof(T) * 8, stream )); - CUDA_CHECK(cudaFree(cu_indices)); - // get diff T* cu_new_ids; CUDA_CHECK(cudaMalloc(&cu_new_ids, N * sizeof(T))); - get_diff_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + get_diff_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_ids_sorted, cu_new_ids, N @@ -193,40 +198,37 @@ int compress_ids(T* ids, size_t N, Buffer& cub_temp_storage, T* inverse=nu // inverse if (inverse) { - int* cu_num; CUDA_CHECK(cudaMalloc(&cu_num, sizeof(int))); temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceSelect::Flagged( nullptr, temp_storage_bytes, cu_ids_sorted, cu_new_ids, inverse, cu_num, - N + N, stream )); cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceSelect::Flagged( cub_temp_storage.ptr, temp_storage_bytes, cu_ids_sorted, cu_new_ids, inverse, cu_num, - N + N, stream )); - CUDA_CHECK(cudaFree(cu_num)); } - CUDA_CHECK(cudaFree(cu_ids_sorted)); - + // scan diff temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, cu_new_ids, - N + N, stream )); cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( cub_temp_storage.ptr, temp_storage_bytes, cu_new_ids, - N + N, stream )); - + // scatter - scatter_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + scatter_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( cu_indices_argsorted, cu_new_ids, N, @@ -234,10 +236,16 @@ int compress_ids(T* ids, size_t N, Buffer& cub_temp_storage, T* inverse=nu ); CUDA_CHECK(cudaGetLastError()); T num_components; - CUDA_CHECK(cudaMemcpy(&num_components, cu_new_ids + N-1, sizeof(T), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&num_components, cu_new_ids + N-1, sizeof(T), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); num_components += 1; + + // Free all scratch memory — stream is synced, all GPU work is done + CUDA_CHECK(cudaFree(cu_indices)); + CUDA_CHECK(cudaFree(cu_ids_sorted)); CUDA_CHECK(cudaFree(cu_new_ids)); CUDA_CHECK(cudaFree(cu_indices_argsorted)); + if (cu_num) CUDA_CHECK(cudaFree(cu_num)); return static_cast(num_components); } @@ -247,6 +255,7 @@ int compress_ids(T* ids, size_t N, Buffer& cub_temp_storage, T* inverse=nu template void print_max_val(T* ptr, size_t size) { + cudaStream_t stream = current_stream(); T* dbg_cu_max_val; CUDA_CHECK(cudaMalloc(&dbg_cu_max_val, sizeof(T))); size_t temp_storage_bytes = 0; @@ -254,7 +263,7 @@ void print_max_val(T* ptr, size_t size) { nullptr, temp_storage_bytes, ptr, dbg_cu_max_val, - size + size, stream )); char* temp_storage; CUDA_CHECK(cudaMalloc(&temp_storage, temp_storage_bytes)); @@ -262,10 +271,11 @@ void print_max_val(T* ptr, size_t size) { temp_storage, temp_storage_bytes, ptr, dbg_cu_max_val, - size + size, stream )); T h_max_val; - CUDA_CHECK(cudaMemcpy(&h_max_val, dbg_cu_max_val, sizeof(T), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&h_max_val, dbg_cu_max_val, sizeof(T), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); std::cout << "Max value: " << h_max_val << std::endl; CUDA_CHECK(cudaFree(dbg_cu_max_val)); CUDA_CHECK(cudaFree(temp_storage)); @@ -273,8 +283,10 @@ void print_max_val(T* ptr, size_t size) { template void print_val(T* ptr, size_t size) { + cudaStream_t stream = current_stream(); T h_ptr[size]; - CUDA_CHECK(cudaMemcpy(h_ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(h_ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); for (size_t i = 0; i < size; i++) { std::cout << h_ptr[i] << " "; } @@ -282,4 +294,4 @@ void print_val(T* ptr, size_t size) { } -} // namespace cumesh \ No newline at end of file +} // namespace cumesh diff --git a/src/simplify.cu b/src/simplify.cu index 9efde9e..b5bbcbb 100644 --- a/src/simplify.cu +++ b/src/simplify.cu @@ -1,6 +1,8 @@ #include "cumesh.h" #include "dtypes.cuh" #include +#include +#include "shared.h" namespace cumesh { @@ -21,7 +23,7 @@ __device__ inline void unpack_key_value_positive(uint64_t key_value, int& key, f /** * Get the QEM for each vertex - * + * * @param vertices: the vertices of the mesh, shape (V) * @param faces: the faces of the mesh, shape (F) * @param vert2face: the buffer for neighbor face ids, shape (total_neighbor_face_cnt) @@ -64,12 +66,13 @@ static __global__ void get_qem_kernel( * Get the QEM for each vertex */ void get_qem( - CuMesh& ctx + CuMesh& ctx, + cudaStream_t stream ) { size_t V = ctx.vertices.size; size_t F = ctx.faces.size; ctx.temp_storage.resize(V * sizeof(QEM)); - get_qem_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + get_qem_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( ctx.vertices.ptr, ctx.faces.ptr, ctx.vert2face.ptr, @@ -139,7 +142,7 @@ inline __device__ bool process_incident_tri( /** * Get the cost for each edge collapse - * + * * @param vertices: the vertices of the mesh, shape (V) * @param faces: the faces of the mesh, shape (F) * @param vert2face: the buffer for neighbor face ids, shape (total_neighbor_face_cnt) @@ -228,13 +231,14 @@ static __global__ void get_edge_collapse_cost_kernel( void get_edge_collapse_cost( CuMesh& ctx, float lambda_edge_length, - float lambda_skinny + float lambda_skinny, + cudaStream_t stream ) { size_t V = ctx.vertices.size; size_t F = ctx.faces.size; size_t E = ctx.edges.size; ctx.edge_collapse_costs.resize(E); - get_edge_collapse_cost_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + get_edge_collapse_cost_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( ctx.vertices.ptr, ctx.faces.ptr, ctx.vert2face.ptr, @@ -252,7 +256,7 @@ void get_edge_collapse_cost( /** * Propagate cost to neighboring faces - * + * * @param edges: the buffer for edges, shape (E) * @param vert2face: the buffer for neighboring face ids, shape (total_neighbor_face_cnt) * @param vert2face_offset: the buffer for neighboring face ids offset, shape (V+1) @@ -296,14 +300,15 @@ static __global__ void propagate_cost_kernel( * Propagate cost to neighboring faces */ void propagate_cost( - CuMesh& ctx + CuMesh& ctx, + cudaStream_t stream ) { size_t V = ctx.vertices.size; size_t F = ctx.faces.size; size_t E = ctx.edges.size; ctx.propagated_costs.resize(F); ctx.propagated_costs.fill(std::numeric_limits::max()); - propagate_cost_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + propagate_cost_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( ctx.edges.ptr, ctx.vert2face.ptr, ctx.vert2face_offset.ptr, @@ -317,7 +322,7 @@ void propagate_cost( /** * Collapse edges parallelly - * + * * @param vertices: the vertices of the mesh, shape (V) * @param faces: the faces of the mesh, shape (F) * @param edges: the buffer for edges, shape (E) @@ -433,7 +438,7 @@ static __global__ void compress_faces_kernel( if (is_kept) { new_faces[new_id].x = vertices_map[old_faces[tid].x]; new_faces[new_id].y = vertices_map[old_faces[tid].y]; - new_faces[new_id].z = vertices_map[old_faces[tid].z]; + new_faces[new_id].z = vertices_map[old_faces[tid].z]; } } @@ -443,7 +448,8 @@ static __global__ void compress_faces_kernel( */ void collapse_edges( CuMesh& ctx, - float collapse_thresh + float collapse_thresh, + cudaStream_t stream ) { size_t V = ctx.vertices.size; size_t F = ctx.faces.size; @@ -452,7 +458,7 @@ void collapse_edges( ctx.faces_map.resize(F + 1); ctx.vertices_map.fill(1); ctx.faces_map.fill(1); - collapse_edges_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + collapse_edges_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( ctx.vertices.ptr, ctx.faces.ptr, ctx.edges.ptr, @@ -467,24 +473,25 @@ void collapse_edges( ctx.faces_map.ptr ); CUDA_CHECK(cudaGetLastError()); - + // update vertices buffer // get vertices map size_t temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, - ctx.vertices_map.ptr, V+1 + ctx.vertices_map.ptr, V+1, stream )); ctx.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( ctx.cub_temp_storage.ptr, temp_storage_bytes, - ctx.vertices_map.ptr, V+1 + ctx.vertices_map.ptr, V+1, stream )); int new_num_vertices; - CUDA_CHECK(cudaMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); // compress vertices ctx.temp_storage.resize(new_num_vertices * sizeof(float3)); - compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( ctx.vertices_map.ptr, ctx.vertices.ptr, V, @@ -497,18 +504,19 @@ void collapse_edges( // get faces map CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, - ctx.faces_map.ptr, F+1 + ctx.faces_map.ptr, F+1, stream )); ctx.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( ctx.cub_temp_storage.ptr, temp_storage_bytes, - ctx.faces_map.ptr, F+1 + ctx.faces_map.ptr, F+1, stream )); int new_num_faces; - CUDA_CHECK(cudaMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); // compress faces ctx.temp_storage.resize(new_num_faces * sizeof(int3)); - compress_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + compress_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( ctx.faces_map.ptr, ctx.vertices_map.ptr, ctx.faces.ptr, @@ -521,12 +529,13 @@ void collapse_edges( std::tuple CuMesh::simplify_step(float lambda_edge_length, float lambda_skinny, float threshold, bool timing) { + cudaStream_t stream = current_stream(); std::chrono::high_resolution_clock::time_point start, end; if (timing) start = std::chrono::high_resolution_clock::now(); this->get_vertex_face_adjacency(); if (timing) { - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaStreamSynchronize(stream)); end = std::chrono::high_resolution_clock::now(); std::cout << "get_vertex_face_adjacency: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; } @@ -535,39 +544,39 @@ std::tuple CuMesh::simplify_step(float lambda_edge_length, float lambd this->get_edges(); this->get_boundary_info(); if (timing) { - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaStreamSynchronize(stream)); end = std::chrono::high_resolution_clock::now(); std::cout << "get_edges: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; } if (timing) start = std::chrono::high_resolution_clock::now(); - get_qem(*this); + get_qem(*this, stream); if (timing) { - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaStreamSynchronize(stream)); end = std::chrono::high_resolution_clock::now(); std::cout << "get_qem: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; } if (timing) start = std::chrono::high_resolution_clock::now(); - get_edge_collapse_cost(*this, lambda_edge_length, lambda_skinny); + get_edge_collapse_cost(*this, lambda_edge_length, lambda_skinny, stream); if (timing) { - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaStreamSynchronize(stream)); end = std::chrono::high_resolution_clock::now(); std::cout << "get_edge_collapse_cost: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; } if (timing) start = std::chrono::high_resolution_clock::now(); - propagate_cost(*this); + propagate_cost(*this, stream); if (timing) { - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaStreamSynchronize(stream)); end = std::chrono::high_resolution_clock::now(); std::cout << "propagate_cost: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; } if (timing) start = std::chrono::high_resolution_clock::now(); - collapse_edges(*this, threshold); + collapse_edges(*this, threshold, stream); if (timing) { - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaStreamSynchronize(stream)); end = std::chrono::high_resolution_clock::now(); std::cout << "collapse_edges: " << std::chrono::duration_cast(end - start).count() << " us" << std::endl; } diff --git a/src/utils.h b/src/utils.h index f15823b..aeaf2d8 100644 --- a/src/utils.h +++ b/src/utils.h @@ -4,6 +4,7 @@ #include #include #include +#include #define CUDA_CHECK(call) \ do { \ @@ -43,7 +44,11 @@ struct Buffer { } void free() { - if (ptr != nullptr) CUDA_CHECK(cudaFree(ptr)); + if (ptr != nullptr) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(ptr)); + } ptr = nullptr; size = 0; capacity = 0; @@ -62,7 +67,9 @@ struct Buffer { if (new_size > capacity) { T* new_ptr; CUDA_CHECK(cudaMalloc(&new_ptr, new_size * sizeof(T))); - CUDA_CHECK(cudaMemcpy(new_ptr, ptr, this->size * sizeof(T), cudaMemcpyDeviceToDevice)); + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); + CUDA_CHECK(cudaMemcpyAsync(new_ptr, ptr, this->size * sizeof(T), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(ptr)); ptr = new_ptr; this->capacity = new_size; @@ -71,12 +78,15 @@ struct Buffer { } void zero() { - CUDA_CHECK(cudaMemset(ptr, 0, size * sizeof(T))); + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); + CUDA_CHECK(cudaMemsetAsync(ptr, 0, size * sizeof(T), stream)); } void fill(T val) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); std::vector tmp(size, val); - CUDA_CHECK(cudaMemcpy(ptr, tmp.data(), size * sizeof(T), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(ptr, tmp.data(), size * sizeof(T), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } };