diff --git a/src/atlas.cu b/src/atlas.cu
index 2d78d8c..d48031f 100644
--- a/src/atlas.cu
+++ b/src/atlas.cu
@@ -2,6 +2,7 @@
 #include "dtypes.cuh"
 #include "shared.h"
 #include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
 
 
 namespace cumesh {
@@ -39,7 +40,7 @@ __device__ inline void unpack_key_value_positive(uint64_t key_value, int& key, f
 // ) {
 //     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 //     if (tid >= F) return;
-// 
+//
 //     float3 n = face_normals[tid];
 //     chart_normal_cones[tid] = make_float4(n.x, n.y, n.z, 0.0f); // half angle = 0
 // }
@@ -59,7 +60,7 @@ static __global__ void init_chart_adj_kernel(
 
     int f0 = face_adj[tid].x;
     int f1 = face_adj[tid].y;
-    
+
     int c0 = chart_ids[f0];
     int c1 = chart_ids[f1];
 
@@ -77,7 +78,7 @@ static __global__ void init_chart_adj_kernel(
     int3 tri1 = faces[f1];
 
     int t0_indices[3] = {tri0.x, tri0.y, tri0.z};
-    int common_v_indices[2]; 
+    int common_v_indices[2];
     int found_count = 0;
 
     #pragma unroll
@@ -277,7 +278,8 @@ static __global__ void collapse_edges_kernel(
 
 
 static void get_chart_connectivity(
-    CuMesh& mesh
+    CuMesh& mesh,
+    cudaStream_t stream
 ) {
     size_t M = mesh.manifold_face_adj.size;
 
@@ -289,7 +291,7 @@ static void get_chart_connectivity(
     CUDA_CHECK(cudaMalloc(&cu_raw_lengths, M * sizeof(float)));
     CUDA_CHECK(cudaMalloc(&cu_sorted_lengths, M * sizeof(float)));
 
-    init_chart_adj_kernel<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    init_chart_adj_kernel<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         mesh.vertices.ptr,
         mesh.faces.ptr,
         mesh.manifold_face_adj.ptr,
@@ -309,7 +311,7 @@ static void get_chart_connectivity(
         reinterpret_cast<uint64_t*>(mesh.temp_storage.ptr),
         cu_raw_lengths,
         cu_sorted_lengths,
-        M
+        M, 0, sizeof(uint64_t) * 8, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
@@ -318,16 +320,15 @@ static void get_chart_connectivity(
         reinterpret_cast<uint64_t*>(mesh.temp_storage.ptr),
         cu_raw_lengths,
         cu_sorted_lengths,
-        M
+        M, 0, sizeof(uint64_t) * 8, stream
     ));
-    CUDA_CHECK(cudaFree(cu_raw_lengths));
-	
+
     #if CUDART_VERSION >= 12090
         auto reduce_op = ::cuda::std::plus();
     #else
         auto reduce_op = cub::Sum();
     #endif
-	
+
 
     // 1.3 Reduce By Key (Aggregate duplicate chart pairs by summing lengths)
     int* cu_num_chart_adjs;
@@ -340,8 +341,8 @@ static void get_chart_connectivity(
         cu_sorted_lengths,
         mesh.atlas_chart_adj_length.ptr,
         cu_num_chart_adjs,
-		reduce_op,
-        M
+        reduce_op,
+        M, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceReduce::ReduceByKey(
@@ -351,19 +352,22 @@ static void get_chart_connectivity(
         cu_sorted_lengths,
         mesh.atlas_chart_adj_length.ptr,
         cu_num_chart_adjs,
-		reduce_op,
-        M
+        reduce_op,
+        M, stream
     ));
-    CUDA_CHECK(cudaMemcpy(&mesh.atlas_chart_adj.size, cu_num_chart_adjs, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&mesh.atlas_chart_adj.size, cu_num_chart_adjs, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     mesh.atlas_chart_adj_length.size = mesh.atlas_chart_adj.size;
+    CUDA_CHECK(cudaFree(cu_raw_lengths));
     CUDA_CHECK(cudaFree(cu_sorted_lengths));
     CUDA_CHECK(cudaFree(cu_num_chart_adjs));
     // Remove invalid edge (UINT64_MAX) if present
     // Since we sorted, invalid edges are at the end.
     uint64_t last_key;
     if (mesh.atlas_chart_adj.size > 0) {
-        CUDA_CHECK(cudaMemcpy(&last_key, mesh.atlas_chart_adj.ptr + mesh.atlas_chart_adj.size - 1, sizeof(uint64_t), cudaMemcpyDeviceToHost));
-        if (last_key == UINT64_MAX) { 
+        CUDA_CHECK(cudaMemcpyAsync(&last_key, mesh.atlas_chart_adj.ptr + mesh.atlas_chart_adj.size - 1, sizeof(uint64_t), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+        if (last_key == UINT64_MAX) {
             mesh.atlas_chart_adj.size -= 1;
             mesh.atlas_chart_adj_length.size -= 1;
         }
@@ -381,7 +385,7 @@ static void get_chart_connectivity(
     mesh.atlas_chart2edge_cnt.zero();
     mesh.atlas_chart_perims.resize(C);
     mesh.atlas_chart_perims.zero();
-    get_chart_edge_cnt_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_chart_edge_cnt_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         mesh.atlas_chart_adj.ptr,
         mesh.atlas_chart_adj_length.ptr,
         E,
@@ -396,19 +400,19 @@ static void get_chart_connectivity(
         nullptr, temp_storage_bytes,
         mesh.atlas_chart2edge_cnt.ptr,
         mesh.atlas_chart2edge_offset.ptr,
-        C + 1
+        C + 1, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         mesh.atlas_chart2edge_cnt.ptr,
         mesh.atlas_chart2edge_offset.ptr,
-        C + 1
+        C + 1, stream
     ));
     // 2.3 Fill CSR format for chart-edge connectivity
     mesh.atlas_chart2edge.resize(2 * E); // each edge connects two charts
     mesh.atlas_chart2edge_cnt.zero();
-    get_chart_edge_adjacency_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_chart_edge_adjacency_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         mesh.atlas_chart_adj.ptr,
         E,
         mesh.atlas_chart2edge.ptr,
@@ -480,7 +484,8 @@ static __global__ void update_normal_cones_kernel(
 
 
 void compute_chart_normal_cones(
-    CuMesh& mesh
+    CuMesh& mesh,
+    cudaStream_t stream
 ) {
     size_t C = mesh.atlas_num_charts;
     size_t F = mesh.faces.size;
@@ -492,7 +497,7 @@ void compute_chart_normal_cones(
     CUDA_CHECK(cudaMalloc(&sorted_chart_ids, F * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&faces_ids, F * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&argsorted_faces_ids, F * sizeof(int)));
-    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         faces_ids,
         F
     );
@@ -502,17 +507,18 @@ void compute_chart_normal_cones(
         nullptr, temp_storage_bytes,
         mesh.atlas_chart_ids.ptr, sorted_chart_ids,
         faces_ids, argsorted_faces_ids,
-        F
+        F, 0, sizeof(int) * 8, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         mesh.atlas_chart_ids.ptr, sorted_chart_ids,
         faces_ids, argsorted_faces_ids,
-        F
+        F, 0, sizeof(int) * 8, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(faces_ids));
-    
+
     // 2. Get CSR format for chart-face assignment
     int* cu_chart_size;
     int* cu_num_charts;
@@ -523,14 +529,15 @@ void compute_chart_normal_cones(
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         nullptr, temp_storage_bytes,
         sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts,
-        F
+        F, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts,
-        F
+        F, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_num_charts));
     CUDA_CHECK(cudaFree(cu_unique_chart_ids));
 
@@ -540,20 +547,21 @@ void compute_chart_normal_cones(
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_chart_size, cu_chart_offsets,
-        C + 1
+        C + 1, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_chart_size, cu_chart_offsets,
-        C + 1
+        C + 1, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_chart_size));
 
     // 3. Compute chart normals and areas
     float* cu_sorted_face_areas;
     CUDA_CHECK(cudaMalloc(&cu_sorted_face_areas, F * sizeof(float)));
-    index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         mesh.face_areas.ptr,
         argsorted_faces_ids,
         F,
@@ -565,26 +573,30 @@ void compute_chart_normal_cones(
         nullptr, temp_storage_bytes,
         cu_sorted_face_areas, mesh.atlas_chart_areas.ptr,
         C,
-        cu_chart_offsets, cu_chart_offsets + 1
+        cu_chart_offsets, cu_chart_offsets + 1,
+        stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_sorted_face_areas, mesh.atlas_chart_areas.ptr,
         C,
-        cu_chart_offsets, cu_chart_offsets + 1
+        cu_chart_offsets, cu_chart_offsets + 1,
+        stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_sorted_face_areas));
 
     float3* cu_sorted_face_normals;
     CUDA_CHECK(cudaMalloc(&cu_sorted_face_normals, F * sizeof(float3)));
-    index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         mesh.face_normals.ptr,
         argsorted_faces_ids,
         F,
         cu_sorted_face_normals
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(argsorted_faces_ids));
     float3* cu_chart_normals;
     CUDA_CHECK(cudaMalloc(&cu_chart_normals, C * sizeof(float3)));
@@ -594,7 +606,8 @@ void compute_chart_normal_cones(
         C,
         cu_chart_offsets, cu_chart_offsets + 1,
         Float3Add(),
-        make_float3(0.0f, 0.0f, 0.0f)
+        make_float3(0.0f, 0.0f, 0.0f),
+        stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSegmentedReduce::Reduce(
@@ -603,9 +616,10 @@ void compute_chart_normal_cones(
         C,
         cu_chart_offsets, cu_chart_offsets + 1,
         Float3Add(),
-        make_float3(0.0f, 0.0f, 0.0f)
+        make_float3(0.0f, 0.0f, 0.0f),
+        stream
     ));
-    normalize_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    normalize_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_chart_normals,
         C
     );
@@ -614,7 +628,7 @@ void compute_chart_normal_cones(
     // 4. Compute normal difference
     float* cu_normal_diff;
     CUDA_CHECK(cudaMalloc(&cu_normal_diff, F * sizeof(float)));
-    normal_diff_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    normal_diff_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_chart_normals,
         cu_sorted_face_normals,
         sorted_chart_ids,
@@ -622,6 +636,7 @@ void compute_chart_normal_cones(
         cu_normal_diff
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_sorted_face_normals));
     CUDA_CHECK(cudaFree(sorted_chart_ids));
 
@@ -633,27 +648,31 @@ void compute_chart_normal_cones(
         nullptr, temp_storage_bytes,
         cu_normal_diff, cu_new_cone_half_angles,
         C,
-        cu_chart_offsets, cu_chart_offsets + 1
+        cu_chart_offsets, cu_chart_offsets + 1,
+        stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSegmentedReduce::Max(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_normal_diff, cu_new_cone_half_angles,
         C,
-        cu_chart_offsets, cu_chart_offsets + 1
+        cu_chart_offsets, cu_chart_offsets + 1,
+        stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_chart_offsets));
     CUDA_CHECK(cudaFree(cu_normal_diff));
 
     // 6. Update chart normal cones
     mesh.atlas_chart_normal_cones.resize(C);
-    update_normal_cones_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    update_normal_cones_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         mesh.atlas_chart_normal_cones.ptr,
         cu_chart_normals,
         cu_new_cone_half_angles,
         C
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_chart_normals));
     CUDA_CHECK(cudaFree(cu_new_cone_half_angles));
 }
@@ -677,7 +696,7 @@ static __global__ void refine_charts_kernel(
 
     // 1. Load current face data
     int current_c = chart_ids[fid];
-    Vec3f n(face_normals[fid]); 
+    Vec3f n(face_normals[fid]);
 
     // local register cache for candidate list (triangle has at most 3 neighbors, plus self, max 4 candidates)
     int candidates[4];
@@ -691,7 +710,7 @@ static __global__ void refine_charts_kernel(
 
     // 2. Iterate over 3 edges to aggregate smooth scores
     int eids[3] = { face2edge[fid].x, face2edge[fid].y, face2edge[fid].z };
-    
+
     #pragma unroll
     for (int i = 0; i < 3; i++) {
         int eid = eids[i];
@@ -740,7 +759,7 @@ static __global__ void refine_charts_kernel(
 
     for (int i = 0; i < num_candidates; ++i) {
         int c = candidates[i];
-        
+
         // A. Geom score
         float4 cone = chart_normal_cones[c];
         Vec3f axis(cone.x, cone.y, cone.z);
@@ -769,7 +788,7 @@ static __global__ void refine_charts_kernel(
             // new best is significantly better than current best
             best_total_score = total_score;
             best_c = c;
-        } 
+        }
         else if (abs(diff) <= epsilon) {
             // scores are very close, break tie by choosing smaller ID
             if (c < best_c) {
@@ -822,13 +841,14 @@ __global__ void hook_edges_if_same_chart_kernel(
 
 
 static void reassign_chart_ids(
-    CuMesh& mesh
+    CuMesh& mesh,
+    cudaStream_t stream
 ) {
     size_t F = mesh.faces.size;
     size_t M = mesh.manifold_face_adj.size;
 
     mesh.temp_storage.resize(F * sizeof(int));        // Use as parent for DSU
-    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         reinterpret_cast<int*>(mesh.temp_storage.ptr),
         F
     );
@@ -838,10 +858,11 @@ static void reassign_chart_ids(
     CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
 
         // Hook
-        hook_edges_if_same_chart_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        hook_edges_if_same_chart_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             mesh.manifold_face_adj.ptr,
             mesh.atlas_chart_ids.ptr,
             M,
@@ -851,15 +872,16 @@ static void reassign_chart_ids(
         CUDA_CHECK(cudaGetLastError());
 
         // Compress
-        compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             reinterpret_cast<int*>(mesh.temp_storage.ptr),
             F
         );
         CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
     } while (h_end_flag == 0);
     CUDA_CHECK(cudaFree(cu_end_flag));
-    
+
     swap_buffers(mesh.atlas_chart_ids, mesh.temp_storage);
     mesh.atlas_num_charts = compress_ids(mesh.atlas_chart_ids.ptr, F, mesh.cub_temp_storage);
 }
@@ -930,7 +952,8 @@ static __global__ void unpack_vertex_ids_kernel(
 
 
 void construct_chart_mesh(
-    CuMesh& mesh
+    CuMesh& mesh,
+    cudaStream_t stream
 ) {
     size_t F = mesh.faces.size;
 
@@ -943,7 +966,7 @@ void construct_chart_mesh(
     CUDA_CHECK(cudaMalloc(&cu_sorted_chart_ids, F * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_face_idx, F * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_sorted_face_idx, F * sizeof(int)));
-    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_face_idx,
         F
     );
@@ -953,15 +976,16 @@ void construct_chart_mesh(
         nullptr, temp_storage_bytes,
         mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids,
         cu_face_idx, cu_sorted_face_idx,
-        F
+        F, 0, sizeof(int) * 8, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids,
         cu_face_idx, cu_sorted_face_idx,
-        F
+        F, 0, sizeof(int) * 8, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_face_idx));
     // 2. RLE for chart size
     int* cu_chart_size;
@@ -974,14 +998,15 @@ void construct_chart_mesh(
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         nullptr, temp_storage_bytes,
         cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart,
-        F
+        F, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart,
-        F
+        F, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_unique_chart_ids));
     CUDA_CHECK(cudaFree(cu_num_chart));
     // 3. Exclusive scan for chart face offset
@@ -989,19 +1014,20 @@ void construct_chart_mesh(
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_chart_size, mesh.atlas_chart_faces_offset.ptr,
-        mesh.atlas_num_charts + 1
+        mesh.atlas_num_charts + 1, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_chart_size, mesh.atlas_chart_faces_offset.ptr,
-        mesh.atlas_num_charts + 1
+        mesh.atlas_num_charts + 1, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_chart_size));
     // 4. Expand chart ids and vertex ids
     uint64_t* cu_pack;
     CUDA_CHECK(cudaMalloc(&cu_pack, 3 * F * sizeof(uint64_t)));
-    expand_chart_ids_and_vertex_ids_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    expand_chart_ids_and_vertex_ids_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_sorted_chart_ids,
         cu_sorted_face_idx,
         mesh.faces.ptr,
@@ -1009,6 +1035,7 @@ void construct_chart_mesh(
         cu_pack
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_sorted_chart_ids));
     CUDA_CHECK(cudaFree(cu_sorted_face_idx));
     // 5. Compress pair to construct all maps
@@ -1022,32 +1049,35 @@ void construct_chart_mesh(
     );
     mesh.atlas_chart_vertex_map.resize(new_num_vertices);
     mesh.atlas_chart_vertex_offset.resize(mesh.atlas_num_charts + 1);
-    unpack_vertex_ids_kernel<<<(new_num_vertices + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    unpack_vertex_ids_kernel<<<(new_num_vertices + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_inverse_pack,
         new_num_vertices,
         mesh.atlas_chart_vertex_map.ptr,
         mesh.atlas_chart_vertex_offset.ptr
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_inverse_pack));
-    unpack_faces_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    unpack_faces_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_pack,
         F,
         mesh.atlas_chart_faces.ptr
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_inverse_pack));
     CUDA_CHECK(cudaFree(cu_pack));
 }
 
 
 void CuMesh::compute_charts(
-    float threshold_cone_half_angle_rad, 
-    int refine_iterations, 
-    int global_iterations, 
+    float threshold_cone_half_angle_rad,
+    int refine_iterations,
+    int global_iterations,
     float smooth_strength,
     float area_penalty_weight,
     float perimeter_area_ratio_weight
 ) {
+    cudaStream_t stream = current_stream();
+
     if (this->manifold_face_adj.is_empty()) {
         this->get_manifold_face_adjacency();
     }
@@ -1062,7 +1092,7 @@ void CuMesh::compute_charts(
     size_t F = this->faces.size;
     this->atlas_chart_ids.resize(F);
     this->atlas_num_charts = F;
-    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->atlas_chart_ids.ptr,
         F
     );
@@ -1074,19 +1104,20 @@ void CuMesh::compute_charts(
     for (int i = 0; i < global_iterations; i++) {
         while (true) {
             h_end_flag = 1;
-            CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+            CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream));
+            CUDA_CHECK(cudaStreamSynchronize(stream));
 
             // 1. Compute chart connectivity
-            get_chart_connectivity(*this);
+            get_chart_connectivity(*this, stream);
             if (this->atlas_chart_adj.size == 0) break;
 
             // 2. Compute normal cones
-            compute_chart_normal_cones(*this);
+            compute_chart_normal_cones(*this, stream);
 
             // 3. Compute chart adjacency cost
             size_t E = this->atlas_chart_adj.size;
             this->edge_collapse_costs.resize(E);
-            compute_chart_adjacency_cost_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+            compute_chart_adjacency_cost_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                 this->atlas_chart_adj.ptr,
                 this->atlas_chart_normal_cones.ptr,
                 this->atlas_chart_adj_length.ptr,
@@ -1097,27 +1128,27 @@ void CuMesh::compute_charts(
                 E,
                 this->edge_collapse_costs.ptr
             );
-            CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize());
+            CUDA_CHECK(cudaGetLastError());
 
             // 4. Propagate costs
             size_t C = this->atlas_num_charts;
             this->propagated_costs.resize(C);
-            propagate_cost_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+            propagate_cost_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                 this->atlas_chart2edge.ptr,
                 this->atlas_chart2edge_offset.ptr,
                 this->edge_collapse_costs.ptr,
                 C,
                 this->propagated_costs.ptr
             );
-            CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize());
+            CUDA_CHECK(cudaGetLastError());
 
             // 5. Collapse edges
             this->vertices_map.resize(C);      // store collapse map
-            arange_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+            arange_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                 this->vertices_map.ptr,
                 C
             );
-            collapse_edges_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+            collapse_edges_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                 this->atlas_chart_adj.ptr,
                 this->edge_collapse_costs.ptr,
                 this->propagated_costs.ptr,
@@ -1127,30 +1158,31 @@ void CuMesh::compute_charts(
                 this->atlas_chart_normal_cones.ptr,
                 cu_end_flag
             );
-            CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize());
+            CUDA_CHECK(cudaGetLastError());
 
             // End of iteration
-            CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+            CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream));
+            CUDA_CHECK(cudaStreamSynchronize(stream));
             if (h_end_flag == 1) break;
 
             // 6. Compress chart ids
             this->atlas_num_charts = compress_ids(this->vertices_map.ptr, C, this->cub_temp_storage);
             this->temp_storage.resize(F * sizeof(int));
-            index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+            index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                 this->vertices_map.ptr,
                 this->atlas_chart_ids.ptr,
                 F,
                 reinterpret_cast<int*>(this->temp_storage.ptr)
             );
-            CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize());
+            CUDA_CHECK(cudaGetLastError());
             swap_buffers(this->atlas_chart_ids, this->temp_storage);
         }
 
         // Refine charts
         for (int j = 0; j < refine_iterations; j++) {
-            compute_chart_normal_cones(*this);
+            compute_chart_normal_cones(*this, stream);
             this->temp_storage.resize(F * sizeof(int));
-            refine_charts_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+            refine_charts_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                 this->atlas_chart_normal_cones.ptr,
                 this->face_normals.ptr,
                 this->vertices.ptr,
@@ -1169,53 +1201,55 @@ void CuMesh::compute_charts(
         }
 
         // After refinement, the chart may become disconnected, so we need to re-assign chart ids
-        reassign_chart_ids(*this);
+        reassign_chart_ids(*this, stream);
     }
     CUDA_CHECK(cudaFree(cu_end_flag));
 
     // Finalizing: calculate vmap, chart face and chart face offset
-    construct_chart_mesh(*this);
+    construct_chart_mesh(*this, stream);
 }
 
 
 std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> CuMesh::read_atlas_charts() {
+    cudaStream_t stream = current_stream();
+
     auto chart_ids = torch::empty({ static_cast<int64_t>(this->faces.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(cudaMemcpyAsync(
         chart_ids.data_ptr<int>(),
         this->atlas_chart_ids.ptr,
         this->faces.size * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        cudaMemcpyDeviceToDevice, stream
     ));
     auto vertex_map = torch::empty({ static_cast<int64_t>(this->atlas_chart_vertex_map.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(cudaMemcpyAsync(
         vertex_map.data_ptr<int>(),
         this->atlas_chart_vertex_map.ptr,
         this->atlas_chart_vertex_map.size * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        cudaMemcpyDeviceToDevice, stream
     ));
     auto chart_faces = torch::empty({ static_cast<int64_t>(this->atlas_chart_faces.size), 3 }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(cudaMemcpyAsync(
         chart_faces.data_ptr<int>(),
         this->atlas_chart_faces.ptr,
         this->atlas_chart_faces.size * 3 * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        cudaMemcpyDeviceToDevice, stream
     ));
     auto chart_vertex_offset = torch::empty({ static_cast<int64_t>(this->atlas_chart_vertex_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(cudaMemcpyAsync(
         chart_vertex_offset.data_ptr<int>(),
         this->atlas_chart_vertex_offset.ptr,
         this->atlas_chart_vertex_offset.size * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        cudaMemcpyDeviceToDevice, stream
     ));
     auto chart_face_offset = torch::empty({ static_cast<int64_t>(this->atlas_chart_faces_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(cudaMemcpyAsync(
         chart_face_offset.data_ptr<int>(),
         this->atlas_chart_faces_offset.ptr,
         this->atlas_chart_faces_offset.size * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        cudaMemcpyDeviceToDevice, stream
     ));
     return std::make_tuple(this->atlas_num_charts, chart_ids, vertex_map, chart_faces, chart_vertex_offset, chart_face_offset);
 }
 
 
-} // namespace cumesh
\ No newline at end of file
+} // namespace cumesh
diff --git a/src/clean_up.cu b/src/clean_up.cu
index 3c12cd7..a72b1d0 100644
--- a/src/clean_up.cu
+++ b/src/clean_up.cu
@@ -2,6 +2,7 @@
 #include "dtypes.cuh"
 #include "shared.h"
 #include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
 
 
 namespace cumesh {
@@ -31,6 +32,7 @@ static __global__ void copy_T_to_T3_kernel(
 
 
 void CuMesh::remove_faces(torch::Tensor& face_mask) {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
 
     size_t temp_storage_bytes = 0;
@@ -41,18 +43,20 @@ void CuMesh::remove_faces(torch::Tensor& face_mask) {
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         this->faces.ptr, face_mask.data_ptr<bool>(), cu_new_faces, cu_new_num_faces,
-        F
+        F, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->faces.ptr, face_mask.data_ptr<bool>(), cu_new_faces, cu_new_num_faces,
-        F
+        F, stream
     ));
     int new_num_faces;
-    CUDA_CHECK(cudaMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     this->faces.resize(new_num_faces);
-    CUDA_CHECK(cudaMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpyAsync(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_new_num_faces));
     CUDA_CHECK(cudaFree(cu_new_faces));
 
@@ -61,6 +65,7 @@ void CuMesh::remove_faces(torch::Tensor& face_mask) {
 
 
 void CuMesh::_remove_faces(uint8_t* face_mask) {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
 
     size_t temp_storage_bytes = 0;
@@ -71,18 +76,20 @@ void CuMesh::_remove_faces(uint8_t* face_mask) {
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces,
-        F
+        F, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces,
-        F
+        F, stream
     ));
     int new_num_faces;
-    CUDA_CHECK(cudaMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     this->faces.resize(new_num_faces);
-    CUDA_CHECK(cudaMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpyAsync(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_new_num_faces));
     CUDA_CHECK(cudaFree(cu_new_faces));
 
@@ -134,14 +141,15 @@ static __global__ void remap_faces_kernel(
 
 
 void CuMesh::remove_unreferenced_vertices() {
+    cudaStream_t stream = current_stream();
     size_t V = this->vertices.size;
     size_t F = this->faces.size;
 
     // Mark referenced vertices
     int* cu_vertex_is_referenced;
     CUDA_CHECK(cudaMalloc(&cu_vertex_is_referenced, (V+1) * sizeof(int)));
-    CUDA_CHECK(cudaMemset(cu_vertex_is_referenced, 0, (V+1) * sizeof(int)));
-    set_vertex_is_referenced<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    CUDA_CHECK(cudaMemsetAsync(cu_vertex_is_referenced, 0, (V+1) * sizeof(int), stream));
+    set_vertex_is_referenced<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->faces.ptr,
         F,
         cu_vertex_is_referenced
@@ -152,19 +160,20 @@ void CuMesh::remove_unreferenced_vertices() {
     size_t temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
-        cu_vertex_is_referenced, V+1
+        cu_vertex_is_referenced, V+1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
-        cu_vertex_is_referenced, V+1
+        cu_vertex_is_referenced, V+1, stream
     ));
     int new_num_vertices;
-    CUDA_CHECK(cudaMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
 
     // Compress vertices
     this->temp_storage.resize(new_num_vertices * sizeof(float3));
-    compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_vertex_is_referenced,
         this->vertices.ptr,
         V,
@@ -174,12 +183,13 @@ void CuMesh::remove_unreferenced_vertices() {
     swap_buffers(this->temp_storage, this->vertices);
 
     // Update faces
-    remap_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    remap_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_vertex_is_referenced,
         F,
         this->faces.ptr
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_vertex_is_referenced));
 
     // Delete all cached info since mesh has changed
@@ -237,16 +247,17 @@ struct int3_decomposer
 
 
 void CuMesh::remove_duplicate_faces() {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
 
     // Create a temporary sorted copy of faces for duplicate detection
     // Do NOT modify the original faces to preserve vertex order and normals
     int3 *cu_sorted_faces;
     CUDA_CHECK(cudaMalloc(&cu_sorted_faces, F * sizeof(int3)));
-    CUDA_CHECK(cudaMemcpy(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpyAsync(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), cudaMemcpyDeviceToDevice, stream));
 
     // Sort vertices within each face (in the temporary copy)
-    sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_sorted_faces,
         F
     );
@@ -256,7 +267,7 @@ void CuMesh::remove_duplicate_faces() {
     size_t temp_storage_bytes = 0;
     int *cu_sorted_face_indices;
     CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int)));
-    arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_sorted_face_indices, F);
+    arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_sorted_face_indices, F);
     CUDA_CHECK(cudaGetLastError());
 
     int *cu_sorted_indices_output;
@@ -269,7 +280,8 @@ void CuMesh::remove_duplicate_faces() {
         cu_sorted_faces, cu_sorted_faces_output,
         cu_sorted_face_indices, cu_sorted_indices_output,
         F,
-        int3_decomposer{}
+        int3_decomposer{},
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
@@ -277,34 +289,36 @@ void CuMesh::remove_duplicate_faces() {
         cu_sorted_faces, cu_sorted_faces_output,
         cu_sorted_face_indices, cu_sorted_indices_output,
         F,
-        int3_decomposer{}
+        int3_decomposer{},
+        stream
     ));
-    CUDA_CHECK(cudaFree(cu_sorted_faces));
-    CUDA_CHECK(cudaFree(cu_sorted_face_indices));
 
     // Select first in each group of duplicate faces (based on sorted faces)
     uint8_t* cu_face_mask_sorted;
     CUDA_CHECK(cudaMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t)));
-    select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_sorted_faces_output,
         F,
         cu_face_mask_sorted
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_sorted_faces_output));
 
     // Map the mask back to original face order using scatter
     // scatter: output[indices[i]] = values[i]
     // This maps: cu_face_mask_original[original_idx] = cu_face_mask_sorted[sorted_position]
     uint8_t* cu_face_mask_original;
     CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t)));
-    scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_sorted_indices_output,  // indices: sorted_position -> original_idx
         cu_face_mask_sorted,       // values: mask at sorted_position
         F,
         cu_face_mask_original      // output: mask at original position
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_sorted_faces));
+    CUDA_CHECK(cudaFree(cu_sorted_face_indices));
+    CUDA_CHECK(cudaFree(cu_sorted_faces_output));
     CUDA_CHECK(cudaFree(cu_face_mask_sorted));
     CUDA_CHECK(cudaFree(cu_sorted_indices_output));
 
@@ -434,6 +448,7 @@ struct LessThanOp {
 
 
 void CuMesh::fill_holes(float max_hole_perimeter) {
+    cudaStream_t stream = current_stream();
     if (this->loop_boundaries.is_empty() || this->loop_boundaries_offset.is_empty()) {
         this->get_boundary_loops();
     }
@@ -451,7 +466,7 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     // Compute loop boundary lengths
     float* cu_loop_boundary_lengths;
     CUDA_CHECK(cudaMalloc(&cu_loop_boundary_lengths, E * sizeof(float)));
-    compute_loop_boundary_lengths<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compute_loop_boundary_lengths<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vertices.ptr,
         this->edges.ptr,
         this->loop_boundaries.ptr,
@@ -469,7 +484,8 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         cu_loop_boundary_lengths, cu_bound_loop_perimeters,
         L,
         this->loop_boundaries_offset.ptr,
-        this->loop_boundaries_offset.ptr + 1
+        this->loop_boundaries_offset.ptr + 1,
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
@@ -477,14 +493,14 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         cu_loop_boundary_lengths, cu_bound_loop_perimeters,
         L,
         this->loop_boundaries_offset.ptr,
-        this->loop_boundaries_offset.ptr + 1
+        this->loop_boundaries_offset.ptr + 1,
+        stream
     ));
-    CUDA_CHECK(cudaFree(cu_loop_boundary_lengths));
 
     // Mask small loops
     uint8_t* cu_bound_loop_mask;
     CUDA_CHECK(cudaMalloc(&cu_bound_loop_mask, L * sizeof(uint8_t)));
-    compare_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compare_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_bound_loop_perimeters,
         max_hole_perimeter,
         L,
@@ -492,12 +508,11 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         cu_bound_loop_mask
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_bound_loop_perimeters));
 
     // Compress bound loops size
     int* cu_bound_loops_cnt;
     CUDA_CHECK(cudaMalloc(&cu_bound_loops_cnt, L * sizeof(int)));
-    diff_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    diff_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->loop_boundaries_offset.ptr,
         L,
         cu_bound_loops_cnt
@@ -510,16 +525,19 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops,
-        L
+        L, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops,
-        L
+        L, stream
     ));
     int new_num_bound_loops;
-    CUDA_CHECK(cudaMemcpy(&new_num_bound_loops, cu_new_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_bound_loops, cu_new_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_loop_boundary_lengths));
+    CUDA_CHECK(cudaFree(cu_bound_loop_perimeters));
     CUDA_CHECK(cudaFree(cu_bound_loops_cnt));
     CUDA_CHECK(cudaFree(cu_new_num_bound_loops));
     if (new_num_bound_loops == 0) {
@@ -531,9 +549,9 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     // Get loop ids of loop boundaries
     int* cu_loop_bound_loop_ids;
     CUDA_CHECK(cudaMalloc(&cu_loop_bound_loop_ids, E * sizeof(int)));
-    CUDA_CHECK(cudaMemset(cu_loop_bound_loop_ids, 0, E * sizeof(int)));
+    CUDA_CHECK(cudaMemsetAsync(cu_loop_bound_loop_ids, 0, E * sizeof(int), stream));
     if (L > 1) {
-        set_flag_kernel<<<(L-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        set_flag_kernel<<<(L-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->loop_boundaries_offset.ptr + 1, L - 1,
             cu_loop_bound_loop_ids
         );
@@ -543,27 +561,25 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(
         nullptr, temp_storage_bytes,
         cu_loop_bound_loop_ids,
-        E
+        E, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_loop_bound_loop_ids,
-        E
+        E, stream
     ));
 
     // Mask loop boundaries
     uint8_t* cu_loop_boundary_mask;
     CUDA_CHECK(cudaMalloc(&cu_loop_boundary_mask, E * sizeof(uint8_t)));
-    index_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    index_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_bound_loop_mask,
         cu_loop_bound_loop_ids,
         E,
         cu_loop_boundary_mask
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_bound_loop_mask));
-    CUDA_CHECK(cudaFree(cu_loop_bound_loop_ids));
 
     // Compress loop boundaries
     int *cu_new_loop_boundaries, *cu_new_num_loop_boundaries;
@@ -573,16 +589,19 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries,
-        E
+        E, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries,
-        E
+        E, stream
     ));
     int new_num_loop_boundaries;
-    CUDA_CHECK(cudaMemcpy(&new_num_loop_boundaries, cu_new_num_loop_boundaries, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_loop_boundaries, cu_new_num_loop_boundaries, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_bound_loop_mask));
+    CUDA_CHECK(cudaFree(cu_loop_bound_loop_ids));
     CUDA_CHECK(cudaFree(cu_new_num_loop_boundaries));
     CUDA_CHECK(cudaFree(cu_loop_boundary_mask));
 
@@ -593,19 +612,19 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset,
-        new_num_bound_loops + 1
+        new_num_bound_loops + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset,
-        new_num_bound_loops + 1
+        new_num_bound_loops + 1, stream
     ));
     int* cu_new_loop_bound_loop_ids;
     CUDA_CHECK(cudaMalloc(&cu_new_loop_bound_loop_ids, new_num_loop_boundaries * sizeof(int)));
-    CUDA_CHECK(cudaMemset(cu_new_loop_bound_loop_ids, 0, new_num_loop_boundaries * sizeof(int)));
+    CUDA_CHECK(cudaMemsetAsync(cu_new_loop_bound_loop_ids, 0, new_num_loop_boundaries * sizeof(int), stream));
     if (new_num_bound_loops > 1) {
-        set_flag_kernel<<<(new_num_bound_loops-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        set_flag_kernel<<<(new_num_bound_loops-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             cu_new_loop_boundaries_offset+1, new_num_bound_loops-1,
             cu_new_loop_bound_loop_ids
         );
@@ -615,19 +634,19 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(
         nullptr, temp_storage_bytes,
         cu_new_loop_bound_loop_ids,
-        new_num_loop_boundaries
+        new_num_loop_boundaries, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_new_loop_bound_loop_ids,
-        new_num_loop_boundaries
+        new_num_loop_boundaries, stream
     ));
 
     // Calculate new vertex positions as average of loop vertices
     Vec3f* cu_new_loop_bound_centers;
     CUDA_CHECK(cudaMalloc(&cu_new_loop_bound_centers, new_num_loop_boundaries * sizeof(Vec3f)));
-    compute_loop_boundary_midpoints<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compute_loop_boundary_midpoints<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vertices.ptr,
         this->edges.ptr,
         cu_new_loop_boundaries,
@@ -643,7 +662,8 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         cu_new_loop_bound_centers, cu_new_vertices,
         new_num_bound_loops,
         cu_new_loop_boundaries_offset,
-        cu_new_loop_boundaries_offset + 1
+        cu_new_loop_boundaries_offset + 1,
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
@@ -651,29 +671,26 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         cu_new_loop_bound_centers, cu_new_vertices,
         new_num_bound_loops,
         cu_new_loop_boundaries_offset,
-        cu_new_loop_boundaries_offset + 1
+        cu_new_loop_boundaries_offset + 1,
+        stream
     ));
-    CUDA_CHECK(cudaFree(cu_new_loop_bound_centers));
-    CUDA_CHECK(cudaFree(cu_new_loop_boundaries_offset));
-    inplace_div_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    inplace_div_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_new_vertices,
         cu_new_loop_boundaries_cnt,
         new_num_bound_loops
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_new_loop_boundaries_cnt));
 
     // Update mesh
     this->vertices.extend(new_num_bound_loops);
     this->faces.extend(new_num_loop_boundaries);
-    copy_vec3f_to_float3_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    copy_vec3f_to_float3_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_new_vertices,
         new_num_bound_loops,
         this->vertices.ptr + V
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_new_vertices));
-    connect_new_vertices_kernel<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    connect_new_vertices_kernel<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edges.ptr,
         cu_new_loop_boundaries,
         cu_new_loop_bound_loop_ids,
@@ -682,6 +699,11 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         this->faces.ptr + F
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_new_loop_bound_centers));
+    CUDA_CHECK(cudaFree(cu_new_loop_boundaries_offset));
+    CUDA_CHECK(cudaFree(cu_new_loop_boundaries_cnt));
+    CUDA_CHECK(cudaFree(cu_new_vertices));
     CUDA_CHECK(cudaFree(cu_new_loop_boundaries));
     CUDA_CHECK(cudaFree(cu_new_loop_bound_loop_ids));
 
@@ -763,6 +785,7 @@ static __global__ void index_vertice_kernel(
 
 
 void CuMesh::repair_non_manifold_edges(){
+    cudaStream_t stream = current_stream();
     // Always recompute manifold_face_adj to ensure it's up to date
     // especially after operations like simplify() that modify the mesh
     this->get_manifold_face_adjacency();
@@ -773,7 +796,7 @@ void CuMesh::repair_non_manifold_edges(){
     // Construct vertex adjacency pairs with manifold edges
     int2* cu_vertex_adj_pairs;
     CUDA_CHECK(cudaMalloc(&cu_vertex_adj_pairs, 2*M*sizeof(int2)));
-    construct_vertex_adj_pairs_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    construct_vertex_adj_pairs_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->manifold_face_adj.ptr,
         this->faces.ptr,
         cu_vertex_adj_pairs,
@@ -784,16 +807,17 @@ void CuMesh::repair_non_manifold_edges(){
     // Iterative Hook and Compress
     int* cu_vertex_ids;
     CUDA_CHECK(cudaMalloc(&cu_vertex_ids, 3 * F * sizeof(int)));
-    arange_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vertex_ids, 3 * F);
+    arange_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_vertex_ids, 3 * F);
     CUDA_CHECK(cudaGetLastError());
     int* cu_end_flag; int h_end_flag;
     CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
 
         // Hook
-        hook_edges_kernel<<<(2*M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        hook_edges_kernel<<<(2*M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             cu_vertex_adj_pairs,
             2 * M,
             cu_vertex_ids,
@@ -802,12 +826,13 @@ void CuMesh::repair_non_manifold_edges(){
         CUDA_CHECK(cudaGetLastError());
 
         // Compress
-        compress_components_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        compress_components_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             cu_vertex_ids,
             3 * F
         );
         CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
     } while (h_end_flag == 0);
     CUDA_CHECK(cudaFree(cu_end_flag));
     CUDA_CHECK(cudaFree(cu_vertex_adj_pairs));
@@ -818,7 +843,7 @@ void CuMesh::repair_non_manifold_edges(){
     int new_V = compress_ids(cu_vertex_ids, 3 * F, this->cub_temp_storage, cu_new_vertices_ids);
     float3* cu_new_vertices;
     CUDA_CHECK(cudaMalloc(&cu_new_vertices, new_V * sizeof(float3)));
-    index_vertice_kernel<<<(new_V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    index_vertice_kernel<<<(new_V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_new_vertices_ids,
         this->faces.ptr,
         this->vertices.ptr,
@@ -826,13 +851,14 @@ void CuMesh::repair_non_manifold_edges(){
         cu_new_vertices
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_new_vertices_ids));
     this->vertices.resize(new_V);
-    CUDA_CHECK(cudaMemcpy(this->vertices.ptr, cu_new_vertices, new_V * sizeof(float3), cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaFree(cu_new_vertices));
+    CUDA_CHECK(cudaMemcpyAsync(this->vertices.ptr, cu_new_vertices, new_V * sizeof(float3), cudaMemcpyDeviceToDevice, stream));
     this->faces.resize(F);
-    copy_T_to_T3_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vertex_ids, F, this->faces.ptr);
+    copy_T_to_T3_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_vertex_ids, F, this->faces.ptr);
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_new_vertices_ids));
+    CUDA_CHECK(cudaFree(cu_new_vertices));
     CUDA_CHECK(cudaFree(cu_vertex_ids));
 
     // Delete all cached info since mesh has changed
@@ -874,6 +900,7 @@ static __global__ void mark_non_manifold_faces_kernel(
 
 
 void CuMesh::remove_non_manifold_faces() {
+    cudaStream_t stream = current_stream();
     // Get edge-face adjacency information
     if (this->edge2face.is_empty() || this->edge2face_offset.is_empty()) {
         this->get_edge_face_adjacency();
@@ -887,10 +914,10 @@ void CuMesh::remove_non_manifold_faces() {
     // Initialize face mask (1 = keep all faces initially)
     uint8_t* cu_face_keep_mask;
     CUDA_CHECK(cudaMalloc(&cu_face_keep_mask, F * sizeof(uint8_t)));
-    CUDA_CHECK(cudaMemset(cu_face_keep_mask, 1, F * sizeof(uint8_t)));
+    CUDA_CHECK(cudaMemsetAsync(cu_face_keep_mask, 1, F * sizeof(uint8_t), stream));
 
     // Mark faces on non-manifold edges for removal
-    mark_non_manifold_faces_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    mark_non_manifold_faces_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edge2face.ptr,
         this->edge2face_offset.ptr,
         this->edge2face_cnt.ptr,
@@ -916,6 +943,7 @@ struct GreaterThanOrEqualToOp {
 
 
 void CuMesh::remove_small_connected_components(float min_area) {
+    cudaStream_t stream = current_stream();
     if (this->conn_comp_ids.is_empty()) {
         this->get_connected_components();
     }
@@ -936,14 +964,14 @@ void CuMesh::remove_small_connected_components(float min_area) {
         nullptr, temp_storage_bytes,
         this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids,
         this->face_areas.ptr, cu_sorted_face_areas,
-        F
+        F, 0, sizeof(int) * 8, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids,
         this->face_areas.ptr, cu_sorted_face_areas,
-        F
+        F, 0, sizeof(int) * 8, stream
     ));
 
     // 2. Find unique components and get the number of faces in each.
@@ -957,17 +985,18 @@ void CuMesh::remove_small_connected_components(float min_area) {
         nullptr, temp_storage_bytes,
         cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids,
         cu_conn_comp_num_faces, cu_num_conn_comps,
-        F
+        F, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids,
         cu_conn_comp_num_faces, cu_num_conn_comps,
-        F
+        F, stream
     ));
     int num_conn_comps;
-    CUDA_CHECK(cudaMemcpy(&num_conn_comps, cu_num_conn_comps, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&num_conn_comps, cu_num_conn_comps, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_num_conn_comps));
     CUDA_CHECK(cudaFree(cu_sorted_conn_comp_ids));
     CUDA_CHECK(cudaFree(cu_unique_conn_comp_ids));
@@ -979,15 +1008,14 @@ void CuMesh::remove_small_connected_components(float min_area) {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_conn_comp_num_faces, cu_conn_comp_offsets,
-        num_conn_comps + 1
+        num_conn_comps + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_conn_comp_num_faces, cu_conn_comp_offsets,
-        num_conn_comps + 1
+        num_conn_comps + 1, stream
     ));
-    CUDA_CHECK(cudaFree(cu_conn_comp_num_faces));
 
     float *cu_conn_comp_areas;
     CUDA_CHECK(cudaMalloc(&cu_conn_comp_areas, num_conn_comps * sizeof(float)));
@@ -996,7 +1024,8 @@ void CuMesh::remove_small_connected_components(float min_area) {
         cu_sorted_face_areas, cu_conn_comp_areas,
         num_conn_comps,
         cu_conn_comp_offsets,
-        cu_conn_comp_offsets + 1
+        cu_conn_comp_offsets + 1,
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
@@ -1004,15 +1033,14 @@ void CuMesh::remove_small_connected_components(float min_area) {
         cu_sorted_face_areas, cu_conn_comp_areas,
         num_conn_comps,
         cu_conn_comp_offsets,
-        cu_conn_comp_offsets + 1
+        cu_conn_comp_offsets + 1,
+        stream
     ));
-    CUDA_CHECK(cudaFree(cu_sorted_face_areas));
-    CUDA_CHECK(cudaFree(cu_conn_comp_offsets));
 
     // 4. Create a "keep" mask for components with area >= min_area.
     uint8_t* cu_comp_keep_mask;
     CUDA_CHECK(cudaMalloc(&cu_comp_keep_mask, num_conn_comps * sizeof(uint8_t)));
-    compare_kernel<<<(num_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compare_kernel<<<(num_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_conn_comp_areas,
         min_area,
         num_conn_comps,
@@ -1020,23 +1048,26 @@ void CuMesh::remove_small_connected_components(float min_area) {
         cu_comp_keep_mask
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_conn_comp_areas));
 
     // 5. Propagate the component "keep" mask to every face.
     uint8_t* cu_face_keep_mask;
     CUDA_CHECK(cudaMalloc(&cu_face_keep_mask, F * sizeof(uint8_t)));
     // Use an index_kernel (gather operation)
-    index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_comp_keep_mask,      // Source array
         this->conn_comp_ids.ptr, // Indices to gather from
         F,
         cu_face_keep_mask       // Destination array
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_comp_keep_mask));
 
     // 6. Select the faces to keep and update the mesh.
     this->_remove_faces(cu_face_keep_mask);
+    CUDA_CHECK(cudaFree(cu_conn_comp_num_faces));
+    CUDA_CHECK(cudaFree(cu_sorted_face_areas));
+    CUDA_CHECK(cudaFree(cu_conn_comp_offsets));
+    CUDA_CHECK(cudaFree(cu_conn_comp_areas));
+    CUDA_CHECK(cudaFree(cu_comp_keep_mask));
     CUDA_CHECK(cudaFree(cu_face_keep_mask));
 }
 
@@ -1158,6 +1189,7 @@ static __global__ void inplace_flip_faces_with_flags_kernel(
 
 
 void CuMesh::unify_face_orientations() {
+    cudaStream_t stream = current_stream();
     if (this->manifold_face_adj.is_empty()) {
         this->get_manifold_face_adjacency();
     }
@@ -1165,7 +1197,7 @@ void CuMesh::unify_face_orientations() {
     // 1. Compute the flipped flag for each edge.
     uint8_t* cu_flipped;
     CUDA_CHECK(cudaMalloc(&cu_flipped, this->manifold_face_adj.size * sizeof(uint8_t)));
-    get_flip_flags_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_flip_flags_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->manifold_face_adj.ptr,
         this->faces.ptr,
         this->manifold_face_adj.size,
@@ -1176,16 +1208,17 @@ void CuMesh::unify_face_orientations() {
     // 2. Hook edges with flipped flag.
     int* conn_comp_with_flip;
     CUDA_CHECK(cudaMalloc(&conn_comp_with_flip, this->faces.size * sizeof(int)));
-    arange_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(conn_comp_with_flip, this->faces.size, 2);
+    arange_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(conn_comp_with_flip, this->faces.size, 2);
     CUDA_CHECK(cudaGetLastError());
     int* cu_end_flag; int h_end_flag;
     CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
 
         // Hook
-        hook_edges_with_orientation_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        hook_edges_with_orientation_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->manifold_face_adj.ptr,
             cu_flipped,
             this->manifold_face_adj.size,
@@ -1195,25 +1228,27 @@ void CuMesh::unify_face_orientations() {
         CUDA_CHECK(cudaGetLastError());
 
         // Compress
-        compress_components_with_orientation_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        compress_components_with_orientation_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             conn_comp_with_flip,
             this->faces.size
         );
         CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
     } while (h_end_flag == 0);
     CUDA_CHECK(cudaFree(cu_end_flag));
 
     // 3. Flip the orientation of the faces.
-    inplace_flip_faces_with_flags_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    inplace_flip_faces_with_flags_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->faces.ptr,
         conn_comp_with_flip,
         this->faces.size
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_flipped));
     CUDA_CHECK(cudaFree(conn_comp_with_flip));
 }
 
 
-} // namespace cumesh
\ No newline at end of file
+} // namespace cumesh
diff --git a/src/connectivity.cu b/src/connectivity.cu
index 6e2f5fe..de0471e 100644
--- a/src/connectivity.cu
+++ b/src/connectivity.cu
@@ -1,6 +1,7 @@
 #include "cumesh.h"
 #include "shared.h"
 
+#include <c10/cuda/CUDAStream.h>
 #include <cub/cub.cuh>
 
 
@@ -8,7 +9,7 @@ namespace cumesh {
 
 /**
  * Get count of neighboring faces for each vertex
- * 
+ *
  * @param faces: the faces of the mesh, shape (F)
  * @param F: the number of faces
  * @param neighbor_face_cnt: the buffer for neighbor face count, shape (V+1)
@@ -31,7 +32,7 @@ static __global__ void get_neighbor_face_cnt_kernel(
 
 /**
  * Fill the neighboring face ids for each vertex
- * 
+ *
  * @param faces: the faces of the mesh, shape (F)
  * @param F: the number of faces
  * @param neighbor_face_ids: the buffer for neighbor face ids, shape (total_neighbor_face_cnt)
@@ -57,13 +58,14 @@ static __global__ void fill_neighbor_face_ids_kernel(
 
 
 void CuMesh::get_vertex_face_adjacency() {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
     size_t V = this->vertices.size;
 
     // get neighboring face count for each vertex
     this->vert2face_cnt.resize(V + 1);
     this->vert2face_cnt.zero();
-    get_neighbor_face_cnt_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->faces.ptr, F, this->vert2face_cnt.ptr);
+    get_neighbor_face_cnt_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->faces.ptr, F, this->vert2face_cnt.ptr);
     CUDA_CHECK(cudaGetLastError());
 
     // allocate memory for neighboring face ids
@@ -72,19 +74,19 @@ void CuMesh::get_vertex_face_adjacency() {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->vert2face_cnt.ptr, this->vert2face_offset.ptr,
-        V + 1
+        V + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->vert2face_cnt.ptr, this->vert2face_offset.ptr,
-        V + 1
+        V + 1, stream
     ));
     this->vert2face.resize(F*3);
 
     // fill neighboring face ids for each vertex
     this->vert2face_cnt.zero();
-    fill_neighbor_face_ids_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->faces.ptr, F, 
+    fill_neighbor_face_ids_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->faces.ptr, F,
         this->vert2face.ptr,
         this->vert2face_offset.ptr,
         this->vert2face_cnt.ptr
@@ -95,7 +97,7 @@ void CuMesh::get_vertex_face_adjacency() {
 
 /**
  * Expand edges for each triangle face
- * 
+ *
  * @param faces: the faces of the mesh, shape (F)
  * @param F: the number of faces
  * @param edges: the buffer for edges, shape (F*3)
@@ -110,7 +112,7 @@ static __global__ void expand_edges_kernel(
 
     int base = tid * 3;
     int3 f = faces[tid];
-    
+
     // expand edges
     edges[base + 0] = ((uint64_t)min(f.x, f.y) << 32) | max(f.x, f.y);
     edges[base + 1] = ((uint64_t)min(f.y, f.z) << 32) | max(f.y, f.z);
@@ -119,9 +121,10 @@ static __global__ void expand_edges_kernel(
 
 
 void CuMesh::get_edges() {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
     this->edges.resize(F * 3);
-    expand_edges_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->faces.ptr, F, this->edges.ptr);
+    expand_edges_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->faces.ptr, F, this->edges.ptr);
     CUDA_CHECK(cudaGetLastError());
 
     // sort edges
@@ -131,14 +134,14 @@ void CuMesh::get_edges() {
         nullptr, temp_storage_bytes,
         this->edges.ptr,
         reinterpret_cast<uint64_t*>(this->temp_storage.ptr),
-        F * 3
+        F * 3, 0, sizeof(uint64_t) * 8, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortKeys(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->edges.ptr,
         reinterpret_cast<uint64_t*>(this->temp_storage.ptr),
-        F * 3
+        F * 3, 0, sizeof(uint64_t) * 8, stream
     ));
 
     // unique edges
@@ -148,15 +151,16 @@ void CuMesh::get_edges() {
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         nullptr, temp_storage_bytes,
         reinterpret_cast<uint64_t*>(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges,
-        F * 3
+        F * 3, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         reinterpret_cast<uint64_t*>(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges,
-        F * 3
+        F * 3, stream
     ));
-    CUDA_CHECK(cudaMemcpy(&this->edges.size, num_edges, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&this->edges.size, num_edges, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     this->edge2face_cnt.size = this->edges.size;
     CUDA_CHECK(cudaFree(num_edges));
 }
@@ -164,7 +168,7 @@ void CuMesh::get_edges() {
 
 /**
  * Get edge-face adjacency
- * 
+ *
  * @param faces: the faces of the mesh, shape (F)
  * @param edges: the buffer for edges, shape (E)
  * @param edge2face_cnt: the buffer for edge duplication number, shape (E)
@@ -217,11 +221,12 @@ static __global__ void get_edge_face_adjacency_kernel(
 
 
 void CuMesh::get_edge_face_adjacency() {
+    cudaStream_t stream = current_stream();
     if (this->edges.is_empty() || this->edge2face_cnt.is_empty()) {
         this->get_edges();
     }
     if (this->vert2face.is_empty() || this->vert2face_offset.is_empty()) {
-        this->get_vertex_face_adjacency();    
+        this->get_vertex_face_adjacency();
     }
     size_t F = this->faces.size;
     size_t E = this->edges.size;
@@ -232,25 +237,26 @@ void CuMesh::get_edge_face_adjacency() {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->edge2face_cnt.ptr, this->edge2face_offset.ptr,
-        E + 1
+        E + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->edge2face_cnt.ptr, this->edge2face_offset.ptr,
-        E + 1
+        E + 1, stream
     ));
 
     // allocate memory for edge2face
     int total_edge_face_cnt;
-    CUDA_CHECK(cudaMemcpy(&total_edge_face_cnt, &this->edge2face_offset.ptr[E], sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&total_edge_face_cnt, &this->edge2face_offset.ptr[E], sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     this->edge2face.resize(total_edge_face_cnt);
 
     // allocate memory for face2edge
     this->face2edge.resize(F);
 
     // get edge-face adjacency
-    get_edge_face_adjacency_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_edge_face_adjacency_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->faces.ptr,
         this->edges.ptr,
         this->edge2face_cnt.ptr,
@@ -267,7 +273,7 @@ void CuMesh::get_edge_face_adjacency() {
 
 /**
  * Get vertex adjacent edge number
- * 
+ *
  * @param edges: the buffer for edges, shape (E)
  * @param E: the number of edges
  * @param vert2edge_cnt: the buffer for vertex adjacent edge number, shape (V)
@@ -293,7 +299,7 @@ static __global__ void get_vertex_edge_cnt_kernel(
 
 /**
  * Get vertex-edge adjacency
- * 
+ *
  * @param edges: the buffer for edges, shape (E)
  * @param E: the number of edges
  * @param vert2edge: the buffer for vertex to edge adjacency, shape (total_vertex_edge_cnt)
@@ -322,6 +328,7 @@ static __global__ void get_vertex_edge_adjacency_kernel(
 
 
 void CuMesh::get_vertex_edge_adjacency() {
+    cudaStream_t stream = current_stream();
     if (this->edges.is_empty()) {
         this->get_edges();
     }
@@ -331,7 +338,7 @@ void CuMesh::get_vertex_edge_adjacency() {
     // get vertex adjacent edge number
     this->vert2edge_cnt.resize(V + 1);
     this->vert2edge_cnt.zero();
-    get_vertex_edge_cnt_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_vertex_edge_cnt_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edges.ptr, E, this->vert2edge_cnt.ptr
     );
     CUDA_CHECK(cudaGetLastError());
@@ -342,19 +349,19 @@ void CuMesh::get_vertex_edge_adjacency() {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr,
-        V + 1
+        V + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr,
-        V + 1
+        V + 1, stream
     ));
 
     // get vertex-edge adjacency
     this->vert2edge.resize(2 * E);
     this->vert2edge_cnt.zero();
-    get_vertex_edge_adjacency_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_vertex_edge_adjacency_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edges.ptr, E,
         this->vert2edge.ptr,
         this->vert2edge_offset.ptr,
@@ -366,7 +373,7 @@ void CuMesh::get_vertex_edge_adjacency() {
 
 /**
  * Set vertex boundary indicator
- * 
+ *
  * @param edges: the buffer for edges, shape (E)
  * @param boundaries: the buffer for boundary edges, shape (B)
  * @param edge2face_cnt: the buffer for edge duplication number, shape (E)
@@ -408,6 +415,7 @@ struct is_boundary_edge {
 
 
 void CuMesh::get_boundary_info() {
+    cudaStream_t stream = current_stream();
     if (this->edges.is_empty() || this->edge2face_cnt.is_empty()) {
         this->get_edges();
     }
@@ -419,21 +427,25 @@ void CuMesh::get_boundary_info() {
     CUDA_CHECK(cudaMalloc(&cu_num_boundary, sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_edge_idx, E * sizeof(int)));
     this->boundaries.resize(E);
-    arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_edge_idx, E);
+    arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_edge_idx, E);
+    CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cub::DeviceSelect::If(
         nullptr, temp_storage_bytes,
         cu_edge_idx, this->boundaries.ptr, cu_num_boundary,
         E,
-        is_boundary_edge{this->edge2face_cnt.ptr}
+        is_boundary_edge{this->edge2face_cnt.ptr},
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::If(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_edge_idx, this->boundaries.ptr, cu_num_boundary,
         E,
-        is_boundary_edge{this->edge2face_cnt.ptr}
+        is_boundary_edge{this->edge2face_cnt.ptr},
+        stream
     ));
-    CUDA_CHECK(cudaMemcpy(&this->boundaries.size, cu_num_boundary, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&this->boundaries.size, cu_num_boundary, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_num_boundary));
     CUDA_CHECK(cudaFree(cu_edge_idx));
 
@@ -441,7 +453,7 @@ void CuMesh::get_boundary_info() {
     this->vert_is_boundary.resize(this->vertices.size);
     this->vert_is_boundary.zero();
     if (this->boundaries.size > 0) {
-        set_boundary_vertex_kernel<<<(this->boundaries.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        set_boundary_vertex_kernel<<<(this->boundaries.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->edges.ptr, this->boundaries.ptr, this->edge2face_cnt.ptr,
             this->boundaries.size, this->vert_is_boundary.ptr
         );
@@ -474,7 +486,7 @@ static __global__ void get_vertex_boundary_cnt_kernel(
 
 /**
  * Get vertex-boundary adjacency
- * 
+ *
  * @param edges: the buffer for edges, shape (E)
  * @param boundaries: the buffer for boundary edges, shape (B)
  * @param B: the number of boundary edges
@@ -507,9 +519,10 @@ static __global__ void get_vertex_boundary_adjacency_kernel(
 
 
 void CuMesh::get_vertex_boundary_adjacency() {
+    cudaStream_t stream = current_stream();
     if (this->edges.is_empty()) {
         this->get_edges();
-    } 
+    }
     if (this->boundaries.is_empty()) {
         this->get_boundary_info();
     }
@@ -528,7 +541,7 @@ void CuMesh::get_vertex_boundary_adjacency() {
     // get vertex adjacent boundary number
     this->vert2bound_cnt.resize(V + 1);
     this->vert2bound_cnt.zero();
-    get_vertex_boundary_cnt_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_vertex_boundary_cnt_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edges.ptr, this->boundaries.ptr, B, this->vert2bound_cnt.ptr
     );
     CUDA_CHECK(cudaGetLastError());
@@ -539,19 +552,19 @@ void CuMesh::get_vertex_boundary_adjacency() {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr,
-        V + 1
+        V + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr,
-        V + 1
+        V + 1, stream
     ));
 
     // get vertex-boundary adjacency
     this->vert2bound.resize(2 * B);
     this->vert2bound_cnt.zero();
-    get_vertex_boundary_adjacency_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_vertex_boundary_adjacency_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edges.ptr, this->boundaries.ptr, B,
         this->vert2bound.ptr,
         this->vert2bound_offset.ptr,
@@ -592,10 +605,11 @@ static __global__ void get_vertex_is_manifold_kernel(
     }
 
     vert_is_manifold[tid] = is_manifold ? 1 : 0;
-}       
+}
 
 
 void CuMesh::get_vertex_is_manifold() {
+    cudaStream_t stream = current_stream();
     if (this->vert2edge.is_empty() || this->vert2edge_offset.is_empty()) {
         this->get_vertex_edge_adjacency();
     }
@@ -606,7 +620,7 @@ void CuMesh::get_vertex_is_manifold() {
 
     // get vertex is manifold
     this->vert_is_manifold.resize(V);
-    get_vertex_is_manifold_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_vertex_is_manifold_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vert2edge.ptr,
         this->vert2edge_offset.ptr,
         this->edge2face_cnt.ptr,
@@ -619,7 +633,7 @@ void CuMesh::get_vertex_is_manifold() {
 
 /**
  * Set manifold face adjacency
- * 
+ *
  * @param manifold_edge_idx: the buffer for manifold edge index, shape (M)
  * @param edge2face: the buffer for edge to face adjacency, shape (total_edge_face_cnt)
  * @param edge2face_offset: the buffer for edge to face adjacency offset, shape (E+1)
@@ -660,6 +674,7 @@ struct is_manifold_edge {
 
 
 void CuMesh::get_manifold_face_adjacency() {
+    cudaStream_t stream = current_stream();
     if (this->edge2face.is_empty() || this->edge2face_offset.is_empty()) {
         this->get_edge_face_adjacency();
     }
@@ -671,29 +686,32 @@ void CuMesh::get_manifold_face_adjacency() {
     CUDA_CHECK(cudaMalloc(&cu_num_manifold_edges, sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_edge_idx, E * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_manifold_edge_idx, E * sizeof(int)));
-    arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_edge_idx, E);
+    arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_edge_idx, E);
     CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cub::DeviceSelect::If(
         nullptr, temp_storage_bytes,
         cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges,
         E,
-        is_manifold_edge{this->edge2face_cnt.ptr}
+        is_manifold_edge{this->edge2face_cnt.ptr},
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::If(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges,
         E,
-        is_manifold_edge{this->edge2face_cnt.ptr}
+        is_manifold_edge{this->edge2face_cnt.ptr},
+        stream
     ));
     int manifold_edge_count;
-    CUDA_CHECK(cudaMemcpy(&manifold_edge_count, cu_num_manifold_edges, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&manifold_edge_count, cu_num_manifold_edges, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_num_manifold_edges));
     CUDA_CHECK(cudaFree(cu_edge_idx));
 
     // set manifold_face_adj
     this->manifold_face_adj.resize(manifold_edge_count);
-    set_manifold_face_adj_kernel<<<(manifold_edge_count+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    set_manifold_face_adj_kernel<<<(manifold_edge_count+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_manifold_edge_idx,
         this->edge2face.ptr,
         this->edge2face_offset.ptr,
@@ -701,6 +719,7 @@ void CuMesh::get_manifold_face_adjacency() {
         this->manifold_face_adj.ptr
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_manifold_edge_idx));
 }
 
@@ -737,6 +756,7 @@ struct is_manifold_boundary_vertex {
 
 
 void CuMesh::get_manifold_boundary_adjacency() {
+    cudaStream_t stream = current_stream();
     if (this->vert2bound.is_empty() || this->vert2bound_offset.is_empty()) {
         this->get_vertex_boundary_adjacency();
     }
@@ -751,23 +771,26 @@ void CuMesh::get_manifold_boundary_adjacency() {
     CUDA_CHECK(cudaMalloc(&cu_num_manifold_boundary_verts, sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_vert_idx, V * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_manifold_vert_idx, V * sizeof(int)));
-    arange_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vert_idx, V);
+    arange_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_vert_idx, V);
     CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cub::DeviceSelect::If(
         nullptr, temp_storage_bytes,
         cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts,
         V,
-        is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr}
+        is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr},
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::If(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts,
         V,
-        is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr}
+        is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr},
+        stream
     ));
     int manifold_boundary_vert_count;
-    CUDA_CHECK(cudaMemcpy(&manifold_boundary_vert_count, cu_num_manifold_boundary_verts, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&manifold_boundary_vert_count, cu_num_manifold_boundary_verts, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_num_manifold_boundary_verts));
     CUDA_CHECK(cudaFree(cu_vert_idx));
 
@@ -779,7 +802,7 @@ void CuMesh::get_manifold_boundary_adjacency() {
 
     // set manifold_bound_adj
     this->manifold_bound_adj.resize(manifold_boundary_vert_count);
-    set_manifold_bound_adj_kernel<<<(manifold_boundary_vert_count+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    set_manifold_bound_adj_kernel<<<(manifold_boundary_vert_count+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_manifold_vert_idx,
         this->vert2bound.ptr,
         this->vert2bound_offset.ptr,
@@ -787,10 +810,13 @@ void CuMesh::get_manifold_boundary_adjacency() {
         this->manifold_bound_adj.ptr
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_manifold_vert_idx));
 }
 
 
 void CuMesh::get_connected_components() {
+    cudaStream_t stream = current_stream();
     if (this->manifold_face_adj.is_empty()) {
         this->get_manifold_face_adjacency();
     }
@@ -800,16 +826,17 @@ void CuMesh::get_connected_components() {
 
     // Iterative Hook and Compress
     this->conn_comp_ids.resize(F);
-    arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->conn_comp_ids.ptr, F);
+    arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->conn_comp_ids.ptr, F);
     CUDA_CHECK(cudaGetLastError());
     int* cu_end_flag; int h_end_flag;
     CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
 
         // Hook
-        hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->manifold_face_adj.ptr,
             M,
             this->conn_comp_ids.ptr,
@@ -818,12 +845,13 @@ void CuMesh::get_connected_components() {
         CUDA_CHECK(cudaGetLastError());
 
         // Compress
-        compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->conn_comp_ids.ptr,
             F
         );
         CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
     } while (h_end_flag == 0);
     CUDA_CHECK(cudaFree(cu_end_flag));
 
@@ -833,6 +861,7 @@ void CuMesh::get_connected_components() {
 
 
 void CuMesh::get_boundary_connected_components() {
+    cudaStream_t stream = current_stream();
     if (this->manifold_bound_adj.is_empty()) {
         this->get_manifold_boundary_adjacency();
     }
@@ -847,16 +876,17 @@ void CuMesh::get_boundary_connected_components() {
 
     // Iterative Hook and Compress
     this->bound_conn_comp_ids.resize(B);
-    arange_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->bound_conn_comp_ids.ptr, B);
+    arange_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->bound_conn_comp_ids.ptr, B);
     CUDA_CHECK(cudaGetLastError());
     int* cu_end_flag; int h_end_flag;
     CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
 
         // Hook
-        hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->manifold_bound_adj.ptr,
             M,
             this->bound_conn_comp_ids.ptr,
@@ -865,12 +895,13 @@ void CuMesh::get_boundary_connected_components() {
         CUDA_CHECK(cudaGetLastError());
 
         // Compress
-        compress_components_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        compress_components_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->bound_conn_comp_ids.ptr,
             B
         );
         CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
     } while (h_end_flag == 0);
     CUDA_CHECK(cudaFree(cu_end_flag));
 
@@ -926,6 +957,7 @@ static __global__ void is_bound_conn_comp_loop_kernel(
 
 
 void CuMesh::get_boundary_loops() {
+    cudaStream_t stream = current_stream();
     if (this->bound_conn_comp_ids.is_empty()) {
         this->get_boundary_connected_components();
     }
@@ -941,13 +973,13 @@ void CuMesh::get_boundary_loops() {
     // Check if boundary components are loops
     int* cu_is_bound_conn_comp_loop;
     CUDA_CHECK(cudaMalloc(&cu_is_bound_conn_comp_loop, this->num_bound_conn_comps * sizeof(int)));
-    fill_kernel<<<(this->num_bound_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    fill_kernel<<<(this->num_bound_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_is_bound_conn_comp_loop,
         this->num_bound_conn_comps,
         1
     );
     CUDA_CHECK(cudaGetLastError());
-    is_bound_conn_comp_loop_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    is_bound_conn_comp_loop_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edges.ptr,
         this->boundaries.ptr,
         this->bound_conn_comp_ids.ptr,
@@ -964,22 +996,23 @@ void CuMesh::get_boundary_loops() {
         nullptr, temp_storage_bytes,
         cu_is_bound_conn_comp_loop,
         cu_num_bound_loops,
-        this->num_bound_conn_comps
+        this->num_bound_conn_comps, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceReduce::Sum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_is_bound_conn_comp_loop,
         cu_num_bound_loops,
-        this->num_bound_conn_comps
+        this->num_bound_conn_comps, stream
     ));
-    CUDA_CHECK(cudaMemcpy(&this->num_bound_loops, cu_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&this->num_bound_loops, cu_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_num_bound_loops));
     if (this->num_bound_loops == 0) {
         CUDA_CHECK(cudaFree(cu_is_bound_conn_comp_loop));
         return;
     }
-    
+
     // Sort boundaries by connected component ids
     int *cu_bound_sorted, *cu_bound_conn_comp_ids_sorted;
     CUDA_CHECK(cudaMalloc(&cu_bound_sorted, B * sizeof(int)));
@@ -989,27 +1022,26 @@ void CuMesh::get_boundary_loops() {
         nullptr, temp_storage_bytes,
         this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted,
         this->boundaries.ptr, cu_bound_sorted,
-        B
+        B, 0, sizeof(int) * 8, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted,
         this->boundaries.ptr, cu_bound_sorted,
-        B
+        B, 0, sizeof(int) * 8, stream
     ));
 
     // Select loops
     int* cu_bound_is_on_loop;
     CUDA_CHECK(cudaMalloc(&cu_bound_is_on_loop, B * sizeof(int)));
-    index_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    index_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_is_bound_conn_comp_loop,
         cu_bound_conn_comp_ids_sorted,
         B,
         cu_bound_is_on_loop
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_is_bound_conn_comp_loop));
     this->loop_boundaries.resize(B);
     int *cu_loop_bound_conn_comp_ids_sorted, *cu_num_bound_on_loop;
     CUDA_CHECK(cudaMalloc(&cu_loop_bound_conn_comp_ids_sorted, B * sizeof(int)));
@@ -1018,34 +1050,37 @@ void CuMesh::get_boundary_loops() {
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop,
-        B
+        B, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop,
-        B
+        B, stream
     ));
     int num_bound_on_loop;
-    CUDA_CHECK(cudaMemcpy(&num_bound_on_loop, cu_num_bound_on_loop, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&num_bound_on_loop, cu_num_bound_on_loop, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_is_bound_conn_comp_loop));
     CUDA_CHECK(cudaFree(cu_bound_sorted));
     this->loop_boundaries.resize(num_bound_on_loop);
     temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop,
-        B
+        B, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop,
-        B
+        B, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_bound_conn_comp_ids_sorted));
     CUDA_CHECK(cudaFree(cu_bound_is_on_loop));
     CUDA_CHECK(cudaFree(cu_num_bound_on_loop));
-    
+
     // RLE
     this->loop_boundaries_offset.resize(this->num_bound_loops + 1);
     this->loop_boundaries_offset.zero();
@@ -1057,15 +1092,16 @@ void CuMesh::get_boundary_loops() {
         nullptr, temp_storage_bytes,
         cu_loop_bound_conn_comp_ids_sorted,
         cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs,
-        num_bound_on_loop
+        num_bound_on_loop, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_loop_bound_conn_comp_ids_sorted,
         cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs,
-        num_bound_on_loop
+        num_bound_on_loop, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_loop_bound_conn_comp_ids_sorted));
     CUDA_CHECK(cudaFree(cu_rle_unique_out));
     CUDA_CHECK(cudaFree(cu_rle_num_runs));
@@ -1075,13 +1111,13 @@ void CuMesh::get_boundary_loops() {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->loop_boundaries_offset.ptr,
-        this->num_bound_loops + 1
+        this->num_bound_loops + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->loop_boundaries_offset.ptr,
-        this->num_bound_loops + 1
+        this->num_bound_loops + 1, stream
     ));
 }
 
diff --git a/src/geometry.cu b/src/geometry.cu
index 0e493ba..45b8492 100644
--- a/src/geometry.cu
+++ b/src/geometry.cu
@@ -24,9 +24,10 @@ static __global__ void compute_face_areas_kernel(
 
 
 void CuMesh::compute_face_areas() {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
     this->face_areas.resize(F);
-    compute_face_areas_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    compute_face_areas_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vertices.ptr,
         this->faces.ptr,
         F,
@@ -57,9 +58,10 @@ static __global__ void compute_face_normals_kernel(
 
 
 void CuMesh::compute_face_normals() {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
     this->face_normals.resize(F);
-    compute_face_normals_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    compute_face_normals_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vertices.ptr,
         this->faces.ptr,
         F,
@@ -113,9 +115,10 @@ void CuMesh::compute_vertex_normals() {
         this->get_vertex_face_adjacency();
     }
 
+    cudaStream_t stream = current_stream();
     size_t V = this->vertices.size;
     this->vertex_normals.resize(V);
-    compute_vertex_normals_kernel<<<(V + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    compute_vertex_normals_kernel<<<(V + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vertices.ptr,
         this->faces.ptr,
         this->vert2face.ptr,
diff --git a/src/hash/hash.cu b/src/hash/hash.cu
index a9b1c23..b691096 100644
--- a/src/hash/hash.cu
+++ b/src/hash/hash.cu
@@ -1,6 +1,7 @@
 #include <torch/extension.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <c10/cuda/CUDAStream.h>
 
 #include "api.h"
 #include "hash.cuh"
@@ -32,9 +33,12 @@ static void dispatch_hashmap_insert_cuda(
     const torch::Tensor& keys,
     const torch::Tensor& values
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_insert_cuda_kernel<<<
         (keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         keys.size(0),
@@ -111,9 +115,12 @@ static void dispatch_hashmap_lookup_cuda(
     const torch::Tensor& keys,
     torch::Tensor& values
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_lookup_cuda_kernel<<<
         (keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         keys.size(0),
@@ -205,9 +212,12 @@ static void dispatch_hashmap_insert_3d_cuda(
     const torch::Tensor& values,
     int W, int H, int D
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_insert_3d_cuda_kernel<<<
         (coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         coords.size(0),
@@ -303,9 +313,12 @@ static void dispatch_hashmap_lookup_3d_cuda(
     torch::Tensor& values,
     int W, int H, int D
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_lookup_3d_cuda_kernel<<<
         (coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         coords.size(0),
@@ -395,9 +408,12 @@ static void dispatch_hashmap_insert_3d_idx_as_val_cuda(
     const torch::Tensor& coords,
     int W, int H, int D
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_insert_3d_idx_as_val_cuda_kernel<<<
         (coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         coords.size(0),
diff --git a/src/remesh/simple_dual_contour.cu b/src/remesh/simple_dual_contour.cu
index e064ee3..1629cd1 100644
--- a/src/remesh/simple_dual_contour.cu
+++ b/src/remesh/simple_dual_contour.cu
@@ -1,6 +1,7 @@
 #include <torch/extension.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <c10/cuda/CUDAStream.h>
 #include <vector>
 
 #include "api.h"
@@ -181,6 +182,7 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
 ) {
     const size_t M = coords.size(0);
     const size_t N_vert = hashmap_keys.size(0);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
 
     auto vertices = torch::empty({(long)M, 3}, torch::dtype(torch::kFloat32).device(coords.device()));
     auto intersected = torch::empty({(long)M, 3}, torch::dtype(torch::kInt32).device(coords.device()));
@@ -189,7 +191,7 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
     dim3 blocks((M + BLOCK_SIZE - 1) / BLOCK_SIZE);
 
     if (hashmap_keys.dtype() == torch::kUInt32) {
-        simple_dual_contour_kernel<<<blocks, threads>>>(
+        simple_dual_contour_kernel<<<blocks, threads, 0, stream>>>(
             N_vert,
             M,
             W, H, D,
@@ -200,9 +202,9 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
             vertices.data_ptr<float>(),
             intersected.data_ptr<int32_t>()
         );
-    } 
+    }
     else if (hashmap_keys.dtype() == torch::kUInt64) {
-        simple_dual_contour_kernel<<<blocks, threads>>>(
+        simple_dual_contour_kernel<<<blocks, threads, 0, stream>>>(
             N_vert,
             M,
             W, H, D,
@@ -213,7 +215,7 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
             vertices.data_ptr<float>(),
             intersected.data_ptr<int32_t>()
         );
-    } 
+    }
     else {
         TORCH_CHECK(false, "Unsupported hashmap data type");
     }
diff --git a/src/remesh/svox2vert.cu b/src/remesh/svox2vert.cu
index 6f1d517..d6b6511 100644
--- a/src/remesh/svox2vert.cu
+++ b/src/remesh/svox2vert.cu
@@ -2,6 +2,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
 
 #include "api.h"
 #include "../utils.h"
@@ -147,10 +148,11 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
 
     // Get the number of active vertices for each voxel
     size_t N = hashmap_keys.size(0);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     int* num_vertices;
     CUDA_CHECK(cudaMalloc(&num_vertices, (M + 1) * sizeof(int)));
     if (hashmap_keys.dtype() == torch::kUInt32) {
-        get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+        get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             N,
             M,
             W,
@@ -162,7 +164,7 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
             num_vertices
         );
     } else if (hashmap_keys.dtype() == torch::kUInt64) {
-        get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+        get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             N,
             M,
             W,
@@ -180,18 +182,19 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
 
     // Compute the offset
     size_t temp_storage_bytes = 0;
-    cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1);
+    cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1, stream);
     void* d_temp_storage = nullptr;
     CUDA_CHECK(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1);
-    CUDA_CHECK(cudaFree(d_temp_storage));
+    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1, stream);
     int total_vertices;
-    CUDA_CHECK(cudaMemcpy(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(d_temp_storage));
 
     // Set the active vertices for each voxel
     auto vertices = torch::empty({total_vertices, 3}, torch::dtype(torch::kInt32).device(hashmap_keys.device()));
     if (hashmap_keys.dtype() == torch::kUInt32) {
-        set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+        set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             N,
             M,
             W,
@@ -205,7 +208,7 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
         );
     }
     else if (hashmap_keys.dtype() == torch::kUInt64) {
-        set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+        set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             N,
             M,
             W,
@@ -220,7 +223,8 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
     }
     CUDA_CHECK(cudaGetLastError());
 
-    // Free the temporary memory
+    // Free the temporary memory — sync stream first so set_vertex kernel is done
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(num_vertices));
 
     return vertices;
diff --git a/src/shared.h b/src/shared.h
index 66ecac7..70eda3e 100644
--- a/src/shared.h
+++ b/src/shared.h
@@ -3,12 +3,17 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
 #include "utils.h"
 #include "cumesh.h"
 
 
 namespace cumesh {
 
+inline cudaStream_t current_stream() {
+    return at::cuda::getCurrentCUDAStream().stream();
+}
+
 
 template<typename T>
 __global__ void arange_kernel(T* array, int N, int stride=1) {
@@ -158,33 +163,33 @@ __global__ void get_diff_kernel(
 
 template<typename T>
 int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nullptr) {
+    cudaStream_t stream = current_stream();
     int *cu_indices, *cu_indices_argsorted;
+    int *cu_num = nullptr;
     T *cu_ids_sorted;
     CUDA_CHECK(cudaMalloc(&cu_indices, N * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_indices_argsorted, N * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_ids_sorted, N * sizeof(T)));
-    arange_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_indices, N);
+    arange_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_indices, N);
     CUDA_CHECK(cudaGetLastError());
     size_t temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
         nullptr, temp_storage_bytes,
         ids, cu_ids_sorted,
         cu_indices, cu_indices_argsorted,
-        N
+        N, 0, sizeof(T) * 8, stream
     ));
     cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
         cub_temp_storage.ptr, temp_storage_bytes,
         ids, cu_ids_sorted,
         cu_indices, cu_indices_argsorted,
-        N
+        N, 0, sizeof(T) * 8, stream
     ));
-    CUDA_CHECK(cudaFree(cu_indices));
-
     // get diff
     T* cu_new_ids;
     CUDA_CHECK(cudaMalloc(&cu_new_ids, N * sizeof(T)));
-    get_diff_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_diff_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_ids_sorted,
         cu_new_ids,
         N
@@ -193,40 +198,37 @@ int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nu
 
     // inverse
     if (inverse) {
-        int* cu_num;
         CUDA_CHECK(cudaMalloc(&cu_num, sizeof(int)));
         temp_storage_bytes = 0;
         CUDA_CHECK(cub::DeviceSelect::Flagged(
             nullptr, temp_storage_bytes,
             cu_ids_sorted, cu_new_ids, inverse, cu_num,
-            N
+            N, stream
         ));
         cub_temp_storage.resize(temp_storage_bytes);
         CUDA_CHECK(cub::DeviceSelect::Flagged(
             cub_temp_storage.ptr, temp_storage_bytes,
             cu_ids_sorted, cu_new_ids, inverse, cu_num,
-            N
+            N, stream
         ));
-        CUDA_CHECK(cudaFree(cu_num));
     }
-    CUDA_CHECK(cudaFree(cu_ids_sorted));
-    
+
     // scan diff
     temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_new_ids,
-        N
+        N, stream
     ));
     cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         cub_temp_storage.ptr, temp_storage_bytes,
         cu_new_ids,
-        N
+        N, stream
     ));
-    
+
     // scatter
-    scatter_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    scatter_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_indices_argsorted,
         cu_new_ids,
         N,
@@ -234,10 +236,16 @@ int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nu
     );
     CUDA_CHECK(cudaGetLastError());
     T num_components;
-    CUDA_CHECK(cudaMemcpy(&num_components, cu_new_ids + N-1, sizeof(T), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&num_components, cu_new_ids + N-1, sizeof(T), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     num_components += 1;
+
+    // Free all scratch memory — stream is synced, all GPU work is done
+    CUDA_CHECK(cudaFree(cu_indices));
+    CUDA_CHECK(cudaFree(cu_ids_sorted));
     CUDA_CHECK(cudaFree(cu_new_ids));
     CUDA_CHECK(cudaFree(cu_indices_argsorted));
+    if (cu_num) CUDA_CHECK(cudaFree(cu_num));
 
     return static_cast<int>(num_components);
 }
@@ -247,6 +255,7 @@ int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nu
 
 template <typename T>
 void print_max_val(T* ptr, size_t size) {
+    cudaStream_t stream = current_stream();
     T* dbg_cu_max_val;
     CUDA_CHECK(cudaMalloc(&dbg_cu_max_val, sizeof(T)));
     size_t temp_storage_bytes = 0;
@@ -254,7 +263,7 @@ void print_max_val(T* ptr, size_t size) {
         nullptr, temp_storage_bytes,
         ptr,
         dbg_cu_max_val,
-        size
+        size, stream
     ));
     char* temp_storage;
     CUDA_CHECK(cudaMalloc(&temp_storage, temp_storage_bytes));
@@ -262,10 +271,11 @@ void print_max_val(T* ptr, size_t size) {
         temp_storage, temp_storage_bytes,
         ptr,
         dbg_cu_max_val,
-        size
+        size, stream
     ));
     T h_max_val;
-    CUDA_CHECK(cudaMemcpy(&h_max_val, dbg_cu_max_val, sizeof(T), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&h_max_val, dbg_cu_max_val, sizeof(T), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     std::cout << "Max value: " << h_max_val << std::endl;
     CUDA_CHECK(cudaFree(dbg_cu_max_val));
     CUDA_CHECK(cudaFree(temp_storage));
@@ -273,8 +283,10 @@ void print_max_val(T* ptr, size_t size) {
 
 template <typename T>
 void print_val(T* ptr, size_t size) {
+    cudaStream_t stream = current_stream();
     T h_ptr[size];
-    CUDA_CHECK(cudaMemcpy(h_ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(h_ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     for (size_t i = 0; i < size; i++) {
         std::cout << h_ptr[i] << " ";
     }
@@ -282,4 +294,4 @@ void print_val(T* ptr, size_t size) {
 }
 
 
-} // namespace cumesh
\ No newline at end of file
+} // namespace cumesh
diff --git a/src/simplify.cu b/src/simplify.cu
index 9efde9e..b5bbcbb 100644
--- a/src/simplify.cu
+++ b/src/simplify.cu
@@ -1,6 +1,8 @@
 #include "cumesh.h"
 #include "dtypes.cuh"
 #include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
+#include "shared.h"
 
 
 namespace cumesh {
@@ -21,7 +23,7 @@ __device__ inline void unpack_key_value_positive(uint64_t key_value, int& key, f
 
 /**
  * Get the QEM for each vertex
- * 
+ *
  * @param vertices: the vertices of the mesh, shape (V)
  * @param faces: the faces of the mesh, shape (F)
  * @param vert2face: the buffer for neighbor face ids, shape (total_neighbor_face_cnt)
@@ -64,12 +66,13 @@ static __global__ void get_qem_kernel(
  * Get the QEM for each vertex
  */
 void get_qem(
-    CuMesh& ctx
+    CuMesh& ctx,
+    cudaStream_t stream
 ) {
     size_t V = ctx.vertices.size;
     size_t F = ctx.faces.size;
     ctx.temp_storage.resize(V * sizeof(QEM));
-    get_qem_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_qem_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         ctx.vertices.ptr,
         ctx.faces.ptr,
         ctx.vert2face.ptr,
@@ -139,7 +142,7 @@ inline __device__ bool process_incident_tri(
 
 /**
  * Get the cost for each edge collapse
- * 
+ *
  * @param vertices: the vertices of the mesh, shape (V)
  * @param faces: the faces of the mesh, shape (F)
  * @param vert2face: the buffer for neighbor face ids, shape (total_neighbor_face_cnt)
@@ -228,13 +231,14 @@ static __global__ void get_edge_collapse_cost_kernel(
 void get_edge_collapse_cost(
     CuMesh& ctx,
     float lambda_edge_length,
-    float lambda_skinny
+    float lambda_skinny,
+    cudaStream_t stream
 ) {
     size_t V = ctx.vertices.size;
     size_t F = ctx.faces.size;
     size_t E = ctx.edges.size;
     ctx.edge_collapse_costs.resize(E);
-    get_edge_collapse_cost_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_edge_collapse_cost_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         ctx.vertices.ptr,
         ctx.faces.ptr,
         ctx.vert2face.ptr,
@@ -252,7 +256,7 @@ void get_edge_collapse_cost(
 
 /**
  * Propagate cost to neighboring faces
- * 
+ *
  * @param edges: the buffer for edges, shape (E)
  * @param vert2face: the buffer for neighboring face ids, shape (total_neighbor_face_cnt)
  * @param vert2face_offset: the buffer for neighboring face ids offset, shape (V+1)
@@ -296,14 +300,15 @@ static __global__ void propagate_cost_kernel(
  * Propagate cost to neighboring faces
  */
 void propagate_cost(
-    CuMesh& ctx
+    CuMesh& ctx,
+    cudaStream_t stream
 ) {
     size_t V = ctx.vertices.size;
     size_t F = ctx.faces.size;
     size_t E = ctx.edges.size;
     ctx.propagated_costs.resize(F);
     ctx.propagated_costs.fill(std::numeric_limits<uint64_t>::max());
-    propagate_cost_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    propagate_cost_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         ctx.edges.ptr,
         ctx.vert2face.ptr,
         ctx.vert2face_offset.ptr,
@@ -317,7 +322,7 @@ void propagate_cost(
 
 /**
  * Collapse edges parallelly
- * 
+ *
  * @param vertices: the vertices of the mesh, shape (V)
  * @param faces: the faces of the mesh, shape (F)
  * @param edges: the buffer for edges, shape (E)
@@ -433,7 +438,7 @@ static __global__ void compress_faces_kernel(
     if (is_kept) {
         new_faces[new_id].x = vertices_map[old_faces[tid].x];
         new_faces[new_id].y = vertices_map[old_faces[tid].y];
-        new_faces[new_id].z = vertices_map[old_faces[tid].z];        
+        new_faces[new_id].z = vertices_map[old_faces[tid].z];
     }
 }
 
@@ -443,7 +448,8 @@ static __global__ void compress_faces_kernel(
  */
 void collapse_edges(
     CuMesh& ctx,
-    float collapse_thresh
+    float collapse_thresh,
+    cudaStream_t stream
 ) {
     size_t V = ctx.vertices.size;
     size_t F = ctx.faces.size;
@@ -452,7 +458,7 @@ void collapse_edges(
     ctx.faces_map.resize(F + 1);
     ctx.vertices_map.fill(1);
     ctx.faces_map.fill(1);
-    collapse_edges_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    collapse_edges_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         ctx.vertices.ptr,
         ctx.faces.ptr,
         ctx.edges.ptr,
@@ -467,24 +473,25 @@ void collapse_edges(
         ctx.faces_map.ptr
     );
     CUDA_CHECK(cudaGetLastError());
-    
+
     // update vertices buffer
     // get vertices map
     size_t temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
-        ctx.vertices_map.ptr, V+1
+        ctx.vertices_map.ptr, V+1, stream
     ));
     ctx.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         ctx.cub_temp_storage.ptr, temp_storage_bytes,
-        ctx.vertices_map.ptr, V+1
+        ctx.vertices_map.ptr, V+1, stream
     ));
     int new_num_vertices;
-    CUDA_CHECK(cudaMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     // compress vertices
     ctx.temp_storage.resize(new_num_vertices * sizeof(float3));
-    compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         ctx.vertices_map.ptr,
         ctx.vertices.ptr,
         V,
@@ -497,18 +504,19 @@ void collapse_edges(
     // get faces map
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
-        ctx.faces_map.ptr, F+1
+        ctx.faces_map.ptr, F+1, stream
     ));
     ctx.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         ctx.cub_temp_storage.ptr, temp_storage_bytes,
-        ctx.faces_map.ptr, F+1
+        ctx.faces_map.ptr, F+1, stream
     ));
     int new_num_faces;
-    CUDA_CHECK(cudaMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     // compress faces
     ctx.temp_storage.resize(new_num_faces * sizeof(int3));
-    compress_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compress_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         ctx.faces_map.ptr,
         ctx.vertices_map.ptr,
         ctx.faces.ptr,
@@ -521,12 +529,13 @@ void collapse_edges(
 
 
 std::tuple<int, int> CuMesh::simplify_step(float lambda_edge_length, float lambda_skinny, float threshold, bool timing) {
+    cudaStream_t stream = current_stream();
     std::chrono::high_resolution_clock::time_point start, end;
 
     if (timing) start = std::chrono::high_resolution_clock::now();
     this->get_vertex_face_adjacency();
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaStreamSynchronize(stream));
         end = std::chrono::high_resolution_clock::now();
         std::cout << "get_vertex_face_adjacency: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
@@ -535,39 +544,39 @@ std::tuple<int, int> CuMesh::simplify_step(float lambda_edge_length, float lambd
     this->get_edges();
     this->get_boundary_info();
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaStreamSynchronize(stream));
         end = std::chrono::high_resolution_clock::now();
         std::cout << "get_edges: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
 
     if (timing) start = std::chrono::high_resolution_clock::now();
-    get_qem(*this);
+    get_qem(*this, stream);
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaStreamSynchronize(stream));
         end = std::chrono::high_resolution_clock::now();
         std::cout << "get_qem: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
 
     if (timing) start = std::chrono::high_resolution_clock::now();
-    get_edge_collapse_cost(*this, lambda_edge_length, lambda_skinny);
+    get_edge_collapse_cost(*this, lambda_edge_length, lambda_skinny, stream);
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaStreamSynchronize(stream));
         end = std::chrono::high_resolution_clock::now();
         std::cout << "get_edge_collapse_cost: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
 
     if (timing) start = std::chrono::high_resolution_clock::now();
-    propagate_cost(*this);
+    propagate_cost(*this, stream);
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaStreamSynchronize(stream));
         end = std::chrono::high_resolution_clock::now();
         std::cout << "propagate_cost: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
 
     if (timing) start = std::chrono::high_resolution_clock::now();
-    collapse_edges(*this, threshold);
+    collapse_edges(*this, threshold, stream);
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaStreamSynchronize(stream));
         end = std::chrono::high_resolution_clock::now();
         std::cout << "collapse_edges: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
diff --git a/src/utils.h b/src/utils.h
index f15823b..aeaf2d8 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -4,6 +4,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <torch/extension.h>
+#include <c10/cuda/CUDAStream.h>
 
 #define CUDA_CHECK(call)                                \
 do {                                                    \
@@ -43,7 +44,11 @@ struct Buffer {
     }
 
     void free() {
-        if (ptr != nullptr) CUDA_CHECK(cudaFree(ptr));
+        if (ptr != nullptr) {
+            cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+            CUDA_CHECK(cudaStreamSynchronize(stream));
+            CUDA_CHECK(cudaFree(ptr));
+        }
         ptr = nullptr;
         size = 0;
         capacity = 0;
@@ -62,7 +67,9 @@ struct Buffer {
         if (new_size > capacity) {
             T* new_ptr;
             CUDA_CHECK(cudaMalloc(&new_ptr, new_size * sizeof(T)));
-            CUDA_CHECK(cudaMemcpy(new_ptr, ptr, this->size * sizeof(T), cudaMemcpyDeviceToDevice));
+            cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+            CUDA_CHECK(cudaMemcpyAsync(new_ptr, ptr, this->size * sizeof(T), cudaMemcpyDeviceToDevice, stream));
+            CUDA_CHECK(cudaStreamSynchronize(stream));
             CUDA_CHECK(cudaFree(ptr));
             ptr = new_ptr;
             this->capacity = new_size;
@@ -71,12 +78,15 @@ struct Buffer {
     }
 
     void zero() {
-        CUDA_CHECK(cudaMemset(ptr, 0, size * sizeof(T)));
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+        CUDA_CHECK(cudaMemsetAsync(ptr, 0, size * sizeof(T), stream));
     }
 
     void fill(T val) {
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
         std::vector<T> tmp(size, val);
-        CUDA_CHECK(cudaMemcpy(ptr, tmp.data(), size * sizeof(T), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(ptr, tmp.data(), size * sizeof(T), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
     }
 };