From 4843631790426a0c4f3ea1b36ec678783c63d84a Mon Sep 17 00:00:00 2001
From: Chris Uzelac <cuzelac@users.noreply.github.com>
Date: Sun, 8 Mar 2026 00:56:13 -0800
Subject: [PATCH] Fix Blackwell (sm_120) illegal memory access via proper CUDA
 stream handling

PyTorch uses cudaStreamNonBlocking streams, but CuMesh launched all
kernels on the default stream (stream 0) via bare <<<blocks, threads>>>
syntax. On Blackwell GPUs (RTX 5090, CUDA 13.0), this stream mismatch
causes "illegal memory access" errors.

Changes:
- Add current_stream() helper wrapping at::cuda::getCurrentCUDAStream()
- Pass PyTorch's current CUDA stream to all kernel launches (<<<..., 0, stream>>>),
  CUB calls, cudaMemcpyAsync, and cudaMemsetAsync across all 10 source files
- Fix cudaFree race conditions: on non-blocking streams, cudaFree may
  return memory to the pool before async kernels finish using it. Added
  cudaStreamSynchronize(stream) before cudaFree where the freed memory
  was recently used by async work on the stream
- Convert cudaMemcpy to cudaMemcpyAsync with stream, adding
  cudaStreamSynchronize where the CPU needs to read the result
- Replace timing-related cudaDeviceSynchronize calls in simplify.cu
  with cudaStreamSynchronize(stream)

Tested on RTX 5090 32GB, PyTorch 2.10.0+cu130, CUDA 13.0, Windows.
Full Trellis2 reconstruction pipeline completes without errors.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/atlas.cu                      | 226 ++++++++++++++++-------------
 src/clean_up.cu                   | 231 +++++++++++++++++-------------
 src/connectivity.cu               | 198 ++++++++++++++-----------
 src/geometry.cu                   |   9 +-
 src/hash/hash.cu                  |  26 +++-
 src/remesh/simple_dual_contour.cu |  10 +-
 src/remesh/svox2vert.cu           |  22 +--
 src/shared.h                      |  56 +++++---
 src/simplify.cu                   |  73 +++++-----
 src/utils.h                       |  18 ++-
 10 files changed, 515 insertions(+), 354 deletions(-)

diff --git a/src/atlas.cu b/src/atlas.cu
index 2d78d8c..d48031f 100644
--- a/src/atlas.cu
+++ b/src/atlas.cu
@@ -2,6 +2,7 @@
 #include "dtypes.cuh"
 #include "shared.h"
 #include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
 
 
 namespace cumesh {
@@ -39,7 +40,7 @@ __device__ inline void unpack_key_value_positive(uint64_t key_value, int& key, f
 // ) {
 //     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 //     if (tid >= F) return;
-// 
+//
 //     float3 n = face_normals[tid];
 //     chart_normal_cones[tid] = make_float4(n.x, n.y, n.z, 0.0f); // half angle = 0
 // }
@@ -59,7 +60,7 @@ static __global__ void init_chart_adj_kernel(
 
     int f0 = face_adj[tid].x;
     int f1 = face_adj[tid].y;
-    
+
     int c0 = chart_ids[f0];
     int c1 = chart_ids[f1];
 
@@ -77,7 +78,7 @@ static __global__ void init_chart_adj_kernel(
     int3 tri1 = faces[f1];
 
     int t0_indices[3] = {tri0.x, tri0.y, tri0.z};
-    int common_v_indices[2]; 
+    int common_v_indices[2];
     int found_count = 0;
 
     #pragma unroll
@@ -277,7 +278,8 @@ static __global__ void collapse_edges_kernel(
 
 
 static void get_chart_connectivity(
-    CuMesh& mesh
+    CuMesh& mesh,
+    cudaStream_t stream
 ) {
     size_t M = mesh.manifold_face_adj.size;
 
@@ -289,7 +291,7 @@ static void get_chart_connectivity(
     CUDA_CHECK(cudaMalloc(&cu_raw_lengths, M * sizeof(float)));
     CUDA_CHECK(cudaMalloc(&cu_sorted_lengths, M * sizeof(float)));
 
-    init_chart_adj_kernel<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    init_chart_adj_kernel<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         mesh.vertices.ptr,
         mesh.faces.ptr,
         mesh.manifold_face_adj.ptr,
@@ -309,7 +311,7 @@ static void get_chart_connectivity(
         reinterpret_cast<uint64_t*>(mesh.temp_storage.ptr),
         cu_raw_lengths,
         cu_sorted_lengths,
-        M
+        M, 0, sizeof(uint64_t) * 8, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
@@ -318,16 +320,15 @@ static void get_chart_connectivity(
         reinterpret_cast<uint64_t*>(mesh.temp_storage.ptr),
         cu_raw_lengths,
         cu_sorted_lengths,
-        M
+        M, 0, sizeof(uint64_t) * 8, stream
     ));
-    CUDA_CHECK(cudaFree(cu_raw_lengths));
-	
+
     #if CUDART_VERSION >= 12090
         auto reduce_op = ::cuda::std::plus();
     #else
         auto reduce_op = cub::Sum();
     #endif
-	
+
 
     // 1.3 Reduce By Key (Aggregate duplicate chart pairs by summing lengths)
     int* cu_num_chart_adjs;
@@ -340,8 +341,8 @@ static void get_chart_connectivity(
         cu_sorted_lengths,
         mesh.atlas_chart_adj_length.ptr,
         cu_num_chart_adjs,
-		reduce_op,
-        M
+        reduce_op,
+        M, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceReduce::ReduceByKey(
@@ -351,19 +352,22 @@ static void get_chart_connectivity(
         cu_sorted_lengths,
         mesh.atlas_chart_adj_length.ptr,
         cu_num_chart_adjs,
-		reduce_op,
-        M
+        reduce_op,
+        M, stream
     ));
-    CUDA_CHECK(cudaMemcpy(&mesh.atlas_chart_adj.size, cu_num_chart_adjs, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&mesh.atlas_chart_adj.size, cu_num_chart_adjs, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     mesh.atlas_chart_adj_length.size = mesh.atlas_chart_adj.size;
+    CUDA_CHECK(cudaFree(cu_raw_lengths));
     CUDA_CHECK(cudaFree(cu_sorted_lengths));
     CUDA_CHECK(cudaFree(cu_num_chart_adjs));
     // Remove invalid edge (UINT64_MAX) if present
     // Since we sorted, invalid edges are at the end.
     uint64_t last_key;
     if (mesh.atlas_chart_adj.size > 0) {
-        CUDA_CHECK(cudaMemcpy(&last_key, mesh.atlas_chart_adj.ptr + mesh.atlas_chart_adj.size - 1, sizeof(uint64_t), cudaMemcpyDeviceToHost));
-        if (last_key == UINT64_MAX) { 
+        CUDA_CHECK(cudaMemcpyAsync(&last_key, mesh.atlas_chart_adj.ptr + mesh.atlas_chart_adj.size - 1, sizeof(uint64_t), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+        if (last_key == UINT64_MAX) {
             mesh.atlas_chart_adj.size -= 1;
             mesh.atlas_chart_adj_length.size -= 1;
         }
@@ -381,7 +385,7 @@ static void get_chart_connectivity(
     mesh.atlas_chart2edge_cnt.zero();
     mesh.atlas_chart_perims.resize(C);
     mesh.atlas_chart_perims.zero();
-    get_chart_edge_cnt_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_chart_edge_cnt_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         mesh.atlas_chart_adj.ptr,
         mesh.atlas_chart_adj_length.ptr,
         E,
@@ -396,19 +400,19 @@ static void get_chart_connectivity(
         nullptr, temp_storage_bytes,
         mesh.atlas_chart2edge_cnt.ptr,
         mesh.atlas_chart2edge_offset.ptr,
-        C + 1
+        C + 1, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         mesh.atlas_chart2edge_cnt.ptr,
         mesh.atlas_chart2edge_offset.ptr,
-        C + 1
+        C + 1, stream
     ));
     // 2.3 Fill CSR format for chart-edge connectivity
     mesh.atlas_chart2edge.resize(2 * E); // each edge connects two charts
     mesh.atlas_chart2edge_cnt.zero();
-    get_chart_edge_adjacency_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_chart_edge_adjacency_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         mesh.atlas_chart_adj.ptr,
         E,
         mesh.atlas_chart2edge.ptr,
@@ -480,7 +484,8 @@ static __global__ void update_normal_cones_kernel(
 
 
 void compute_chart_normal_cones(
-    CuMesh& mesh
+    CuMesh& mesh,
+    cudaStream_t stream
 ) {
     size_t C = mesh.atlas_num_charts;
     size_t F = mesh.faces.size;
@@ -492,7 +497,7 @@ void compute_chart_normal_cones(
     CUDA_CHECK(cudaMalloc(&sorted_chart_ids, F * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&faces_ids, F * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&argsorted_faces_ids, F * sizeof(int)));
-    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         faces_ids,
         F
     );
@@ -502,17 +507,18 @@ void compute_chart_normal_cones(
         nullptr, temp_storage_bytes,
         mesh.atlas_chart_ids.ptr, sorted_chart_ids,
         faces_ids, argsorted_faces_ids,
-        F
+        F, 0, sizeof(int) * 8, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         mesh.atlas_chart_ids.ptr, sorted_chart_ids,
         faces_ids, argsorted_faces_ids,
-        F
+        F, 0, sizeof(int) * 8, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(faces_ids));
-    
+
     // 2. Get CSR format for chart-face assignment
     int* cu_chart_size;
     int* cu_num_charts;
@@ -523,14 +529,15 @@ void compute_chart_normal_cones(
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         nullptr, temp_storage_bytes,
         sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts,
-        F
+        F, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts,
-        F
+        F, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_num_charts));
     CUDA_CHECK(cudaFree(cu_unique_chart_ids));
 
@@ -540,20 +547,21 @@ void compute_chart_normal_cones(
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_chart_size, cu_chart_offsets,
-        C + 1
+        C + 1, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_chart_size, cu_chart_offsets,
-        C + 1
+        C + 1, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_chart_size));
 
     // 3. Compute chart normals and areas
     float* cu_sorted_face_areas;
     CUDA_CHECK(cudaMalloc(&cu_sorted_face_areas, F * sizeof(float)));
-    index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         mesh.face_areas.ptr,
         argsorted_faces_ids,
         F,
@@ -565,26 +573,30 @@ void compute_chart_normal_cones(
         nullptr, temp_storage_bytes,
         cu_sorted_face_areas, mesh.atlas_chart_areas.ptr,
         C,
-        cu_chart_offsets, cu_chart_offsets + 1
+        cu_chart_offsets, cu_chart_offsets + 1,
+        stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_sorted_face_areas, mesh.atlas_chart_areas.ptr,
         C,
-        cu_chart_offsets, cu_chart_offsets + 1
+        cu_chart_offsets, cu_chart_offsets + 1,
+        stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_sorted_face_areas));
 
     float3* cu_sorted_face_normals;
     CUDA_CHECK(cudaMalloc(&cu_sorted_face_normals, F * sizeof(float3)));
-    index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         mesh.face_normals.ptr,
         argsorted_faces_ids,
         F,
         cu_sorted_face_normals
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(argsorted_faces_ids));
     float3* cu_chart_normals;
     CUDA_CHECK(cudaMalloc(&cu_chart_normals, C * sizeof(float3)));
@@ -594,7 +606,8 @@ void compute_chart_normal_cones(
         C,
         cu_chart_offsets, cu_chart_offsets + 1,
         Float3Add(),
-        make_float3(0.0f, 0.0f, 0.0f)
+        make_float3(0.0f, 0.0f, 0.0f),
+        stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSegmentedReduce::Reduce(
@@ -603,9 +616,10 @@ void compute_chart_normal_cones(
         C,
         cu_chart_offsets, cu_chart_offsets + 1,
         Float3Add(),
-        make_float3(0.0f, 0.0f, 0.0f)
+        make_float3(0.0f, 0.0f, 0.0f),
+        stream
     ));
-    normalize_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    normalize_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_chart_normals,
         C
     );
@@ -614,7 +628,7 @@ void compute_chart_normal_cones(
     // 4. Compute normal difference
     float* cu_normal_diff;
     CUDA_CHECK(cudaMalloc(&cu_normal_diff, F * sizeof(float)));
-    normal_diff_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    normal_diff_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_chart_normals,
         cu_sorted_face_normals,
         sorted_chart_ids,
@@ -622,6 +636,7 @@ void compute_chart_normal_cones(
         cu_normal_diff
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_sorted_face_normals));
     CUDA_CHECK(cudaFree(sorted_chart_ids));
 
@@ -633,27 +648,31 @@ void compute_chart_normal_cones(
         nullptr, temp_storage_bytes,
         cu_normal_diff, cu_new_cone_half_angles,
         C,
-        cu_chart_offsets, cu_chart_offsets + 1
+        cu_chart_offsets, cu_chart_offsets + 1,
+        stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSegmentedReduce::Max(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_normal_diff, cu_new_cone_half_angles,
         C,
-        cu_chart_offsets, cu_chart_offsets + 1
+        cu_chart_offsets, cu_chart_offsets + 1,
+        stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_chart_offsets));
     CUDA_CHECK(cudaFree(cu_normal_diff));
 
     // 6. Update chart normal cones
     mesh.atlas_chart_normal_cones.resize(C);
-    update_normal_cones_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    update_normal_cones_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         mesh.atlas_chart_normal_cones.ptr,
         cu_chart_normals,
         cu_new_cone_half_angles,
         C
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_chart_normals));
     CUDA_CHECK(cudaFree(cu_new_cone_half_angles));
 }
@@ -677,7 +696,7 @@ static __global__ void refine_charts_kernel(
 
     // 1. Load current face data
     int current_c = chart_ids[fid];
-    Vec3f n(face_normals[fid]); 
+    Vec3f n(face_normals[fid]);
 
     // local register cache for candidate list (triangle has at most 3 neighbors, plus self, max 4 candidates)
     int candidates[4];
@@ -691,7 +710,7 @@ static __global__ void refine_charts_kernel(
 
     // 2. Iterate over 3 edges to aggregate smooth scores
     int eids[3] = { face2edge[fid].x, face2edge[fid].y, face2edge[fid].z };
-    
+
     #pragma unroll
     for (int i = 0; i < 3; i++) {
         int eid = eids[i];
@@ -740,7 +759,7 @@ static __global__ void refine_charts_kernel(
 
     for (int i = 0; i < num_candidates; ++i) {
         int c = candidates[i];
-        
+
         // A. Geom score
         float4 cone = chart_normal_cones[c];
         Vec3f axis(cone.x, cone.y, cone.z);
@@ -769,7 +788,7 @@ static __global__ void refine_charts_kernel(
             // new best is significantly better than current best
             best_total_score = total_score;
             best_c = c;
-        } 
+        }
         else if (abs(diff) <= epsilon) {
             // scores are very close, break tie by choosing smaller ID
             if (c < best_c) {
@@ -822,13 +841,14 @@ __global__ void hook_edges_if_same_chart_kernel(
 
 
 static void reassign_chart_ids(
-    CuMesh& mesh
+    CuMesh& mesh,
+    cudaStream_t stream
 ) {
     size_t F = mesh.faces.size;
     size_t M = mesh.manifold_face_adj.size;
 
     mesh.temp_storage.resize(F * sizeof(int));        // Use as parent for DSU
-    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         reinterpret_cast<int*>(mesh.temp_storage.ptr),
         F
     );
@@ -838,10 +858,11 @@ static void reassign_chart_ids(
     CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
 
         // Hook
-        hook_edges_if_same_chart_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        hook_edges_if_same_chart_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             mesh.manifold_face_adj.ptr,
             mesh.atlas_chart_ids.ptr,
             M,
@@ -851,15 +872,16 @@ static void reassign_chart_ids(
         CUDA_CHECK(cudaGetLastError());
 
         // Compress
-        compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             reinterpret_cast<int*>(mesh.temp_storage.ptr),
             F
         );
         CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
     } while (h_end_flag == 0);
     CUDA_CHECK(cudaFree(cu_end_flag));
-    
+
     swap_buffers(mesh.atlas_chart_ids, mesh.temp_storage);
     mesh.atlas_num_charts = compress_ids(mesh.atlas_chart_ids.ptr, F, mesh.cub_temp_storage);
 }
@@ -930,7 +952,8 @@ static __global__ void unpack_vertex_ids_kernel(
 
 
 void construct_chart_mesh(
-    CuMesh& mesh
+    CuMesh& mesh,
+    cudaStream_t stream
 ) {
     size_t F = mesh.faces.size;
 
@@ -943,7 +966,7 @@ void construct_chart_mesh(
     CUDA_CHECK(cudaMalloc(&cu_sorted_chart_ids, F * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_face_idx, F * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_sorted_face_idx, F * sizeof(int)));
-    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_face_idx,
         F
     );
@@ -953,15 +976,16 @@ void construct_chart_mesh(
         nullptr, temp_storage_bytes,
         mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids,
         cu_face_idx, cu_sorted_face_idx,
-        F
+        F, 0, sizeof(int) * 8, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids,
         cu_face_idx, cu_sorted_face_idx,
-        F
+        F, 0, sizeof(int) * 8, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_face_idx));
     // 2. RLE for chart size
     int* cu_chart_size;
@@ -974,14 +998,15 @@ void construct_chart_mesh(
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         nullptr, temp_storage_bytes,
         cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart,
-        F
+        F, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart,
-        F
+        F, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_unique_chart_ids));
     CUDA_CHECK(cudaFree(cu_num_chart));
     // 3. Exclusive scan for chart face offset
@@ -989,19 +1014,20 @@ void construct_chart_mesh(
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_chart_size, mesh.atlas_chart_faces_offset.ptr,
-        mesh.atlas_num_charts + 1
+        mesh.atlas_num_charts + 1, stream
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_chart_size, mesh.atlas_chart_faces_offset.ptr,
-        mesh.atlas_num_charts + 1
+        mesh.atlas_num_charts + 1, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_chart_size));
     // 4. Expand chart ids and vertex ids
     uint64_t* cu_pack;
     CUDA_CHECK(cudaMalloc(&cu_pack, 3 * F * sizeof(uint64_t)));
-    expand_chart_ids_and_vertex_ids_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    expand_chart_ids_and_vertex_ids_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_sorted_chart_ids,
         cu_sorted_face_idx,
         mesh.faces.ptr,
@@ -1009,6 +1035,7 @@ void construct_chart_mesh(
         cu_pack
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_sorted_chart_ids));
     CUDA_CHECK(cudaFree(cu_sorted_face_idx));
     // 5. Compress pair to construct all maps
@@ -1022,32 +1049,35 @@ void construct_chart_mesh(
     );
     mesh.atlas_chart_vertex_map.resize(new_num_vertices);
     mesh.atlas_chart_vertex_offset.resize(mesh.atlas_num_charts + 1);
-    unpack_vertex_ids_kernel<<<(new_num_vertices + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    unpack_vertex_ids_kernel<<<(new_num_vertices + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_inverse_pack,
         new_num_vertices,
         mesh.atlas_chart_vertex_map.ptr,
         mesh.atlas_chart_vertex_offset.ptr
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_inverse_pack));
-    unpack_faces_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    unpack_faces_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_pack,
         F,
         mesh.atlas_chart_faces.ptr
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_inverse_pack));
     CUDA_CHECK(cudaFree(cu_pack));
 }
 
 
 void CuMesh::compute_charts(
-    float threshold_cone_half_angle_rad, 
-    int refine_iterations, 
-    int global_iterations, 
+    float threshold_cone_half_angle_rad,
+    int refine_iterations,
+    int global_iterations,
     float smooth_strength,
     float area_penalty_weight,
     float perimeter_area_ratio_weight
 ) {
+    cudaStream_t stream = current_stream();
+
     if (this->manifold_face_adj.is_empty()) {
         this->get_manifold_face_adjacency();
     }
@@ -1062,7 +1092,7 @@ void CuMesh::compute_charts(
     size_t F = this->faces.size;
     this->atlas_chart_ids.resize(F);
     this->atlas_num_charts = F;
-    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->atlas_chart_ids.ptr,
         F
     );
@@ -1074,19 +1104,20 @@ void CuMesh::compute_charts(
     for (int i = 0; i < global_iterations; i++) {
         while (true) {
             h_end_flag = 1;
-            CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+            CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream));
+            CUDA_CHECK(cudaStreamSynchronize(stream));
 
             // 1. Compute chart connectivity
-            get_chart_connectivity(*this);
+            get_chart_connectivity(*this, stream);
             if (this->atlas_chart_adj.size == 0) break;
 
             // 2. Compute normal cones
-            compute_chart_normal_cones(*this);
+            compute_chart_normal_cones(*this, stream);
 
             // 3. Compute chart adjacency cost
             size_t E = this->atlas_chart_adj.size;
             this->edge_collapse_costs.resize(E);
-            compute_chart_adjacency_cost_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+            compute_chart_adjacency_cost_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                 this->atlas_chart_adj.ptr,
                 this->atlas_chart_normal_cones.ptr,
                 this->atlas_chart_adj_length.ptr,
@@ -1097,27 +1128,27 @@ void CuMesh::compute_charts(
                 E,
                 this->edge_collapse_costs.ptr
             );
-            CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize());
+            CUDA_CHECK(cudaGetLastError());
 
             // 4. Propagate costs
             size_t C = this->atlas_num_charts;
             this->propagated_costs.resize(C);
-            propagate_cost_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+            propagate_cost_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                 this->atlas_chart2edge.ptr,
                 this->atlas_chart2edge_offset.ptr,
                 this->edge_collapse_costs.ptr,
                 C,
                 this->propagated_costs.ptr
             );
-            CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize());
+            CUDA_CHECK(cudaGetLastError());
 
             // 5. Collapse edges
             this->vertices_map.resize(C);      // store collapse map
-            arange_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+            arange_kernel<<<(C + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                 this->vertices_map.ptr,
                 C
             );
-            collapse_edges_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+            collapse_edges_kernel<<<(E + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                 this->atlas_chart_adj.ptr,
                 this->edge_collapse_costs.ptr,
                 this->propagated_costs.ptr,
@@ -1127,30 +1158,31 @@ void CuMesh::compute_charts(
                 this->atlas_chart_normal_cones.ptr,
                 cu_end_flag
             );
-            CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize());
+            CUDA_CHECK(cudaGetLastError());
 
             // End of iteration
-            CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+            CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream));
+            CUDA_CHECK(cudaStreamSynchronize(stream));
             if (h_end_flag == 1) break;
 
             // 6. Compress chart ids
             this->atlas_num_charts = compress_ids(this->vertices_map.ptr, C, this->cub_temp_storage);
             this->temp_storage.resize(F * sizeof(int));
-            index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+            index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                 this->vertices_map.ptr,
                 this->atlas_chart_ids.ptr,
                 F,
                 reinterpret_cast<int*>(this->temp_storage.ptr)
             );
-            CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize());
+            CUDA_CHECK(cudaGetLastError());
             swap_buffers(this->atlas_chart_ids, this->temp_storage);
         }
 
         // Refine charts
         for (int j = 0; j < refine_iterations; j++) {
-            compute_chart_normal_cones(*this);
+            compute_chart_normal_cones(*this, stream);
             this->temp_storage.resize(F * sizeof(int));
-            refine_charts_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+            refine_charts_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                 this->atlas_chart_normal_cones.ptr,
                 this->face_normals.ptr,
                 this->vertices.ptr,
@@ -1169,53 +1201,55 @@ void CuMesh::compute_charts(
         }
 
         // After refinement, the chart may become disconnected, so we need to re-assign chart ids
-        reassign_chart_ids(*this);
+        reassign_chart_ids(*this, stream);
     }
     CUDA_CHECK(cudaFree(cu_end_flag));
 
     // Finalizing: calculate vmap, chart face and chart face offset
-    construct_chart_mesh(*this);
+    construct_chart_mesh(*this, stream);
 }
 
 
 std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> CuMesh::read_atlas_charts() {
+    cudaStream_t stream = current_stream();
+
     auto chart_ids = torch::empty({ static_cast<int64_t>(this->faces.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(cudaMemcpyAsync(
         chart_ids.data_ptr<int>(),
         this->atlas_chart_ids.ptr,
         this->faces.size * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        cudaMemcpyDeviceToDevice, stream
     ));
     auto vertex_map = torch::empty({ static_cast<int64_t>(this->atlas_chart_vertex_map.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(cudaMemcpyAsync(
         vertex_map.data_ptr<int>(),
         this->atlas_chart_vertex_map.ptr,
         this->atlas_chart_vertex_map.size * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        cudaMemcpyDeviceToDevice, stream
     ));
     auto chart_faces = torch::empty({ static_cast<int64_t>(this->atlas_chart_faces.size), 3 }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(cudaMemcpyAsync(
         chart_faces.data_ptr<int>(),
         this->atlas_chart_faces.ptr,
         this->atlas_chart_faces.size * 3 * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        cudaMemcpyDeviceToDevice, stream
     ));
     auto chart_vertex_offset = torch::empty({ static_cast<int64_t>(this->atlas_chart_vertex_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(cudaMemcpyAsync(
         chart_vertex_offset.data_ptr<int>(),
         this->atlas_chart_vertex_offset.ptr,
         this->atlas_chart_vertex_offset.size * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        cudaMemcpyDeviceToDevice, stream
     ));
     auto chart_face_offset = torch::empty({ static_cast<int64_t>(this->atlas_chart_faces_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(cudaMemcpyAsync(
         chart_face_offset.data_ptr<int>(),
         this->atlas_chart_faces_offset.ptr,
         this->atlas_chart_faces_offset.size * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        cudaMemcpyDeviceToDevice, stream
     ));
     return std::make_tuple(this->atlas_num_charts, chart_ids, vertex_map, chart_faces, chart_vertex_offset, chart_face_offset);
 }
 
 
-} // namespace cumesh
\ No newline at end of file
+} // namespace cumesh
diff --git a/src/clean_up.cu b/src/clean_up.cu
index 3c12cd7..a72b1d0 100644
--- a/src/clean_up.cu
+++ b/src/clean_up.cu
@@ -2,6 +2,7 @@
 #include "dtypes.cuh"
 #include "shared.h"
 #include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
 
 
 namespace cumesh {
@@ -31,6 +32,7 @@ static __global__ void copy_T_to_T3_kernel(
 
 
 void CuMesh::remove_faces(torch::Tensor& face_mask) {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
 
     size_t temp_storage_bytes = 0;
@@ -41,18 +43,20 @@ void CuMesh::remove_faces(torch::Tensor& face_mask) {
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         this->faces.ptr, face_mask.data_ptr<bool>(), cu_new_faces, cu_new_num_faces,
-        F
+        F, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->faces.ptr, face_mask.data_ptr<bool>(), cu_new_faces, cu_new_num_faces,
-        F
+        F, stream
     ));
     int new_num_faces;
-    CUDA_CHECK(cudaMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     this->faces.resize(new_num_faces);
-    CUDA_CHECK(cudaMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpyAsync(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_new_num_faces));
     CUDA_CHECK(cudaFree(cu_new_faces));
 
@@ -61,6 +65,7 @@ void CuMesh::remove_faces(torch::Tensor& face_mask) {
 
 
 void CuMesh::_remove_faces(uint8_t* face_mask) {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
 
     size_t temp_storage_bytes = 0;
@@ -71,18 +76,20 @@ void CuMesh::_remove_faces(uint8_t* face_mask) {
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces,
-        F
+        F, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces,
-        F
+        F, stream
     ));
     int new_num_faces;
-    CUDA_CHECK(cudaMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     this->faces.resize(new_num_faces);
-    CUDA_CHECK(cudaMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpyAsync(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_new_num_faces));
     CUDA_CHECK(cudaFree(cu_new_faces));
 
@@ -134,14 +141,15 @@ static __global__ void remap_faces_kernel(
 
 
 void CuMesh::remove_unreferenced_vertices() {
+    cudaStream_t stream = current_stream();
     size_t V = this->vertices.size;
     size_t F = this->faces.size;
 
     // Mark referenced vertices
     int* cu_vertex_is_referenced;
     CUDA_CHECK(cudaMalloc(&cu_vertex_is_referenced, (V+1) * sizeof(int)));
-    CUDA_CHECK(cudaMemset(cu_vertex_is_referenced, 0, (V+1) * sizeof(int)));
-    set_vertex_is_referenced<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    CUDA_CHECK(cudaMemsetAsync(cu_vertex_is_referenced, 0, (V+1) * sizeof(int), stream));
+    set_vertex_is_referenced<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->faces.ptr,
         F,
         cu_vertex_is_referenced
@@ -152,19 +160,20 @@ void CuMesh::remove_unreferenced_vertices() {
     size_t temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
-        cu_vertex_is_referenced, V+1
+        cu_vertex_is_referenced, V+1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
-        cu_vertex_is_referenced, V+1
+        cu_vertex_is_referenced, V+1, stream
     ));
     int new_num_vertices;
-    CUDA_CHECK(cudaMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
 
     // Compress vertices
     this->temp_storage.resize(new_num_vertices * sizeof(float3));
-    compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_vertex_is_referenced,
         this->vertices.ptr,
         V,
@@ -174,12 +183,13 @@ void CuMesh::remove_unreferenced_vertices() {
     swap_buffers(this->temp_storage, this->vertices);
 
     // Update faces
-    remap_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    remap_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_vertex_is_referenced,
         F,
         this->faces.ptr
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_vertex_is_referenced));
 
     // Delete all cached info since mesh has changed
@@ -237,16 +247,17 @@ struct int3_decomposer
 
 
 void CuMesh::remove_duplicate_faces() {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
 
     // Create a temporary sorted copy of faces for duplicate detection
     // Do NOT modify the original faces to preserve vertex order and normals
     int3 *cu_sorted_faces;
     CUDA_CHECK(cudaMalloc(&cu_sorted_faces, F * sizeof(int3)));
-    CUDA_CHECK(cudaMemcpy(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpyAsync(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), cudaMemcpyDeviceToDevice, stream));
 
     // Sort vertices within each face (in the temporary copy)
-    sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_sorted_faces,
         F
     );
@@ -256,7 +267,7 @@ void CuMesh::remove_duplicate_faces() {
     size_t temp_storage_bytes = 0;
     int *cu_sorted_face_indices;
     CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int)));
-    arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_sorted_face_indices, F);
+    arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_sorted_face_indices, F);
     CUDA_CHECK(cudaGetLastError());
 
     int *cu_sorted_indices_output;
@@ -269,7 +280,8 @@ void CuMesh::remove_duplicate_faces() {
         cu_sorted_faces, cu_sorted_faces_output,
         cu_sorted_face_indices, cu_sorted_indices_output,
         F,
-        int3_decomposer{}
+        int3_decomposer{},
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
@@ -277,34 +289,36 @@ void CuMesh::remove_duplicate_faces() {
         cu_sorted_faces, cu_sorted_faces_output,
         cu_sorted_face_indices, cu_sorted_indices_output,
         F,
-        int3_decomposer{}
+        int3_decomposer{},
+        stream
     ));
-    CUDA_CHECK(cudaFree(cu_sorted_faces));
-    CUDA_CHECK(cudaFree(cu_sorted_face_indices));
 
     // Select first in each group of duplicate faces (based on sorted faces)
     uint8_t* cu_face_mask_sorted;
     CUDA_CHECK(cudaMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t)));
-    select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_sorted_faces_output,
         F,
         cu_face_mask_sorted
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_sorted_faces_output));
 
     // Map the mask back to original face order using scatter
     // scatter: output[indices[i]] = values[i]
     // This maps: cu_face_mask_original[original_idx] = cu_face_mask_sorted[sorted_position]
     uint8_t* cu_face_mask_original;
     CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t)));
-    scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_sorted_indices_output,  // indices: sorted_position -> original_idx
         cu_face_mask_sorted,       // values: mask at sorted_position
         F,
         cu_face_mask_original      // output: mask at original position
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_sorted_faces));
+    CUDA_CHECK(cudaFree(cu_sorted_face_indices));
+    CUDA_CHECK(cudaFree(cu_sorted_faces_output));
     CUDA_CHECK(cudaFree(cu_face_mask_sorted));
     CUDA_CHECK(cudaFree(cu_sorted_indices_output));
 
@@ -434,6 +448,7 @@ struct LessThanOp {
 
 
 void CuMesh::fill_holes(float max_hole_perimeter) {
+    cudaStream_t stream = current_stream();
     if (this->loop_boundaries.is_empty() || this->loop_boundaries_offset.is_empty()) {
         this->get_boundary_loops();
     }
@@ -451,7 +466,7 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     // Compute loop boundary lengths
     float* cu_loop_boundary_lengths;
     CUDA_CHECK(cudaMalloc(&cu_loop_boundary_lengths, E * sizeof(float)));
-    compute_loop_boundary_lengths<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compute_loop_boundary_lengths<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vertices.ptr,
         this->edges.ptr,
         this->loop_boundaries.ptr,
@@ -469,7 +484,8 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         cu_loop_boundary_lengths, cu_bound_loop_perimeters,
         L,
         this->loop_boundaries_offset.ptr,
-        this->loop_boundaries_offset.ptr + 1
+        this->loop_boundaries_offset.ptr + 1,
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
@@ -477,14 +493,14 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         cu_loop_boundary_lengths, cu_bound_loop_perimeters,
         L,
         this->loop_boundaries_offset.ptr,
-        this->loop_boundaries_offset.ptr + 1
+        this->loop_boundaries_offset.ptr + 1,
+        stream
     ));
-    CUDA_CHECK(cudaFree(cu_loop_boundary_lengths));
 
     // Mask small loops
     uint8_t* cu_bound_loop_mask;
     CUDA_CHECK(cudaMalloc(&cu_bound_loop_mask, L * sizeof(uint8_t)));
-    compare_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compare_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_bound_loop_perimeters,
         max_hole_perimeter,
         L,
@@ -492,12 +508,11 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         cu_bound_loop_mask
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_bound_loop_perimeters));
 
     // Compress bound loops size
     int* cu_bound_loops_cnt;
     CUDA_CHECK(cudaMalloc(&cu_bound_loops_cnt, L * sizeof(int)));
-    diff_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    diff_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->loop_boundaries_offset.ptr,
         L,
         cu_bound_loops_cnt
@@ -510,16 +525,19 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops,
-        L
+        L, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops,
-        L
+        L, stream
     ));
     int new_num_bound_loops;
-    CUDA_CHECK(cudaMemcpy(&new_num_bound_loops, cu_new_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_bound_loops, cu_new_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_loop_boundary_lengths));
+    CUDA_CHECK(cudaFree(cu_bound_loop_perimeters));
     CUDA_CHECK(cudaFree(cu_bound_loops_cnt));
     CUDA_CHECK(cudaFree(cu_new_num_bound_loops));
     if (new_num_bound_loops == 0) {
@@ -531,9 +549,9 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     // Get loop ids of loop boundaries
     int* cu_loop_bound_loop_ids;
     CUDA_CHECK(cudaMalloc(&cu_loop_bound_loop_ids, E * sizeof(int)));
-    CUDA_CHECK(cudaMemset(cu_loop_bound_loop_ids, 0, E * sizeof(int)));
+    CUDA_CHECK(cudaMemsetAsync(cu_loop_bound_loop_ids, 0, E * sizeof(int), stream));
     if (L > 1) {
-        set_flag_kernel<<<(L-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        set_flag_kernel<<<(L-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->loop_boundaries_offset.ptr + 1, L - 1,
             cu_loop_bound_loop_ids
         );
@@ -543,27 +561,25 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(
         nullptr, temp_storage_bytes,
         cu_loop_bound_loop_ids,
-        E
+        E, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_loop_bound_loop_ids,
-        E
+        E, stream
     ));
 
     // Mask loop boundaries
     uint8_t* cu_loop_boundary_mask;
     CUDA_CHECK(cudaMalloc(&cu_loop_boundary_mask, E * sizeof(uint8_t)));
-    index_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    index_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_bound_loop_mask,
         cu_loop_bound_loop_ids,
         E,
         cu_loop_boundary_mask
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_bound_loop_mask));
-    CUDA_CHECK(cudaFree(cu_loop_bound_loop_ids));
 
     // Compress loop boundaries
     int *cu_new_loop_boundaries, *cu_new_num_loop_boundaries;
@@ -573,16 +589,19 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries,
-        E
+        E, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries,
-        E
+        E, stream
     ));
     int new_num_loop_boundaries;
-    CUDA_CHECK(cudaMemcpy(&new_num_loop_boundaries, cu_new_num_loop_boundaries, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_loop_boundaries, cu_new_num_loop_boundaries, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_bound_loop_mask));
+    CUDA_CHECK(cudaFree(cu_loop_bound_loop_ids));
     CUDA_CHECK(cudaFree(cu_new_num_loop_boundaries));
     CUDA_CHECK(cudaFree(cu_loop_boundary_mask));
 
@@ -593,19 +612,19 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset,
-        new_num_bound_loops + 1
+        new_num_bound_loops + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset,
-        new_num_bound_loops + 1
+        new_num_bound_loops + 1, stream
     ));
     int* cu_new_loop_bound_loop_ids;
     CUDA_CHECK(cudaMalloc(&cu_new_loop_bound_loop_ids, new_num_loop_boundaries * sizeof(int)));
-    CUDA_CHECK(cudaMemset(cu_new_loop_bound_loop_ids, 0, new_num_loop_boundaries * sizeof(int)));
+    CUDA_CHECK(cudaMemsetAsync(cu_new_loop_bound_loop_ids, 0, new_num_loop_boundaries * sizeof(int), stream));
     if (new_num_bound_loops > 1) {
-        set_flag_kernel<<<(new_num_bound_loops-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        set_flag_kernel<<<(new_num_bound_loops-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             cu_new_loop_boundaries_offset+1, new_num_bound_loops-1,
             cu_new_loop_bound_loop_ids
         );
@@ -615,19 +634,19 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(
         nullptr, temp_storage_bytes,
         cu_new_loop_bound_loop_ids,
-        new_num_loop_boundaries
+        new_num_loop_boundaries, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_new_loop_bound_loop_ids,
-        new_num_loop_boundaries
+        new_num_loop_boundaries, stream
     ));
 
     // Calculate new vertex positions as average of loop vertices
     Vec3f* cu_new_loop_bound_centers;
     CUDA_CHECK(cudaMalloc(&cu_new_loop_bound_centers, new_num_loop_boundaries * sizeof(Vec3f)));
-    compute_loop_boundary_midpoints<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compute_loop_boundary_midpoints<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vertices.ptr,
         this->edges.ptr,
         cu_new_loop_boundaries,
@@ -643,7 +662,8 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         cu_new_loop_bound_centers, cu_new_vertices,
         new_num_bound_loops,
         cu_new_loop_boundaries_offset,
-        cu_new_loop_boundaries_offset + 1
+        cu_new_loop_boundaries_offset + 1,
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
@@ -651,29 +671,26 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         cu_new_loop_bound_centers, cu_new_vertices,
         new_num_bound_loops,
         cu_new_loop_boundaries_offset,
-        cu_new_loop_boundaries_offset + 1
+        cu_new_loop_boundaries_offset + 1,
+        stream
     ));
-    CUDA_CHECK(cudaFree(cu_new_loop_bound_centers));
-    CUDA_CHECK(cudaFree(cu_new_loop_boundaries_offset));
-    inplace_div_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    inplace_div_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_new_vertices,
         cu_new_loop_boundaries_cnt,
         new_num_bound_loops
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_new_loop_boundaries_cnt));
 
     // Update mesh
     this->vertices.extend(new_num_bound_loops);
     this->faces.extend(new_num_loop_boundaries);
-    copy_vec3f_to_float3_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    copy_vec3f_to_float3_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_new_vertices,
         new_num_bound_loops,
         this->vertices.ptr + V
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_new_vertices));
-    connect_new_vertices_kernel<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    connect_new_vertices_kernel<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edges.ptr,
         cu_new_loop_boundaries,
         cu_new_loop_bound_loop_ids,
@@ -682,6 +699,11 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         this->faces.ptr + F
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_new_loop_bound_centers));
+    CUDA_CHECK(cudaFree(cu_new_loop_boundaries_offset));
+    CUDA_CHECK(cudaFree(cu_new_loop_boundaries_cnt));
+    CUDA_CHECK(cudaFree(cu_new_vertices));
     CUDA_CHECK(cudaFree(cu_new_loop_boundaries));
     CUDA_CHECK(cudaFree(cu_new_loop_bound_loop_ids));
 
@@ -763,6 +785,7 @@ static __global__ void index_vertice_kernel(
 
 
 void CuMesh::repair_non_manifold_edges(){
+    cudaStream_t stream = current_stream();
     // Always recompute manifold_face_adj to ensure it's up to date
     // especially after operations like simplify() that modify the mesh
     this->get_manifold_face_adjacency();
@@ -773,7 +796,7 @@ void CuMesh::repair_non_manifold_edges(){
     // Construct vertex adjacency pairs with manifold edges
     int2* cu_vertex_adj_pairs;
     CUDA_CHECK(cudaMalloc(&cu_vertex_adj_pairs, 2*M*sizeof(int2)));
-    construct_vertex_adj_pairs_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    construct_vertex_adj_pairs_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->manifold_face_adj.ptr,
         this->faces.ptr,
         cu_vertex_adj_pairs,
@@ -784,16 +807,17 @@ void CuMesh::repair_non_manifold_edges(){
     // Iterative Hook and Compress
     int* cu_vertex_ids;
     CUDA_CHECK(cudaMalloc(&cu_vertex_ids, 3 * F * sizeof(int)));
-    arange_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vertex_ids, 3 * F);
+    arange_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_vertex_ids, 3 * F);
     CUDA_CHECK(cudaGetLastError());
     int* cu_end_flag; int h_end_flag;
     CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
 
         // Hook
-        hook_edges_kernel<<<(2*M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        hook_edges_kernel<<<(2*M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             cu_vertex_adj_pairs,
             2 * M,
             cu_vertex_ids,
@@ -802,12 +826,13 @@ void CuMesh::repair_non_manifold_edges(){
         CUDA_CHECK(cudaGetLastError());
 
         // Compress
-        compress_components_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        compress_components_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             cu_vertex_ids,
             3 * F
         );
         CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
     } while (h_end_flag == 0);
     CUDA_CHECK(cudaFree(cu_end_flag));
     CUDA_CHECK(cudaFree(cu_vertex_adj_pairs));
@@ -818,7 +843,7 @@ void CuMesh::repair_non_manifold_edges(){
     int new_V = compress_ids(cu_vertex_ids, 3 * F, this->cub_temp_storage, cu_new_vertices_ids);
     float3* cu_new_vertices;
     CUDA_CHECK(cudaMalloc(&cu_new_vertices, new_V * sizeof(float3)));
-    index_vertice_kernel<<<(new_V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    index_vertice_kernel<<<(new_V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_new_vertices_ids,
         this->faces.ptr,
         this->vertices.ptr,
@@ -826,13 +851,14 @@ void CuMesh::repair_non_manifold_edges(){
         cu_new_vertices
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_new_vertices_ids));
     this->vertices.resize(new_V);
-    CUDA_CHECK(cudaMemcpy(this->vertices.ptr, cu_new_vertices, new_V * sizeof(float3), cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaFree(cu_new_vertices));
+    CUDA_CHECK(cudaMemcpyAsync(this->vertices.ptr, cu_new_vertices, new_V * sizeof(float3), cudaMemcpyDeviceToDevice, stream));
     this->faces.resize(F);
-    copy_T_to_T3_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vertex_ids, F, this->faces.ptr);
+    copy_T_to_T3_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_vertex_ids, F, this->faces.ptr);
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_new_vertices_ids));
+    CUDA_CHECK(cudaFree(cu_new_vertices));
     CUDA_CHECK(cudaFree(cu_vertex_ids));
 
     // Delete all cached info since mesh has changed
@@ -874,6 +900,7 @@ static __global__ void mark_non_manifold_faces_kernel(
 
 
 void CuMesh::remove_non_manifold_faces() {
+    cudaStream_t stream = current_stream();
     // Get edge-face adjacency information
     if (this->edge2face.is_empty() || this->edge2face_offset.is_empty()) {
         this->get_edge_face_adjacency();
@@ -887,10 +914,10 @@ void CuMesh::remove_non_manifold_faces() {
     // Initialize face mask (1 = keep all faces initially)
     uint8_t* cu_face_keep_mask;
     CUDA_CHECK(cudaMalloc(&cu_face_keep_mask, F * sizeof(uint8_t)));
-    CUDA_CHECK(cudaMemset(cu_face_keep_mask, 1, F * sizeof(uint8_t)));
+    CUDA_CHECK(cudaMemsetAsync(cu_face_keep_mask, 1, F * sizeof(uint8_t), stream));
 
     // Mark faces on non-manifold edges for removal
-    mark_non_manifold_faces_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    mark_non_manifold_faces_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edge2face.ptr,
         this->edge2face_offset.ptr,
         this->edge2face_cnt.ptr,
@@ -916,6 +943,7 @@ struct GreaterThanOrEqualToOp {
 
 
 void CuMesh::remove_small_connected_components(float min_area) {
+    cudaStream_t stream = current_stream();
     if (this->conn_comp_ids.is_empty()) {
         this->get_connected_components();
     }
@@ -936,14 +964,14 @@ void CuMesh::remove_small_connected_components(float min_area) {
         nullptr, temp_storage_bytes,
         this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids,
         this->face_areas.ptr, cu_sorted_face_areas,
-        F
+        F, 0, sizeof(int) * 8, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids,
         this->face_areas.ptr, cu_sorted_face_areas,
-        F
+        F, 0, sizeof(int) * 8, stream
     ));
 
     // 2. Find unique components and get the number of faces in each.
@@ -957,17 +985,18 @@ void CuMesh::remove_small_connected_components(float min_area) {
         nullptr, temp_storage_bytes,
         cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids,
         cu_conn_comp_num_faces, cu_num_conn_comps,
-        F
+        F, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids,
         cu_conn_comp_num_faces, cu_num_conn_comps,
-        F
+        F, stream
     ));
     int num_conn_comps;
-    CUDA_CHECK(cudaMemcpy(&num_conn_comps, cu_num_conn_comps, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&num_conn_comps, cu_num_conn_comps, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_num_conn_comps));
     CUDA_CHECK(cudaFree(cu_sorted_conn_comp_ids));
     CUDA_CHECK(cudaFree(cu_unique_conn_comp_ids));
@@ -979,15 +1008,14 @@ void CuMesh::remove_small_connected_components(float min_area) {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_conn_comp_num_faces, cu_conn_comp_offsets,
-        num_conn_comps + 1
+        num_conn_comps + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_conn_comp_num_faces, cu_conn_comp_offsets,
-        num_conn_comps + 1
+        num_conn_comps + 1, stream
     ));
-    CUDA_CHECK(cudaFree(cu_conn_comp_num_faces));
 
     float *cu_conn_comp_areas;
     CUDA_CHECK(cudaMalloc(&cu_conn_comp_areas, num_conn_comps * sizeof(float)));
@@ -996,7 +1024,8 @@ void CuMesh::remove_small_connected_components(float min_area) {
         cu_sorted_face_areas, cu_conn_comp_areas,
         num_conn_comps,
         cu_conn_comp_offsets,
-        cu_conn_comp_offsets + 1
+        cu_conn_comp_offsets + 1,
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
@@ -1004,15 +1033,14 @@ void CuMesh::remove_small_connected_components(float min_area) {
         cu_sorted_face_areas, cu_conn_comp_areas,
         num_conn_comps,
         cu_conn_comp_offsets,
-        cu_conn_comp_offsets + 1
+        cu_conn_comp_offsets + 1,
+        stream
     ));
-    CUDA_CHECK(cudaFree(cu_sorted_face_areas));
-    CUDA_CHECK(cudaFree(cu_conn_comp_offsets));
 
     // 4. Create a "keep" mask for components with area >= min_area.
     uint8_t* cu_comp_keep_mask;
     CUDA_CHECK(cudaMalloc(&cu_comp_keep_mask, num_conn_comps * sizeof(uint8_t)));
-    compare_kernel<<<(num_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compare_kernel<<<(num_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_conn_comp_areas,
         min_area,
         num_conn_comps,
@@ -1020,23 +1048,26 @@ void CuMesh::remove_small_connected_components(float min_area) {
         cu_comp_keep_mask
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_conn_comp_areas));
 
     // 5. Propagate the component "keep" mask to every face.
     uint8_t* cu_face_keep_mask;
     CUDA_CHECK(cudaMalloc(&cu_face_keep_mask, F * sizeof(uint8_t)));
     // Use an index_kernel (gather operation)
-    index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_comp_keep_mask,      // Source array
         this->conn_comp_ids.ptr, // Indices to gather from
         F,
         cu_face_keep_mask       // Destination array
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_comp_keep_mask));
 
     // 6. Select the faces to keep and update the mesh.
     this->_remove_faces(cu_face_keep_mask);
+    CUDA_CHECK(cudaFree(cu_conn_comp_num_faces));
+    CUDA_CHECK(cudaFree(cu_sorted_face_areas));
+    CUDA_CHECK(cudaFree(cu_conn_comp_offsets));
+    CUDA_CHECK(cudaFree(cu_conn_comp_areas));
+    CUDA_CHECK(cudaFree(cu_comp_keep_mask));
     CUDA_CHECK(cudaFree(cu_face_keep_mask));
 }
 
@@ -1158,6 +1189,7 @@ static __global__ void inplace_flip_faces_with_flags_kernel(
 
 
 void CuMesh::unify_face_orientations() {
+    cudaStream_t stream = current_stream();
     if (this->manifold_face_adj.is_empty()) {
         this->get_manifold_face_adjacency();
     }
@@ -1165,7 +1197,7 @@ void CuMesh::unify_face_orientations() {
     // 1. Compute the flipped flag for each edge.
     uint8_t* cu_flipped;
     CUDA_CHECK(cudaMalloc(&cu_flipped, this->manifold_face_adj.size * sizeof(uint8_t)));
-    get_flip_flags_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_flip_flags_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->manifold_face_adj.ptr,
         this->faces.ptr,
         this->manifold_face_adj.size,
@@ -1176,16 +1208,17 @@ void CuMesh::unify_face_orientations() {
     // 2. Hook edges with flipped flag.
     int* conn_comp_with_flip;
     CUDA_CHECK(cudaMalloc(&conn_comp_with_flip, this->faces.size * sizeof(int)));
-    arange_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(conn_comp_with_flip, this->faces.size, 2);
+    arange_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(conn_comp_with_flip, this->faces.size, 2);
     CUDA_CHECK(cudaGetLastError());
     int* cu_end_flag; int h_end_flag;
     CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
 
         // Hook
-        hook_edges_with_orientation_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        hook_edges_with_orientation_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->manifold_face_adj.ptr,
             cu_flipped,
             this->manifold_face_adj.size,
@@ -1195,25 +1228,27 @@ void CuMesh::unify_face_orientations() {
         CUDA_CHECK(cudaGetLastError());
 
         // Compress
-        compress_components_with_orientation_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        compress_components_with_orientation_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             conn_comp_with_flip,
             this->faces.size
         );
         CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
     } while (h_end_flag == 0);
     CUDA_CHECK(cudaFree(cu_end_flag));
 
     // 3. Flip the orientation of the faces.
-    inplace_flip_faces_with_flags_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    inplace_flip_faces_with_flags_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->faces.ptr,
         conn_comp_with_flip,
         this->faces.size
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_flipped));
     CUDA_CHECK(cudaFree(conn_comp_with_flip));
 }
 
 
-} // namespace cumesh
\ No newline at end of file
+} // namespace cumesh
diff --git a/src/connectivity.cu b/src/connectivity.cu
index 6e2f5fe..de0471e 100644
--- a/src/connectivity.cu
+++ b/src/connectivity.cu
@@ -1,6 +1,7 @@
 #include "cumesh.h"
 #include "shared.h"
 
+#include <c10/cuda/CUDAStream.h>
 #include <cub/cub.cuh>
 
 
@@ -8,7 +9,7 @@ namespace cumesh {
 
 /**
  * Get count of neighboring faces for each vertex
- * 
+ *
  * @param faces: the faces of the mesh, shape (F)
  * @param F: the number of faces
  * @param neighbor_face_cnt: the buffer for neighbor face count, shape (V+1)
@@ -31,7 +32,7 @@ static __global__ void get_neighbor_face_cnt_kernel(
 
 /**
  * Fill the neighboring face ids for each vertex
- * 
+ *
  * @param faces: the faces of the mesh, shape (F)
  * @param F: the number of faces
  * @param neighbor_face_ids: the buffer for neighbor face ids, shape (total_neighbor_face_cnt)
@@ -57,13 +58,14 @@ static __global__ void fill_neighbor_face_ids_kernel(
 
 
 void CuMesh::get_vertex_face_adjacency() {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
     size_t V = this->vertices.size;
 
     // get neighboring face count for each vertex
     this->vert2face_cnt.resize(V + 1);
     this->vert2face_cnt.zero();
-    get_neighbor_face_cnt_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->faces.ptr, F, this->vert2face_cnt.ptr);
+    get_neighbor_face_cnt_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->faces.ptr, F, this->vert2face_cnt.ptr);
     CUDA_CHECK(cudaGetLastError());
 
     // allocate memory for neighboring face ids
@@ -72,19 +74,19 @@ void CuMesh::get_vertex_face_adjacency() {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->vert2face_cnt.ptr, this->vert2face_offset.ptr,
-        V + 1
+        V + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->vert2face_cnt.ptr, this->vert2face_offset.ptr,
-        V + 1
+        V + 1, stream
     ));
     this->vert2face.resize(F*3);
 
     // fill neighboring face ids for each vertex
     this->vert2face_cnt.zero();
-    fill_neighbor_face_ids_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->faces.ptr, F, 
+    fill_neighbor_face_ids_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->faces.ptr, F,
         this->vert2face.ptr,
         this->vert2face_offset.ptr,
         this->vert2face_cnt.ptr
@@ -95,7 +97,7 @@ void CuMesh::get_vertex_face_adjacency() {
 
 /**
  * Expand edges for each triangle face
- * 
+ *
  * @param faces: the faces of the mesh, shape (F)
  * @param F: the number of faces
  * @param edges: the buffer for edges, shape (F*3)
@@ -110,7 +112,7 @@ static __global__ void expand_edges_kernel(
 
     int base = tid * 3;
     int3 f = faces[tid];
-    
+
     // expand edges
     edges[base + 0] = ((uint64_t)min(f.x, f.y) << 32) | max(f.x, f.y);
     edges[base + 1] = ((uint64_t)min(f.y, f.z) << 32) | max(f.y, f.z);
@@ -119,9 +121,10 @@ static __global__ void expand_edges_kernel(
 
 
 void CuMesh::get_edges() {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
     this->edges.resize(F * 3);
-    expand_edges_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->faces.ptr, F, this->edges.ptr);
+    expand_edges_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->faces.ptr, F, this->edges.ptr);
     CUDA_CHECK(cudaGetLastError());
 
     // sort edges
@@ -131,14 +134,14 @@ void CuMesh::get_edges() {
         nullptr, temp_storage_bytes,
         this->edges.ptr,
         reinterpret_cast<uint64_t*>(this->temp_storage.ptr),
-        F * 3
+        F * 3, 0, sizeof(uint64_t) * 8, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortKeys(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->edges.ptr,
         reinterpret_cast<uint64_t*>(this->temp_storage.ptr),
-        F * 3
+        F * 3, 0, sizeof(uint64_t) * 8, stream
     ));
 
     // unique edges
@@ -148,15 +151,16 @@ void CuMesh::get_edges() {
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         nullptr, temp_storage_bytes,
         reinterpret_cast<uint64_t*>(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges,
-        F * 3
+        F * 3, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         reinterpret_cast<uint64_t*>(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges,
-        F * 3
+        F * 3, stream
     ));
-    CUDA_CHECK(cudaMemcpy(&this->edges.size, num_edges, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&this->edges.size, num_edges, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     this->edge2face_cnt.size = this->edges.size;
     CUDA_CHECK(cudaFree(num_edges));
 }
@@ -164,7 +168,7 @@ void CuMesh::get_edges() {
 
 /**
  * Get edge-face adjacency
- * 
+ *
  * @param faces: the faces of the mesh, shape (F)
  * @param edges: the buffer for edges, shape (E)
  * @param edge2face_cnt: the buffer for edge duplication number, shape (E)
@@ -217,11 +221,12 @@ static __global__ void get_edge_face_adjacency_kernel(
 
 
 void CuMesh::get_edge_face_adjacency() {
+    cudaStream_t stream = current_stream();
     if (this->edges.is_empty() || this->edge2face_cnt.is_empty()) {
         this->get_edges();
     }
     if (this->vert2face.is_empty() || this->vert2face_offset.is_empty()) {
-        this->get_vertex_face_adjacency();    
+        this->get_vertex_face_adjacency();
     }
     size_t F = this->faces.size;
     size_t E = this->edges.size;
@@ -232,25 +237,26 @@ void CuMesh::get_edge_face_adjacency() {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->edge2face_cnt.ptr, this->edge2face_offset.ptr,
-        E + 1
+        E + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->edge2face_cnt.ptr, this->edge2face_offset.ptr,
-        E + 1
+        E + 1, stream
     ));
 
     // allocate memory for edge2face
     int total_edge_face_cnt;
-    CUDA_CHECK(cudaMemcpy(&total_edge_face_cnt, &this->edge2face_offset.ptr[E], sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&total_edge_face_cnt, &this->edge2face_offset.ptr[E], sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     this->edge2face.resize(total_edge_face_cnt);
 
     // allocate memory for face2edge
     this->face2edge.resize(F);
 
     // get edge-face adjacency
-    get_edge_face_adjacency_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_edge_face_adjacency_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->faces.ptr,
         this->edges.ptr,
         this->edge2face_cnt.ptr,
@@ -267,7 +273,7 @@ void CuMesh::get_edge_face_adjacency() {
 
 /**
  * Get vertex adjacent edge number
- * 
+ *
  * @param edges: the buffer for edges, shape (E)
  * @param E: the number of edges
  * @param vert2edge_cnt: the buffer for vertex adjacent edge number, shape (V)
@@ -293,7 +299,7 @@ static __global__ void get_vertex_edge_cnt_kernel(
 
 /**
  * Get vertex-edge adjacency
- * 
+ *
  * @param edges: the buffer for edges, shape (E)
  * @param E: the number of edges
  * @param vert2edge: the buffer for vertex to edge adjacency, shape (total_vertex_edge_cnt)
@@ -322,6 +328,7 @@ static __global__ void get_vertex_edge_adjacency_kernel(
 
 
 void CuMesh::get_vertex_edge_adjacency() {
+    cudaStream_t stream = current_stream();
     if (this->edges.is_empty()) {
         this->get_edges();
     }
@@ -331,7 +338,7 @@ void CuMesh::get_vertex_edge_adjacency() {
     // get vertex adjacent edge number
     this->vert2edge_cnt.resize(V + 1);
     this->vert2edge_cnt.zero();
-    get_vertex_edge_cnt_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_vertex_edge_cnt_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edges.ptr, E, this->vert2edge_cnt.ptr
     );
     CUDA_CHECK(cudaGetLastError());
@@ -342,19 +349,19 @@ void CuMesh::get_vertex_edge_adjacency() {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr,
-        V + 1
+        V + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr,
-        V + 1
+        V + 1, stream
     ));
 
     // get vertex-edge adjacency
     this->vert2edge.resize(2 * E);
     this->vert2edge_cnt.zero();
-    get_vertex_edge_adjacency_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_vertex_edge_adjacency_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edges.ptr, E,
         this->vert2edge.ptr,
         this->vert2edge_offset.ptr,
@@ -366,7 +373,7 @@ void CuMesh::get_vertex_edge_adjacency() {
 
 /**
  * Set vertex boundary indicator
- * 
+ *
  * @param edges: the buffer for edges, shape (E)
  * @param boundaries: the buffer for boundary edges, shape (B)
  * @param edge2face_cnt: the buffer for edge duplication number, shape (E)
@@ -408,6 +415,7 @@ struct is_boundary_edge {
 
 
 void CuMesh::get_boundary_info() {
+    cudaStream_t stream = current_stream();
     if (this->edges.is_empty() || this->edge2face_cnt.is_empty()) {
         this->get_edges();
     }
@@ -419,21 +427,25 @@ void CuMesh::get_boundary_info() {
     CUDA_CHECK(cudaMalloc(&cu_num_boundary, sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_edge_idx, E * sizeof(int)));
     this->boundaries.resize(E);
-    arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_edge_idx, E);
+    arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_edge_idx, E);
+    CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cub::DeviceSelect::If(
         nullptr, temp_storage_bytes,
         cu_edge_idx, this->boundaries.ptr, cu_num_boundary,
         E,
-        is_boundary_edge{this->edge2face_cnt.ptr}
+        is_boundary_edge{this->edge2face_cnt.ptr},
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::If(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_edge_idx, this->boundaries.ptr, cu_num_boundary,
         E,
-        is_boundary_edge{this->edge2face_cnt.ptr}
+        is_boundary_edge{this->edge2face_cnt.ptr},
+        stream
     ));
-    CUDA_CHECK(cudaMemcpy(&this->boundaries.size, cu_num_boundary, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&this->boundaries.size, cu_num_boundary, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_num_boundary));
     CUDA_CHECK(cudaFree(cu_edge_idx));
 
@@ -441,7 +453,7 @@ void CuMesh::get_boundary_info() {
     this->vert_is_boundary.resize(this->vertices.size);
     this->vert_is_boundary.zero();
     if (this->boundaries.size > 0) {
-        set_boundary_vertex_kernel<<<(this->boundaries.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        set_boundary_vertex_kernel<<<(this->boundaries.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->edges.ptr, this->boundaries.ptr, this->edge2face_cnt.ptr,
             this->boundaries.size, this->vert_is_boundary.ptr
         );
@@ -474,7 +486,7 @@ static __global__ void get_vertex_boundary_cnt_kernel(
 
 /**
  * Get vertex-boundary adjacency
- * 
+ *
  * @param edges: the buffer for edges, shape (E)
  * @param boundaries: the buffer for boundary edges, shape (B)
  * @param B: the number of boundary edges
@@ -507,9 +519,10 @@ static __global__ void get_vertex_boundary_adjacency_kernel(
 
 
 void CuMesh::get_vertex_boundary_adjacency() {
+    cudaStream_t stream = current_stream();
     if (this->edges.is_empty()) {
         this->get_edges();
-    } 
+    }
     if (this->boundaries.is_empty()) {
         this->get_boundary_info();
     }
@@ -528,7 +541,7 @@ void CuMesh::get_vertex_boundary_adjacency() {
     // get vertex adjacent boundary number
     this->vert2bound_cnt.resize(V + 1);
     this->vert2bound_cnt.zero();
-    get_vertex_boundary_cnt_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_vertex_boundary_cnt_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edges.ptr, this->boundaries.ptr, B, this->vert2bound_cnt.ptr
     );
     CUDA_CHECK(cudaGetLastError());
@@ -539,19 +552,19 @@ void CuMesh::get_vertex_boundary_adjacency() {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr,
-        V + 1
+        V + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr,
-        V + 1
+        V + 1, stream
     ));
 
     // get vertex-boundary adjacency
     this->vert2bound.resize(2 * B);
     this->vert2bound_cnt.zero();
-    get_vertex_boundary_adjacency_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_vertex_boundary_adjacency_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edges.ptr, this->boundaries.ptr, B,
         this->vert2bound.ptr,
         this->vert2bound_offset.ptr,
@@ -592,10 +605,11 @@ static __global__ void get_vertex_is_manifold_kernel(
     }
 
     vert_is_manifold[tid] = is_manifold ? 1 : 0;
-}       
+}
 
 
 void CuMesh::get_vertex_is_manifold() {
+    cudaStream_t stream = current_stream();
     if (this->vert2edge.is_empty() || this->vert2edge_offset.is_empty()) {
         this->get_vertex_edge_adjacency();
     }
@@ -606,7 +620,7 @@ void CuMesh::get_vertex_is_manifold() {
 
     // get vertex is manifold
     this->vert_is_manifold.resize(V);
-    get_vertex_is_manifold_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_vertex_is_manifold_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vert2edge.ptr,
         this->vert2edge_offset.ptr,
         this->edge2face_cnt.ptr,
@@ -619,7 +633,7 @@ void CuMesh::get_vertex_is_manifold() {
 
 /**
  * Set manifold face adjacency
- * 
+ *
  * @param manifold_edge_idx: the buffer for manifold edge index, shape (M)
  * @param edge2face: the buffer for edge to face adjacency, shape (total_edge_face_cnt)
  * @param edge2face_offset: the buffer for edge to face adjacency offset, shape (E+1)
@@ -660,6 +674,7 @@ struct is_manifold_edge {
 
 
 void CuMesh::get_manifold_face_adjacency() {
+    cudaStream_t stream = current_stream();
     if (this->edge2face.is_empty() || this->edge2face_offset.is_empty()) {
         this->get_edge_face_adjacency();
     }
@@ -671,29 +686,32 @@ void CuMesh::get_manifold_face_adjacency() {
     CUDA_CHECK(cudaMalloc(&cu_num_manifold_edges, sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_edge_idx, E * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_manifold_edge_idx, E * sizeof(int)));
-    arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_edge_idx, E);
+    arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_edge_idx, E);
     CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cub::DeviceSelect::If(
         nullptr, temp_storage_bytes,
         cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges,
         E,
-        is_manifold_edge{this->edge2face_cnt.ptr}
+        is_manifold_edge{this->edge2face_cnt.ptr},
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::If(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges,
         E,
-        is_manifold_edge{this->edge2face_cnt.ptr}
+        is_manifold_edge{this->edge2face_cnt.ptr},
+        stream
     ));
     int manifold_edge_count;
-    CUDA_CHECK(cudaMemcpy(&manifold_edge_count, cu_num_manifold_edges, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&manifold_edge_count, cu_num_manifold_edges, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_num_manifold_edges));
     CUDA_CHECK(cudaFree(cu_edge_idx));
 
     // set manifold_face_adj
     this->manifold_face_adj.resize(manifold_edge_count);
-    set_manifold_face_adj_kernel<<<(manifold_edge_count+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    set_manifold_face_adj_kernel<<<(manifold_edge_count+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_manifold_edge_idx,
         this->edge2face.ptr,
         this->edge2face_offset.ptr,
@@ -701,6 +719,7 @@ void CuMesh::get_manifold_face_adjacency() {
         this->manifold_face_adj.ptr
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_manifold_edge_idx));
 }
 
@@ -737,6 +756,7 @@ struct is_manifold_boundary_vertex {
 
 
 void CuMesh::get_manifold_boundary_adjacency() {
+    cudaStream_t stream = current_stream();
     if (this->vert2bound.is_empty() || this->vert2bound_offset.is_empty()) {
         this->get_vertex_boundary_adjacency();
     }
@@ -751,23 +771,26 @@ void CuMesh::get_manifold_boundary_adjacency() {
     CUDA_CHECK(cudaMalloc(&cu_num_manifold_boundary_verts, sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_vert_idx, V * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_manifold_vert_idx, V * sizeof(int)));
-    arange_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vert_idx, V);
+    arange_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_vert_idx, V);
     CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cub::DeviceSelect::If(
         nullptr, temp_storage_bytes,
         cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts,
         V,
-        is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr}
+        is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr},
+        stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::If(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts,
         V,
-        is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr}
+        is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr},
+        stream
     ));
     int manifold_boundary_vert_count;
-    CUDA_CHECK(cudaMemcpy(&manifold_boundary_vert_count, cu_num_manifold_boundary_verts, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&manifold_boundary_vert_count, cu_num_manifold_boundary_verts, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_num_manifold_boundary_verts));
     CUDA_CHECK(cudaFree(cu_vert_idx));
 
@@ -779,7 +802,7 @@ void CuMesh::get_manifold_boundary_adjacency() {
 
     // set manifold_bound_adj
     this->manifold_bound_adj.resize(manifold_boundary_vert_count);
-    set_manifold_bound_adj_kernel<<<(manifold_boundary_vert_count+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    set_manifold_bound_adj_kernel<<<(manifold_boundary_vert_count+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_manifold_vert_idx,
         this->vert2bound.ptr,
         this->vert2bound_offset.ptr,
@@ -787,10 +810,13 @@ void CuMesh::get_manifold_boundary_adjacency() {
         this->manifold_bound_adj.ptr
     );
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_manifold_vert_idx));
 }
 
 
 void CuMesh::get_connected_components() {
+    cudaStream_t stream = current_stream();
     if (this->manifold_face_adj.is_empty()) {
         this->get_manifold_face_adjacency();
     }
@@ -800,16 +826,17 @@ void CuMesh::get_connected_components() {
 
     // Iterative Hook and Compress
     this->conn_comp_ids.resize(F);
-    arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->conn_comp_ids.ptr, F);
+    arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->conn_comp_ids.ptr, F);
     CUDA_CHECK(cudaGetLastError());
     int* cu_end_flag; int h_end_flag;
     CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
 
         // Hook
-        hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->manifold_face_adj.ptr,
             M,
             this->conn_comp_ids.ptr,
@@ -818,12 +845,13 @@ void CuMesh::get_connected_components() {
         CUDA_CHECK(cudaGetLastError());
 
         // Compress
-        compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->conn_comp_ids.ptr,
             F
         );
         CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
     } while (h_end_flag == 0);
     CUDA_CHECK(cudaFree(cu_end_flag));
 
@@ -833,6 +861,7 @@ void CuMesh::get_connected_components() {
 
 
 void CuMesh::get_boundary_connected_components() {
+    cudaStream_t stream = current_stream();
     if (this->manifold_bound_adj.is_empty()) {
         this->get_manifold_boundary_adjacency();
     }
@@ -847,16 +876,17 @@ void CuMesh::get_boundary_connected_components() {
 
     // Iterative Hook and Compress
     this->bound_conn_comp_ids.resize(B);
-    arange_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->bound_conn_comp_ids.ptr, B);
+    arange_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(this->bound_conn_comp_ids.ptr, B);
     CUDA_CHECK(cudaGetLastError());
     int* cu_end_flag; int h_end_flag;
     CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
 
         // Hook
-        hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->manifold_bound_adj.ptr,
             M,
             this->bound_conn_comp_ids.ptr,
@@ -865,12 +895,13 @@ void CuMesh::get_boundary_connected_components() {
         CUDA_CHECK(cudaGetLastError());
 
         // Compress
-        compress_components_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        compress_components_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             this->bound_conn_comp_ids.ptr,
             B
         );
         CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
     } while (h_end_flag == 0);
     CUDA_CHECK(cudaFree(cu_end_flag));
 
@@ -926,6 +957,7 @@ static __global__ void is_bound_conn_comp_loop_kernel(
 
 
 void CuMesh::get_boundary_loops() {
+    cudaStream_t stream = current_stream();
     if (this->bound_conn_comp_ids.is_empty()) {
         this->get_boundary_connected_components();
     }
@@ -941,13 +973,13 @@ void CuMesh::get_boundary_loops() {
     // Check if boundary components are loops
     int* cu_is_bound_conn_comp_loop;
     CUDA_CHECK(cudaMalloc(&cu_is_bound_conn_comp_loop, this->num_bound_conn_comps * sizeof(int)));
-    fill_kernel<<<(this->num_bound_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    fill_kernel<<<(this->num_bound_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_is_bound_conn_comp_loop,
         this->num_bound_conn_comps,
         1
     );
     CUDA_CHECK(cudaGetLastError());
-    is_bound_conn_comp_loop_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    is_bound_conn_comp_loop_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->edges.ptr,
         this->boundaries.ptr,
         this->bound_conn_comp_ids.ptr,
@@ -964,22 +996,23 @@ void CuMesh::get_boundary_loops() {
         nullptr, temp_storage_bytes,
         cu_is_bound_conn_comp_loop,
         cu_num_bound_loops,
-        this->num_bound_conn_comps
+        this->num_bound_conn_comps, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceReduce::Sum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_is_bound_conn_comp_loop,
         cu_num_bound_loops,
-        this->num_bound_conn_comps
+        this->num_bound_conn_comps, stream
     ));
-    CUDA_CHECK(cudaMemcpy(&this->num_bound_loops, cu_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&this->num_bound_loops, cu_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_num_bound_loops));
     if (this->num_bound_loops == 0) {
         CUDA_CHECK(cudaFree(cu_is_bound_conn_comp_loop));
         return;
     }
-    
+
     // Sort boundaries by connected component ids
     int *cu_bound_sorted, *cu_bound_conn_comp_ids_sorted;
     CUDA_CHECK(cudaMalloc(&cu_bound_sorted, B * sizeof(int)));
@@ -989,27 +1022,26 @@ void CuMesh::get_boundary_loops() {
         nullptr, temp_storage_bytes,
         this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted,
         this->boundaries.ptr, cu_bound_sorted,
-        B
+        B, 0, sizeof(int) * 8, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted,
         this->boundaries.ptr, cu_bound_sorted,
-        B
+        B, 0, sizeof(int) * 8, stream
     ));
 
     // Select loops
     int* cu_bound_is_on_loop;
     CUDA_CHECK(cudaMalloc(&cu_bound_is_on_loop, B * sizeof(int)));
-    index_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    index_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_is_bound_conn_comp_loop,
         cu_bound_conn_comp_ids_sorted,
         B,
         cu_bound_is_on_loop
     );
     CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_is_bound_conn_comp_loop));
     this->loop_boundaries.resize(B);
     int *cu_loop_bound_conn_comp_ids_sorted, *cu_num_bound_on_loop;
     CUDA_CHECK(cudaMalloc(&cu_loop_bound_conn_comp_ids_sorted, B * sizeof(int)));
@@ -1018,34 +1050,37 @@ void CuMesh::get_boundary_loops() {
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop,
-        B
+        B, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop,
-        B
+        B, stream
     ));
     int num_bound_on_loop;
-    CUDA_CHECK(cudaMemcpy(&num_bound_on_loop, cu_num_bound_on_loop, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&num_bound_on_loop, cu_num_bound_on_loop, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(cu_is_bound_conn_comp_loop));
     CUDA_CHECK(cudaFree(cu_bound_sorted));
     this->loop_boundaries.resize(num_bound_on_loop);
     temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop,
-        B
+        B, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop,
-        B
+        B, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_bound_conn_comp_ids_sorted));
     CUDA_CHECK(cudaFree(cu_bound_is_on_loop));
     CUDA_CHECK(cudaFree(cu_num_bound_on_loop));
-    
+
     // RLE
     this->loop_boundaries_offset.resize(this->num_bound_loops + 1);
     this->loop_boundaries_offset.zero();
@@ -1057,15 +1092,16 @@ void CuMesh::get_boundary_loops() {
         nullptr, temp_storage_bytes,
         cu_loop_bound_conn_comp_ids_sorted,
         cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs,
-        num_bound_on_loop
+        num_bound_on_loop, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_loop_bound_conn_comp_ids_sorted,
         cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs,
-        num_bound_on_loop
+        num_bound_on_loop, stream
     ));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(cu_loop_bound_conn_comp_ids_sorted));
     CUDA_CHECK(cudaFree(cu_rle_unique_out));
     CUDA_CHECK(cudaFree(cu_rle_num_runs));
@@ -1075,13 +1111,13 @@ void CuMesh::get_boundary_loops() {
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->loop_boundaries_offset.ptr,
-        this->num_bound_loops + 1
+        this->num_bound_loops + 1, stream
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->loop_boundaries_offset.ptr,
-        this->num_bound_loops + 1
+        this->num_bound_loops + 1, stream
     ));
 }
 
diff --git a/src/geometry.cu b/src/geometry.cu
index 0e493ba..45b8492 100644
--- a/src/geometry.cu
+++ b/src/geometry.cu
@@ -24,9 +24,10 @@ static __global__ void compute_face_areas_kernel(
 
 
 void CuMesh::compute_face_areas() {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
     this->face_areas.resize(F);
-    compute_face_areas_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    compute_face_areas_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vertices.ptr,
         this->faces.ptr,
         F,
@@ -57,9 +58,10 @@ static __global__ void compute_face_normals_kernel(
 
 
 void CuMesh::compute_face_normals() {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
     this->face_normals.resize(F);
-    compute_face_normals_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    compute_face_normals_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vertices.ptr,
         this->faces.ptr,
         F,
@@ -113,9 +115,10 @@ void CuMesh::compute_vertex_normals() {
         this->get_vertex_face_adjacency();
     }
 
+    cudaStream_t stream = current_stream();
     size_t V = this->vertices.size;
     this->vertex_normals.resize(V);
-    compute_vertex_normals_kernel<<<(V + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    compute_vertex_normals_kernel<<<(V + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vertices.ptr,
         this->faces.ptr,
         this->vert2face.ptr,
diff --git a/src/hash/hash.cu b/src/hash/hash.cu
index a9b1c23..b691096 100644
--- a/src/hash/hash.cu
+++ b/src/hash/hash.cu
@@ -1,6 +1,7 @@
 #include <torch/extension.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <c10/cuda/CUDAStream.h>
 
 #include "api.h"
 #include "hash.cuh"
@@ -32,9 +33,12 @@ static void dispatch_hashmap_insert_cuda(
     const torch::Tensor& keys,
     const torch::Tensor& values
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_insert_cuda_kernel<<<
         (keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         keys.size(0),
@@ -111,9 +115,12 @@ static void dispatch_hashmap_lookup_cuda(
     const torch::Tensor& keys,
     torch::Tensor& values
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_lookup_cuda_kernel<<<
         (keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         keys.size(0),
@@ -205,9 +212,12 @@ static void dispatch_hashmap_insert_3d_cuda(
     const torch::Tensor& values,
     int W, int H, int D
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_insert_3d_cuda_kernel<<<
         (coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         coords.size(0),
@@ -303,9 +313,12 @@ static void dispatch_hashmap_lookup_3d_cuda(
     torch::Tensor& values,
     int W, int H, int D
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_lookup_3d_cuda_kernel<<<
         (coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         coords.size(0),
@@ -395,9 +408,12 @@ static void dispatch_hashmap_insert_3d_idx_as_val_cuda(
     const torch::Tensor& coords,
     int W, int H, int D
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_insert_3d_idx_as_val_cuda_kernel<<<
         (coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         coords.size(0),
diff --git a/src/remesh/simple_dual_contour.cu b/src/remesh/simple_dual_contour.cu
index e064ee3..1629cd1 100644
--- a/src/remesh/simple_dual_contour.cu
+++ b/src/remesh/simple_dual_contour.cu
@@ -1,6 +1,7 @@
 #include <torch/extension.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <c10/cuda/CUDAStream.h>
 #include <vector>
 
 #include "api.h"
@@ -181,6 +182,7 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
 ) {
     const size_t M = coords.size(0);
     const size_t N_vert = hashmap_keys.size(0);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
 
     auto vertices = torch::empty({(long)M, 3}, torch::dtype(torch::kFloat32).device(coords.device()));
     auto intersected = torch::empty({(long)M, 3}, torch::dtype(torch::kInt32).device(coords.device()));
@@ -189,7 +191,7 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
     dim3 blocks((M + BLOCK_SIZE - 1) / BLOCK_SIZE);
 
     if (hashmap_keys.dtype() == torch::kUInt32) {
-        simple_dual_contour_kernel<<<blocks, threads>>>(
+        simple_dual_contour_kernel<<<blocks, threads, 0, stream>>>(
             N_vert,
             M,
             W, H, D,
@@ -200,9 +202,9 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
             vertices.data_ptr<float>(),
             intersected.data_ptr<int32_t>()
         );
-    } 
+    }
     else if (hashmap_keys.dtype() == torch::kUInt64) {
-        simple_dual_contour_kernel<<<blocks, threads>>>(
+        simple_dual_contour_kernel<<<blocks, threads, 0, stream>>>(
             N_vert,
             M,
             W, H, D,
@@ -213,7 +215,7 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
             vertices.data_ptr<float>(),
             intersected.data_ptr<int32_t>()
         );
-    } 
+    }
     else {
         TORCH_CHECK(false, "Unsupported hashmap data type");
     }
diff --git a/src/remesh/svox2vert.cu b/src/remesh/svox2vert.cu
index 6f1d517..d6b6511 100644
--- a/src/remesh/svox2vert.cu
+++ b/src/remesh/svox2vert.cu
@@ -2,6 +2,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
 
 #include "api.h"
 #include "../utils.h"
@@ -147,10 +148,11 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
 
     // Get the number of active vertices for each voxel
     size_t N = hashmap_keys.size(0);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     int* num_vertices;
     CUDA_CHECK(cudaMalloc(&num_vertices, (M + 1) * sizeof(int)));
     if (hashmap_keys.dtype() == torch::kUInt32) {
-        get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+        get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             N,
             M,
             W,
@@ -162,7 +164,7 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
             num_vertices
         );
     } else if (hashmap_keys.dtype() == torch::kUInt64) {
-        get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+        get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             N,
             M,
             W,
@@ -180,18 +182,19 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
 
     // Compute the offset
     size_t temp_storage_bytes = 0;
-    cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1);
+    cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1, stream);
     void* d_temp_storage = nullptr;
     CUDA_CHECK(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1);
-    CUDA_CHECK(cudaFree(d_temp_storage));
+    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1, stream);
     int total_vertices;
-    CUDA_CHECK(cudaMemcpy(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(d_temp_storage));
 
     // Set the active vertices for each voxel
     auto vertices = torch::empty({total_vertices, 3}, torch::dtype(torch::kInt32).device(hashmap_keys.device()));
     if (hashmap_keys.dtype() == torch::kUInt32) {
-        set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+        set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             N,
             M,
             W,
@@ -205,7 +208,7 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
         );
     }
     else if (hashmap_keys.dtype() == torch::kUInt64) {
-        set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+        set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             N,
             M,
             W,
@@ -220,7 +223,8 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
     }
     CUDA_CHECK(cudaGetLastError());
 
-    // Free the temporary memory
+    // Free the temporary memory — sync stream first so set_vertex kernel is done
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(num_vertices));
 
     return vertices;
diff --git a/src/shared.h b/src/shared.h
index 66ecac7..70eda3e 100644
--- a/src/shared.h
+++ b/src/shared.h
@@ -3,12 +3,17 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
 #include "utils.h"
 #include "cumesh.h"
 
 
 namespace cumesh {
 
+inline cudaStream_t current_stream() {
+    return at::cuda::getCurrentCUDAStream().stream();
+}
+
 
 template<typename T>
 __global__ void arange_kernel(T* array, int N, int stride=1) {
@@ -158,33 +163,33 @@ __global__ void get_diff_kernel(
 
 template<typename T>
 int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nullptr) {
+    cudaStream_t stream = current_stream();
     int *cu_indices, *cu_indices_argsorted;
+    int *cu_num = nullptr;
     T *cu_ids_sorted;
     CUDA_CHECK(cudaMalloc(&cu_indices, N * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_indices_argsorted, N * sizeof(int)));
     CUDA_CHECK(cudaMalloc(&cu_ids_sorted, N * sizeof(T)));
-    arange_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_indices, N);
+    arange_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(cu_indices, N);
     CUDA_CHECK(cudaGetLastError());
     size_t temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
         nullptr, temp_storage_bytes,
         ids, cu_ids_sorted,
         cu_indices, cu_indices_argsorted,
-        N
+        N, 0, sizeof(T) * 8, stream
     ));
     cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
         cub_temp_storage.ptr, temp_storage_bytes,
         ids, cu_ids_sorted,
         cu_indices, cu_indices_argsorted,
-        N
+        N, 0, sizeof(T) * 8, stream
     ));
-    CUDA_CHECK(cudaFree(cu_indices));
-
     // get diff
     T* cu_new_ids;
     CUDA_CHECK(cudaMalloc(&cu_new_ids, N * sizeof(T)));
-    get_diff_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_diff_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_ids_sorted,
         cu_new_ids,
         N
@@ -193,40 +198,37 @@ int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nu
 
     // inverse
     if (inverse) {
-        int* cu_num;
         CUDA_CHECK(cudaMalloc(&cu_num, sizeof(int)));
         temp_storage_bytes = 0;
         CUDA_CHECK(cub::DeviceSelect::Flagged(
             nullptr, temp_storage_bytes,
             cu_ids_sorted, cu_new_ids, inverse, cu_num,
-            N
+            N, stream
         ));
         cub_temp_storage.resize(temp_storage_bytes);
         CUDA_CHECK(cub::DeviceSelect::Flagged(
             cub_temp_storage.ptr, temp_storage_bytes,
             cu_ids_sorted, cu_new_ids, inverse, cu_num,
-            N
+            N, stream
         ));
-        CUDA_CHECK(cudaFree(cu_num));
     }
-    CUDA_CHECK(cudaFree(cu_ids_sorted));
-    
+
     // scan diff
     temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_new_ids,
-        N
+        N, stream
     ));
     cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         cub_temp_storage.ptr, temp_storage_bytes,
         cu_new_ids,
-        N
+        N, stream
     ));
-    
+
     // scatter
-    scatter_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    scatter_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         cu_indices_argsorted,
         cu_new_ids,
         N,
@@ -234,10 +236,16 @@ int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nu
     );
     CUDA_CHECK(cudaGetLastError());
     T num_components;
-    CUDA_CHECK(cudaMemcpy(&num_components, cu_new_ids + N-1, sizeof(T), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&num_components, cu_new_ids + N-1, sizeof(T), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     num_components += 1;
+
+    // Free all scratch memory — stream is synced, all GPU work is done
+    CUDA_CHECK(cudaFree(cu_indices));
+    CUDA_CHECK(cudaFree(cu_ids_sorted));
     CUDA_CHECK(cudaFree(cu_new_ids));
     CUDA_CHECK(cudaFree(cu_indices_argsorted));
+    if (cu_num) CUDA_CHECK(cudaFree(cu_num));
 
     return static_cast<int>(num_components);
 }
@@ -247,6 +255,7 @@ int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nu
 
 template <typename T>
 void print_max_val(T* ptr, size_t size) {
+    cudaStream_t stream = current_stream();
     T* dbg_cu_max_val;
     CUDA_CHECK(cudaMalloc(&dbg_cu_max_val, sizeof(T)));
     size_t temp_storage_bytes = 0;
@@ -254,7 +263,7 @@ void print_max_val(T* ptr, size_t size) {
         nullptr, temp_storage_bytes,
         ptr,
         dbg_cu_max_val,
-        size
+        size, stream
     ));
     char* temp_storage;
     CUDA_CHECK(cudaMalloc(&temp_storage, temp_storage_bytes));
@@ -262,10 +271,11 @@ void print_max_val(T* ptr, size_t size) {
         temp_storage, temp_storage_bytes,
         ptr,
         dbg_cu_max_val,
-        size
+        size, stream
     ));
     T h_max_val;
-    CUDA_CHECK(cudaMemcpy(&h_max_val, dbg_cu_max_val, sizeof(T), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&h_max_val, dbg_cu_max_val, sizeof(T), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     std::cout << "Max value: " << h_max_val << std::endl;
     CUDA_CHECK(cudaFree(dbg_cu_max_val));
     CUDA_CHECK(cudaFree(temp_storage));
@@ -273,8 +283,10 @@ void print_max_val(T* ptr, size_t size) {
 
 template <typename T>
 void print_val(T* ptr, size_t size) {
+    cudaStream_t stream = current_stream();
     T h_ptr[size];
-    CUDA_CHECK(cudaMemcpy(h_ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(h_ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     for (size_t i = 0; i < size; i++) {
         std::cout << h_ptr[i] << " ";
     }
@@ -282,4 +294,4 @@ void print_val(T* ptr, size_t size) {
 }
 
 
-} // namespace cumesh
\ No newline at end of file
+} // namespace cumesh
diff --git a/src/simplify.cu b/src/simplify.cu
index 9efde9e..b5bbcbb 100644
--- a/src/simplify.cu
+++ b/src/simplify.cu
@@ -1,6 +1,8 @@
 #include "cumesh.h"
 #include "dtypes.cuh"
 #include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
+#include "shared.h"
 
 
 namespace cumesh {
@@ -21,7 +23,7 @@ __device__ inline void unpack_key_value_positive(uint64_t key_value, int& key, f
 
 /**
  * Get the QEM for each vertex
- * 
+ *
  * @param vertices: the vertices of the mesh, shape (V)
  * @param faces: the faces of the mesh, shape (F)
  * @param vert2face: the buffer for neighbor face ids, shape (total_neighbor_face_cnt)
@@ -64,12 +66,13 @@ static __global__ void get_qem_kernel(
  * Get the QEM for each vertex
  */
 void get_qem(
-    CuMesh& ctx
+    CuMesh& ctx,
+    cudaStream_t stream
 ) {
     size_t V = ctx.vertices.size;
     size_t F = ctx.faces.size;
     ctx.temp_storage.resize(V * sizeof(QEM));
-    get_qem_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_qem_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         ctx.vertices.ptr,
         ctx.faces.ptr,
         ctx.vert2face.ptr,
@@ -139,7 +142,7 @@ inline __device__ bool process_incident_tri(
 
 /**
  * Get the cost for each edge collapse
- * 
+ *
  * @param vertices: the vertices of the mesh, shape (V)
  * @param faces: the faces of the mesh, shape (F)
  * @param vert2face: the buffer for neighbor face ids, shape (total_neighbor_face_cnt)
@@ -228,13 +231,14 @@ static __global__ void get_edge_collapse_cost_kernel(
 void get_edge_collapse_cost(
     CuMesh& ctx,
     float lambda_edge_length,
-    float lambda_skinny
+    float lambda_skinny,
+    cudaStream_t stream
 ) {
     size_t V = ctx.vertices.size;
     size_t F = ctx.faces.size;
     size_t E = ctx.edges.size;
     ctx.edge_collapse_costs.resize(E);
-    get_edge_collapse_cost_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    get_edge_collapse_cost_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         ctx.vertices.ptr,
         ctx.faces.ptr,
         ctx.vert2face.ptr,
@@ -252,7 +256,7 @@ void get_edge_collapse_cost(
 
 /**
  * Propagate cost to neighboring faces
- * 
+ *
  * @param edges: the buffer for edges, shape (E)
  * @param vert2face: the buffer for neighboring face ids, shape (total_neighbor_face_cnt)
  * @param vert2face_offset: the buffer for neighboring face ids offset, shape (V+1)
@@ -296,14 +300,15 @@ static __global__ void propagate_cost_kernel(
  * Propagate cost to neighboring faces
  */
 void propagate_cost(
-    CuMesh& ctx
+    CuMesh& ctx,
+    cudaStream_t stream
 ) {
     size_t V = ctx.vertices.size;
     size_t F = ctx.faces.size;
     size_t E = ctx.edges.size;
     ctx.propagated_costs.resize(F);
     ctx.propagated_costs.fill(std::numeric_limits<uint64_t>::max());
-    propagate_cost_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    propagate_cost_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         ctx.edges.ptr,
         ctx.vert2face.ptr,
         ctx.vert2face_offset.ptr,
@@ -317,7 +322,7 @@ void propagate_cost(
 
 /**
  * Collapse edges parallelly
- * 
+ *
  * @param vertices: the vertices of the mesh, shape (V)
  * @param faces: the faces of the mesh, shape (F)
  * @param edges: the buffer for edges, shape (E)
@@ -433,7 +438,7 @@ static __global__ void compress_faces_kernel(
     if (is_kept) {
         new_faces[new_id].x = vertices_map[old_faces[tid].x];
         new_faces[new_id].y = vertices_map[old_faces[tid].y];
-        new_faces[new_id].z = vertices_map[old_faces[tid].z];        
+        new_faces[new_id].z = vertices_map[old_faces[tid].z];
     }
 }
 
@@ -443,7 +448,8 @@ static __global__ void compress_faces_kernel(
  */
 void collapse_edges(
     CuMesh& ctx,
-    float collapse_thresh
+    float collapse_thresh,
+    cudaStream_t stream
 ) {
     size_t V = ctx.vertices.size;
     size_t F = ctx.faces.size;
@@ -452,7 +458,7 @@ void collapse_edges(
     ctx.faces_map.resize(F + 1);
     ctx.vertices_map.fill(1);
     ctx.faces_map.fill(1);
-    collapse_edges_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    collapse_edges_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         ctx.vertices.ptr,
         ctx.faces.ptr,
         ctx.edges.ptr,
@@ -467,24 +473,25 @@ void collapse_edges(
         ctx.faces_map.ptr
     );
     CUDA_CHECK(cudaGetLastError());
-    
+
     // update vertices buffer
     // get vertices map
     size_t temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
-        ctx.vertices_map.ptr, V+1
+        ctx.vertices_map.ptr, V+1, stream
     ));
     ctx.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         ctx.cub_temp_storage.ptr, temp_storage_bytes,
-        ctx.vertices_map.ptr, V+1
+        ctx.vertices_map.ptr, V+1, stream
     ));
     int new_num_vertices;
-    CUDA_CHECK(cudaMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     // compress vertices
     ctx.temp_storage.resize(new_num_vertices * sizeof(float3));
-    compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         ctx.vertices_map.ptr,
         ctx.vertices.ptr,
         V,
@@ -497,18 +504,19 @@ void collapse_edges(
     // get faces map
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
-        ctx.faces_map.ptr, F+1
+        ctx.faces_map.ptr, F+1, stream
     ));
     ctx.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         ctx.cub_temp_storage.ptr, temp_storage_bytes,
-        ctx.faces_map.ptr, F+1
+        ctx.faces_map.ptr, F+1, stream
     ));
     int new_num_faces;
-    CUDA_CHECK(cudaMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     // compress faces
     ctx.temp_storage.resize(new_num_faces * sizeof(int3));
-    compress_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+    compress_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         ctx.faces_map.ptr,
         ctx.vertices_map.ptr,
         ctx.faces.ptr,
@@ -521,12 +529,13 @@ void collapse_edges(
 
 
 std::tuple<int, int> CuMesh::simplify_step(float lambda_edge_length, float lambda_skinny, float threshold, bool timing) {
+    cudaStream_t stream = current_stream();
     std::chrono::high_resolution_clock::time_point start, end;
 
     if (timing) start = std::chrono::high_resolution_clock::now();
     this->get_vertex_face_adjacency();
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaStreamSynchronize(stream));
         end = std::chrono::high_resolution_clock::now();
         std::cout << "get_vertex_face_adjacency: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
@@ -535,39 +544,39 @@ std::tuple<int, int> CuMesh::simplify_step(float lambda_edge_length, float lambd
     this->get_edges();
     this->get_boundary_info();
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaStreamSynchronize(stream));
         end = std::chrono::high_resolution_clock::now();
         std::cout << "get_edges: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
 
     if (timing) start = std::chrono::high_resolution_clock::now();
-    get_qem(*this);
+    get_qem(*this, stream);
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaStreamSynchronize(stream));
         end = std::chrono::high_resolution_clock::now();
         std::cout << "get_qem: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
 
     if (timing) start = std::chrono::high_resolution_clock::now();
-    get_edge_collapse_cost(*this, lambda_edge_length, lambda_skinny);
+    get_edge_collapse_cost(*this, lambda_edge_length, lambda_skinny, stream);
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaStreamSynchronize(stream));
         end = std::chrono::high_resolution_clock::now();
         std::cout << "get_edge_collapse_cost: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
 
     if (timing) start = std::chrono::high_resolution_clock::now();
-    propagate_cost(*this);
+    propagate_cost(*this, stream);
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaStreamSynchronize(stream));
         end = std::chrono::high_resolution_clock::now();
         std::cout << "propagate_cost: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
 
     if (timing) start = std::chrono::high_resolution_clock::now();
-    collapse_edges(*this, threshold);
+    collapse_edges(*this, threshold, stream);
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaStreamSynchronize(stream));
         end = std::chrono::high_resolution_clock::now();
         std::cout << "collapse_edges: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
diff --git a/src/utils.h b/src/utils.h
index f15823b..aeaf2d8 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -4,6 +4,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <torch/extension.h>
+#include <c10/cuda/CUDAStream.h>
 
 #define CUDA_CHECK(call)                                \
 do {                                                    \
@@ -43,7 +44,11 @@ struct Buffer {
     }
 
     void free() {
-        if (ptr != nullptr) CUDA_CHECK(cudaFree(ptr));
+        if (ptr != nullptr) {
+            cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+            CUDA_CHECK(cudaStreamSynchronize(stream));
+            CUDA_CHECK(cudaFree(ptr));
+        }
         ptr = nullptr;
         size = 0;
         capacity = 0;
@@ -62,7 +67,9 @@ struct Buffer {
         if (new_size > capacity) {
             T* new_ptr;
             CUDA_CHECK(cudaMalloc(&new_ptr, new_size * sizeof(T)));
-            CUDA_CHECK(cudaMemcpy(new_ptr, ptr, this->size * sizeof(T), cudaMemcpyDeviceToDevice));
+            cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+            CUDA_CHECK(cudaMemcpyAsync(new_ptr, ptr, this->size * sizeof(T), cudaMemcpyDeviceToDevice, stream));
+            CUDA_CHECK(cudaStreamSynchronize(stream));
             CUDA_CHECK(cudaFree(ptr));
             ptr = new_ptr;
             this->capacity = new_size;
@@ -71,12 +78,15 @@ struct Buffer {
     }
 
     void zero() {
-        CUDA_CHECK(cudaMemset(ptr, 0, size * sizeof(T)));
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+        CUDA_CHECK(cudaMemsetAsync(ptr, 0, size * sizeof(T), stream));
     }
 
     void fill(T val) {
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
         std::vector<T> tmp(size, val);
-        CUDA_CHECK(cudaMemcpy(ptr, tmp.data(), size * sizeof(T), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(ptr, tmp.data(), size * sizeof(T), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
     }
 };