diff --git a/forge_cute_py/kernels/reduce_sum.py b/forge_cute_py/kernels/reduce_sum.py
new file mode 100644
index 0000000..f02fc77
--- /dev/null
+++ b/forge_cute_py/kernels/reduce_sum.py
@@ -0,0 +1,164 @@
+from concurrent.futures import thread
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+os.environ['CUTLASS_CUDA_ARCH'] = '86'
+os.environ['CUTE_DSL_ENABLE_TVM_FFI'] = '1'
+
+import math
+import torch
+import time
+from cutlass.cute.runtime import from_dlpack
+import cuda.bindings.driver as cuda
+import cutlass.cute as cute
+from cutlass import const_expr
+import cutlass
+
+from cutlass import dsl_user_op
+from cutlass.cute.arch import nvvm
+from cutlass._mlir.dialects.nvvm import AtomicOpKind, MemOrderKind, MemScopeKind
+from cutlass.base_dsl.typing import T
+
+_reduce_sum_last_cache = {}
+_reduce_sum_first_cache = {}
+
+
+# old just for future reference
+@dsl_user_op
+def atomicAddF32(dst_ptr: cute.Pointer, val: cute.Float32, loc=None, ip=None) -> cute.Float32:
+    return nvvm.atomicrmw(
+        T.f32(),
+        AtomicOpKind.FADD,
+        dst_ptr.llvm_ptr,
+        val.ir_value(loc=loc, ip=ip),
+        mem_order=MemOrderKind.RELAXED,
+        syncscope=MemScopeKind.SYS,
+        loc=loc,
+        ip=ip,
+    )
+
+
+class ReduceSumLast:
+    """Sum reduction along last dimension using CuTe DSL."""
+
+    def __init__(self, dtype: type):
+        self.dtype = dtype
+        self.num_warps = 8
+        self.threads_per_block = self.num_warps * 32
+
+    @cute.jit
+    def __call__(
+        self,
+        input: cute.Tensor,
+        output: cute.Tensor,
+        # stream: cuda.CUstream = None,
+    ):
+        M, N = input.shape
+        tiler_mn = (1, self.threads_per_block * 4)
+        gX = cute.zipped_divide(input, tiler_mn)
+
+        thr_layout = cute.make_layout((self.threads_per_block,), stride=(1,))
+        val_layout = cute.make_layout((4,), stride=(1,))
+        _, tv_layout = cute.make_layout_tv(thr_layout, val_layout)
+
+        self.kernel(gX, output, tv_layout).launch(
+            grid=(M, 1, 1),
+            block=(self.threads_per_block, 1, 1),
+            # stream=stream,
+        )
+
+    @cute.kernel
+    def kernel(self, input: cute.Tensor, output: cute.Tensor, tv_layout):
+        num_warps = const_expr(self.num_warps)
+        
+        smem_alloc = cutlass.utils.SmemAllocator()
+        shmem = smem_alloc.allocate_tensor(cute.Float32, cute.make_layout((32,)))
+
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        lane = cute.arch.lane_idx()
+        warp = cute.arch.warp_idx()
+
+        # op = cute.nvgpu.CopyUniversalOp()
+        # atom = cute.make_copy_atom(op, cute.Float32, num_bits_per_copy=128)
+        acc = cute.Float32(0.0)
+
+        _, mode1 = input.shape
+        ntiles = mode1[1]
+
+        for tile_idx in range(ntiles):
+            blk_coord = ((0, None), (bidx, tile_idx))
+            cta_tile = input[blk_coord]
+            thr_frag = cute.composition(cta_tile, tv_layout)
+            thr_src = thr_frag[(tidx, None)]
+            # r = cute.make_rmem_tensor(cute.make_layout((4,), stride=(1,)), cute.Float32)
+            # cute.copy(atom, thr_src, r)
+            # acc += r[0] + r[1] + r[2] + r[3]
+            acc += thr_src[0] + thr_src[1] + thr_src[2] + thr_src[3]
+
+
+        acc = cute.arch.warp_reduction_sum(acc)
+        if lane == 0:
+            shmem[warp] = acc
+        cute.arch.sync_threads()
+
+        if warp == 0:
+            acc2 = shmem[lane] if lane < num_warps else 0.0
+            acc2 = cute.arch.warp_reduction_sum(acc2)
+            if lane == 0:
+                output[bidx] = acc2
+
+
+class ReduceSumFirst:
+    """Sum reduction along first dimension using CuTe DSL."""
+
+    def __init__(self, dtype: type):
+        self.dtype = dtype
+        self.num_warps = 4
+        self.threads_per_block = self.num_warps * 32
+
+    @cute.jit
+    def __call__(
+        self,
+        input: cute.Tensor,
+        output: cute.Tensor,
+        # stream: cuda.CUstream = None,
+    ):
+        M, N = input.shape
+        yblocks = cute.ceil_div(N, 4)
+        self.kernel(input, output, self.threads_per_block // 4).launch(
+            grid=(yblocks, 1, 1),
+            block=(self.threads_per_block, 1, 1),
+            # stream=stream,
+        )
+
+    @cute.kernel
+    def kernel(self, input: cute.Tensor, output: cute.Tensor, stride: int):
+        smem_alloc = cutlass.utils.SmemAllocator()
+        smem_layout = cute.make_layout((4, 32))
+        shmem = smem_alloc.allocate_tensor(cute.Float32, smem_layout)
+
+        M, N = input.shape
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        lane_idx = cute.arch.lane_idx()
+        warp_idx = cute.arch.warp_idx()
+
+        max_iters = cute.ceil_div(M, stride)
+        col_offset = tidx % 4
+        row_offset = tidx // 4
+        col = 4 * bidx + col_offset
+        acc = cute.Float32(0)
+
+        row = row_offset
+        for _ in range(max_iters):
+            if row < M and col < N:
+                acc = acc + input[row, col]
+            row = row + 32
+
+        shmem[col_offset, row_offset] = acc
+        cute.arch.sync_threads()
+        acc = shmem[warp_idx, lane_idx]
+
+        acc = cute.arch.warp_reduction_sum(acc)
+        if lane_idx == 0:
+            output[bidx * 4 + warp_idx] = acc
diff --git a/forge_cute_py/ops/reduce_sum.py b/forge_cute_py/ops/reduce_sum.py
index d501eca..e7252c9 100644
--- a/forge_cute_py/ops/reduce_sum.py
+++ b/forge_cute_py/ops/reduce_sum.py
@@ -1,65 +1,58 @@
+import cutlass.cute as cute
 import torch
+from cutlass import BFloat16, Float16, Float32
+from cutlass.cute.runtime import from_dlpack
 
+from forge_cute_py.kernels.reduce_sum import ReduceSumLast, ReduceSumFirst
+
+_compile_cache = {}
 
-@torch.library.custom_op("forge_cute_py::_reduce_sum", mutates_args={"out"})
-def _reduce_sum(x: torch.Tensor, out: torch.Tensor, dim: int = -1, variant: str = "shfl") -> None:
-    """Row/column sum reduction (reference implementation stub).
 
-    Args:
-        x: Input tensor of shape (M, N)
-        out: Output tensor (mutated in-place)
-        dim: Dimension to reduce over (-1, 0, or 1)
-        variant: Reduction variant (naive, improved, shfl) - currently unused
-    """
+@torch.library.custom_op("forge_cute_py::_reduce_sum", mutates_args={"out"})
+def _reduce_sum(x: torch.Tensor, out: torch.Tensor, dim: int = -1) -> None:
+    """Sum reduction using CuTe DSL."""
     assert x.dim() == 2, "reduce_sum expects a 2D tensor"
     assert x.is_cuda, f"reduce_sum is CUDA-only, got device={x.device}"
-    assert dim in (-1, 0, 1), f"reduce_sum expects dim in {{-1, 0, 1}}, got {dim}"
-    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], (
-        f"Unsupported dtype: {x.dtype}"
-    )
 
-    # Normalize dim to positive index
     dim = dim if dim >= 0 else x.ndim + dim
 
-    # For now, use reference implementation
-    # Future: call kernel implementation based on variant when available
-    from forge_cute_py.ref import reduce_sum as reduce_sum_ref
-
-    result = reduce_sum_ref(x, dim=dim)
-    out.copy_(result)
-
-
-_reduce_sum.compile_cache = {}
-
-
-def reduce_sum(x: torch.Tensor, dim: int = -1, variant: str = "shfl") -> torch.Tensor:
-    """Row/column sum reduction.
-
-    Args:
-        x: Input tensor of shape (M, N)
-        dim: Dimension to reduce over (-1 for last dim, 0 or 1)
-        variant: Reduction variant (naive, improved, shfl) - currently unused
-
-    Returns:
-        Reduced tensor of shape (M,) if dim=1 or (N,) if dim=0
-
-    Examples:
-        >>> x = torch.randn(32, 128, device='cuda', dtype=torch.float16)
-        >>> y = reduce_sum(x, dim=-1)  # Sum over columns, result shape: (32,)
-        >>> y.shape
-        torch.Size([32])
-    """
-    # Normalize dim to positive index
+    dtype_map = {
+        torch.float16: Float16,
+        torch.float32: Float32,
+        torch.bfloat16: BFloat16,
+    }
+    cute_dtype = dtype_map[x.dtype]
+    compile_key = (cute_dtype, dim)
+
+    if compile_key not in _compile_cache:
+        m = cute.sym_int()
+        n = cute.sym_int()
+        input_cute = cute.runtime.make_fake_compact_tensor(cute_dtype, (m, n), stride_order=(1, 0))
+        
+        if dim == 1:  # Reduce last dim
+            output_cute = cute.runtime.make_fake_compact_tensor(cute_dtype, (m,))
+            kernel_class = ReduceSumLast(cute_dtype)
+        else:  # dim == 0
+            output_cute = cute.runtime.make_fake_compact_tensor(cute_dtype, (n,))
+            kernel_class = ReduceSumFirst(cute_dtype)
+
+        _compile_cache[compile_key] = cute.compile(
+            kernel_class,
+            input_cute,
+            output_cute,
+            # cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=False),
+            options="--enable-tvm-ffi",
+        )
+
+    x_cute = from_dlpack(x, assumed_align=16)
+    out_cute = from_dlpack(out, assumed_align=16)
+    _compile_cache[compile_key](x_cute, out_cute)
+
+
+def reduce_sum(x: torch.Tensor, dim: int = -1, variant='') -> torch.Tensor:
+    """Sum reduction with CuTe DSL kernel."""
     dim = dim if dim >= 0 else x.ndim + dim
-
-    # Determine output shape
-    if dim == 0:
-        out_shape = (x.shape[1],)
-    elif dim == 1:
-        out_shape = (x.shape[0],)
-    else:
-        raise ValueError(f"Invalid dim={dim} for 2D tensor")
-
+    out_shape = (x.shape[1],) if dim == 0 else (x.shape[0],)
     out = torch.empty(out_shape, dtype=x.dtype, device=x.device)
-    _reduce_sum(x, out, dim, variant)
+    _reduce_sum(x, out, dim)
     return out
diff --git a/forge_cute_py/util/profile_launch.py b/forge_cute_py/util/profile_launch.py
new file mode 100644
index 0000000..0098364
--- /dev/null
+++ b/forge_cute_py/util/profile_launch.py
@@ -0,0 +1,32 @@
+import torch
+
+from forge_cute_py.ops.reduce_sum import reduce_sum
+
+def main():
+    M, N = 4096, 4096
+    x = torch.randn(M, N, device='cuda', dtype=torch.float32)
+    
+    # Warmup
+    print("Warming up...")
+    for _ in range(10):
+        _ = reduce_sum(x, dim=-1)
+        _ = x.sum(dim=-1)
+    torch.cuda.synchronize()
+    print("Warmup complete")
+    
+    # Profile cute
+    torch.cuda.nvtx.range_push("cute_reduce_sum")
+    for _ in range(100):
+        _ = reduce_sum(x, dim=-1)
+    torch.cuda.synchronize()
+    torch.cuda.nvtx.range_pop()
+    
+    # Profile torch
+    torch.cuda.nvtx.range_push("torch_sum")
+    for _ in range(100):
+        _ = x.sum(dim=-1)
+    torch.cuda.synchronize()
+    torch.cuda.nvtx.range_pop()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/ncu_output/sum_reduce.txt b/ncu_output/sum_reduce.txt
new file mode 100644
index 0000000..2ec3fa6
--- /dev/null
+++ b/ncu_output/sum_reduce.txt
@@ -0,0 +1,411 @@
+ kernel_cutlass_reduce_sum_kernel_last_tensorptrf32gmemo4096409640961_tensorptrf32gmemo40961_4_0 (4096, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.9
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ----------- ------------
+    Metric Name             Metric Unit Metric Value
+    ----------------------- ----------- ------------
+    DRAM Frequency                  Ghz         8.99
+    SM Frequency                    Ghz         1.06
+    Elapsed Cycles                cycle       100174
+    Memory Throughput                 %        93.81
+    DRAM Throughput                   %        93.81
+    Duration                         us        94.11
+    L1/TEX Cache Throughput           %        15.18
+    L2 Cache Throughput               %        33.00
+    SM Active Cycles              cycle     97308.30
+    Compute (SM) Throughput           %         9.62
+    ----------------------- ----------- ------------
+
+    INF   This workload is utilizing greater than 80.0% of the available compute or memory performance of the device.
+          To further improve performance, work will likely need to be shifted from the most utilized to another unit.
+          Start by analyzing DRAM in the Memory Workload Analysis section.
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 64:1. The workload achieved
+          close to 1% of this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel
+          Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details
+          on roofline analysis.
+
+    Section: PM Sampling
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Maximum Buffer Size             Mbyte        50.33
+    Dropped Samples                sample            0
+    Maximum Sampling Interval          us            1
+    # Pass Groups                                    2
+    ------------------------- ----------- ------------
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.33
+    Executed Ipc Elapsed  inst/cycle         0.32
+    Issue Slots Busy               %         8.18
+    Issued Ipc Active     inst/cycle         0.33
+    SM Busy                        %         8.18
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 93.37%
+          All compute pipelines are under-utilized. Either this workload is very small or it doesn't issue enough warps
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.
+
+    Section: Memory Workload Analysis
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    Memory Throughput                Gbyte/s       809.32
+    Mem Busy                               %        17.07
+    Max Bandwidth                          %        93.81
+    L1/TEX Hit Rate                        %         0.02
+    L2 Compression Success Rate            %            0
+    L2 Compression Ratio                                0
+    L2 Compression Input Sectors      sector            0
+    L2 Hit Rate                            %         0.58
+    Mem Pipes Busy                         %         9.62
+    ---------------------------- ----------- ------------
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 25.74%
+          The memory access pattern for global stores to L2 might not be optimal. On average, only 4.0 of the 32 bytes
+          transmitted per sector are utilized by each thread. This applies to the 89.2% of sectors missed in L1TEX.
+          This could possibly be caused by a stride between threads. Check the Source Counters section for uncoalesced
+          global stores.
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         8.20
+    Issued Warp Per Scheduler                        0.08
+    No Eligible                            %        91.80
+    Active Warps Per Scheduler          warp         9.36
+    Eligible Warps Per Scheduler        warp         0.10
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 6.188%
+          Every scheduler is capable of issuing one instruction per cycle, but for this workload each scheduler only
+          issues an instruction every 12.2 cycles. This might leave hardware resources underutilized and may lead to
+          less optimal performance. Out of the maximum of 12 warps per scheduler, this workload allocates an average
+          of 9.36 active warps per scheduler, but only an average of 0.10 warps were eligible per cycle. Eligible
+          warps are the subset of active warps that are ready to issue their next instruction. Every cycle with no
+          eligible warp results in no instruction being issued and the issue slot remains unused. To increase the
+          number of eligible warps, avoid possible load imbalances due to highly different execution durations per
+          warp. Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle       114.14
+    Warp Cycles Per Executed Instruction           cycle       114.69
+    Avg. Active Threads Per Warp                                31.89
+    Avg. Not Predicated Off Threads Per Warp                    31.29
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 6.188%
+          On average, each warp of this workload spends 106.6 cycles being stalled waiting for a scoreboard dependency
+          on a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently
+          used data to shared memory. This stall type represents about 93.4% of the total average of 114.1 cycles
+          between issuing two instructions.
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on
+          sampling data. The Kernel Profiling Guide
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details
+          on each stall reason.
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      7917.97
+    Executed Instructions                           inst      4497408
+    Avg. Issued Instructions Per Scheduler          inst      7955.98
+    Issued Instructions                             inst      4518996
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 2.056%
+          This kernel executes 0 fused and 675840 non-fused FP32 instructions. By converting pairs of non-fused
+          instructions to their fused (https://docs.nvidia.com/cuda/floating-point/#cuda-and-floating-point),
+          higher-throughput equivalent, the achieved FP32 performance could be increased by up to 50% (relative to its
+          current performance). Check the Source page to identify where this kernel executes FP32 instructions.
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                   4096
+    Registers Per Thread             register/thread              42
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block       Kbyte/block            1.02
+    Dynamic Shared Memory Per Block       byte/block             128
+    Static Shared Memory Per Block        byte/block               0
+    # SMs                                         SM             142
+    Stack Size                                                  1024
+    Threads                                   thread          524288
+    # TPCs                                                        71
+    Enabled TPC IDs                                              all
+    Uses Green Context                                             0
+    Waves Per SM                                                2.88
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 33.33%
+          A wave of thread blocks is defined as the maximum number of blocks that can be executed in parallel on the
+          target GPU. The number of blocks in a wave depends on the number of multiprocessors and the theoretical
+          occupancy of the kernel. This kernel launch results in 2 full waves and a partial wave of 1257 thread
+          blocks. Under the assumption of a uniform execution duration of all thread blocks, this partial wave may
+          account for up to 33.3% of the total runtime of this kernel. Try launching a grid with no partial wave. The
+          overall impact of this tail effect also lessens with the number of full waves executed for a grid. See the
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)
+          description for more details on launch configurations.
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           24
+    Block Limit Registers                 block           10
+    Block Limit Shared Mem                block           28
+    Block Limit Warps                     block           12
+    Theoretical Active Warps per SM        warp           40
+    Theoretical Occupancy                     %        83.33
+    Achieved Occupancy                        %        77.93
+    Achieved Active Warps Per SM           warp        37.41
+    ------------------------------- ----------- ------------
+
+    Section: GPU and Memory Workload Distribution
+    -------------------------- ----------- ------------
+    Metric Name                Metric Unit Metric Value
+    -------------------------- ----------- ------------
+    Average DRAM Active Cycles       cycle    793401.33
+    Total DRAM Elapsed Cycles        cycle     10148864
+    Average L1 Active Cycles         cycle     97308.30
+    Total L1 Elapsed Cycles          cycle     14224546
+    Average L2 Active Cycles         cycle    127385.48
+    Total L2 Elapsed Cycles          cycle      6356352
+    Average SM Active Cycles         cycle     97308.30
+    Total SM Elapsed Cycles          cycle     14224546
+    Average SMSP Active Cycles       cycle     97001.06
+    Total SMSP Elapsed Cycles        cycle     56898184
+    -------------------------- ----------- ------------
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.02
+    Branch Instructions              inst       106496
+    Branch Efficiency                   %          100
+    Avg. Divergent Branches                          0
+    ------------------------- ----------- ------------
+
+  kernel_cutlass_reduce_sum_kernel_last_tensorptrf32gmemo4096409640961_tensorptrf32gmemo40961_4_0 (4096, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.9
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ----------- ------------
+    Metric Name             Metric Unit Metric Value
+    ----------------------- ----------- ------------
+    DRAM Frequency                  Ghz         8.99
+    SM Frequency                    Ghz         1.06
+    Elapsed Cycles                cycle       100250
+    Memory Throughput                 %        93.71
+    DRAM Throughput                   %        93.71
+    Duration                         us        94.18
+    L1/TEX Cache Throughput           %        15.20
+    L2 Cache Throughput               %        33.26
+    SM Active Cycles              cycle     97148.04
+    Compute (SM) Throughput           %         9.61
+    ----------------------- ----------- ------------
+
+    INF   This workload is utilizing greater than 80.0% of the available compute or memory performance of the device.
+          To further improve performance, work will likely need to be shifted from the most utilized to another unit.
+          Start by analyzing DRAM in the Memory Workload Analysis section.
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 64:1. The workload achieved
+          close to 1% of this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel
+          Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details
+          on roofline analysis.
+
+    Section: PM Sampling
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Maximum Buffer Size             Mbyte        50.33
+    Dropped Samples                sample            0
+    Maximum Sampling Interval          us            1
+    # Pass Groups                                    2
+    ------------------------- ----------- ------------
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.33
+    Executed Ipc Elapsed  inst/cycle         0.32
+    Issue Slots Busy               %         8.19
+    Issued Ipc Active     inst/cycle         0.33
+    SM Busy                        %         8.19
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 93.36%
+          All compute pipelines are under-utilized. Either this workload is very small or it doesn't issue enough warps
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.
+
+    Section: Memory Workload Analysis
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    Memory Throughput                Gbyte/s       808.46
+    Mem Busy                               %        17.15
+    Max Bandwidth                          %        93.71
+    L1/TEX Hit Rate                        %         0.02
+    L2 Compression Success Rate            %            0
+    L2 Compression Ratio                                0
+    L2 Compression Input Sectors      sector            0
+    L2 Hit Rate                            %         0.58
+    Mem Pipes Busy                         %         9.61
+    ---------------------------- ----------- ------------
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 25.87%
+          The memory access pattern for global stores to L2 might not be optimal. On average, only 4.0 of the 32 bytes
+          transmitted per sector are utilized by each thread. This applies to the 88.9% of sectors missed in L1TEX.
+          This could possibly be caused by a stride between threads. Check the Source Counters section for uncoalesced
+          global stores.
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         8.20
+    Issued Warp Per Scheduler                        0.08
+    No Eligible                            %        91.80
+    Active Warps Per Scheduler          warp         9.36
+    Eligible Warps Per Scheduler        warp         0.10
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 6.29%
+          Every scheduler is capable of issuing one instruction per cycle, but for this workload each scheduler only
+          issues an instruction every 12.2 cycles. This might leave hardware resources underutilized and may lead to
+          less optimal performance. Out of the maximum of 12 warps per scheduler, this workload allocates an average
+          of 9.36 active warps per scheduler, but only an average of 0.10 warps were eligible per cycle. Eligible
+          warps are the subset of active warps that are ready to issue their next instruction. Every cycle with no
+          eligible warp results in no instruction being issued and the issue slot remains unused. To increase the
+          number of eligible warps, avoid possible load imbalances due to highly different execution durations per
+          warp. Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle       114.07
+    Warp Cycles Per Executed Instruction           cycle       114.62
+    Avg. Active Threads Per Warp                                31.89
+    Avg. Not Predicated Off Threads Per Warp                    31.29
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 6.29%
+          On average, each warp of this workload spends 106.6 cycles being stalled waiting for a scoreboard dependency
+          on a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently
+          used data to shared memory. This stall type represents about 93.5% of the total average of 114.1 cycles
+          between issuing two instructions.
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on
+          sampling data. The Kernel Profiling Guide
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details
+          on each stall reason.
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      7917.97
+    Executed Instructions                           inst      4497408
+    Avg. Issued Instructions Per Scheduler          inst      7956.46
+    Issued Instructions                             inst      4519271
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 2.06%
+          This kernel executes 0 fused and 675840 non-fused FP32 instructions. By converting pairs of non-fused
+          instructions to their fused (https://docs.nvidia.com/cuda/floating-point/#cuda-and-floating-point),
+          higher-throughput equivalent, the achieved FP32 performance could be increased by up to 50% (relative to its
+          current performance). Check the Source page to identify where this kernel executes FP32 instructions.
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                   4096
+    Registers Per Thread             register/thread              42
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block       Kbyte/block            1.02
+    Dynamic Shared Memory Per Block       byte/block             128
+    Static Shared Memory Per Block        byte/block               0
+    # SMs                                         SM             142
+    Stack Size                                                  1024
+    Threads                                   thread          524288
+    # TPCs                                                        71
+    Enabled TPC IDs                                              all
+    Uses Green Context                                             0
+    Waves Per SM                                                2.88
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 33.33%
+          A wave of thread blocks is defined as the maximum number of blocks that can be executed in parallel on the
+          target GPU. The number of blocks in a wave depends on the number of multiprocessors and the theoretical
+          occupancy of the kernel. This kernel launch results in 2 full waves and a partial wave of 1257 thread
+          blocks. Under the assumption of a uniform execution duration of all thread blocks, this partial wave may
+          account for up to 33.3% of the total runtime of this kernel. Try launching a grid with no partial wave. The
+          overall impact of this tail effect also lessens with the number of full waves executed for a grid. See the
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)
+          description for more details on launch configurations.
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           24
+    Block Limit Registers                 block           10
+    Block Limit Shared Mem                block           28
+    Block Limit Warps                     block           12
+    Theoretical Active Warps per SM        warp           40
+    Theoretical Occupancy                     %        83.33
+    Achieved Occupancy                        %        78.04
+    Achieved Active Warps Per SM           warp        37.46
+    ------------------------------- ----------- ------------
+
+    Section: GPU and Memory Workload Distribution
+    -------------------------- ----------- ------------
+    Metric Name                Metric Unit Metric Value
+    -------------------------- ----------- ------------
+    Average DRAM Active Cycles       cycle    793101.33
+    Total DRAM Elapsed Cycles        cycle     10156032
+    Average L1 Active Cycles         cycle     97148.04
+    Total L1 Elapsed Cycles          cycle     14235170
+    Average L2 Active Cycles         cycle    126206.75
+    Total L2 Elapsed Cycles          cycle      6306576
+    Average SM Active Cycles         cycle     97148.04
+    Total SM Elapsed Cycles          cycle     14235170
+    Average SMSP Active Cycles       cycle     97011.88
+    Total SMSP Elapsed Cycles        cycle     56940680
+    -------------------------- ----------- ------------
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.02
+    Branch Instructions              inst       106496
+    Branch Efficiency                   %          100
+    Avg. Divergent Branches                          0
+    ------------------------- ----------- ------------
diff --git a/pyproject.toml b/pyproject.toml
index a44eeec..e2d98fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,12 +18,12 @@ dependencies = [
 
 [tool.uv.sources]
 torch = [
-  { index = "pytorch-cu130", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+  { index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 
 [[tool.uv.index]]
-name = "pytorch-cu130"
-url = "https://download.pytorch.org/whl/cu130"
+name = "pytorch-cu128"
+url = "https://download.pytorch.org/whl/cu128"
 explicit = true