diff --git a/forge_cute_py/kernels/reduce_sum.py b/forge_cute_py/kernels/reduce_sum.py new file mode 100644 index 0000000..f02fc77 --- /dev/null +++ b/forge_cute_py/kernels/reduce_sum.py @@ -0,0 +1,164 @@ +from concurrent.futures import thread +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '0' +os.environ['CUTLASS_CUDA_ARCH'] = '86' +os.environ['CUTE_DSL_ENABLE_TVM_FFI'] = '1' + +import math +import torch +import time +from cutlass.cute.runtime import from_dlpack +import cuda.bindings.driver as cuda +import cutlass.cute as cute +from cutlass import const_expr +import cutlass + +from cutlass import dsl_user_op +from cutlass.cute.arch import nvvm +from cutlass._mlir.dialects.nvvm import AtomicOpKind, MemOrderKind, MemScopeKind +from cutlass.base_dsl.typing import T + +_reduce_sum_last_cache = {} +_reduce_sum_first_cache = {} + + +# old just for future reference +@dsl_user_op +def atomicAddF32(dst_ptr: cute.Pointer, val: cute.Float32, loc=None, ip=None) -> cute.Float32: + return nvvm.atomicrmw( + T.f32(), + AtomicOpKind.FADD, + dst_ptr.llvm_ptr, + val.ir_value(loc=loc, ip=ip), + mem_order=MemOrderKind.RELAXED, + syncscope=MemScopeKind.SYS, + loc=loc, + ip=ip, + ) + + +class ReduceSumLast: + """Sum reduction along last dimension using CuTe DSL.""" + + def __init__(self, dtype: type): + self.dtype = dtype + self.num_warps = 8 + self.threads_per_block = self.num_warps * 32 + + @cute.jit + def __call__( + self, + input: cute.Tensor, + output: cute.Tensor, + # stream: cuda.CUstream = None, + ): + M, N = input.shape + tiler_mn = (1, self.threads_per_block * 4) + gX = cute.zipped_divide(input, tiler_mn) + + thr_layout = cute.make_layout((self.threads_per_block,), stride=(1,)) + val_layout = cute.make_layout((4,), stride=(1,)) + _, tv_layout = cute.make_layout_tv(thr_layout, val_layout) + + self.kernel(gX, output, tv_layout).launch( + grid=(M, 1, 1), + block=(self.threads_per_block, 1, 1), + # stream=stream, + ) + + @cute.kernel + def kernel(self, input: cute.Tensor, output: cute.Tensor, tv_layout): + num_warps = const_expr(self.num_warps) + + smem_alloc = cutlass.utils.SmemAllocator() + shmem = smem_alloc.allocate_tensor(cute.Float32, cute.make_layout((32,))) + + tidx, _, _ = cute.arch.thread_idx() + bidx, _, _ = cute.arch.block_idx() + lane = cute.arch.lane_idx() + warp = cute.arch.warp_idx() + + # op = cute.nvgpu.CopyUniversalOp() + # atom = cute.make_copy_atom(op, cute.Float32, num_bits_per_copy=128) + acc = cute.Float32(0.0) + + _, mode1 = input.shape + ntiles = mode1[1] + + for tile_idx in range(ntiles): + blk_coord = ((0, None), (bidx, tile_idx)) + cta_tile = input[blk_coord] + thr_frag = cute.composition(cta_tile, tv_layout) + thr_src = thr_frag[(tidx, None)] + # r = cute.make_rmem_tensor(cute.make_layout((4,), stride=(1,)), cute.Float32) + # cute.copy(atom, thr_src, r) + # acc += r[0] + r[1] + r[2] + r[3] + acc += thr_src[0] + thr_src[1] + thr_src[2] + thr_src[3] + + + acc = cute.arch.warp_reduction_sum(acc) + if lane == 0: + shmem[warp] = acc + cute.arch.sync_threads() + + if warp == 0: + acc2 = shmem[lane] if lane < num_warps else 0.0 + acc2 = cute.arch.warp_reduction_sum(acc2) + if lane == 0: + output[bidx] = acc2 + + +class ReduceSumFirst: + """Sum reduction along first dimension using CuTe DSL.""" + + def __init__(self, dtype: type): + self.dtype = dtype + self.num_warps = 4 + self.threads_per_block = self.num_warps * 32 + + @cute.jit + def __call__( + self, + input: cute.Tensor, + output: cute.Tensor, + # stream: cuda.CUstream = None, + ): + M, N = input.shape + yblocks = cute.ceil_div(N, 4) + self.kernel(input, output, self.threads_per_block // 4).launch( + grid=(yblocks, 1, 1), + block=(self.threads_per_block, 1, 1), + # stream=stream, + ) + + @cute.kernel + def kernel(self, input: cute.Tensor, output: cute.Tensor, stride: int): + smem_alloc = cutlass.utils.SmemAllocator() + smem_layout = cute.make_layout((4, 32)) + shmem = smem_alloc.allocate_tensor(cute.Float32, smem_layout) + + M, N = input.shape + tidx, _, _ = cute.arch.thread_idx() + bidx, _, _ = cute.arch.block_idx() + lane_idx = cute.arch.lane_idx() + warp_idx = cute.arch.warp_idx() + + max_iters = cute.ceil_div(M, stride) + col_offset = tidx % 4 + row_offset = tidx // 4 + col = 4 * bidx + col_offset + acc = cute.Float32(0) + + row = row_offset + for _ in range(max_iters): + if row < M and col < N: + acc = acc + input[row, col] + row = row + 32 + + shmem[col_offset, row_offset] = acc + cute.arch.sync_threads() + acc = shmem[warp_idx, lane_idx] + + acc = cute.arch.warp_reduction_sum(acc) + if lane_idx == 0: + output[bidx * 4 + warp_idx] = acc diff --git a/forge_cute_py/ops/reduce_sum.py b/forge_cute_py/ops/reduce_sum.py index d501eca..e7252c9 100644 --- a/forge_cute_py/ops/reduce_sum.py +++ b/forge_cute_py/ops/reduce_sum.py @@ -1,65 +1,58 @@ +import cutlass.cute as cute import torch +from cutlass import BFloat16, Float16, Float32 +from cutlass.cute.runtime import from_dlpack +from forge_cute_py.kernels.reduce_sum import ReduceSumLast, ReduceSumFirst + +_compile_cache = {} -@torch.library.custom_op("forge_cute_py::_reduce_sum", mutates_args={"out"}) -def _reduce_sum(x: torch.Tensor, out: torch.Tensor, dim: int = -1, variant: str = "shfl") -> None: - """Row/column sum reduction (reference implementation stub). - Args: - x: Input tensor of shape (M, N) - out: Output tensor (mutated in-place) - dim: Dimension to reduce over (-1, 0, or 1) - variant: Reduction variant (naive, improved, shfl) - currently unused - """ +@torch.library.custom_op("forge_cute_py::_reduce_sum", mutates_args={"out"}) +def _reduce_sum(x: torch.Tensor, out: torch.Tensor, dim: int = -1) -> None: + """Sum reduction using CuTe DSL.""" assert x.dim() == 2, "reduce_sum expects a 2D tensor" assert x.is_cuda, f"reduce_sum is CUDA-only, got device={x.device}" - assert dim in (-1, 0, 1), f"reduce_sum expects dim in {{-1, 0, 1}}, got {dim}" - assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], ( - f"Unsupported dtype: {x.dtype}" - ) - # Normalize dim to positive index dim = dim if dim >= 0 else x.ndim + dim - # For now, use reference implementation - # Future: call kernel implementation based on variant when available - from forge_cute_py.ref import reduce_sum as reduce_sum_ref - - result = reduce_sum_ref(x, dim=dim) - out.copy_(result) - - -_reduce_sum.compile_cache = {} - - -def reduce_sum(x: torch.Tensor, dim: int = -1, variant: str = "shfl") -> torch.Tensor: - """Row/column sum reduction. - - Args: - x: Input tensor of shape (M, N) - dim: Dimension to reduce over (-1 for last dim, 0 or 1) - variant: Reduction variant (naive, improved, shfl) - currently unused - - Returns: - Reduced tensor of shape (M,) if dim=1 or (N,) if dim=0 - - Examples: - >>> x = torch.randn(32, 128, device='cuda', dtype=torch.float16) - >>> y = reduce_sum(x, dim=-1) # Sum over columns, result shape: (32,) - >>> y.shape - torch.Size([32]) - """ - # Normalize dim to positive index + dtype_map = { + torch.float16: Float16, + torch.float32: Float32, + torch.bfloat16: BFloat16, + } + cute_dtype = dtype_map[x.dtype] + compile_key = (cute_dtype, dim) + + if compile_key not in _compile_cache: + m = cute.sym_int() + n = cute.sym_int() + input_cute = cute.runtime.make_fake_compact_tensor(cute_dtype, (m, n), stride_order=(1, 0)) + + if dim == 1: # Reduce last dim + output_cute = cute.runtime.make_fake_compact_tensor(cute_dtype, (m,)) + kernel_class = ReduceSumLast(cute_dtype) + else: # dim == 0 + output_cute = cute.runtime.make_fake_compact_tensor(cute_dtype, (n,)) + kernel_class = ReduceSumFirst(cute_dtype) + + _compile_cache[compile_key] = cute.compile( + kernel_class, + input_cute, + output_cute, + # cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=False), + options="--enable-tvm-ffi", + ) + + x_cute = from_dlpack(x, assumed_align=16) + out_cute = from_dlpack(out, assumed_align=16) + _compile_cache[compile_key](x_cute, out_cute) + + +def reduce_sum(x: torch.Tensor, dim: int = -1, variant='') -> torch.Tensor: + """Sum reduction with CuTe DSL kernel.""" dim = dim if dim >= 0 else x.ndim + dim - - # Determine output shape - if dim == 0: - out_shape = (x.shape[1],) - elif dim == 1: - out_shape = (x.shape[0],) - else: - raise ValueError(f"Invalid dim={dim} for 2D tensor") - + out_shape = (x.shape[1],) if dim == 0 else (x.shape[0],) out = torch.empty(out_shape, dtype=x.dtype, device=x.device) - _reduce_sum(x, out, dim, variant) + _reduce_sum(x, out, dim) return out diff --git a/forge_cute_py/util/profile_launch.py b/forge_cute_py/util/profile_launch.py new file mode 100644 index 0000000..0098364 --- /dev/null +++ b/forge_cute_py/util/profile_launch.py @@ -0,0 +1,32 @@ +import torch + +from forge_cute_py.ops.reduce_sum import reduce_sum + +def main(): + M, N = 4096, 4096 + x = torch.randn(M, N, device='cuda', dtype=torch.float32) + + # Warmup + print("Warming up...") + for _ in range(10): + _ = reduce_sum(x, dim=-1) + _ = x.sum(dim=-1) + torch.cuda.synchronize() + print("Warmup complete") + + # Profile cute + torch.cuda.nvtx.range_push("cute_reduce_sum") + for _ in range(100): + _ = reduce_sum(x, dim=-1) + torch.cuda.synchronize() + torch.cuda.nvtx.range_pop() + + # Profile torch + torch.cuda.nvtx.range_push("torch_sum") + for _ in range(100): + _ = x.sum(dim=-1) + torch.cuda.synchronize() + torch.cuda.nvtx.range_pop() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ncu_output/sum_reduce.txt b/ncu_output/sum_reduce.txt new file mode 100644 index 0000000..2ec3fa6 --- /dev/null +++ b/ncu_output/sum_reduce.txt @@ -0,0 +1,411 @@ + kernel_cutlass_reduce_sum_kernel_last_tensorptrf32gmemo4096409640961_tensorptrf32gmemo40961_4_0 (4096, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.9 + Section: GPU Speed Of Light Throughput + ----------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ----------------------- ----------- ------------ + DRAM Frequency Ghz 8.99 + SM Frequency Ghz 1.06 + Elapsed Cycles cycle 100174 + Memory Throughput % 93.81 + DRAM Throughput % 93.81 + Duration us 94.11 + L1/TEX Cache Throughput % 15.18 + L2 Cache Throughput % 33.00 + SM Active Cycles cycle 97308.30 + Compute (SM) Throughput % 9.62 + ----------------------- ----------- ------------ + + INF This workload is utilizing greater than 80.0% of the available compute or memory performance of the device. + To further improve performance, work will likely need to be shifted from the most utilized to another unit. + Start by analyzing DRAM in the Memory Workload Analysis section. + + Section: GPU Speed Of Light Roofline Chart + INF The ratio of peak float (fp32) to double (fp64) performance on this device is 64:1. The workload achieved + close to 1% of this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel + Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details + on roofline analysis. + + Section: PM Sampling + ------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ------------------------- ----------- ------------ + Maximum Buffer Size Mbyte 50.33 + Dropped Samples sample 0 + Maximum Sampling Interval us 1 + # Pass Groups 2 + ------------------------- ----------- ------------ + + Section: Compute Workload Analysis + -------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + -------------------- ----------- ------------ + Executed Ipc Active inst/cycle 0.33 + Executed Ipc Elapsed inst/cycle 0.32 + Issue Slots Busy % 8.18 + Issued Ipc Active inst/cycle 0.33 + SM Busy % 8.18 + -------------------- ----------- ------------ + + OPT Est. Local Speedup: 93.37% + All compute pipelines are under-utilized. Either this workload is very small or it doesn't issue enough warps + per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. + + Section: Memory Workload Analysis + ---------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ---------------------------- ----------- ------------ + Memory Throughput Gbyte/s 809.32 + Mem Busy % 17.07 + Max Bandwidth % 93.81 + L1/TEX Hit Rate % 0.02 + L2 Compression Success Rate % 0 + L2 Compression Ratio 0 + L2 Compression Input Sectors sector 0 + L2 Hit Rate % 0.58 + Mem Pipes Busy % 9.62 + ---------------------------- ----------- ------------ + + Section: Memory Workload Analysis Tables + OPT Est. Speedup: 25.74% + The memory access pattern for global stores to L2 might not be optimal. On average, only 4.0 of the 32 bytes + transmitted per sector are utilized by each thread. This applies to the 89.2% of sectors missed in L1TEX. + This could possibly be caused by a stride between threads. Check the Source Counters section for uncoalesced + global stores. + + Section: Scheduler Statistics + ---------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ---------------------------- ----------- ------------ + One or More Eligible % 8.20 + Issued Warp Per Scheduler 0.08 + No Eligible % 91.80 + Active Warps Per Scheduler warp 9.36 + Eligible Warps Per Scheduler warp 0.10 + ---------------------------- ----------- ------------ + + OPT Est. Local Speedup: 6.188% + Every scheduler is capable of issuing one instruction per cycle, but for this workload each scheduler only + issues an instruction every 12.2 cycles. This might leave hardware resources underutilized and may lead to + less optimal performance. Out of the maximum of 12 warps per scheduler, this workload allocates an average + of 9.36 active warps per scheduler, but only an average of 0.10 warps were eligible per cycle. Eligible + warps are the subset of active warps that are ready to issue their next instruction. Every cycle with no + eligible warp results in no instruction being issued and the issue slot remains unused. To increase the + number of eligible warps, avoid possible load imbalances due to highly different execution durations per + warp. Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too. + + Section: Warp State Statistics + ---------------------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ---------------------------------------- ----------- ------------ + Warp Cycles Per Issued Instruction cycle 114.14 + Warp Cycles Per Executed Instruction cycle 114.69 + Avg. Active Threads Per Warp 31.89 + Avg. Not Predicated Off Threads Per Warp 31.29 + ---------------------------------------- ----------- ------------ + + OPT Est. Speedup: 6.188% + On average, each warp of this workload spends 106.6 cycles being stalled waiting for a scoreboard dependency + on a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited + upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the + memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by + increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently + used data to shared memory. This stall type represents about 93.4% of the total average of 114.1 cycles + between issuing two instructions. + ----- -------------------------------------------------------------------------------------------------------------- + INF Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on + sampling data. The Kernel Profiling Guide + (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details + on each stall reason. + + Section: Instruction Statistics + ---------------------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ---------------------------------------- ----------- ------------ + Avg. Executed Instructions Per Scheduler inst 7917.97 + Executed Instructions inst 4497408 + Avg. Issued Instructions Per Scheduler inst 7955.98 + Issued Instructions inst 4518996 + ---------------------------------------- ----------- ------------ + + OPT Est. Speedup: 2.056% + This kernel executes 0 fused and 675840 non-fused FP32 instructions. By converting pairs of non-fused + instructions to their fused (https://docs.nvidia.com/cuda/floating-point/#cuda-and-floating-point), + higher-throughput equivalent, the achieved FP32 performance could be increased by up to 50% (relative to its + current performance). Check the Source page to identify where this kernel executes FP32 instructions. + + Section: Launch Statistics + -------------------------------- --------------- --------------- + Metric Name Metric Unit Metric Value + -------------------------------- --------------- --------------- + Block Size 128 + Function Cache Configuration CachePreferNone + Grid Size 4096 + Registers Per Thread register/thread 42 + Shared Memory Configuration Size Kbyte 32.77 + Driver Shared Memory Per Block Kbyte/block 1.02 + Dynamic Shared Memory Per Block byte/block 128 + Static Shared Memory Per Block byte/block 0 + # SMs SM 142 + Stack Size 1024 + Threads thread 524288 + # TPCs 71 + Enabled TPC IDs all + Uses Green Context 0 + Waves Per SM 2.88 + -------------------------------- --------------- --------------- + + OPT Est. Speedup: 33.33% + A wave of thread blocks is defined as the maximum number of blocks that can be executed in parallel on the + target GPU. The number of blocks in a wave depends on the number of multiprocessors and the theoretical + occupancy of the kernel. This kernel launch results in 2 full waves and a partial wave of 1257 thread + blocks. Under the assumption of a uniform execution duration of all thread blocks, this partial wave may + account for up to 33.3% of the total runtime of this kernel. Try launching a grid with no partial wave. The + overall impact of this tail effect also lessens with the number of full waves executed for a grid. See the + Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model) + description for more details on launch configurations. + + Section: Occupancy + ------------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ------------------------------- ----------- ------------ + Block Limit SM block 24 + Block Limit Registers block 10 + Block Limit Shared Mem block 28 + Block Limit Warps block 12 + Theoretical Active Warps per SM warp 40 + Theoretical Occupancy % 83.33 + Achieved Occupancy % 77.93 + Achieved Active Warps Per SM warp 37.41 + ------------------------------- ----------- ------------ + + Section: GPU and Memory Workload Distribution + -------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + -------------------------- ----------- ------------ + Average DRAM Active Cycles cycle 793401.33 + Total DRAM Elapsed Cycles cycle 10148864 + Average L1 Active Cycles cycle 97308.30 + Total L1 Elapsed Cycles cycle 14224546 + Average L2 Active Cycles cycle 127385.48 + Total L2 Elapsed Cycles cycle 6356352 + Average SM Active Cycles cycle 97308.30 + Total SM Elapsed Cycles cycle 14224546 + Average SMSP Active Cycles cycle 97001.06 + Total SMSP Elapsed Cycles cycle 56898184 + -------------------------- ----------- ------------ + + Section: Source Counters + ------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ------------------------- ----------- ------------ + Branch Instructions Ratio % 0.02 + Branch Instructions inst 106496 + Branch Efficiency % 100 + Avg. Divergent Branches 0 + ------------------------- ----------- ------------ + + kernel_cutlass_reduce_sum_kernel_last_tensorptrf32gmemo4096409640961_tensorptrf32gmemo40961_4_0 (4096, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.9 + Section: GPU Speed Of Light Throughput + ----------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ----------------------- ----------- ------------ + DRAM Frequency Ghz 8.99 + SM Frequency Ghz 1.06 + Elapsed Cycles cycle 100250 + Memory Throughput % 93.71 + DRAM Throughput % 93.71 + Duration us 94.18 + L1/TEX Cache Throughput % 15.20 + L2 Cache Throughput % 33.26 + SM Active Cycles cycle 97148.04 + Compute (SM) Throughput % 9.61 + ----------------------- ----------- ------------ + + INF This workload is utilizing greater than 80.0% of the available compute or memory performance of the device. + To further improve performance, work will likely need to be shifted from the most utilized to another unit. + Start by analyzing DRAM in the Memory Workload Analysis section. + + Section: GPU Speed Of Light Roofline Chart + INF The ratio of peak float (fp32) to double (fp64) performance on this device is 64:1. The workload achieved + close to 1% of this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel + Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details + on roofline analysis. + + Section: PM Sampling + ------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ------------------------- ----------- ------------ + Maximum Buffer Size Mbyte 50.33 + Dropped Samples sample 0 + Maximum Sampling Interval us 1 + # Pass Groups 2 + ------------------------- ----------- ------------ + + Section: Compute Workload Analysis + -------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + -------------------- ----------- ------------ + Executed Ipc Active inst/cycle 0.33 + Executed Ipc Elapsed inst/cycle 0.32 + Issue Slots Busy % 8.19 + Issued Ipc Active inst/cycle 0.33 + SM Busy % 8.19 + -------------------- ----------- ------------ + + OPT Est. Local Speedup: 93.36% + All compute pipelines are under-utilized. Either this workload is very small or it doesn't issue enough warps + per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. + + Section: Memory Workload Analysis + ---------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ---------------------------- ----------- ------------ + Memory Throughput Gbyte/s 808.46 + Mem Busy % 17.15 + Max Bandwidth % 93.71 + L1/TEX Hit Rate % 0.02 + L2 Compression Success Rate % 0 + L2 Compression Ratio 0 + L2 Compression Input Sectors sector 0 + L2 Hit Rate % 0.58 + Mem Pipes Busy % 9.61 + ---------------------------- ----------- ------------ + + Section: Memory Workload Analysis Tables + OPT Est. Speedup: 25.87% + The memory access pattern for global stores to L2 might not be optimal. On average, only 4.0 of the 32 bytes + transmitted per sector are utilized by each thread. This applies to the 88.9% of sectors missed in L1TEX. + This could possibly be caused by a stride between threads. Check the Source Counters section for uncoalesced + global stores. + + Section: Scheduler Statistics + ---------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ---------------------------- ----------- ------------ + One or More Eligible % 8.20 + Issued Warp Per Scheduler 0.08 + No Eligible % 91.80 + Active Warps Per Scheduler warp 9.36 + Eligible Warps Per Scheduler warp 0.10 + ---------------------------- ----------- ------------ + + OPT Est. Local Speedup: 6.29% + Every scheduler is capable of issuing one instruction per cycle, but for this workload each scheduler only + issues an instruction every 12.2 cycles. This might leave hardware resources underutilized and may lead to + less optimal performance. Out of the maximum of 12 warps per scheduler, this workload allocates an average + of 9.36 active warps per scheduler, but only an average of 0.10 warps were eligible per cycle. Eligible + warps are the subset of active warps that are ready to issue their next instruction. Every cycle with no + eligible warp results in no instruction being issued and the issue slot remains unused. To increase the + number of eligible warps, avoid possible load imbalances due to highly different execution durations per + warp. Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too. + + Section: Warp State Statistics + ---------------------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ---------------------------------------- ----------- ------------ + Warp Cycles Per Issued Instruction cycle 114.07 + Warp Cycles Per Executed Instruction cycle 114.62 + Avg. Active Threads Per Warp 31.89 + Avg. Not Predicated Off Threads Per Warp 31.29 + ---------------------------------------- ----------- ------------ + + OPT Est. Speedup: 6.29% + On average, each warp of this workload spends 106.6 cycles being stalled waiting for a scoreboard dependency + on a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited + upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the + memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by + increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently + used data to shared memory. This stall type represents about 93.5% of the total average of 114.1 cycles + between issuing two instructions. + ----- -------------------------------------------------------------------------------------------------------------- + INF Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on + sampling data. The Kernel Profiling Guide + (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details + on each stall reason. + + Section: Instruction Statistics + ---------------------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ---------------------------------------- ----------- ------------ + Avg. Executed Instructions Per Scheduler inst 7917.97 + Executed Instructions inst 4497408 + Avg. Issued Instructions Per Scheduler inst 7956.46 + Issued Instructions inst 4519271 + ---------------------------------------- ----------- ------------ + + OPT Est. Speedup: 2.06% + This kernel executes 0 fused and 675840 non-fused FP32 instructions. By converting pairs of non-fused + instructions to their fused (https://docs.nvidia.com/cuda/floating-point/#cuda-and-floating-point), + higher-throughput equivalent, the achieved FP32 performance could be increased by up to 50% (relative to its + current performance). Check the Source page to identify where this kernel executes FP32 instructions. + + Section: Launch Statistics + -------------------------------- --------------- --------------- + Metric Name Metric Unit Metric Value + -------------------------------- --------------- --------------- + Block Size 128 + Function Cache Configuration CachePreferNone + Grid Size 4096 + Registers Per Thread register/thread 42 + Shared Memory Configuration Size Kbyte 32.77 + Driver Shared Memory Per Block Kbyte/block 1.02 + Dynamic Shared Memory Per Block byte/block 128 + Static Shared Memory Per Block byte/block 0 + # SMs SM 142 + Stack Size 1024 + Threads thread 524288 + # TPCs 71 + Enabled TPC IDs all + Uses Green Context 0 + Waves Per SM 2.88 + -------------------------------- --------------- --------------- + + OPT Est. Speedup: 33.33% + A wave of thread blocks is defined as the maximum number of blocks that can be executed in parallel on the + target GPU. The number of blocks in a wave depends on the number of multiprocessors and the theoretical + occupancy of the kernel. This kernel launch results in 2 full waves and a partial wave of 1257 thread + blocks. Under the assumption of a uniform execution duration of all thread blocks, this partial wave may + account for up to 33.3% of the total runtime of this kernel. Try launching a grid with no partial wave. The + overall impact of this tail effect also lessens with the number of full waves executed for a grid. See the + Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model) + description for more details on launch configurations. + + Section: Occupancy + ------------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ------------------------------- ----------- ------------ + Block Limit SM block 24 + Block Limit Registers block 10 + Block Limit Shared Mem block 28 + Block Limit Warps block 12 + Theoretical Active Warps per SM warp 40 + Theoretical Occupancy % 83.33 + Achieved Occupancy % 78.04 + Achieved Active Warps Per SM warp 37.46 + ------------------------------- ----------- ------------ + + Section: GPU and Memory Workload Distribution + -------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + -------------------------- ----------- ------------ + Average DRAM Active Cycles cycle 793101.33 + Total DRAM Elapsed Cycles cycle 10156032 + Average L1 Active Cycles cycle 97148.04 + Total L1 Elapsed Cycles cycle 14235170 + Average L2 Active Cycles cycle 126206.75 + Total L2 Elapsed Cycles cycle 6306576 + Average SM Active Cycles cycle 97148.04 + Total SM Elapsed Cycles cycle 14235170 + Average SMSP Active Cycles cycle 97011.88 + Total SMSP Elapsed Cycles cycle 56940680 + -------------------------- ----------- ------------ + + Section: Source Counters + ------------------------- ----------- ------------ + Metric Name Metric Unit Metric Value + ------------------------- ----------- ------------ + Branch Instructions Ratio % 0.02 + Branch Instructions inst 106496 + Branch Efficiency % 100 + Avg. Divergent Branches 0 + ------------------------- ----------- ------------ diff --git a/pyproject.toml b/pyproject.toml index a44eeec..e2d98fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,12 +18,12 @@ dependencies = [ [tool.uv.sources] torch = [ - { index = "pytorch-cu130", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] [[tool.uv.index]] -name = "pytorch-cu130" -url = "https://download.pytorch.org/whl/cu130" +name = "pytorch-cu128" +url = "https://download.pytorch.org/whl/cu128" explicit = true