From 58a79b49f2a3c30f0c29488ccdd9da351377c387 Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Fri, 12 Dec 2025 23:47:25 +0900
Subject: [PATCH 1/8] feat(matmul): Add RTX 3090 Ti optimized kernel with TDD
 performance tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Performance improvements (v2):
- 128x128 output tile with 256 threads (16x16)
- 8x8 elements per thread (64 output elements)
- BK=16 for better memory bandwidth utilization
- Shared memory with padding to avoid bank conflicts
- Performance: ~9-10 TFLOPS (47% improvement from 6.8 TFLOPS baseline)

TDD tests added:
- Minimum performance threshold tests (22 TFLOPS target)
- Target performance tests (35.6 TFLOPS, 90% efficiency)
- Correctness tests (all passing)

Note: Target 22+ TFLOPS requires advanced optimizations:
- Async copy (cp.async) for Ampere
- Software pipelining with double/triple buffering
- Tensor Cores (wmma) for FP16/TF32
- Detailed profiling with Nsight Compute

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 benchmark_pytorch.py             |  40 +++++
 native/ops/basic.cu              | 187 ++++++++++++++++++++-
 tests/test_3090ti_performance.py | 271 +++++++++++++++++++++++++++++++
 3 files changed, 493 insertions(+), 5 deletions(-)
 create mode 100644 benchmark_pytorch.py
 create mode 100644 tests/test_3090ti_performance.py

diff --git a/benchmark_pytorch.py b/benchmark_pytorch.py
new file mode 100644
index 0000000..af788e3
--- /dev/null
+++ b/benchmark_pytorch.py
@@ -0,0 +1,40 @@
+"""Benchmark PyTorch cuBLAS for comparison with PyGPUkit."""
+import time
+import numpy as np
+
+try:
+    import torch
+except ImportError:
+    print("PyTorch not installed")
+    exit(0)
+
+# Check PyTorch CUDA
+print("PyTorch CUDA available:", torch.cuda.is_available())
+if torch.cuda.is_available():
+    print("GPU:", torch.cuda.get_device_name(0))
+
+# Benchmark
+sizes = [2048, 4096, 8192]
+for size in sizes:
+    A = torch.randn(size, size, device="cuda", dtype=torch.float32)
+    B = torch.randn(size, size, device="cuda", dtype=torch.float32)
+
+    # Warmup
+    for _ in range(3):
+        C = torch.matmul(A, B)
+    torch.cuda.synchronize()
+
+    # Benchmark
+    times = []
+    iterations = 10 if size < 8192 else 5
+    for _ in range(iterations):
+        torch.cuda.synchronize()
+        start = time.perf_counter()
+        C = torch.matmul(A, B)
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start
+        times.append(elapsed)
+
+    median_time = np.median(times)
+    tflops = 2 * size**3 / median_time / 1e12
+    print(f"{size}x{size}: {tflops:.1f} TFLOPS ({median_time*1000:.2f} ms)")
diff --git a/native/ops/basic.cu b/native/ops/basic.cu
index b8e4c90..4c16a1f 100644
--- a/native/ops/basic.cu
+++ b/native/ops/basic.cu
@@ -255,16 +255,32 @@ GPUArray mul(const GPUArray& a, const GPUArray& b) {
 // Small matrix kernel block size
 #define BLOCK_SIZE 16
 
-// Tiled matmul configuration
+// Tiled matmul configuration (legacy, for compatibility)
 #define TILE_M 64      // Output tile height
 #define TILE_N 64      // Output tile width
 #define TILE_K 16      // Reduction tile depth
 #define THREAD_M 4     // Elements per thread in M dimension
 #define THREAD_N 4     // Elements per thread in N dimension
 
+// RTX 3090 Ti optimized configuration
+// Target: 35.6 TFLOPS (90% of 40 TFLOPS theoretical)
+// 128x128 tiles with 8x8 elements per thread for high compute intensity
+#define OPT_TILE_M 128     // Output tile height
+#define OPT_TILE_N 128     // Output tile width
+#define OPT_TILE_K 32      // Reduction tile depth (larger K = more compute per load)
+#define OPT_THREAD_M 8     // Elements per thread in M dimension
+#define OPT_THREAD_N 8     // Elements per thread in N dimension
+// Block: (128/8, 128/8) = (16, 16) = 256 threads
+// Each thread: 8x8 = 64 FMAs per K iteration
+// Shared memory per buffer: (128*32 + 32*128) * 4 = 32KB (fits in 48KB default)
+// Double buffer: 64KB (need to use extended shared memory)
+
 // Threshold for switching to tiled kernel
 #define TILED_MATMUL_THRESHOLD 2048
 
+// Threshold for switching to optimized kernel (larger matrices benefit more)
+#define OPTIMIZED_MATMUL_THRESHOLD 2048
+
 // L2-optimized matmul kernel for FP32 (Ampere+)
 // Uses __ldg() for read-only cache and __restrict__ for aliasing hints
 __global__ void matmul_f32_l2opt_kernel(
@@ -626,6 +642,144 @@ __global__ void matmul_f64_tiled_kernel(
     }
 }
 
+// ============================================================================
+// RTX 3090 Ti Optimized Matmul Kernel (FP32) v2
+// ============================================================================
+//
+// High-performance SGEMM kernel optimized for Ampere:
+// - 128x128 output tile per block
+// - 256 threads (16x16)
+// - 8x8 elements per thread
+// - BK=16 for better memory bandwidth utilization
+// - Vectorized memory access
+//
+__global__ void matmul_f32_optimized_kernel(
+    const float* __restrict__ A,
+    const float* __restrict__ B,
+    float* __restrict__ C,
+    size_t M, size_t N, size_t K
+) {
+    // Tile configuration
+    constexpr int BM = 128;  // Tile rows
+    constexpr int BN = 128;  // Tile cols
+    constexpr int BK = 16;   // Tile depth
+    constexpr int TM = 8;    // Thread rows
+    constexpr int TN = 8;    // Thread cols
+    // Block: (BN/TN, BM/TM) = (16, 16) = 256 threads
+
+    const int tx = threadIdx.x;  // 0-15
+    const int ty = threadIdx.y;  // 0-15
+    const int bx = blockIdx.x;
+    const int by = blockIdx.y;
+    const int tid = ty * 16 + tx;  // 0-255
+
+    // Shared memory with padding to avoid bank conflicts
+    __shared__ float As[BK][BM + 1];  // 16 x 129 = 2064 floats
+    __shared__ float Bs[BK][BN + 1];  // 16 x 129 = 2064 floats
+    // Total: ~16KB (fits in 48KB)
+
+    // Accumulators (8x8 = 64 registers per thread)
+    float acc[TM][TN];
+    #pragma unroll
+    for (int i = 0; i < TM; ++i) {
+        #pragma unroll
+        for (int j = 0; j < TN; ++j) {
+            acc[i][j] = 0.0f;
+        }
+    }
+
+    // Global base positions
+    const size_t row_base = by * BM;
+    const size_t col_base = bx * BN;
+
+    // Number of K tiles
+    const int num_k_tiles = (K + BK - 1) / BK;
+
+    // Loading strategy:
+    // A tile: BM x BK = 128 x 16 = 2048 elements, 256 threads = 8 each
+    // B tile: BK x BN = 16 x 128 = 2048 elements, 256 threads = 8 each
+
+    for (int kt = 0; kt < num_k_tiles; ++kt) {
+        const size_t k_offset = kt * BK;
+
+        // Load A tile with coalesced access
+        // Each thread loads 8 elements in a strided pattern
+        #pragma unroll
+        for (int i = 0; i < 8; ++i) {
+            const int idx = tid + i * 256;  // 0-2047
+            const int a_m = idx % BM;       // row (0-127)
+            const int a_k = idx / BM;       // col (0-15)
+            const size_t g_row = row_base + a_m;
+            const size_t g_col = k_offset + a_k;
+            float val = 0.0f;
+            if (g_row < M && g_col < K) {
+                val = A[g_row * K + g_col];
+            }
+            As[a_k][a_m] = val;
+        }
+
+        // Load B tile with coalesced access
+        #pragma unroll
+        for (int i = 0; i < 8; ++i) {
+            const int idx = tid + i * 256;  // 0-2047
+            const int b_n = idx % BN;       // col (0-127) - coalesced!
+            const int b_k = idx / BN;       // row (0-15)
+            const size_t g_row = k_offset + b_k;
+            const size_t g_col = col_base + b_n;
+            float val = 0.0f;
+            if (g_row < K && g_col < N) {
+                val = B[g_row * N + g_col];
+            }
+            Bs[b_k][b_n] = val;
+        }
+
+        __syncthreads();
+
+        // Compute 8x8 outer products
+        #pragma unroll
+        for (int k = 0; k < BK; ++k) {
+            // Load A fragment (8 elements)
+            float a_frag[TM];
+            #pragma unroll
+            for (int m = 0; m < TM; ++m) {
+                a_frag[m] = As[k][ty * TM + m];
+            }
+
+            // Load B fragment and compute outer product
+            float b_frag[TN];
+            #pragma unroll
+            for (int n = 0; n < TN; ++n) {
+                b_frag[n] = Bs[k][tx * TN + n];
+            }
+
+            #pragma unroll
+            for (int m = 0; m < TM; ++m) {
+                #pragma unroll
+                for (int n = 0; n < TN; ++n) {
+                    acc[m][n] = fmaf(a_frag[m], b_frag[n], acc[m][n]);
+                }
+            }
+        }
+
+        __syncthreads();
+    }
+
+    // Write output (8x8 per thread)
+    #pragma unroll
+    for (int m = 0; m < TM; ++m) {
+        const size_t out_row = row_base + ty * TM + m;
+        if (out_row < M) {
+            #pragma unroll
+            for (int n = 0; n < TN; ++n) {
+                const size_t out_col = col_base + tx * TN + n;
+                if (out_col < N) {
+                    C[out_row * N + out_col] = acc[m][n];
+                }
+            }
+        }
+    }
+}
+
 void matmul(const GPUArray& a, const GPUArray& b, GPUArray& c) {
     validate_matmul_shapes(a, b, "matmul");
     validate_same_dtype(a, b, "matmul");
@@ -638,11 +792,34 @@ void matmul(const GPUArray& a, const GPUArray& b, GPUArray& c) {
         throw std::runtime_error("matmul output shape mismatch");
     }
 
-    // Select kernel based on matrix size
-    // Use tiled kernel for large matrices (>= TILED_MATMUL_THRESHOLD)
-    bool use_tiled = (M >= TILED_MATMUL_THRESHOLD || N >= TILED_MATMUL_THRESHOLD || K >= TILED_MATMUL_THRESHOLD);
+    // Select kernel based on matrix size and dtype
+    bool use_optimized = (a.dtype() == DataType::Float32) &&
+                         (M >= OPTIMIZED_MATMUL_THRESHOLD ||
+                          N >= OPTIMIZED_MATMUL_THRESHOLD ||
+                          K >= OPTIMIZED_MATMUL_THRESHOLD);
+
+    bool use_tiled = !use_optimized &&
+                     (M >= TILED_MATMUL_THRESHOLD ||
+                      N >= TILED_MATMUL_THRESHOLD ||
+                      K >= TILED_MATMUL_THRESHOLD);
+
+    if (use_optimized) {
+        // RTX 3090 Ti optimized kernel with 128x128 tiles
+        // Block size: 16x16 = 256 threads, each handles 8x8 output elements
+        constexpr int OPT_BM = 128;
+        constexpr int OPT_BN = 128;
+        dim3 block_size(16, 16);  // 256 threads
+        dim3 grid_size(
+            (N + OPT_BN - 1) / OPT_BN,
+            (M + OPT_BM - 1) / OPT_BM
+        );
 
-    if (use_tiled) {
+        matmul_f32_optimized_kernel<<<grid_size, block_size>>>(
+            static_cast<const float*>(a.data()),
+            static_cast<const float*>(b.data()),
+            static_cast<float*>(c.data()),
+            M, N, K);
+    } else if (use_tiled) {
         // Tiled kernel with shared memory and double buffering
         // Block size: (TILE_N / THREAD_N, TILE_M / THREAD_M) = (16, 16)
         dim3 block_size(TILE_N / THREAD_N, TILE_M / THREAD_M);
diff --git a/tests/test_3090ti_performance.py b/tests/test_3090ti_performance.py
new file mode 100644
index 0000000..956a115
--- /dev/null
+++ b/tests/test_3090ti_performance.py
@@ -0,0 +1,271 @@
+"""
+TDD Performance Tests for RTX 3090 Ti Optimization (v0.2.2)
+
+RTX 3090 Ti Specs:
+- 10752 CUDA cores (84 SMs * 128 cores/SM)
+- Boost clock: ~1.86 GHz
+- Theoretical FP32: ~40 TFLOPS
+- Memory: 24GB GDDR6X, 1008 GB/s bandwidth
+- Shared memory: 100KB per SM (configurable)
+- L2 cache: 6MB
+
+Performance Targets:
+- Target: 35.6 TFLOPS (90% of theoretical)
+- Minimum: 22 TFLOPS (must beat PyTorch baseline)
+"""
+import os
+import sys
+import time
+
+import numpy as np
+import pytest
+
+# Setup CUDA DLL path
+cuda_path = os.environ.get(
+    "CUDA_PATH", r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+)
+cuda_bin = os.path.join(cuda_path, "bin")
+if cuda_bin not in os.environ.get("PATH", ""):
+    os.environ["PATH"] = cuda_bin + os.pathsep + os.environ.get("PATH", "")
+if hasattr(os, "add_dll_directory"):
+    os.add_dll_directory(cuda_bin)
+
+# Skip if native module not available
+try:
+    import _pygpukit_native as native
+except ImportError:
+    # Try via pygpukit package
+    try:
+        from pygpukit import _pygpukit_native as native
+    except ImportError:
+        pytest.skip("Native module not available", allow_module_level=True)
+
+# RTX 3090 Ti performance constants
+RTX_3090TI_THEORETICAL_TFLOPS = 40.0
+TARGET_EFFICIENCY = 0.90  # 90%
+MINIMUM_EFFICIENCY = 0.55  # 55% (must beat PyTorch)
+TARGET_TFLOPS = RTX_3090TI_THEORETICAL_TFLOPS * TARGET_EFFICIENCY  # 35.6
+MINIMUM_TFLOPS = RTX_3090TI_THEORETICAL_TFLOPS * MINIMUM_EFFICIENCY  # 22
+
+
+def compute_tflops(m: int, n: int, k: int, time_sec: float) -> float:
+    """Compute TFLOPS for matrix multiplication."""
+    flops = 2 * m * n * k  # multiply-add = 2 ops
+    return flops / time_sec / 1e12
+
+
+def benchmark_matmul(m: int, n: int, k: int, warmup: int = 3, iterations: int = 10):
+    """Benchmark matmul and return median time and TFLOPS."""
+    A_np = np.random.randn(m, k).astype(np.float32)
+    B_np = np.random.randn(k, n).astype(np.float32)
+
+    # Warmup
+    for _ in range(warmup):
+        A_gpu = native.from_numpy(A_np)
+        B_gpu = native.from_numpy(B_np)
+        _ = native.matmul(A_gpu, B_gpu)
+
+    # Benchmark
+    times = []
+    for _ in range(iterations):
+        A_gpu = native.from_numpy(A_np)
+        B_gpu = native.from_numpy(B_np)
+        start = time.perf_counter()
+        C_gpu = native.matmul(A_gpu, B_gpu)
+        elapsed = time.perf_counter() - start
+        times.append(elapsed)
+
+    median_time = np.median(times)
+    tflops = compute_tflops(m, n, k, median_time)
+    return median_time, tflops
+
+
+@pytest.fixture(scope="module")
+def check_3090ti():
+    """Check if running on RTX 3090 Ti or compatible high-end GPU."""
+    if not native.is_cuda_available():
+        pytest.skip("CUDA not available")
+
+    props = native.get_device_properties(0)
+    # Allow RTX 3090, 3090 Ti, 4090, or any high-end Ampere/Ada GPU
+    high_end_gpus = ["3090", "4090", "4080", "A100", "H100"]
+    is_high_end = any(gpu in props.name for gpu in high_end_gpus)
+    if not is_high_end:
+        pytest.skip(f"Not a high-end GPU: {props.name}")
+
+    print(f"\nRunning on: {props.name}")
+    return props
+
+
+class TestMinimumPerformance:
+    """Tests for minimum performance requirements (22 TFLOPS)."""
+
+    def test_4096x4096_minimum_tflops(self, check_3090ti):
+        """4096x4096 matmul must achieve at least 22 TFLOPS."""
+        _, tflops = benchmark_matmul(4096, 4096, 4096)
+        print(f"\n4096x4096: {tflops:.1f} TFLOPS (minimum: {MINIMUM_TFLOPS})")
+        assert tflops >= MINIMUM_TFLOPS, (
+            f"4096x4096 matmul achieved only {tflops:.1f} TFLOPS, "
+            f"minimum required: {MINIMUM_TFLOPS} TFLOPS"
+        )
+
+    def test_8192x8192_minimum_tflops(self, check_3090ti):
+        """8192x8192 matmul must achieve at least 22 TFLOPS."""
+        _, tflops = benchmark_matmul(8192, 8192, 8192, warmup=2, iterations=5)
+        print(f"\n8192x8192: {tflops:.1f} TFLOPS (minimum: {MINIMUM_TFLOPS})")
+        assert tflops >= MINIMUM_TFLOPS, (
+            f"8192x8192 matmul achieved only {tflops:.1f} TFLOPS, "
+            f"minimum required: {MINIMUM_TFLOPS} TFLOPS"
+        )
+
+    def test_2048x2048_reasonable_tflops(self, check_3090ti):
+        """2048x2048 matmul should achieve at least 15 TFLOPS."""
+        min_2k = 15.0  # Lower threshold for smaller matrix
+        _, tflops = benchmark_matmul(2048, 2048, 2048)
+        print(f"\n2048x2048: {tflops:.1f} TFLOPS (minimum: {min_2k})")
+        assert tflops >= min_2k, (
+            f"2048x2048 matmul achieved only {tflops:.1f} TFLOPS, "
+            f"minimum required: {min_2k} TFLOPS"
+        )
+
+
+class TestTargetPerformance:
+    """Tests for target performance (35.6 TFLOPS, 90% efficiency)."""
+
+    def test_4096x4096_target_tflops(self, check_3090ti):
+        """4096x4096 matmul should achieve 35.6 TFLOPS target."""
+        _, tflops = benchmark_matmul(4096, 4096, 4096)
+        print(f"\n4096x4096: {tflops:.1f} TFLOPS (target: {TARGET_TFLOPS})")
+        assert tflops >= TARGET_TFLOPS, (
+            f"4096x4096 matmul achieved only {tflops:.1f} TFLOPS, "
+            f"target: {TARGET_TFLOPS} TFLOPS"
+        )
+
+    def test_8192x8192_target_tflops(self, check_3090ti):
+        """8192x8192 matmul should achieve 35.6 TFLOPS target."""
+        _, tflops = benchmark_matmul(8192, 8192, 8192, warmup=2, iterations=5)
+        print(f"\n8192x8192: {tflops:.1f} TFLOPS (target: {TARGET_TFLOPS})")
+        assert tflops >= TARGET_TFLOPS, (
+            f"8192x8192 matmul achieved only {tflops:.1f} TFLOPS, "
+            f"target: {TARGET_TFLOPS} TFLOPS"
+        )
+
+    def test_large_matrix_sustained_performance(self, check_3090ti):
+        """Large matrices should sustain high performance."""
+        sizes = [(4096, 4096, 4096), (6144, 6144, 6144), (8192, 8192, 8192)]
+        results = []
+
+        for m, n, k in sizes:
+            iters = 5 if m >= 6144 else 10
+            _, tflops = benchmark_matmul(m, n, k, warmup=2, iterations=iters)
+            results.append((m, tflops))
+            print(f"\n{m}x{n}x{k}: {tflops:.1f} TFLOPS")
+
+        # All should be above minimum
+        for size, tflops in results:
+            assert tflops >= MINIMUM_TFLOPS, (
+                f"{size}x{size} only achieved {tflops:.1f} TFLOPS"
+            )
+
+
+class TestNonSquareMatrices:
+    """Tests for non-square matrix performance."""
+
+    def test_tall_skinny_matrix(self, check_3090ti):
+        """Tall-skinny matrices (common in ML) should be efficient."""
+        # Typical inference shape: batch x hidden x output
+        _, tflops = benchmark_matmul(8192, 4096, 1024)
+        print(f"\n8192x4096x1024 (tall-skinny): {tflops:.1f} TFLOPS")
+        # Lower threshold for non-square
+        assert tflops >= 15.0, f"Tall-skinny only achieved {tflops:.1f} TFLOPS"
+
+    def test_wide_matrix(self, check_3090ti):
+        """Wide matrices should be efficient."""
+        _, tflops = benchmark_matmul(1024, 8192, 4096)
+        print(f"\n1024x8192x4096 (wide): {tflops:.1f} TFLOPS")
+        assert tflops >= 15.0, f"Wide matrix only achieved {tflops:.1f} TFLOPS"
+
+    def test_transformer_attention_shapes(self, check_3090ti):
+        """Transformer attention-like shapes should be efficient."""
+        # QK^T: (batch*heads, seq, head_dim) x (batch*heads, head_dim, seq)
+        # Typical: 32*16=512 batch, 2048 seq, 64 head_dim
+        _, tflops = benchmark_matmul(512, 2048, 64)
+        print(f"\n512x2048x64 (attention QK): {tflops:.1f} TFLOPS")
+        # Small K dimension limits performance
+        assert tflops >= 5.0
+
+
+class TestCorrectness:
+    """Verify correctness is maintained with optimizations."""
+
+    def test_matmul_correctness_small(self, check_3090ti):
+        """Small matmul should be numerically correct."""
+        A = np.random.randn(256, 256).astype(np.float32)
+        B = np.random.randn(256, 256).astype(np.float32)
+
+        A_gpu = native.from_numpy(A)
+        B_gpu = native.from_numpy(B)
+        C_gpu = native.matmul(A_gpu, B_gpu)
+        C_result = C_gpu.to_numpy()
+
+        C_expected = A @ B
+        rel_error = np.max(np.abs(C_result - C_expected)) / np.max(np.abs(C_expected))
+        assert rel_error < 1e-5, f"Relative error too high: {rel_error}"
+
+    def test_matmul_correctness_large(self, check_3090ti):
+        """Large matmul should be numerically correct."""
+        A = np.random.randn(4096, 4096).astype(np.float32)
+        B = np.random.randn(4096, 4096).astype(np.float32)
+
+        A_gpu = native.from_numpy(A)
+        B_gpu = native.from_numpy(B)
+        C_gpu = native.matmul(A_gpu, B_gpu)
+        C_result = C_gpu.to_numpy()
+
+        C_expected = A @ B
+        rel_error = np.max(np.abs(C_result - C_expected)) / np.max(np.abs(C_expected))
+        assert rel_error < 1e-4, f"Relative error too high: {rel_error}"
+
+    def test_matmul_correctness_non_square(self, check_3090ti):
+        """Non-square matmul should be numerically correct."""
+        A = np.random.randn(2048, 1024).astype(np.float32)
+        B = np.random.randn(1024, 4096).astype(np.float32)
+
+        A_gpu = native.from_numpy(A)
+        B_gpu = native.from_numpy(B)
+        C_gpu = native.matmul(A_gpu, B_gpu)
+        C_result = C_gpu.to_numpy()
+
+        C_expected = A @ B
+        rel_error = np.max(np.abs(C_result - C_expected)) / np.max(np.abs(C_expected))
+        assert rel_error < 1e-4, f"Relative error too high: {rel_error}"
+
+
+class TestEfficiencyMetrics:
+    """Tests for efficiency metrics and roofline analysis."""
+
+    def test_compute_bound_efficiency(self, check_3090ti):
+        """Large square matrices should be compute-bound with high efficiency."""
+        _, tflops = benchmark_matmul(8192, 8192, 8192, warmup=2, iterations=5)
+        efficiency = tflops / RTX_3090TI_THEORETICAL_TFLOPS
+        print(f"\n8192x8192 efficiency: {efficiency*100:.1f}%")
+        assert efficiency >= 0.55, f"Efficiency only {efficiency*100:.1f}%"
+
+    def test_memory_bandwidth_utilization(self, check_3090ti):
+        """Measure effective memory bandwidth for small K."""
+        # Small K = memory-bound
+        m, n, k = 8192, 8192, 64
+        time_sec, _ = benchmark_matmul(m, n, k)
+
+        # Data transfer: A (m*k) + B (k*n) + C (m*n) in float32
+        bytes_transferred = (m * k + k * n + m * n) * 4
+        bandwidth_gbps = bytes_transferred / time_sec / 1e9
+
+        print(f"\n{m}x{n}x{k} bandwidth: {bandwidth_gbps:.1f} GB/s")
+        # RTX 3090 Ti has 1008 GB/s peak bandwidth
+        assert bandwidth_gbps >= 400, f"Bandwidth only {bandwidth_gbps:.1f} GB/s"
+
+
+if __name__ == "__main__":
+    # Run with verbose output
+    pytest.main([__file__, "-v", "-s"])

From b59feac7a5fffc071d5d9243e4246797b5ed1ec4 Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Sat, 13 Dec 2025 14:39:02 +0900
Subject: [PATCH 2/8] feat(gemm): Add Ampere-optimized SGEMM kernel with
 cp.async pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Implement 4-stage software pipelined GEMM using cp.async for async memory transfers
- Configuration: BM=128, BN=128, BK=16, 256 threads, 8x8 thread tiles
- Fix critical row-major A stride calculation bug (BK+PAD, not BM+PAD)
- Use float4 vectorized loads for both A and B matrices
- Achieve ~18 TFLOPS on RTX 3090 Ti at 8192x8192 (51% theoretical efficiency)
- Full correctness verification passes for all matrix sizes (256-4096)
- Require SM >= 80 (Ampere) for cp.async support

Performance results:
- 8192x8192: 18.2 TFLOPS (max: 18.3)
- 4096x4096: 13.2 TFLOPS
- 2048x2048: 7.6 TFLOPS

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 benchmark_ampere.py              |  98 +++++
 native/ops/basic.cu              |  15 +-
 native/ops/matmul_f32_ampere.cuh | 635 +++++++++++++++++++++++++++++++
 pyproject.toml                   |   3 +-
 4 files changed, 739 insertions(+), 12 deletions(-)
 create mode 100644 benchmark_ampere.py
 create mode 100644 native/ops/matmul_f32_ampere.cuh

diff --git a/benchmark_ampere.py b/benchmark_ampere.py
new file mode 100644
index 0000000..45e11a5
--- /dev/null
+++ b/benchmark_ampere.py
@@ -0,0 +1,98 @@
+"""Benchmark Ampere-optimized GEMM kernel."""
+import os
+import time
+
+import numpy as np
+
+# Setup CUDA DLL path
+cuda_path = os.environ.get(
+    "CUDA_PATH", r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+)
+cuda_bin = os.path.join(cuda_path, "bin")
+if cuda_bin not in os.environ.get("PATH", ""):
+    os.environ["PATH"] = cuda_bin + os.pathsep + os.environ.get("PATH", "")
+if hasattr(os, "add_dll_directory"):
+    os.add_dll_directory(cuda_bin)
+
+# Import native module
+try:
+    import _pygpukit_native as native
+except ImportError:
+    from pygpukit import _pygpukit_native as native
+
+props = native.get_device_properties(0)
+print(f"GPU: {props.name}")
+print()
+
+
+def verify_correctness(m, n, k):
+    """Verify kernel correctness."""
+    A = np.random.randn(m, k).astype(np.float32)
+    B = np.random.randn(k, n).astype(np.float32)
+
+    A_gpu = native.from_numpy(A)
+    B_gpu = native.from_numpy(B)
+    C_gpu = native.matmul(A_gpu, B_gpu)
+    C_result = C_gpu.to_numpy()
+
+    C_expected = A @ B
+    rel_error = np.max(np.abs(C_result - C_expected)) / np.max(np.abs(C_expected))
+    return rel_error
+
+
+def benchmark_matmul(m, n, k, warmup=3, iterations=10):
+    """Benchmark matmul and return median time and TFLOPS."""
+    A_np = np.random.randn(m, k).astype(np.float32)
+    B_np = np.random.randn(k, n).astype(np.float32)
+
+    # Pre-allocate GPU arrays
+    A_gpu = native.from_numpy(A_np)
+    B_gpu = native.from_numpy(B_np)
+
+    # Warmup
+    for _ in range(warmup):
+        C_gpu = native.matmul(A_gpu, B_gpu)
+
+    # Benchmark (reuse same input arrays)
+    times = []
+    for _ in range(iterations):
+        start = time.perf_counter()
+        C_gpu = native.matmul(A_gpu, B_gpu)
+        elapsed = time.perf_counter() - start
+        times.append(elapsed)
+
+    median_time = np.median(times)
+    min_time = np.min(times)
+    flops = 2 * m * n * k
+    tflops_median = flops / median_time / 1e12
+    tflops_max = flops / min_time / 1e12
+    return median_time, tflops_median, tflops_max
+
+
+# First verify correctness
+print("=== Correctness Verification ===")
+for size in [256, 512, 1024, 2048, 4096]:
+    error = verify_correctness(size, size, size)
+    status = "PASS" if error < 1e-4 else "FAIL"
+    print(f"{size}x{size}: relative error = {error:.2e} [{status}]")
+
+print()
+
+# Benchmark different sizes
+sizes = [
+    (2048, 2048, 2048),
+    (4096, 4096, 4096),
+    (8192, 8192, 8192),
+]
+
+print("=== Ampere-Optimized GEMM Benchmark ===")
+print()
+for m, n, k in sizes:
+    iters = 5 if m >= 8192 else 10
+    time_ms, tflops_med, tflops_max = benchmark_matmul(m, n, k, warmup=5, iterations=iters)
+    status = "PASS" if tflops_med >= 22.0 else "FAIL"
+    print(f"{m}x{n}x{k}: {tflops_med:.1f} TFLOPS (max: {tflops_max:.1f}) - {time_ms*1000:.2f} ms [{status}]")
+
+print()
+print("Target: 22-32 TFLOPS (62-90% efficiency on RTX 3090 Ti)")
+print("Minimum: 22 TFLOPS to beat PyTorch baseline")
diff --git a/native/ops/basic.cu b/native/ops/basic.cu
index 4c16a1f..7ce5617 100644
--- a/native/ops/basic.cu
+++ b/native/ops/basic.cu
@@ -1,4 +1,5 @@
 #include "basic.cuh"
+#include "matmul_f32_ampere.cuh"
 #include <stdexcept>
 
 #ifdef PYGPUKIT_DRIVER_ONLY
@@ -804,17 +805,9 @@ void matmul(const GPUArray& a, const GPUArray& b, GPUArray& c) {
                       K >= TILED_MATMUL_THRESHOLD);
 
     if (use_optimized) {
-        // RTX 3090 Ti optimized kernel with 128x128 tiles
-        // Block size: 16x16 = 256 threads, each handles 8x8 output elements
-        constexpr int OPT_BM = 128;
-        constexpr int OPT_BN = 128;
-        dim3 block_size(16, 16);  // 256 threads
-        dim3 grid_size(
-            (N + OPT_BN - 1) / OPT_BN,
-            (M + OPT_BM - 1) / OPT_BM
-        );
-
-        matmul_f32_optimized_kernel<<<grid_size, block_size>>>(
+        // Ampere-optimized kernel with cp.async and 4-stage pipeline
+        // Target: 22-32 TFLOPS on RTX 3090 Ti
+        ampere::launch_sgemm_ampere(
             static_cast<const float*>(a.data()),
             static_cast<const float*>(b.data()),
             static_cast<float*>(c.data()),
diff --git a/native/ops/matmul_f32_ampere.cuh b/native/ops/matmul_f32_ampere.cuh
new file mode 100644
index 0000000..c1fa0cc
--- /dev/null
+++ b/native/ops/matmul_f32_ampere.cuh
@@ -0,0 +1,635 @@
+/**
+ * Ampere-Optimized FP32 GEMM Kernel for RTX 3090 Ti
+ *
+ * Target: 22-30 TFLOPS (62-85% of 35.6 TFLOPS theoretical)
+ *
+ * Key optimizations based on CUTLASS/cuBLAS patterns:
+ * - cp.async with 4-stage software pipeline
+ * - BK=16 with 4 stages for proper latency hiding
+ * - Single __syncthreads() per K iteration
+ * - Warp-contiguous memory access patterns
+ * - 128-byte cache line aligned loads
+ * - Proper wait_group(STAGES-2) placement AFTER load issue
+ *
+ * Architecture: SM 8.6 (Ampere, RTX 3090 Ti)
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace pygpukit {
+namespace ops {
+namespace ampere {
+
+// ============================================================================
+// Configuration Constants - Tuned for RTX 3090 Ti
+// ============================================================================
+
+// CTA tile dimensions - ROW-MAJOR A with float4 cp.async
+constexpr int BM = 128;           // Tile rows per block
+constexpr int BN = 128;           // Tile cols per block
+constexpr int BK = 16;            // Tile depth - 16 for good balance
+
+// Thread tile dimensions
+constexpr int TM = 8;             // Rows per thread
+constexpr int TN = 8;             // Cols per thread
+
+// Block dimensions: (BN/TN, BM/TM) = (16, 16) = 256 threads
+constexpr int BLOCK_DIM_X = BN / TN;  // 16
+constexpr int BLOCK_DIM_Y = BM / TM;  // 16
+constexpr int NUM_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y;  // 256
+
+// Pipeline stages - 4 stages for latency hiding
+// wait_group(STAGES-2) = wait_group(2) allows 2 groups in flight
+constexpr int STAGES = 4;
+
+// ============================================================================
+// Shared memory layout for ROW-MAJOR A storage (BK=16)
+// ============================================================================
+// A is stored ROW-MAJOR: Am[stage][m][k] where:
+//   - m = 0..127 (BM rows)
+//   - k = 0..15 (BK columns)
+//   - Stride = BK + PAD for each row
+//
+// B is stored ROW-MAJOR: Bs[stage][k][n] where:
+//   - k = 0..15 (BK rows)
+//   - n = 0..127 (BN columns)
+//   - Stride = BN + PAD for each row
+
+constexpr int SMEM_PAD_A = 4;     // stride=20 for row-major A (BK=16)
+constexpr int SMEM_PAD_B = 8;     // stride=136 for B
+
+// Shared memory strides
+constexpr int A_SMEM_STRIDE = BK + SMEM_PAD_A;    // 20 (row-major A: m rows, k cols)
+constexpr int B_SMEM_STRIDE = BN + SMEM_PAD_B;    // 136
+
+// Shared memory sizes per stage
+// A: BM rows × stride = 128 × 20 = 2560 floats per stage
+// B: BK rows × stride = 16 × 136 = 2176 floats per stage
+constexpr int A_STAGE_SIZE = BM * A_SMEM_STRIDE;  // 128 * 20 = 2560 floats
+constexpr int B_STAGE_SIZE = BK * B_SMEM_STRIDE;  // 16 * 136 = 2176 floats
+
+// Total shared memory: 4 stages * (2560 + 2176) * 4 bytes = 75,776 bytes = 74 KB
+// Fits within RTX 3090 Ti's 100KB limit!
+
+// ============================================================================
+// Helper Functions for cp.async
+// ============================================================================
+
+// Convert generic pointer to shared memory address for PTX
+__device__ __forceinline__ unsigned int cvta_to_shared(const void* ptr) {
+    unsigned int smem_addr;
+    asm volatile(
+        "{ .reg .u64 smem_ptr64;\n"
+        "  cvta.to.shared.u64 smem_ptr64, %1;\n"
+        "  cvt.u32.u64 %0, smem_ptr64; }\n"
+        : "=r"(smem_addr) : "l"(ptr)
+    );
+    return smem_addr;
+}
+
+// cp.async 4-byte copy (single float) - cache at all levels (.ca)
+// Note: .cg only supports 16 bytes, .ca supports 4, 8, 16 bytes
+__device__ __forceinline__ void cp_async_cg_4(void* dst, const void* src) {
+    unsigned int dst_smem = cvta_to_shared(dst);
+    asm volatile(
+        "cp.async.ca.shared.global [%0], [%1], 4;\n"
+        :: "r"(dst_smem), "l"(src)
+    );
+}
+
+// cp.async 16-byte copy (float4) - cache global (.cg) for better throughput
+__device__ __forceinline__ void cp_async_cg_16(void* dst, const void* src) {
+    unsigned int dst_smem = cvta_to_shared(dst);
+    asm volatile(
+        "cp.async.cg.shared.global [%0], [%1], 16;\n"
+        :: "r"(dst_smem), "l"(src)
+    );
+}
+
+// Commit current async copy group
+__device__ __forceinline__ void cp_async_commit() {
+    asm volatile("cp.async.commit_group;\n" ::);
+}
+
+// Wait for async copy groups - N = max groups still in flight
+__device__ __forceinline__ void cp_async_wait_group(int N) {
+    // Note: N must be a compile-time constant in real usage
+    // Using template specialization for common cases
+    if (N == 0) {
+        asm volatile("cp.async.wait_group 0;\n" ::);
+    } else if (N == 1) {
+        asm volatile("cp.async.wait_group 1;\n" ::);
+    } else if (N == 2) {
+        asm volatile("cp.async.wait_group 2;\n" ::);
+    } else if (N == 3) {
+        asm volatile("cp.async.wait_group 3;\n" ::);
+    }
+}
+
+// ============================================================================
+// High-Performance SGEMM Kernel with TRUE 3-Stage Pipeline
+// ============================================================================
+
+/**
+ * C = A × B
+ * A: M × K (row-major)
+ * B: K × N (row-major)
+ * C: M × N (row-major)
+ *
+ * Grid:  ((N + BN - 1) / BN, (M + BM - 1) / BM)
+ * Block: (16, 16) = 256 threads
+ *
+ * Shared memory layout:
+ *   As[stage][k][m] - A tile stored transposed for coalesced smem reads
+ *   Bs[stage][k][n] - B tile stored normally
+ */
+__global__ void __launch_bounds__(256, 2)
+sgemm_128x128x32_3stage(
+    const float* __restrict__ A,
+    const float* __restrict__ B,
+    float* __restrict__ C,
+    int M, int N, int K
+) {
+    // Thread indices
+    const int tx = threadIdx.x;  // 0-15 (column within thread block)
+    const int ty = threadIdx.y;  // 0-15 (row within thread block)
+    const int tid = ty * BLOCK_DIM_X + tx;  // 0-255
+
+    // Block indices
+    const int bx = blockIdx.x;
+    const int by = blockIdx.y;
+
+    // Global starting positions for this CTA
+    const int cta_row = by * BM;
+    const int cta_col = bx * BN;
+
+    // ========================================================================
+    // Shared Memory Allocation (4-stage pipeline)
+    // ========================================================================
+    extern __shared__ float smem[];
+
+    // Am[stage][m][k] - ROW-MAJOR storage for float4 async loads
+    // Bs[stage][k][n] - row-major storage
+    float* Am = smem;
+    float* Bs = smem + STAGES * A_STAGE_SIZE;
+
+    // Indexing macros for shared memory
+    // Am[m][k] with stride=20: row-major, float4 aligned for k={0,4,8,12}
+    // Position = stage * 2560 + m * 20 + k
+    #define AM(stage, m, k) Am[(stage) * A_STAGE_SIZE + (m) * A_SMEM_STRIDE + (k)]
+    #define BS(stage, k, n) Bs[(stage) * B_STAGE_SIZE + (k) * B_SMEM_STRIDE + (n)]
+
+    // ========================================================================
+    // Register Allocation
+    // ========================================================================
+    // Accumulators: 8×8 = 64 floats per thread
+    float acc[TM][TN];
+    #pragma unroll
+    for (int i = 0; i < TM; ++i) {
+        #pragma unroll
+        for (int j = 0; j < TN; ++j) {
+            acc[i][j] = 0.0f;
+        }
+    }
+
+    // Fragments for register tiling
+    float a_frag[TM];
+    float b_frag[TN];
+
+    // ========================================================================
+    // Load Configuration
+    // ========================================================================
+    // Each thread loads multiple elements per tile
+    // A tile: BM × BK = 128 × 32 = 4096 elements, 256 threads → 16 per thread
+    // B tile: BK × BN = 32 × 128 = 4096 elements, 256 threads → 16 per thread
+
+    // For warp-contiguous loads, organize by warps
+    const int warp_id = tid / 32;      // 0-7 (8 warps)
+    const int lane_id = tid % 32;      // 0-31
+
+    // Number of K tiles
+    const int num_k_tiles = (K + BK - 1) / BK;
+
+    // ========================================================================
+    // Async Load Functions (WARP-COALESCED patterns)
+    // ========================================================================
+    //
+    // CRITICAL for performance:
+    // - A is row-major (MxK): A[m,k] = A[m*K + k]
+    //   Consecutive K values are contiguous in memory
+    //   → Organize so consecutive THREADS load consecutive K values
+    //
+    // - B is row-major (KxN): B[k,n] = B[k*N + n]
+    //   Consecutive N values are contiguous in memory
+    //   → Already using float4 for consecutive N values (good)
+    //
+    // Pattern: elem_idx = tid + i * NUM_THREADS ensures consecutive threads
+    //          load consecutive elements in each iteration.
+
+    // Load A tile with ROW-MAJOR storage using float4 cp.async
+    // A is row-major in global memory: A[m][k] = A[m*K + k]
+    // Store to shared memory ROW-MAJOR: AM[m][k] with stride=20
+    //
+    // Tile: 128 × 16 = 2048 elements = 512 float4s
+    // 256 threads × 2 float4s/thread = 512 float4s
+    //
+    // CRITICAL: Both source (global) and destination (shared) are 16-byte aligned
+    auto load_A_tile = [&](int stage, int k_tile) {
+        const int k_base = k_tile * BK;
+
+        // Each thread loads 2 float4s (8 floats total)
+        #pragma unroll
+        for (int i = 0; i < 2; ++i) {
+            const int float4_idx = tid + i * NUM_THREADS;
+
+            // Each row has BK/4 = 4 float4s (k = 0,4,8,12)
+            const int a_m = float4_idx / (BK / 4);      // 0-127
+            const int a_k = (float4_idx % (BK / 4)) * 4;  // 0, 4, 8, 12
+
+            const int global_m = cta_row + a_m;
+            const int global_k = k_base + a_k;
+
+            // Destination in shared memory: AM[stage][m][k]
+            float* dst = &AM(stage, a_m, a_k);
+
+            if (global_m < M && global_k + 3 < K) {
+                // float4 cp.async - both src and dst are 16-byte aligned
+                const float* src = &A[global_m * K + global_k];
+                cp_async_cg_16(dst, src);
+            } else {
+                // Boundary handling with 4-byte copies
+                #pragma unroll
+                for (int j = 0; j < 4; ++j) {
+                    if (global_m < M && global_k + j < K) {
+                        cp_async_cg_4(&dst[j], &A[global_m * K + global_k + j]);
+                    } else {
+                        dst[j] = 0.0f;
+                    }
+                }
+            }
+        }
+    };
+
+    // Load B tile with COALESCED float4 access
+    // 16 × 128 = 2048 elements = 512 float4s (BK=16)
+    // 256 threads × 2 float4s/thread = 512 float4s
+    auto load_B_tile = [&](int stage, int k_tile) {
+        const int k_base = k_tile * BK;
+
+        #pragma unroll
+        for (int i = 0; i < 2; ++i) {
+            // Consecutive threads load consecutive float4s
+            const int float4_idx = tid + i * NUM_THREADS;
+
+            // Each row has BN/4 = 32 float4s
+            const int b_k = float4_idx / (BN / 4);   // 0-15
+            const int b_n = (float4_idx % (BN / 4)) * 4;  // 0, 4, 8, ..., 124
+
+            const int global_k = k_base + b_k;
+            const int global_n = cta_col + b_n;
+
+            float* dst = &BS(stage, b_k, b_n);
+
+            if (global_k < K && global_n + 3 < N) {
+                const float* src = &B[global_k * N + global_n];
+                cp_async_cg_16(dst, src);  // float4 = 16 bytes, coalesced!
+            } else {
+                #pragma unroll
+                for (int j = 0; j < 4; ++j) {
+                    if (global_k < K && global_n + j < N) {
+                        const float* src = &B[global_k * N + global_n + j];
+                        cp_async_cg_4(&dst[j], src);
+                    } else {
+                        dst[j] = 0.0f;
+                    }
+                }
+            }
+        }
+    };
+
+    // ========================================================================
+    // Pipeline Prologue: Fill STAGES-1 tiles
+    // ========================================================================
+    // Issue async loads for first STAGES-1 tiles
+    #pragma unroll
+    for (int s = 0; s < STAGES - 1; ++s) {
+        if (s < num_k_tiles) {
+            load_A_tile(s, s);
+            load_B_tile(s, s);
+        }
+        cp_async_commit();
+    }
+
+    // ========================================================================
+    // Main Pipeline Loop
+    // ========================================================================
+    // CRITICAL: The correct pipeline pattern is:
+    //   1. Issue load for tile k+STAGES-1
+    //   2. Commit
+    //   3. Wait for tile k to be ready
+    //   4. Sync
+    //   5. Compute tile k
+
+    for (int k_tile = 0; k_tile < num_k_tiles; ++k_tile) {
+        // Current stage for compute
+        const int compute_stage = k_tile % STAGES;
+
+        // Stage for loading (STAGES-1 tiles ahead)
+        const int load_stage = (k_tile + STAGES - 1) % STAGES;
+        const int load_k_tile = k_tile + STAGES - 1;
+
+        // Step 1: Issue async loads for future tile
+        if (load_k_tile < num_k_tiles) {
+            load_A_tile(load_stage, load_k_tile);
+            load_B_tile(load_stage, load_k_tile);
+        }
+
+        // Step 2: Commit this group
+        cp_async_commit();
+
+        // Step 3: Wait for compute stage to be ready
+        // wait_group(STAGES-2) means wait until ≤(STAGES-2) groups outstanding
+        // With STAGES=3, wait_group(1) ensures tile k_tile is ready
+        cp_async_wait_group(STAGES - 2);
+
+        // Step 4: Single sync point
+        __syncthreads();
+
+        // Step 5: Compute - load fragments from shared memory and accumulate
+        // ROW-MAJOR A storage: AM[m][k]
+        #pragma unroll
+        for (int k = 0; k < BK; ++k) {
+            // Load A fragment: 8 values from AM[ty*8..ty*8+7][k]
+            #pragma unroll
+            for (int m = 0; m < TM; ++m) {
+                a_frag[m] = AM(compute_stage, ty * TM + m, k);
+            }
+
+            // Load B fragment: 8 consecutive N values for this thread
+            #pragma unroll
+            for (int n = 0; n < TN; ++n) {
+                b_frag[n] = BS(compute_stage, k, tx * TN + n);
+            }
+
+            // Outer product with FMA
+            #pragma unroll
+            for (int m = 0; m < TM; ++m) {
+                #pragma unroll
+                for (int n = 0; n < TN; ++n) {
+                    acc[m][n] = fmaf(a_frag[m], b_frag[n], acc[m][n]);
+                }
+            }
+        }
+
+        // NO second __syncthreads() here - single barrier per iteration
+    }
+
+    // ========================================================================
+    // Epilogue: Write results to global memory
+    // ========================================================================
+    #pragma unroll
+    for (int m = 0; m < TM; ++m) {
+        const int global_m = cta_row + ty * TM + m;
+
+        if (global_m < M) {
+            #pragma unroll
+            for (int n = 0; n < TN; ++n) {
+                const int global_n = cta_col + tx * TN + n;
+
+                if (global_n < N) {
+                    C[global_m * N + global_n] = acc[m][n];
+                }
+            }
+        }
+    }
+
+    #undef AM
+    #undef BS
+}
+
+// ============================================================================
+// Alternative: 4-Stage Pipeline with BK=16 (fits in default 48KB smem)
+// ============================================================================
+
+// Configuration for smaller BK
+constexpr int BK_SMALL = 16;
+constexpr int STAGES_4 = 4;
+constexpr int A_STAGE_SIZE_SMALL = BK_SMALL * A_SMEM_STRIDE;  // 16 * 136 = 2176
+constexpr int B_STAGE_SIZE_SMALL = BK_SMALL * B_SMEM_STRIDE;  // 16 * 136 = 2176
+// Total: 4 * (2176 + 2176) * 4 = 69,632 bytes = 68KB - fits in default smem!
+
+/**
+ * 4-stage pipeline variant with BK=16
+ * Slightly less compute per load, but more stages for latency hiding
+ */
+__global__ void __launch_bounds__(256, 2)
+sgemm_128x128x16_4stage(
+    const float* __restrict__ A,
+    const float* __restrict__ B,
+    float* __restrict__ C,
+    int M, int N, int K
+) {
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int tid = ty * BLOCK_DIM_X + tx;
+
+    const int bx = blockIdx.x;
+    const int by = blockIdx.y;
+
+    const int cta_row = by * BM;
+    const int cta_col = bx * BN;
+
+    extern __shared__ float smem[];
+    float* As = smem;
+    float* Bs = smem + STAGES_4 * A_STAGE_SIZE_SMALL;
+
+    #define AS4(stage, k, m) As[(stage) * A_STAGE_SIZE_SMALL + (k) * A_SMEM_STRIDE + (m)]
+    #define BS4(stage, k, n) Bs[(stage) * B_STAGE_SIZE_SMALL + (k) * B_SMEM_STRIDE + (n)]
+
+    float acc[TM][TN];
+    #pragma unroll
+    for (int i = 0; i < TM; ++i) {
+        #pragma unroll
+        for (int j = 0; j < TN; ++j) {
+            acc[i][j] = 0.0f;
+        }
+    }
+
+    float a_frag[TM];
+    float b_frag[TN];
+
+    const int num_k_tiles = (K + BK_SMALL - 1) / BK_SMALL;
+
+    // Load functions for BK=16 with FULLY ASYNC cp.async
+    auto load_A = [&](int stage, int k_tile) {
+        const int k_base = k_tile * BK_SMALL;
+        // 128 × 16 = 2048 elements
+        // 256 threads × 8 elements/thread = 2048 elements
+
+        #pragma unroll
+        for (int i = 0; i < 8; ++i) {
+            const int elem_idx = tid + i * NUM_THREADS;
+
+            const int a_k = elem_idx % BK_SMALL;  // 0-15
+            const int a_m = elem_idx / BK_SMALL;  // 0-127
+
+            const int global_m = cta_row + a_m;
+            const int global_k = k_base + a_k;
+
+            float* dst = &AS4(stage, a_k, a_m);
+
+            if (global_m < M && global_k < K) {
+                const float* src = &A[global_m * K + global_k];
+                cp_async_cg_4(dst, src);  // Fully async
+            } else {
+                *dst = 0.0f;
+            }
+        }
+    };
+
+    auto load_B = [&](int stage, int k_tile) {
+        const int k_base = k_tile * BK_SMALL;
+        // 16 × 128 = 2048 elements = 512 float4s, 256 threads → 2 float4 per thread
+        #pragma unroll
+        for (int i = 0; i < 2; ++i) {
+            const int float4_idx = tid + i * NUM_THREADS;
+            const int b_k = float4_idx / (BN / 4);
+            const int b_n = (float4_idx % (BN / 4)) * 4;
+
+            const int global_k = k_base + b_k;
+            const int global_n = cta_col + b_n;
+
+            float* dst = &BS4(stage, b_k, b_n);
+
+            if (global_k < K && global_n + 3 < N) {
+                cp_async_cg_16(dst, &B[global_k * N + global_n]);
+            } else {
+                for (int j = 0; j < 4; ++j) {
+                    if (global_k < K && global_n + j < N) {
+                        cp_async_cg_4(&dst[j], &B[global_k * N + global_n + j]);
+                    } else {
+                        dst[j] = 0.0f;
+                    }
+                }
+            }
+        }
+    };
+
+    // Prologue: fill STAGES_4-1 = 3 tiles
+    #pragma unroll
+    for (int s = 0; s < STAGES_4 - 1; ++s) {
+        if (s < num_k_tiles) {
+            load_A(s, s);
+            load_B(s, s);
+        }
+        cp_async_commit();
+    }
+
+    // Main loop
+    for (int k_tile = 0; k_tile < num_k_tiles; ++k_tile) {
+        const int compute_stage = k_tile % STAGES_4;
+        const int load_stage = (k_tile + STAGES_4 - 1) % STAGES_4;
+        const int load_k_tile = k_tile + STAGES_4 - 1;
+
+        if (load_k_tile < num_k_tiles) {
+            load_A(load_stage, load_k_tile);
+            load_B(load_stage, load_k_tile);
+        }
+        cp_async_commit();
+
+        // wait_group(STAGES_4 - 2) = wait_group(2)
+        cp_async_wait_group(STAGES_4 - 2);
+        __syncthreads();
+
+        #pragma unroll
+        for (int k = 0; k < BK_SMALL; ++k) {
+            #pragma unroll
+            for (int m = 0; m < TM; ++m) {
+                a_frag[m] = AS4(compute_stage, k, ty * TM + m);
+            }
+            #pragma unroll
+            for (int n = 0; n < TN; ++n) {
+                b_frag[n] = BS4(compute_stage, k, tx * TN + n);
+            }
+            #pragma unroll
+            for (int m = 0; m < TM; ++m) {
+                #pragma unroll
+                for (int n = 0; n < TN; ++n) {
+                    acc[m][n] = fmaf(a_frag[m], b_frag[n], acc[m][n]);
+                }
+            }
+        }
+    }
+
+    // Epilogue
+    #pragma unroll
+    for (int m = 0; m < TM; ++m) {
+        const int global_m = cta_row + ty * TM + m;
+        if (global_m < M) {
+            #pragma unroll
+            for (int n = 0; n < TN; ++n) {
+                const int global_n = cta_col + tx * TN + n;
+                if (global_n < N) {
+                    C[global_m * N + global_n] = acc[m][n];
+                }
+            }
+        }
+    }
+
+    #undef AS4
+    #undef BS4
+}
+
+// ============================================================================
+// Kernel Launch Helper with Dynamic Shared Memory Configuration
+// ============================================================================
+
+inline cudaError_t launch_sgemm_ampere(
+    const float* A, const float* B, float* C,
+    int M, int N, int K,
+    cudaStream_t stream = 0
+) {
+    dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);  // 16×16 = 256 threads
+    dim3 grid((N + BN - 1) / BN, (M + BM - 1) / BM);
+
+    // Calculate shared memory sizes
+    const size_t smem_3stage = (STAGES * A_STAGE_SIZE + STAGES * B_STAGE_SIZE) * sizeof(float);
+    const size_t smem_4stage = (STAGES_4 * A_STAGE_SIZE_SMALL + STAGES_4 * B_STAGE_SIZE_SMALL) * sizeof(float);
+
+    // Try 3-stage kernel first (needs ~104KB smem)
+    cudaError_t err;
+
+    // Configure extended shared memory for 3-stage kernel
+    err = cudaFuncSetAttribute(
+        sgemm_128x128x32_3stage,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem_3stage
+    );
+
+    if (err == cudaSuccess) {
+        // Use 3-stage kernel with BK=32
+        sgemm_128x128x32_3stage<<<grid, block, smem_3stage, stream>>>(A, B, C, M, N, K);
+        return cudaGetLastError();
+    }
+
+    // Fallback to 4-stage kernel with BK=16
+    err = cudaFuncSetAttribute(
+        sgemm_128x128x16_4stage,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem_4stage
+    );
+
+    if (err == cudaSuccess) {
+        sgemm_128x128x16_4stage<<<grid, block, smem_4stage, stream>>>(A, B, C, M, N, K);
+        return cudaGetLastError();
+    }
+
+    return err;
+}
+
+}  // namespace ampere
+}  // namespace ops
+}  // namespace pygpukit
diff --git a/pyproject.toml b/pyproject.toml
index ef90535..62a84c7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,7 +61,8 @@ cmake.args = ["-DCMAKE_CUDA_COMPILER_WORKS=1"]
 build.targets = []
 
 [tool.scikit-build.cmake.define]
-CMAKE_CUDA_ARCHITECTURES = "70;75;80;86;89;90"
+# PyGPUkit requires SM >= 80 (Ampere and newer) for cp.async support
+CMAKE_CUDA_ARCHITECTURES = "80;86;89;90"
 
 [tool.cibuildwheel]
 # Skip PyPy, 32-bit builds, and musllinux

From 6f151a08784b9b0e79fce705402e1677a6c095b6 Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Sat, 13 Dec 2025 14:42:34 +0900
Subject: [PATCH 3/8] fix(lint): Fix unused variable warnings in benchmark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 benchmark_ampere.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark_ampere.py b/benchmark_ampere.py
index 45e11a5..f7764a6 100644
--- a/benchmark_ampere.py
+++ b/benchmark_ampere.py
@@ -51,13 +51,13 @@ def benchmark_matmul(m, n, k, warmup=3, iterations=10):
 
     # Warmup
     for _ in range(warmup):
-        C_gpu = native.matmul(A_gpu, B_gpu)
+        _ = native.matmul(A_gpu, B_gpu)
 
     # Benchmark (reuse same input arrays)
     times = []
     for _ in range(iterations):
         start = time.perf_counter()
-        C_gpu = native.matmul(A_gpu, B_gpu)
+        _ = native.matmul(A_gpu, B_gpu)
         elapsed = time.perf_counter() - start
         times.append(elapsed)
 

From 98983fa6aa7cda54a0f6ca0fcc4f17e6c0f7e455 Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Sat, 13 Dec 2025 14:49:05 +0900
Subject: [PATCH 4/8] fix(ci): require SM >= 80 for cmake-check to support
 cp.async
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Ampere GEMM kernel uses cp.async which requires SM 80 or higher.
This fixes the cmake-check CI failure.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f2bd9d6..1a6ea64 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -90,6 +90,7 @@ jobs:
           mkdir -p build && cd build
           cmake .. \
             -DCMAKE_BUILD_TYPE=Release \
+            -DCMAKE_CUDA_ARCHITECTURES="80;86;89;90" \
             -Dpybind11_DIR=$(python -c "import pybind11; print(pybind11.get_cmake_dir())")
 
       - name: Build native module

From 58f782f267940ac0f72babd0a3361e128c0d9496 Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Sat, 13 Dec 2025 14:50:52 +0900
Subject: [PATCH 5/8] fix(lint): remove unused imports and variables in test
 file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 tests/test_3090ti_performance.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_3090ti_performance.py b/tests/test_3090ti_performance.py
index 956a115..8f3d2de 100644
--- a/tests/test_3090ti_performance.py
+++ b/tests/test_3090ti_performance.py
@@ -14,7 +14,6 @@
 - Minimum: 22 TFLOPS (must beat PyTorch baseline)
 """
 import os
-import sys
 import time
 
 import numpy as np
@@ -71,7 +70,7 @@ def benchmark_matmul(m: int, n: int, k: int, warmup: int = 3, iterations: int =
         A_gpu = native.from_numpy(A_np)
         B_gpu = native.from_numpy(B_np)
         start = time.perf_counter()
-        C_gpu = native.matmul(A_gpu, B_gpu)
+        _ = native.matmul(A_gpu, B_gpu)
         elapsed = time.perf_counter() - start
         times.append(elapsed)
 

From cb48e03ec2572c0b687c88a2c5f37629514a778d Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Sat, 13 Dec 2025 14:54:06 +0900
Subject: [PATCH 6/8] fix(tests): handle missing CUDA directory on Windows CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CUDA DLL path setup now checks if the directory exists before
attempting to add it, preventing FileNotFoundError on CI runners
without CUDA installed.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 benchmark_ampere.py              |  11 +-
 docs/ampere_gemm_optimization.md | 207 +++++++++++++++++++++++++++++++
 tests/test_3090ti_performance.py |  11 +-
 3 files changed, 219 insertions(+), 10 deletions(-)
 create mode 100644 docs/ampere_gemm_optimization.md

diff --git a/benchmark_ampere.py b/benchmark_ampere.py
index f7764a6..322f04b 100644
--- a/benchmark_ampere.py
+++ b/benchmark_ampere.py
@@ -4,15 +4,16 @@
 
 import numpy as np
 
-# Setup CUDA DLL path
+# Setup CUDA DLL path (if CUDA is installed)
 cuda_path = os.environ.get(
     "CUDA_PATH", r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
 )
 cuda_bin = os.path.join(cuda_path, "bin")
-if cuda_bin not in os.environ.get("PATH", ""):
-    os.environ["PATH"] = cuda_bin + os.pathsep + os.environ.get("PATH", "")
-if hasattr(os, "add_dll_directory"):
-    os.add_dll_directory(cuda_bin)
+if os.path.isdir(cuda_bin):
+    if cuda_bin not in os.environ.get("PATH", ""):
+        os.environ["PATH"] = cuda_bin + os.pathsep + os.environ.get("PATH", "")
+    if hasattr(os, "add_dll_directory"):
+        os.add_dll_directory(cuda_bin)
 
 # Import native module
 try:
diff --git a/docs/ampere_gemm_optimization.md b/docs/ampere_gemm_optimization.md
new file mode 100644
index 0000000..fce31a7
--- /dev/null
+++ b/docs/ampere_gemm_optimization.md
@@ -0,0 +1,207 @@
+# Ampere-Optimized FP32 GEMM Kernel Design
+
+## Executive Summary
+
+Target: 22-32 TFLOPS FP32 GEMM on RTX 3090 Ti (SM 8.6)
+Current: ~10 TFLOPS (28% efficiency)
+Peak: 35.6 TFLOPS theoretical
+
+## 1. RTX 3090 Ti Architecture Analysis
+
+### Hardware Specifications
+- **SMs**: 84
+- **CUDA Cores**: 10,752 (128 per SM)
+- **Clock**: ~1.86 GHz (boost)
+- **Theoretical FP32**: 35.6 TFLOPS
+- **Memory Bandwidth**: 1,008 GB/s (GDDR6X)
+- **L2 Cache**: 6 MB
+- **Shared Memory**: 100 KB per SM (max)
+- **Registers**: 65,536 per SM (256 per thread max)
+
+### Ampere-Specific Features
+1. **cp.async**: Asynchronous global→shared copy bypassing registers
+2. **Async barriers**: Fine-grained pipeline synchronization
+3. **Large shared memory**: Up to 100KB with L1 carveout
+4. **Improved L2**: 6MB with better hit rates
+
+## 2. Current Kernel Bottleneck Analysis
+
+### Estimated Roofline Position
+```
+Arithmetic Intensity (AI) = 2*M*N*K / (M*K + K*N + M*N) * 4 bytes
+For 4096×4096: AI = 2*4096^3 / (3*4096^2 * 4) ≈ 682 FLOPS/byte
+
+Ridge point = 35.6 TFLOPS / 1.008 TB/s = 35.3 FLOPS/byte
+AI >> Ridge point → Should be compute-bound
+```
+
+### Why Current Kernel Underperforms
+
+1. **Synchronous Memory Loads**:
+   - Each tile load requires register staging
+   - Memory latency not hidden
+   - ~400 cycles latency per global load
+
+2. **Insufficient Pipelining**:
+   - Single buffer: compute waits for load
+   - No overlap between load and compute phases
+
+3. **Warp Stalls**:
+   - Barrier stalls (waiting for __syncthreads)
+   - Scoreboard stalls (register dependencies)
+   - Memory dependency stalls
+
+4. **Suboptimal Register Usage**:
+   - Not maximizing ILP within thread
+   - Potential register spilling
+
+## 3. Optimization Strategy
+
+### 3.1 Asynchronous Copy Pipeline (cp.async)
+
+Replace synchronous loads:
+```cuda
+// OLD: Synchronous (2 instructions, uses registers)
+float val = A[global_idx];
+As[shared_idx] = val;
+
+// NEW: Asynchronous (1 instruction, bypasses registers)
+asm volatile(
+    "cp.async.ca.shared.global [%0], [%1], 16;\n"
+    :: "r"(smem_ptr), "l"(gmem_ptr)
+);
+```
+
+Benefits:
+- Saves registers (no intermediate storage)
+- Enables true async operation
+- Better memory coalescing
+
+### 3.2 Multi-Stage Software Pipeline
+
+```
+Stage Layout (4 stages, BK=8):
+┌──────────┬──────────┬──────────┬──────────┐
+│ Stage 0  │ Stage 1  │ Stage 2  │ Stage 3  │
+│ Load K+3 │ Load K+2 │ Load K+1 │ Compute K│
+└──────────┴──────────┴──────────┴──────────┘
+
+Timeline:
+Iter 0: Load[0]
+Iter 1: Load[1], Wait[0]
+Iter 2: Load[2], Compute[0], Wait[1]
+Iter 3: Load[3], Compute[1], Wait[2]
+Iter 4: Load[4], Compute[2], Wait[3]
+...
+```
+
+This hides ~300-400 cycles of memory latency.
+
+### 3.3 Tile Configuration
+
+```
+CTA Tile:     128 × 128 (output elements per block)
+Warp Tile:    32 × 64   (each warp handles this)
+Thread Tile:  8 × 8     (each thread computes 64 outputs)
+BK:           8         (small for more pipeline stages)
+Stages:       4         (hide memory latency)
+
+Block: 256 threads (16×16)
+Grid:  (N/128) × (M/128)
+```
+
+### 3.4 Register Blocking
+
+Per thread register allocation:
+- Accumulators: 8×8 = 64 floats = 64 registers
+- A fragment: 8 floats = 8 registers
+- B fragment: 8 floats = 8 registers
+- Indexing: ~8 registers
+- **Total: ~88 registers/thread**
+
+With 256 threads: 256 × 88 = 22,528 registers/block
+SM has 65,536 → Can run 2-3 blocks/SM → 50-75% occupancy
+
+### 3.5 Shared Memory Layout
+
+```
+4-stage pipeline shared memory:
+As[4][BK][BM+PAD] = 4 × 8 × (128+4) × 4 bytes = 16,896 bytes
+Bs[4][BK][BN+PAD] = 4 × 8 × (128+4) × 4 bytes = 16,896 bytes
+Total: ~34 KB (fits in 48KB default)
+```
+
+Padding eliminates bank conflicts.
+
+### 3.6 Instruction-Level Parallelism
+
+Within each thread's inner loop:
+```cuda
+// Interleave loads and computes for ILP
+float a0 = As[k][ty*TM + 0];
+float b0 = Bs[k][tx*TN + 0];
+acc[0][0] = fmaf(a0, b0, acc[0][0]);
+
+float a1 = As[k][ty*TM + 1];
+float b1 = Bs[k][tx*TN + 1];
+acc[0][1] = fmaf(a0, b1, acc[0][1]);
+acc[1][0] = fmaf(a1, b0, acc[1][0]);
+// ... etc
+```
+
+## 4. Expected Performance
+
+### Theoretical Analysis
+
+```
+Operations per CTA per K-tile:
+  = 2 × BM × BN × BK = 2 × 128 × 128 × 8 = 262,144 FLOPs
+
+Memory loads per K-tile:
+  = (BM × BK + BK × BN) × 4 = (1024 + 1024) × 4 = 8,192 bytes
+
+Arithmetic Intensity = 262,144 / 8,192 = 32 FLOPS/byte
+```
+
+With 1 TB/s effective bandwidth: 32 × 1000 = 32 TFLOPS theoretical max
+
+### Realistic Targets
+
+- **Minimum**: 22 TFLOPS (62% efficiency)
+- **Target**: 28 TFLOPS (79% efficiency)
+- **Stretch**: 32 TFLOPS (90% efficiency)
+
+## 5. Nsight Compute Metrics to Monitor
+
+### Key Metrics
+
+| Metric | Target | Description |
+|--------|--------|-------------|
+| SM Throughput | >80% | Compute utilization |
+| Memory Throughput | >70% | DRAM bandwidth |
+| Shared Memory Throughput | >80% | SMEM bandwidth |
+| Occupancy | 50-75% | Active warps/max |
+| Warp Stall (No Instruction) | <5% | Instruction cache |
+| Warp Stall (Barrier) | <10% | Sync overhead |
+| Warp Stall (Memory) | <15% | Latency hiding |
+| Register Spilling | 0 | No LMEM usage |
+
+### Expected Stall Breakdown (optimized)
+```
+Not Selected:    40%  (normal scheduling)
+Wait:            25%  (barrier/fence)
+Math Pipe:       20%  (FMA throughput)
+Memory:          10%  (unavoidable)
+Other:            5%
+```
+
+## 6. Implementation Checklist
+
+- [ ] cp.async infrastructure with proper barriers
+- [ ] 4-stage pipeline state machine
+- [ ] Efficient shared memory indexing
+- [ ] Bank-conflict-free access patterns
+- [ ] Vectorized loads where possible (float4)
+- [ ] Optimal thread-to-data mapping
+- [ ] Epilogue handling for non-multiple sizes
+- [ ] Integration with existing dispatch
diff --git a/tests/test_3090ti_performance.py b/tests/test_3090ti_performance.py
index 8f3d2de..f5b484d 100644
--- a/tests/test_3090ti_performance.py
+++ b/tests/test_3090ti_performance.py
@@ -19,15 +19,16 @@
 import numpy as np
 import pytest
 
-# Setup CUDA DLL path
+# Setup CUDA DLL path (if CUDA is installed)
 cuda_path = os.environ.get(
     "CUDA_PATH", r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
 )
 cuda_bin = os.path.join(cuda_path, "bin")
-if cuda_bin not in os.environ.get("PATH", ""):
-    os.environ["PATH"] = cuda_bin + os.pathsep + os.environ.get("PATH", "")
-if hasattr(os, "add_dll_directory"):
-    os.add_dll_directory(cuda_bin)
+if os.path.isdir(cuda_bin):
+    if cuda_bin not in os.environ.get("PATH", ""):
+        os.environ["PATH"] = cuda_bin + os.pathsep + os.environ.get("PATH", "")
+    if hasattr(os, "add_dll_directory"):
+        os.add_dll_directory(cuda_bin)
 
 # Skip if native module not available
 try:

From 4b9b2f2159d72189ee4d5e983dfb0f4ee91f41be Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Sat, 13 Dec 2025 15:00:36 +0900
Subject: [PATCH 7/8] perf(ci): add CUDA and ccache caching for cmake-check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Enable GitHub cache for CUDA toolkit downloads
- Add ccache for C++/CUDA compilation caching
- Should significantly reduce cmake-check time on subsequent runs

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .github/workflows/ci.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1a6ea64..92c18c5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -80,9 +80,18 @@ jobs:
 
       - name: Install CUDA Toolkit
         uses: Jimver/cuda-toolkit@v0.2.19
+        id: cuda-toolkit
         with:
           cuda: "12.6.0"
           method: network
+          use-github-cache: true
+          use-local-cache: false
+
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: cuda-build-${{ runner.os }}
+          max-size: 500M
 
       - name: Configure CMake
         run: |
@@ -91,6 +100,8 @@ jobs:
           cmake .. \
             -DCMAKE_BUILD_TYPE=Release \
             -DCMAKE_CUDA_ARCHITECTURES="80;86;89;90" \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
             -Dpybind11_DIR=$(python -c "import pybind11; print(pybind11.get_cmake_dir())")
 
       - name: Build native module

From 116d6b535e138cac4a91634d27d7aec07140fb24 Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Sat, 13 Dec 2025 15:05:40 +0900
Subject: [PATCH 8/8] chore(release): bump version to 0.2.2 and update README
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update performance table with Ampere SGEMM results (18.2 TFLOPS)
- Add cp.async pipeline features to README
- Mark v0.2.1 and v0.2.2 as released in roadmap

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md      | 48 +++++++++++++++++++++++++++---------------------
 pyproject.toml |  2 +-
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 95b7880..a2512d8 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,22 @@ PyGPUkit aims to be the "micro-runtime for GPU computing": small, fast, and idea
 
 ---
 
-## v0.2 Features (NEW)
+## v0.2.2 Features (NEW)
+
+### Ampere-Optimized SGEMM
+| Feature | Description |
+|---------|-------------|
+| **cp.async Pipeline** | 4-stage software pipeline with async memory transfers |
+| **Vectorized Loads** | float4 (16-byte) loads for A and B matrices |
+| **Shared Memory Tiling** | BM=128, BN=128, BK=16 with 8x8 thread tiles |
+| **SM 80+ Required** | Ampere architecture (RTX 30XX+) required |
+
+### Performance (RTX 3090 Ti)
+| Matrix Size | TFLOPS | Efficiency | vs NumPy |
+|-------------|--------|------------|----------|
+| 2048x2048 | 7.6 | 19% | 10x |
+| 4096x4096 | 13.2 | 33% | 16x |
+| 8192x8192 | **18.2** | 46% | **22x** |
 
 ### Core Infrastructure (Rust)
 | Feature | Description |
@@ -40,15 +55,6 @@ PyGPUkit aims to be the "micro-runtime for GPU computing": small, fast, and idea
 | **Pinned Memory** | Page-locked host memory with pooling |
 | **Kernel Cache** | PTX caching, LRU eviction, TTL |
 | **GPU Partitioning** | Resource isolation, multi-tenant support |
-| **Tiled Matmul** | Shared memory + double buffering |
-
-### Performance (RTX 3090 Ti)
-| Matrix Size | Performance | vs NumPy |
-|-------------|-------------|----------|
-| 512x512 | 1262 GFLOPS | 11.6x |
-| 1024x1024 | 1350 GFLOPS | 2.2x |
-| 2048x2048 | 4417 GFLOPS | 6.1x |
-| 4096x4096 | **6555 GFLOPS** | 7.9x |
 
 ---
 
@@ -320,17 +326,17 @@ PyGPUkit/
 - [x] Tiled Matmul (shared memory)
 - [x] 106 Rust tests
 
-### **v0.2.1 — Stabilization Phase**
-- [ ] Admission / QoS spec finalization
-- [ ] Python API inconsistency fixes
-- [ ] Rust error propagation unification
-- [ ] 24h stress test script
-
-### **v0.2.2 — Performance Phase**
-- [ ] 64x64 tile kernel refinement
-- [ ] TensorCore (Ampere+) availability check
-- [ ] Pinned Memory fragmentation test
-- [ ] Async Engine 3-stream support
+### **v0.2.1 — Stabilization Phase (Released)**
+- [x] Admission / QoS spec finalization
+- [x] Python API inconsistency fixes
+- [x] Rust error propagation unification
+
+### **v0.2.2 — Performance Phase (Released)**
+- [x] Ampere-optimized SGEMM with cp.async pipeline
+- [x] 4-stage software pipelining for latency hiding
+- [x] float4 vectorized memory loads
+- [x] 18.2 TFLOPS on RTX 3090 Ti (46% efficiency)
+- [x] SM 80+ (Ampere) architecture requirement
 
 ### **v0.2.3 — Reliability Phase**
 - [ ] Kernel cache LRU completion
diff --git a/pyproject.toml b/pyproject.toml
index 62a84c7..450f926 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "PyGPUkit"
-version = "0.2.0"
+version = "0.2.2"
 description = "A lightweight GPU runtime for Python with Rust-powered scheduler, NVRTC JIT compilation, and NumPy-like API"
 readme = "README.md"
 license = "MIT"