From 616abb27bb3f119de47017684cb742e1da36b9ea Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Tue, 30 Dec 2025 23:41:34 +0900
Subject: [PATCH 1/5] feat(benchmark): Add unified benchmark suite (#163)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add modular benchmark suite with:
- BenchmarkSuite class for unified benchmark orchestration
- GEMM benchmarks (fp32, tf32, bf16, fp16, fp8)
- GEMV benchmarks (bf16, fp8, nvf4, int4, w8a8)
- Attention benchmarks (SDPA, GQA)
- JSON export and baseline comparison
- Regression detection with configurable threshold
- CLI interface: python -m pygpukit.benchmark

Usage:
  from pygpukit.benchmark import BenchmarkSuite
  suite = BenchmarkSuite()
  suite.add_gemm().add_gemv()
  report = suite.run()
  report.save("baseline.json")

  # Compare with baseline
  comparison = suite.compare("baseline.json")
  if comparison.has_regression(threshold=0.05):
      raise RuntimeError("Regression detected!")

Closes #163

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/pygpukit/benchmark/__init__.py  | 271 ++++++++++++++++++++++++++++
 src/pygpukit/benchmark/__main__.py  |   7 +
 src/pygpukit/benchmark/attention.py | 159 ++++++++++++++++
 src/pygpukit/benchmark/base.py      | 137 ++++++++++++++
 src/pygpukit/benchmark/cli.py       | 233 ++++++++++++++++++++++++
 src/pygpukit/benchmark/gemm.py      | 214 ++++++++++++++++++++++
 src/pygpukit/benchmark/gemv.py      | 211 ++++++++++++++++++++++
 src/pygpukit/benchmark/results.py   | 187 +++++++++++++++++++
 8 files changed, 1419 insertions(+)
 create mode 100644 src/pygpukit/benchmark/__init__.py
 create mode 100644 src/pygpukit/benchmark/__main__.py
 create mode 100644 src/pygpukit/benchmark/attention.py
 create mode 100644 src/pygpukit/benchmark/base.py
 create mode 100644 src/pygpukit/benchmark/cli.py
 create mode 100644 src/pygpukit/benchmark/gemm.py
 create mode 100644 src/pygpukit/benchmark/gemv.py
 create mode 100644 src/pygpukit/benchmark/results.py

diff --git a/src/pygpukit/benchmark/__init__.py b/src/pygpukit/benchmark/__init__.py
new file mode 100644
index 0000000..cc55a4a
--- /dev/null
+++ b/src/pygpukit/benchmark/__init__.py
@@ -0,0 +1,271 @@
+"""PyGPUkit Benchmark Suite.
+
+Usage:
+    from pygpukit.benchmark import BenchmarkSuite
+
+    suite = BenchmarkSuite()
+    suite.add_gemm()
+    suite.add_gemv()
+    report = suite.run()
+    report.save("baseline.json")
+
+    # Compare with baseline
+    comparison = suite.compare("baseline.json")
+    if comparison.has_regression():
+        raise RuntimeError("Performance regression detected!")
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from .attention import GQABenchmark, SDPABenchmark
+from .base import Benchmark, get_gpu_info, measure_kernel
+from .gemm import FP8GEMMBenchmark, GEMMBenchmark
+from .gemv import GEMVBenchmark, W8A8GEMVBenchmark
+from .results import (
+    BenchmarkReport,
+    BenchmarkResult,
+    ComparisonResult,
+    GPUInfo,
+    Regression,
+    compare_reports,
+)
+
+__all__ = [
+    "BenchmarkSuite",
+    "BenchmarkReport",
+    "BenchmarkResult",
+    "ComparisonResult",
+    "GPUInfo",
+    "Regression",
+    "Benchmark",
+    "GEMMBenchmark",
+    "FP8GEMMBenchmark",
+    "GEMVBenchmark",
+    "W8A8GEMVBenchmark",
+    "SDPABenchmark",
+    "GQABenchmark",
+    "get_gpu_info",
+    "measure_kernel",
+    "compare_reports",
+]
+
+
+class BenchmarkSuite:
+    """Unified benchmark suite for PyGPUkit.
+
+    Example:
+        suite = BenchmarkSuite()
+        suite.add_gemm(sizes=[(4096, 4096, 4096)])
+        suite.add_gemv()
+        report = suite.run()
+        report.save("results.json")
+    """
+
+    def __init__(self, warmup: int = 10, iterations: int = 50, quick: bool = False):
+        """Initialize benchmark suite.
+
+        Args:
+            warmup: Number of warmup iterations
+            iterations: Number of timed iterations
+            quick: If True, use reduced warmup/iterations
+        """
+        if quick:
+            warmup = 5
+            iterations = 20
+        self.warmup = warmup
+        self.iterations = iterations
+        self.benchmarks: list[Benchmark] = []
+
+    def add_gemm(
+        self,
+        sizes: list[tuple[int, int, int]] | None = None,
+        dtypes: list[str] | None = None,
+    ) -> BenchmarkSuite:
+        """Add GEMM benchmark.
+
+        Args:
+            sizes: List of (M, K, N) tuples
+            dtypes: List of dtypes to benchmark (fp32, tf32, bf16, fp16)
+        """
+        self.benchmarks.append(
+            GEMMBenchmark(
+                sizes=sizes,
+                dtypes=dtypes,
+                warmup=self.warmup,
+                iterations=self.iterations,
+            )
+        )
+        return self
+
+    def add_fp8_gemm(
+        self,
+        sizes: list[tuple[int, int, int]] | None = None,
+    ) -> BenchmarkSuite:
+        """Add FP8 GEMM benchmark (SM120+)."""
+        self.benchmarks.append(
+            FP8GEMMBenchmark(
+                sizes=sizes,
+                warmup=self.warmup,
+                iterations=self.iterations,
+            )
+        )
+        return self
+
+    def add_gemv(
+        self,
+        configs: list[tuple[int, int, str]] | None = None,
+        dtypes: list[str] | None = None,
+    ) -> BenchmarkSuite:
+        """Add GEMV benchmark.
+
+        Args:
+            configs: List of (K, N, label) tuples
+            dtypes: List of dtypes (bf16, fp8, nvf4, int4)
+        """
+        self.benchmarks.append(
+            GEMVBenchmark(
+                configs=configs,
+                dtypes=dtypes,
+                warmup=self.warmup,
+                iterations=self.iterations,
+            )
+        )
+        return self
+
+    def add_w8a8_gemv(
+        self,
+        configs: list[tuple[int, int, str]] | None = None,
+    ) -> BenchmarkSuite:
+        """Add W8A8 GEMV benchmark."""
+        self.benchmarks.append(
+            W8A8GEMVBenchmark(
+                configs=configs,
+                warmup=self.warmup,
+                iterations=self.iterations,
+            )
+        )
+        return self
+
+    def add_attention(
+        self,
+        seq_lens: list[int] | None = None,
+        num_heads: int = 32,
+        head_dim: int = 128,
+    ) -> BenchmarkSuite:
+        """Add SDPA benchmark."""
+        self.benchmarks.append(
+            SDPABenchmark(
+                seq_lens=seq_lens,
+                num_heads=num_heads,
+                head_dim=head_dim,
+                warmup=self.warmup,
+                iterations=self.iterations,
+            )
+        )
+        return self
+
+    def add_gqa(
+        self,
+        seq_lens: list[int] | None = None,
+        num_heads: int = 32,
+        num_kv_heads: int = 8,
+        head_dim: int = 128,
+    ) -> BenchmarkSuite:
+        """Add GQA benchmark."""
+        self.benchmarks.append(
+            GQABenchmark(
+                seq_lens=seq_lens,
+                num_heads=num_heads,
+                num_kv_heads=num_kv_heads,
+                head_dim=head_dim,
+                warmup=self.warmup,
+                iterations=self.iterations,
+            )
+        )
+        return self
+
+    def add_all(self) -> BenchmarkSuite:
+        """Add all available benchmarks with default settings."""
+        self.add_gemm()
+        self.add_gemv()
+        self.add_attention()
+        return self
+
+    def run(self, verbose: bool = True) -> BenchmarkReport:
+        """Run all benchmarks.
+
+        Args:
+            verbose: If True, print progress
+
+        Returns:
+            BenchmarkReport with all results
+        """
+        gpu_info = get_gpu_info()
+        report = BenchmarkReport(gpu=gpu_info)
+
+        if verbose:
+            print("=" * 60)
+            print("PyGPUkit Benchmark Suite")
+            print("=" * 60)
+            print(f"GPU: {gpu_info.name}")
+            print(f"SM: {gpu_info.sm_major}.{gpu_info.sm_minor}")
+            print(f"Memory: {gpu_info.memory_gb:.1f} GB")
+            print()
+
+        for benchmark in self.benchmarks:
+            if verbose:
+                print(f"Running {benchmark.__class__.__name__}...")
+
+            results = benchmark.run()
+            for result in results:
+                report.add(result)
+                if verbose:
+                    tflops_str = f"{result.tflops:.1f} TFLOPS" if result.tflops else ""
+                    print(f"  {result.name}: {result.median_us:.1f} us {tflops_str}")
+
+            if verbose:
+                print()
+
+        return report
+
+    def compare(
+        self,
+        baseline_path: str | Path,
+        threshold: float = 0.05,
+        verbose: bool = True,
+    ) -> ComparisonResult:
+        """Run benchmarks and compare with baseline.
+
+        Args:
+            baseline_path: Path to baseline JSON file
+            threshold: Regression threshold (0.05 = 5%)
+            verbose: If True, print comparison summary
+
+        Returns:
+            ComparisonResult
+        """
+        current = self.run(verbose=verbose)
+        baseline = BenchmarkReport.load(baseline_path)
+        comparison = compare_reports(current, baseline, threshold=threshold)
+
+        if verbose:
+            print(comparison.summary())
+
+        return comparison
+
+
+def run_quick() -> BenchmarkReport:
+    """Run quick benchmark suite."""
+    suite = BenchmarkSuite(quick=True)
+    suite.add_gemm(sizes=[(4096, 4096, 4096)], dtypes=["bf16"])
+    suite.add_gemv(dtypes=["bf16"])
+    return suite.run()
+
+
+def run_full() -> BenchmarkReport:
+    """Run full benchmark suite."""
+    suite = BenchmarkSuite()
+    suite.add_all()
+    return suite.run()
diff --git a/src/pygpukit/benchmark/__main__.py b/src/pygpukit/benchmark/__main__.py
new file mode 100644
index 0000000..2ecb328
--- /dev/null
+++ b/src/pygpukit/benchmark/__main__.py
@@ -0,0 +1,7 @@
+"""Entry point for python -m pygpukit.benchmark."""
+
+import sys
+
+from .cli import main
+
+sys.exit(main())
diff --git a/src/pygpukit/benchmark/attention.py b/src/pygpukit/benchmark/attention.py
new file mode 100644
index 0000000..8741d10
--- /dev/null
+++ b/src/pygpukit/benchmark/attention.py
@@ -0,0 +1,159 @@
+"""Attention benchmarks."""
+
+from __future__ import annotations
+
+import numpy as np
+
+from .base import Benchmark
+from .results import BenchmarkResult
+
+
+class SDPABenchmark(Benchmark):
+    """Scaled Dot-Product Attention benchmark."""
+
+    category = "attention"
+
+    def __init__(
+        self,
+        seq_lens: list[int] | None = None,
+        num_heads: int = 32,
+        head_dim: int = 128,
+        warmup: int = 10,
+        iterations: int = 50,
+    ):
+        super().__init__(warmup=warmup, iterations=iterations)
+        self.seq_lens = seq_lens or [512, 1024, 2048, 4096]
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+
+    def run(self) -> list[BenchmarkResult]:
+        """Run SDPA benchmarks."""
+        from pygpukit.core.backend import get_native_module
+
+        native = get_native_module()
+        results = []
+
+        for seq_len in self.seq_lens:
+            try:
+                result = self._benchmark_sdpa(native, seq_len)
+                if result:
+                    results.append(result)
+            except Exception as e:
+                print(f"  SDPA seq_len={seq_len}: ERROR - {e}")
+
+        return results
+
+    def _benchmark_sdpa(
+        self,
+        native: object,
+        seq_len: int,
+    ) -> BenchmarkResult | None:
+        """Benchmark SDPA for a given sequence length."""
+        import pygpukit as gk
+
+        name = f"sdpa_seq{seq_len}"
+        params = {
+            "seq_len": seq_len,
+            "num_heads": self.num_heads,
+            "head_dim": self.head_dim,
+        }
+
+        # Attention FLOPs: 4 * seq_len^2 * head_dim * num_heads
+        # (Q@K^T and attn@V, each 2*seq*seq*dim)
+        flops = 4.0 * seq_len * seq_len * self.head_dim * self.num_heads
+
+        # Create Q, K, V
+        Q = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
+        K = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
+        V = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
+        O = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
+
+        # Check if native SDPA available
+        if not hasattr(native, "sdpa_causal_bf16"):
+            return None
+
+        def run_fn() -> None:
+            native.sdpa_causal_bf16(
+                Q._get_native(),
+                K._get_native(),
+                V._get_native(),
+                O._get_native(),
+            )
+
+        return self._measure(name, run_fn, params, flops=flops)
+
+
+class GQABenchmark(Benchmark):
+    """Grouped Query Attention benchmark."""
+
+    category = "attention"
+
+    def __init__(
+        self,
+        seq_lens: list[int] | None = None,
+        num_heads: int = 32,
+        num_kv_heads: int = 8,
+        head_dim: int = 128,
+        warmup: int = 10,
+        iterations: int = 50,
+    ):
+        super().__init__(warmup=warmup, iterations=iterations)
+        self.seq_lens = seq_lens or [512, 1024, 2048]
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+
+    def run(self) -> list[BenchmarkResult]:
+        """Run GQA benchmarks."""
+        from pygpukit.core.backend import get_native_module
+
+        native = get_native_module()
+        results = []
+
+        for seq_len in self.seq_lens:
+            try:
+                result = self._benchmark_gqa(native, seq_len)
+                if result:
+                    results.append(result)
+            except Exception as e:
+                print(f"  GQA seq_len={seq_len}: ERROR - {e}")
+
+        return results
+
+    def _benchmark_gqa(
+        self,
+        native: object,
+        seq_len: int,
+    ) -> BenchmarkResult | None:
+        """Benchmark GQA."""
+        import pygpukit as gk
+
+        name = f"gqa_seq{seq_len}"
+        params = {
+            "seq_len": seq_len,
+            "num_heads": self.num_heads,
+            "num_kv_heads": self.num_kv_heads,
+            "head_dim": self.head_dim,
+        }
+
+        # GQA FLOPs (KV heads broadcasted)
+        flops = 4.0 * seq_len * seq_len * self.head_dim * self.num_heads
+
+        Q = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
+        K = gk.empty((self.num_kv_heads, seq_len, self.head_dim), dtype="bfloat16")
+        V = gk.empty((self.num_kv_heads, seq_len, self.head_dim), dtype="bfloat16")
+        O = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
+
+        if not hasattr(native, "sdpa_causal_gqa_bf16"):
+            return None
+
+        def run_fn() -> None:
+            native.sdpa_causal_gqa_bf16(
+                Q._get_native(),
+                K._get_native(),
+                V._get_native(),
+                O._get_native(),
+                self.num_heads // self.num_kv_heads,
+            )
+
+        return self._measure(name, run_fn, params, flops=flops)
diff --git a/src/pygpukit/benchmark/base.py b/src/pygpukit/benchmark/base.py
new file mode 100644
index 0000000..e5629fc
--- /dev/null
+++ b/src/pygpukit/benchmark/base.py
@@ -0,0 +1,137 @@
+"""Base benchmark class and utilities."""
+
+from __future__ import annotations
+
+import time
+from abc import ABC, abstractmethod
+from typing import Any, Callable
+
+import numpy as np
+
+from .results import BenchmarkResult, GPUInfo
+
+
+def get_gpu_info() -> GPUInfo:
+    """Get GPU information."""
+    from pygpukit.core.backend import get_native_module
+
+    native = get_native_module()
+    props = native.get_device_properties(0)
+
+    return GPUInfo(
+        name=props.name,
+        sm_major=props.compute_capability_major,
+        sm_minor=props.compute_capability_minor,
+        memory_gb=props.total_memory / (1024**3),
+    )
+
+
+def measure_kernel(
+    fn: Callable[[], Any],
+    warmup: int = 10,
+    iterations: int = 50,
+    sync_fn: Callable[[], None] | None = None,
+) -> tuple[float, float, float, float]:
+    """Measure kernel execution time.
+
+    Args:
+        fn: Function to benchmark
+        warmup: Number of warmup iterations
+        iterations: Number of timed iterations
+        sync_fn: Optional sync function (e.g., device_synchronize)
+
+    Returns:
+        (median_us, min_us, max_us, std_us)
+    """
+    if sync_fn is None:
+        from pygpukit.core.backend import get_native_module
+
+        native = get_native_module()
+        sync_fn = native.device_synchronize
+
+    # Warmup
+    for _ in range(warmup):
+        fn()
+    sync_fn()
+
+    # Benchmark
+    times = []
+    for _ in range(iterations):
+        sync_fn()
+        start = time.perf_counter()
+        fn()
+        sync_fn()
+        end = time.perf_counter()
+        times.append((end - start) * 1e6)  # Convert to microseconds
+
+    times_arr = np.array(times)
+    return (
+        float(np.median(times_arr)),
+        float(np.min(times_arr)),
+        float(np.max(times_arr)),
+        float(np.std(times_arr)),
+    )
+
+
+class Benchmark(ABC):
+    """Abstract base class for benchmarks."""
+
+    category: str = "unknown"
+    warmup: int = 10
+    iterations: int = 50
+
+    def __init__(self, warmup: int | None = None, iterations: int | None = None):
+        if warmup is not None:
+            self.warmup = warmup
+        if iterations is not None:
+            self.iterations = iterations
+
+    @abstractmethod
+    def run(self) -> list[BenchmarkResult]:
+        """Run the benchmark and return results."""
+        pass
+
+    def _measure(
+        self,
+        name: str,
+        fn: Callable[[], Any],
+        params: dict[str, Any],
+        flops: float | None = None,
+        bytes_moved: float | None = None,
+        check_fn: Callable[[], tuple[bool, float]] | None = None,
+    ) -> BenchmarkResult:
+        """Measure a single benchmark case."""
+        median_us, min_us, max_us, std_us = measure_kernel(
+            fn, warmup=self.warmup, iterations=self.iterations
+        )
+
+        # Calculate TFLOPS if flops provided
+        tflops = None
+        if flops is not None and median_us > 0:
+            tflops = flops / median_us / 1e6  # TFLOPS = flops / us / 1e6
+
+        # Calculate bandwidth if bytes provided
+        bandwidth = None
+        if bytes_moved is not None and median_us > 0:
+            bandwidth = bytes_moved / median_us / 1e3  # GB/s = bytes / us / 1e3
+
+        # Check correctness
+        correct = True
+        rel_error = 0.0
+        if check_fn is not None:
+            correct, rel_error = check_fn()
+
+        return BenchmarkResult(
+            name=name,
+            category=self.category,
+            params=params,
+            median_us=median_us,
+            min_us=min_us,
+            max_us=max_us,
+            std_us=std_us,
+            tflops=tflops,
+            bandwidth_gbps=bandwidth,
+            correct=correct,
+            rel_error=rel_error,
+            iterations=self.iterations,
+        )
diff --git a/src/pygpukit/benchmark/cli.py b/src/pygpukit/benchmark/cli.py
new file mode 100644
index 0000000..5c0390d
--- /dev/null
+++ b/src/pygpukit/benchmark/cli.py
@@ -0,0 +1,233 @@
+"""CLI interface for benchmark suite."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+from . import BenchmarkReport, BenchmarkSuite
+
+
+def main() -> int:
+    """Main entry point for benchmark CLI."""
+    parser = argparse.ArgumentParser(
+        description="PyGPUkit Benchmark Suite",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python -m pygpukit.benchmark                    # Run default benchmarks
+  python -m pygpukit.benchmark --quick            # Quick benchmarks
+  python -m pygpukit.benchmark --save results.json
+  python -m pygpukit.benchmark --compare baseline.json
+  python -m pygpukit.benchmark --gemm --sizes 4096,8192
+  python -m pygpukit.benchmark --gemv --dtypes bf16,fp8
+""",
+    )
+
+    # Output options
+    parser.add_argument(
+        "--save",
+        type=str,
+        metavar="FILE",
+        help="Save results to JSON file",
+    )
+    parser.add_argument(
+        "--compare",
+        type=str,
+        metavar="FILE",
+        help="Compare with baseline JSON file",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=0.05,
+        help="Regression threshold (default: 0.05 = 5%%)",
+    )
+    parser.add_argument(
+        "--fail-on-regression",
+        action="store_true",
+        help="Exit with code 1 if regression detected",
+    )
+
+    # Benchmark selection
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        help="Run all benchmarks",
+    )
+    parser.add_argument(
+        "--gemm",
+        action="store_true",
+        help="Run GEMM benchmarks",
+    )
+    parser.add_argument(
+        "--gemv",
+        action="store_true",
+        help="Run GEMV benchmarks",
+    )
+    parser.add_argument(
+        "--attention",
+        action="store_true",
+        help="Run attention benchmarks",
+    )
+    parser.add_argument(
+        "--fp8",
+        action="store_true",
+        help="Include FP8 benchmarks (SM120+)",
+    )
+
+    # Benchmark parameters
+    parser.add_argument(
+        "--sizes",
+        type=str,
+        help="GEMM sizes: comma-separated (e.g., 2048,4096,8192)",
+    )
+    parser.add_argument(
+        "--dtypes",
+        type=str,
+        help="Dtypes: comma-separated (e.g., fp32,tf32,bf16)",
+    )
+    parser.add_argument(
+        "--seq-lens",
+        type=str,
+        help="Attention seq lengths: comma-separated (e.g., 512,1024,2048)",
+    )
+
+    # Performance options
+    parser.add_argument(
+        "--quick",
+        action="store_true",
+        help="Quick mode: fewer iterations",
+    )
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=10,
+        help="Warmup iterations (default: 10)",
+    )
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=50,
+        help="Benchmark iterations (default: 50)",
+    )
+
+    # Output format
+    parser.add_argument(
+        "--quiet",
+        action="store_true",
+        help="Suppress progress output",
+    )
+    parser.add_argument(
+        "--markdown",
+        action="store_true",
+        help="Output results as markdown table",
+    )
+
+    args = parser.parse_args()
+
+    # Create suite
+    suite = BenchmarkSuite(
+        warmup=args.warmup,
+        iterations=args.iterations,
+        quick=args.quick,
+    )
+
+    # Parse sizes
+    sizes = None
+    if args.sizes:
+        size_list = [int(s.strip()) for s in args.sizes.split(",")]
+        sizes = [(s, s, s) for s in size_list]  # Square matrices
+
+    # Parse dtypes
+    dtypes = None
+    if args.dtypes:
+        dtypes = [d.strip() for d in args.dtypes.split(",")]
+
+    # Parse seq lens
+    seq_lens = None
+    if args.seq_lens:
+        seq_lens = [int(s.strip()) for s in args.seq_lens.split(",")]
+
+    # Add benchmarks
+    if args.all:
+        suite.add_all()
+        if args.fp8:
+            suite.add_fp8_gemm()
+            suite.add_w8a8_gemv()
+    else:
+        # Default: add gemm and gemv if nothing specified
+        has_selection = args.gemm or args.gemv or args.attention
+        if not has_selection:
+            suite.add_gemm(sizes=sizes, dtypes=dtypes)
+            suite.add_gemv(dtypes=dtypes)
+        else:
+            if args.gemm:
+                suite.add_gemm(sizes=sizes, dtypes=dtypes)
+                if args.fp8:
+                    suite.add_fp8_gemm(sizes=sizes)
+            if args.gemv:
+                suite.add_gemv(dtypes=dtypes)
+                if args.fp8:
+                    suite.add_w8a8_gemv()
+            if args.attention:
+                suite.add_attention(seq_lens=seq_lens)
+                suite.add_gqa(seq_lens=seq_lens)
+
+    # Run benchmarks
+    verbose = not args.quiet
+    if args.compare:
+        comparison = suite.compare(
+            args.compare,
+            threshold=args.threshold,
+            verbose=verbose,
+        )
+        if args.fail_on_regression and comparison.has_regression(args.threshold):
+            print("\nERROR: Performance regression detected!")
+            return 1
+        report = comparison.current
+    else:
+        report = suite.run(verbose=verbose)
+
+    # Save results
+    if args.save:
+        report.save(args.save)
+        if verbose:
+            print(f"Results saved to {args.save}")
+
+    # Print markdown table
+    if args.markdown:
+        print_markdown_table(report)
+
+    return 0
+
+
+def print_markdown_table(report: BenchmarkReport) -> None:
+    """Print results as markdown table."""
+
+    print("\n## Benchmark Results\n")
+    print(f"GPU: {report.gpu.name}")
+    print(f"SM: {report.gpu.sm_major}.{report.gpu.sm_minor}")
+    print()
+
+    # Group by category
+    by_category: dict[str, list] = {}
+    for r in report.results:
+        if r.category not in by_category:
+            by_category[r.category] = []
+        by_category[r.category].append(r)
+
+    for category, results in by_category.items():
+        print(f"### {category.upper()}\n")
+        print("| Name | Time (us) | TFLOPS | Correct |")
+        print("|------|-----------|--------|---------|")
+        for r in results:
+            tflops = f"{r.tflops:.1f}" if r.tflops else "-"
+            correct = "Yes" if r.correct else "No"
+            print(f"| {r.name} | {r.median_us:.1f} | {tflops} | {correct} |")
+        print()
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/pygpukit/benchmark/gemm.py b/src/pygpukit/benchmark/gemm.py
new file mode 100644
index 0000000..2e0cd91
--- /dev/null
+++ b/src/pygpukit/benchmark/gemm.py
@@ -0,0 +1,214 @@
+"""GEMM (General Matrix Multiply) benchmarks."""
+
+from __future__ import annotations
+
+from typing import Literal
+
+import numpy as np
+
+from .base import Benchmark
+from .results import BenchmarkResult
+
+
+class GEMMBenchmark(Benchmark):
+    """GEMM benchmark for various dtypes and sizes."""
+
+    category = "gemm"
+
+    def __init__(
+        self,
+        sizes: list[tuple[int, int, int]] | None = None,
+        dtypes: list[str] | None = None,
+        warmup: int = 10,
+        iterations: int = 50,
+    ):
+        super().__init__(warmup=warmup, iterations=iterations)
+        self.sizes = sizes or [
+            (2048, 2048, 2048),
+            (4096, 4096, 4096),
+            (8192, 8192, 8192),
+        ]
+        self.dtypes = dtypes or ["fp32", "tf32", "bf16"]
+
+    def run(self) -> list[BenchmarkResult]:
+        """Run GEMM benchmarks."""
+        results: list[BenchmarkResult] = []
+
+        for dtype in self.dtypes:
+            for M, K, N in self.sizes:
+                try:
+                    result = self._benchmark_gemm(dtype, M, K, N)
+                    if result:
+                        results.append(result)
+                except Exception as e:
+                    print(f"  GEMM {dtype} {M}x{K}x{N}: ERROR - {e}")
+
+        return results
+
+    def _benchmark_gemm(
+        self,
+        dtype: str,
+        M: int,
+        K: int,
+        N: int,
+    ) -> BenchmarkResult | None:
+        """Benchmark single GEMM configuration."""
+        import os
+
+        from pygpukit.core.backend import get_native_module
+
+        native = get_native_module()
+
+        name = f"gemm_{dtype}_{M}x{K}x{N}"
+        params = {"dtype": dtype, "M": M, "K": K, "N": N}
+        flops = 2.0 * M * K * N
+
+        if dtype == "fp32":
+            os.environ.pop("PYGPUKIT_ALLOW_TF32", None)
+            A = np.random.randn(M, K).astype(np.float32) * 0.1
+            B = np.random.randn(K, N).astype(np.float32) * 0.1
+            A_gpu = native.from_numpy(A)
+            B_gpu = native.from_numpy(B)
+
+            def run_fn() -> None:
+                native.matmul(A_gpu, B_gpu)
+
+            def check_fn() -> tuple[bool, float]:
+                C_gpu = native.matmul(A_gpu, B_gpu)
+                C = C_gpu.to_numpy()
+                C_ref = A @ B
+                err = float(np.max(np.abs(C - C_ref)) / (np.max(np.abs(C_ref)) + 1e-8))
+                return err < 1e-3, err
+
+        elif dtype == "tf32":
+            os.environ["PYGPUKIT_ALLOW_TF32"] = "1"
+            os.environ["PYGPUKIT_TF32_V2"] = "1"
+            A = np.random.randn(M, K).astype(np.float32) * 0.1
+            B = np.random.randn(K, N).astype(np.float32) * 0.1
+            A_gpu = native.from_numpy(A)
+            B_gpu = native.from_numpy(B)
+
+            def run_fn() -> None:
+                native.matmul(A_gpu, B_gpu)
+
+            def check_fn() -> tuple[bool, float]:
+                C_gpu = native.matmul(A_gpu, B_gpu)
+                C = C_gpu.to_numpy()
+                C_ref = A @ B
+                err = float(np.max(np.abs(C - C_ref)) / (np.max(np.abs(C_ref)) + 1e-8))
+                return err < 0.01, err
+
+        elif dtype == "bf16":
+            import pygpukit as gk
+
+            A = np.random.randn(M, K).astype(np.float32) * 0.1
+            B = np.random.randn(K, N).astype(np.float32) * 0.1
+            A_gpu = gk.from_numpy(A).astype(gk.bfloat16)._get_native()
+            B_gpu = gk.from_numpy(B).astype(gk.bfloat16)._get_native()
+
+            def run_fn() -> None:
+                native.matmul(A_gpu, B_gpu)
+
+            def check_fn() -> tuple[bool, float]:
+                import pygpukit as gk
+
+                C_gpu = native.matmul(A_gpu, B_gpu)
+                C = gk.GPUArray._wrap_native(C_gpu).astype(gk.float32).to_numpy()
+                C_ref = A @ B
+                err = float(np.max(np.abs(C - C_ref)) / (np.max(np.abs(C_ref)) + 1e-8))
+                return err < 0.05, err
+
+        elif dtype == "fp16":
+            A = np.random.randn(M, K).astype(np.float16) * 0.1
+            B = np.random.randn(K, N).astype(np.float16) * 0.1
+            A_gpu = native.from_numpy(A)
+            B_gpu = native.from_numpy(B)
+
+            def run_fn() -> None:
+                native.matmul(A_gpu, B_gpu)
+
+            def check_fn() -> tuple[bool, float]:
+                C_gpu = native.matmul(A_gpu, B_gpu)
+                C = C_gpu.to_numpy().astype(np.float32)
+                C_ref = (A.astype(np.float32) @ B.astype(np.float32)).astype(np.float16)
+                err = float(
+                    np.max(np.abs(C - C_ref.astype(np.float32)))
+                    / (np.max(np.abs(C_ref.astype(np.float32))) + 1e-8)
+                )
+                return err < 0.05, err
+
+        else:
+            return None
+
+        return self._measure(name, run_fn, params, flops=flops, check_fn=check_fn)
+
+
+class FP8GEMMBenchmark(Benchmark):
+    """FP8 GEMM benchmark (SM120+)."""
+
+    category = "gemm"
+
+    def __init__(
+        self,
+        sizes: list[tuple[int, int, int]] | None = None,
+        warmup: int = 10,
+        iterations: int = 50,
+    ):
+        super().__init__(warmup=warmup, iterations=iterations)
+        self.sizes = sizes or [
+            (1024, 4096, 14336),
+            (2048, 4096, 14336),
+            (4096, 4096, 14336),
+        ]
+
+    def run(self) -> list[BenchmarkResult]:
+        """Run FP8 GEMM benchmarks."""
+        from pygpukit.core.backend import get_native_module
+
+        native = get_native_module()
+        results: list[BenchmarkResult] = []
+
+        # Check SM120 availability
+        props = native.get_device_properties(0)
+        sm = props.compute_capability_major * 10 + props.compute_capability_minor
+        if sm < 120:
+            print(f"  FP8 GEMM: Requires SM120+ (current: SM{sm})")
+            return results
+
+        for M, K, N in self.sizes:
+            try:
+                result = self._benchmark_fp8_gemm(native, M, K, N)
+                if result:
+                    results.append(result)
+            except Exception as e:
+                print(f"  FP8 GEMM {M}x{K}x{N}: ERROR - {e}")
+
+        return results
+
+    def _benchmark_fp8_gemm(
+        self,
+        native: object,
+        M: int,
+        K: int,
+        N: int,
+    ) -> BenchmarkResult | None:
+        """Benchmark FP8 GEMM."""
+        from pygpukit.core import from_numpy
+
+        name = f"gemm_fp8_{M}x{K}x{N}"
+        params = {"dtype": "fp8", "M": M, "K": K, "N": N}
+        flops = 2.0 * M * K * N
+
+        A_fp8 = from_numpy(np.random.randint(0, 256, (M, K), dtype=np.uint8))
+        B_fp8 = from_numpy(np.random.randint(0, 256, (K, N), dtype=np.uint8))
+        C_fp8 = from_numpy(np.zeros((M, N), dtype=np.uint8))
+
+        # Try v5 (cached) kernel
+        func = getattr(native, "gemm_fp8_fp8_sm120_v5", None)
+        if func is None:
+            return None
+
+        def run_fn() -> None:
+            func(A_fp8._get_native(), B_fp8._get_native(), C_fp8._get_native())
+
+        return self._measure(name, run_fn, params, flops=flops)
diff --git a/src/pygpukit/benchmark/gemv.py b/src/pygpukit/benchmark/gemv.py
new file mode 100644
index 0000000..4f2adb3
--- /dev/null
+++ b/src/pygpukit/benchmark/gemv.py
@@ -0,0 +1,211 @@
+"""GEMV (General Matrix-Vector) benchmarks."""
+
+from __future__ import annotations
+
+import numpy as np
+
+from .base import Benchmark
+from .results import BenchmarkResult
+
+
+# LLM-relevant GEMV configurations
+LLM_CONFIGS = [
+    # (K, N, label)
+    (4096, 4096, "7B_hidden"),
+    (4096, 14336, "7B_mlp_up"),
+    (14336, 4096, "7B_mlp_down"),
+    (8192, 8192, "72B_hidden"),
+    (8192, 29568, "72B_mlp_up"),
+    (29568, 8192, "72B_mlp_down"),
+]
+
+
+class GEMVBenchmark(Benchmark):
+    """GEMV benchmark for LLM decode (M=1)."""
+
+    category = "gemv"
+
+    def __init__(
+        self,
+        configs: list[tuple[int, int, str]] | None = None,
+        dtypes: list[str] | None = None,
+        warmup: int = 10,
+        iterations: int = 50,
+    ):
+        super().__init__(warmup=warmup, iterations=iterations)
+        self.configs = configs or LLM_CONFIGS
+        self.dtypes = dtypes or ["bf16", "fp8", "nvf4"]
+
+    def run(self) -> list[BenchmarkResult]:
+        """Run GEMV benchmarks."""
+        from pygpukit.core.backend import get_native_module
+
+        native = get_native_module()
+        results: list[BenchmarkResult] = []
+
+        for K, N, label in self.configs:
+            for dtype in self.dtypes:
+                try:
+                    result = self._benchmark_gemv(native, K, N, label, dtype)
+                    if result:
+                        results.append(result)
+                except Exception as e:
+                    print(f"  GEMV {dtype} {label}: ERROR - {e}")
+
+        return results
+
+    def _benchmark_gemv(
+        self,
+        native: object,
+        K: int,
+        N: int,
+        label: str,
+        dtype: str,
+    ) -> BenchmarkResult | None:
+        """Benchmark single GEMV configuration."""
+        import pygpukit as gk
+        from pygpukit.core import from_numpy
+
+        name = f"gemv_{dtype}_{label}"
+        params = {"dtype": dtype, "K": K, "N": N, "label": label}
+        flops = 2.0 * K * N  # M=1
+
+        if dtype == "bf16":
+            from pygpukit.ops.matmul import gemv_bf16
+
+            A = gk.empty((K,), dtype="bfloat16")
+            B = gk.empty((K, N), dtype="bfloat16")
+            C = gk.empty((N,), dtype="bfloat16")
+
+            def run_fn() -> None:
+                gemv_bf16(A, B, out=C)
+
+        elif dtype == "fp8":
+            from pygpukit.ops.matmul import fp8_init_lut, gemv_fp8_bf16
+
+            fp8_init_lut()
+            A = gk.empty((K,), dtype="bfloat16")
+            B_fp8 = from_numpy(np.zeros((N, K), dtype=np.uint8))
+            n_blocks = (N + 127) // 128
+            k_blocks = (K + 127) // 128
+            B_scale = from_numpy(np.ones((n_blocks, k_blocks), dtype=np.float16).view(np.uint16))
+            C = gk.empty((N,), dtype="bfloat16")
+
+            def run_fn() -> None:
+                gemv_fp8_bf16(A, B_fp8, B_scale, out=C)
+
+        elif dtype == "nvf4":
+            from pygpukit.ops.matmul import gemv_nvf4_available, gemv_nvf4_bf16
+
+            if not gemv_nvf4_available():
+                return None
+
+            A = gk.empty((K,), dtype="bfloat16")
+            B_nvf4 = from_numpy(np.zeros((K // 2, N), dtype=np.uint8))
+            k_scale_blocks = (K + 31) // 32
+            B_scale = from_numpy(np.ones((k_scale_blocks, N), dtype=np.uint8))
+            C = gk.empty((N,), dtype="bfloat16")
+
+            def run_fn() -> None:
+                gemv_nvf4_bf16(A, B_nvf4, B_scale, out=C)
+
+        elif dtype == "int4":
+            if not hasattr(native, "int4_gemv_available") or not native.int4_gemv_available():
+                return None
+
+            def pack_int4(values: np.ndarray) -> np.ndarray:
+                flat = values.reshape(-1)
+                low = flat[0::2].astype(np.int32) & 0x0F
+                high = flat[1::2].astype(np.int32) & 0x0F
+                packed = (high << 4) | low
+                new_shape = list(values.shape)
+                new_shape[-1] //= 2
+                return packed.astype(np.uint8).reshape(new_shape)
+
+            A_raw = np.random.randint(-8, 8, K, dtype=np.int8)
+            B_raw = np.random.randint(-8, 8, (N, K), dtype=np.int8)
+            A_int4 = from_numpy(pack_int4(A_raw.reshape(1, -1)).reshape(-1))
+            B_int4 = from_numpy(pack_int4(B_raw))
+            C_int4 = from_numpy(np.zeros(N, dtype=np.int32))
+
+            def run_fn() -> None:
+                native.int4_gemv_int32_sm120(
+                    A_int4._get_native(), B_int4._get_native(), C_int4._get_native()
+                )
+
+        else:
+            return None
+
+        return self._measure(name, run_fn, params, flops=flops)
+
+
+class W8A8GEMVBenchmark(Benchmark):
+    """W8A8 (FP8 weights, FP8 activations) GEMV benchmark."""
+
+    category = "gemv"
+
+    def __init__(
+        self,
+        configs: list[tuple[int, int, str]] | None = None,
+        warmup: int = 10,
+        iterations: int = 50,
+    ):
+        super().__init__(warmup=warmup, iterations=iterations)
+        self.configs = configs or LLM_CONFIGS[:3]  # Smaller set
+
+    def run(self) -> list[BenchmarkResult]:
+        """Run W8A8 GEMV benchmarks."""
+        from pygpukit.core.backend import get_native_module
+
+        native = get_native_module()
+        results: list[BenchmarkResult] = []
+
+        # Check availability
+        if not hasattr(native, "gemv_fp8_fp8_available") or not native.gemv_fp8_fp8_available():
+            print("  W8A8 GEMV: Not available")
+            return results
+
+        for K, N, label in self.configs:
+            try:
+                result = self._benchmark_w8a8(native, K, N, label)
+                if result:
+                    results.append(result)
+            except Exception as e:
+                print(f"  W8A8 GEMV {label}: ERROR - {e}")
+
+        return results
+
+    def _benchmark_w8a8(
+        self,
+        native: object,
+        K: int,
+        N: int,
+        label: str,
+    ) -> BenchmarkResult | None:
+        """Benchmark W8A8 GEMV."""
+        from pygpukit.core import from_numpy, zeros
+
+        name = f"gemv_w8a8_{label}"
+        params = {"dtype": "w8a8", "K": K, "N": N, "label": label}
+        flops = 2.0 * K * N
+
+        block_size = 128
+        n_scales_k = (K + block_size - 1) // block_size
+        n_scales_n = (N + block_size - 1) // block_size
+
+        A_fp8 = from_numpy(np.random.randint(0, 256, K, dtype=np.uint8))
+        B_fp8 = from_numpy(np.random.randint(0, 256, (N, K), dtype=np.uint8))
+        scale_A = from_numpy(np.ones(n_scales_k, dtype=np.float32))
+        scale_B = from_numpy(np.ones(n_scales_n * n_scales_k, dtype=np.float32))
+        C = zeros((N,), dtype="bfloat16")
+
+        def run_fn() -> None:
+            native.gemv_fp8_fp8_bf16_sm120(
+                A_fp8._get_native(),
+                B_fp8._get_native(),
+                scale_A._get_native(),
+                scale_B._get_native(),
+                C._get_native(),
+            )
+
+        return self._measure(name, run_fn, params, flops=flops)
diff --git a/src/pygpukit/benchmark/results.py b/src/pygpukit/benchmark/results.py
new file mode 100644
index 0000000..268f7dc
--- /dev/null
+++ b/src/pygpukit/benchmark/results.py
@@ -0,0 +1,187 @@
+"""Benchmark result classes and comparison utilities."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+
+@dataclass
+class BenchmarkResult:
+    """Single benchmark result."""
+
+    name: str
+    category: str  # gemm, gemv, attention, inference
+    params: dict[str, Any]  # M, K, N, dtype, etc.
+    median_us: float  # Median time in microseconds
+    min_us: float
+    max_us: float
+    std_us: float
+    tflops: float | None = None  # For compute benchmarks
+    bandwidth_gbps: float | None = None  # For memory benchmarks
+    correct: bool = True
+    rel_error: float = 0.0
+    iterations: int = 0
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, d: dict) -> BenchmarkResult:
+        return cls(**d)
+
+
+@dataclass
+class GPUInfo:
+    """GPU information."""
+
+    name: str
+    sm_major: int
+    sm_minor: int
+    memory_gb: float
+    driver_version: str = ""
+    cuda_version: str = ""
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+
+@dataclass
+class BenchmarkReport:
+    """Complete benchmark report with multiple results."""
+
+    gpu: GPUInfo
+    results: list[BenchmarkResult] = field(default_factory=list)
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+    version: str = "0.2.18"
+
+    def add(self, result: BenchmarkResult) -> None:
+        self.results.append(result)
+
+    def save(self, path: str | Path) -> None:
+        """Save report to JSON file."""
+        path = Path(path)
+        data = {
+            "version": self.version,
+            "timestamp": self.timestamp,
+            "gpu": self.gpu.to_dict(),
+            "results": [r.to_dict() for r in self.results],
+        }
+        path.write_text(json.dumps(data, indent=2))
+
+    @classmethod
+    def load(cls, path: str | Path) -> BenchmarkReport:
+        """Load report from JSON file."""
+        path = Path(path)
+        data = json.loads(path.read_text())
+        gpu = GPUInfo(**data["gpu"])
+        results = [BenchmarkResult.from_dict(r) for r in data["results"]]
+        return cls(
+            gpu=gpu,
+            results=results,
+            timestamp=data.get("timestamp", ""),
+            version=data.get("version", "unknown"),
+        )
+
+    def to_dict(self) -> dict:
+        return {
+            "version": self.version,
+            "timestamp": self.timestamp,
+            "gpu": self.gpu.to_dict(),
+            "results": [r.to_dict() for r in self.results],
+        }
+
+
+@dataclass
+class Regression:
+    """Regression information."""
+
+    result: BenchmarkResult
+    baseline: BenchmarkResult
+    delta_percent: float  # Negative = regression
+
+
+@dataclass
+class ComparisonResult:
+    """Result of comparing two benchmark reports."""
+
+    current: BenchmarkReport
+    baseline: BenchmarkReport
+    regressions: list[Regression] = field(default_factory=list)
+    improvements: list[Regression] = field(default_factory=list)
+    missing: list[str] = field(default_factory=list)  # In baseline but not current
+    new: list[str] = field(default_factory=list)  # In current but not baseline
+
+    def has_regression(self, threshold: float = 0.05) -> bool:
+        """Check if any regression exceeds threshold."""
+        return any(r.delta_percent < -threshold * 100 for r in self.regressions)
+
+    def summary(self) -> str:
+        """Generate comparison summary."""
+        lines = []
+        lines.append("=" * 60)
+        lines.append("Benchmark Comparison")
+        lines.append("=" * 60)
+        lines.append(f"Baseline: {self.baseline.timestamp}")
+        lines.append(f"Current:  {self.current.timestamp}")
+        lines.append("")
+
+        if self.regressions:
+            lines.append("REGRESSIONS:")
+            for r in sorted(self.regressions, key=lambda x: x.delta_percent):
+                lines.append(
+                    f"  {r.result.name}: {r.baseline.median_us:.1f} -> "
+                    f"{r.result.median_us:.1f} us ({r.delta_percent:+.1f}%)"
+                )
+            lines.append("")
+
+        if self.improvements:
+            lines.append("IMPROVEMENTS:")
+            for r in sorted(self.improvements, key=lambda x: -x.delta_percent)[:5]:
+                lines.append(
+                    f"  {r.result.name}: {r.baseline.median_us:.1f} -> "
+                    f"{r.result.median_us:.1f} us ({r.delta_percent:+.1f}%)"
+                )
+            lines.append("")
+
+        return "\n".join(lines)
+
+
+def compare_reports(
+    current: BenchmarkReport,
+    baseline: BenchmarkReport,
+    threshold: float = 0.05,
+) -> ComparisonResult:
+    """Compare two benchmark reports."""
+    result = ComparisonResult(current=current, baseline=baseline)
+
+    # Build lookup by name+params
+    def key(r: BenchmarkResult) -> str:
+        params_str = json.dumps(r.params, sort_keys=True)
+        return f"{r.category}:{r.name}:{params_str}"
+
+    baseline_map = {key(r): r for r in baseline.results}
+    current_map = {key(r): r for r in current.results}
+
+    for k, curr in current_map.items():
+        if k in baseline_map:
+            base = baseline_map[k]
+            if base.median_us > 0:
+                delta = (curr.median_us - base.median_us) / base.median_us * 100
+                reg = Regression(result=curr, baseline=base, delta_percent=-delta)
+                if delta > threshold * 100:
+                    result.regressions.append(reg)
+                elif delta < -threshold * 100:
+                    result.improvements.append(reg)
+        else:
+            result.new.append(k)
+
+    for k in baseline_map:
+        if k not in current_map:
+            result.missing.append(k)
+
+    return result

From 4e71a9850f115185c2024897cf5289fe66c95e2f Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Tue, 30 Dec 2025 23:57:07 +0900
Subject: [PATCH 2/5] fix(benchmark): correct GEMV B matrix layout [N,K]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/pygpukit/benchmark/gemv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pygpukit/benchmark/gemv.py b/src/pygpukit/benchmark/gemv.py
index 4f2adb3..695ee9d 100644
--- a/src/pygpukit/benchmark/gemv.py
+++ b/src/pygpukit/benchmark/gemv.py
@@ -74,7 +74,7 @@ def _benchmark_gemv(
             from pygpukit.ops.matmul import gemv_bf16
 
             A = gk.empty((K,), dtype="bfloat16")
-            B = gk.empty((K, N), dtype="bfloat16")
+            B = gk.empty((N, K), dtype="bfloat16")  # B[N, K] layout for gemv
             C = gk.empty((N,), dtype="bfloat16")
 
             def run_fn() -> None:

From 68a1f7230fc901033eb286d2086993edae2d6fde Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Tue, 30 Dec 2025 23:59:33 +0900
Subject: [PATCH 3/5] chore: remove old benchmark files replaced by unified
 suite
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Deleted:
- scripts/benchmark.py
- benchmarks/benchmark_gemv_*.py
- benchmarks/benchmark_nvf4_*.py
- benchmarks/benchmark_w8a16_gemm.py
- examples/benchmark_*.py

Use 'python -m pygpukit.benchmark' instead.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 benchmarks/benchmark_gemv_all.py      | 213 ----------
 benchmarks/benchmark_gemv_detailed.py | 165 --------
 benchmarks/benchmark_nvf4_bf16.py     | 145 -------
 benchmarks/benchmark_nvf4_nvf4.py     | 113 ------
 benchmarks/benchmark_w8a16_gemm.py    | 101 -----
 examples/benchmark_compare.py         |  98 -----
 examples/benchmark_large.py           |  38 --
 examples/benchmark_matmul.py          | 137 -------
 examples/benchmark_tiled_matmul.py    |  93 -----
 scripts/benchmark.py                  | 538 --------------------------
 10 files changed, 1641 deletions(-)
 delete mode 100644 benchmarks/benchmark_gemv_all.py
 delete mode 100644 benchmarks/benchmark_gemv_detailed.py
 delete mode 100644 benchmarks/benchmark_nvf4_bf16.py
 delete mode 100644 benchmarks/benchmark_nvf4_nvf4.py
 delete mode 100644 benchmarks/benchmark_w8a16_gemm.py
 delete mode 100644 examples/benchmark_compare.py
 delete mode 100644 examples/benchmark_large.py
 delete mode 100644 examples/benchmark_matmul.py
 delete mode 100644 examples/benchmark_tiled_matmul.py
 delete mode 100644 scripts/benchmark.py

diff --git a/benchmarks/benchmark_gemv_all.py b/benchmarks/benchmark_gemv_all.py
deleted file mode 100644
index 2275525..0000000
--- a/benchmarks/benchmark_gemv_all.py
+++ /dev/null
@@ -1,213 +0,0 @@
-#!/usr/bin/env python3
-"""
-Comprehensive GEMV Benchmark for README.md
-
-All GEMV kernels with LLM-relevant sizes, reporting in microseconds.
-"""
-
-import time
-
-import numpy as np
-
-import pygpukit as gk
-from pygpukit.core import from_numpy
-from pygpukit.core.backend import get_native_module
-
-
-def benchmark_gemv_all():
-    """Comprehensive GEMV benchmark for all formats."""
-    from pygpukit.ops.matmul import (
-        fp8_init_lut,
-        gemv_bf16,
-        gemv_fp8_bf16,
-        gemv_nvf4_available,
-        gemv_nvf4_bf16,
-    )
-
-    native = get_native_module()
-    fp8_init_lut()
-
-    print("=" * 80)
-    print("Comprehensive GEMV Benchmark (RTX 5090)")
-    print("=" * 80)
-
-    props = native.get_device_properties(0)
-    print(f"GPU: {props.name}")
-    print()
-
-    # LLM-relevant configurations
-    # (K, N) - K is hidden dim, N is output dim
-    configs = [
-        # Qwen-7B style
-        (4096, 4096, "Qwen-7B hidden"),
-        (4096, 14336, "Qwen-7B MLP up"),
-        (14336, 4096, "Qwen-7B MLP down"),
-        # Qwen-72B style
-        (8192, 8192, "Qwen-72B hidden"),
-        (8192, 29568, "Qwen-72B MLP up"),
-        (29568, 8192, "Qwen-72B MLP down"),
-    ]
-
-    warmup = 10
-    iterations = 50
-
-    # Results table
-    results = []
-
-    for K, N, label in configs:
-        print(f"\n{label}: K={K}, N={N}")
-
-        # ===== BF16 GEMV =====
-        A_bf16 = gk.empty((K,), dtype="bfloat16")
-        B_bf16 = gk.empty((K, N), dtype="bfloat16")
-        C_bf16 = gk.empty((N,), dtype="bfloat16")
-
-        for _ in range(warmup):
-            gemv_bf16(A_bf16, B_bf16, out=C_bf16)
-        native.device_synchronize()
-
-        times_bf16 = []
-        for _ in range(iterations):
-            native.device_synchronize()
-            start = time.perf_counter()
-            gemv_bf16(A_bf16, B_bf16, out=C_bf16)
-            native.device_synchronize()
-            end = time.perf_counter()
-            times_bf16.append((end - start) * 1e6)
-
-        median_bf16 = np.median(times_bf16)
-
-        # ===== FP8 GEMV =====
-        try:
-            A_fp8 = gk.empty((K,), dtype="bfloat16")
-            B_fp8_nk = from_numpy(np.zeros((N, K), dtype=np.uint8))
-            n_blocks = (N + 127) // 128
-            k_blocks = (K + 127) // 128
-            B_scale_fp8 = from_numpy(
-                np.ones((n_blocks, k_blocks), dtype=np.float16).view(np.uint16)
-            )
-            C_fp8 = gk.empty((N,), dtype="bfloat16")
-
-            for _ in range(warmup):
-                gemv_fp8_bf16(A_fp8, B_fp8_nk, B_scale_fp8, out=C_fp8)
-            native.device_synchronize()
-
-            times_fp8 = []
-            for _ in range(iterations):
-                native.device_synchronize()
-                start = time.perf_counter()
-                gemv_fp8_bf16(A_fp8, B_fp8_nk, B_scale_fp8, out=C_fp8)
-                native.device_synchronize()
-                end = time.perf_counter()
-                times_fp8.append((end - start) * 1e6)
-
-            median_fp8 = np.median(times_fp8)
-        except Exception:
-            median_fp8 = float("inf")
-
-        # ===== NVF4 GEMV =====
-        if gemv_nvf4_available():
-            A_nvf4 = gk.empty((K,), dtype="bfloat16")
-            B_nvf4 = from_numpy(np.zeros((K // 2, N), dtype=np.uint8))
-            k_scale_blocks = (K + 31) // 32
-            B_scale_nvf4 = from_numpy(np.ones((k_scale_blocks, N), dtype=np.uint8))
-            C_nvf4 = gk.empty((N,), dtype="bfloat16")
-
-            for _ in range(warmup):
-                gemv_nvf4_bf16(A_nvf4, B_nvf4, B_scale_nvf4, out=C_nvf4)
-            native.device_synchronize()
-
-            times_nvf4 = []
-            for _ in range(iterations):
-                native.device_synchronize()
-                start = time.perf_counter()
-                gemv_nvf4_bf16(A_nvf4, B_nvf4, B_scale_nvf4, out=C_nvf4)
-                native.device_synchronize()
-                end = time.perf_counter()
-                times_nvf4.append((end - start) * 1e6)
-
-            median_nvf4 = np.median(times_nvf4)
-        else:
-            median_nvf4 = float("inf")
-
-        # ===== Int4 GEMV =====
-        try:
-            if native.int4_gemv_available():
-
-                def pack_int4(values: np.ndarray) -> np.ndarray:
-                    flat = values.reshape(-1)
-                    low = flat[0::2].astype(np.int32) & 0x0F
-                    high = flat[1::2].astype(np.int32) & 0x0F
-                    packed = (high << 4) | low
-                    new_shape = list(values.shape)
-                    new_shape[-1] //= 2
-                    return packed.astype(np.uint8).reshape(new_shape)
-
-                A_int4_raw = np.random.randint(-8, 8, K, dtype=np.int8)
-                B_int4_raw = np.random.randint(-8, 8, (N, K), dtype=np.int8)
-                A_int4 = from_numpy(pack_int4(A_int4_raw.reshape(1, -1)).reshape(-1))
-                B_int4 = from_numpy(pack_int4(B_int4_raw))
-                C_int4 = from_numpy(np.zeros(N, dtype=np.int32))
-
-                for _ in range(warmup):
-                    native.int4_gemv_int32_sm120(
-                        A_int4._get_native(), B_int4._get_native(), C_int4._get_native()
-                    )
-                native.device_synchronize()
-
-                times_int4 = []
-                for _ in range(iterations):
-                    native.device_synchronize()
-                    start = time.perf_counter()
-                    native.int4_gemv_int32_sm120(
-                        A_int4._get_native(), B_int4._get_native(), C_int4._get_native()
-                    )
-                    native.device_synchronize()
-                    end = time.perf_counter()
-                    times_int4.append((end - start) * 1e6)
-
-                median_int4 = np.median(times_int4)
-            else:
-                median_int4 = float("inf")
-        except Exception:
-            median_int4 = float("inf")
-
-        results.append(
-            {
-                "label": label,
-                "K": K,
-                "N": N,
-                "bf16": median_bf16,
-                "fp8": median_fp8,
-                "nvf4": median_nvf4,
-                "int4": median_int4,
-            }
-        )
-
-        print(f"  BF16: {median_bf16:.1f} us")
-        print(f"  FP8:  {median_fp8:.1f} us")
-        if median_nvf4 != float("inf"):
-            print(f"  NVF4: {median_nvf4:.1f} us")
-        if median_int4 != float("inf"):
-            print(f"  Int4: {median_int4:.1f} us")
-
-    # Print README table
-    print("\n" + "=" * 80)
-    print("README.md Table (GEMV Performance)")
-    print("=" * 80)
-    print()
-    print("| Layer | K | N | BF16 | FP8 | NVF4 | Int4 |")
-    print("|-------|------|-------|------|-----|------|------|")
-
-    for r in results:
-        bf16_str = f"{r['bf16']:.0f} us"
-        fp8_str = f"{r['fp8']:.0f} us"
-        nvf4_str = f"{r['nvf4']:.0f} us" if r["nvf4"] != float("inf") else "—"
-        int4_str = f"{r['int4']:.0f} us" if r["int4"] != float("inf") else "—"
-        print(
-            f"| {r['label']} | {r['K']} | {r['N']} | {bf16_str} | {fp8_str} | {nvf4_str} | {int4_str} |"
-        )
-
-
-if __name__ == "__main__":
-    benchmark_gemv_all()
diff --git a/benchmarks/benchmark_gemv_detailed.py b/benchmarks/benchmark_gemv_detailed.py
deleted file mode 100644
index 832868e..0000000
--- a/benchmarks/benchmark_gemv_detailed.py
+++ /dev/null
@@ -1,165 +0,0 @@
-#!/usr/bin/env python3
-"""
-Detailed GEMV Benchmark with individual timing per iteration.
-
-Compares: BF16, FP8, NVFP4 GEMV kernels.
-"""
-
-import time
-
-import numpy as np
-
-import pygpukit as gk
-from pygpukit.core import from_numpy
-from pygpukit.core.backend import get_native_module
-
-
-def benchmark_gemv_detailed():
-    """Detailed GEMV benchmark with per-iteration timing."""
-    from pygpukit.ops.matmul import (
-        fp8_init_lut,
-        gemv_bf16,
-        gemv_fp8_bf16,
-        gemv_nvf4_available,
-        gemv_nvf4_bf16,
-    )
-
-    native = get_native_module()
-    fp8_init_lut()
-
-    print("=" * 80)
-    print("Detailed GEMV Benchmark")
-    print("=" * 80)
-
-    # Get GPU info
-    props = native.get_device_properties(0)
-    print(f"GPU: {props.name}")
-    print("Memory Bandwidth: ~1792 GB/s (theoretical)")
-    print()
-
-    configs = [
-        (4096, 4096),
-        (14336, 4096),
-        (4096, 14336),
-    ]
-
-    warmup = 10
-    iterations = 50
-
-    for N, K in configs:
-        print(f"\n{'=' * 60}")
-        print(f"N={N}, K={K}")
-        print(f"{'=' * 60}")
-
-        # Calculate theoretical bandwidth
-        # BF16: B is K*N*2 bytes, A is K*2 bytes
-        bf16_bytes = K * N * 2 + K * 2
-        # FP8: B is N*K bytes, A is K*2 bytes, scale is (N/128)*(K/128)*2 bytes
-        fp8_bytes = N * K + K * 2 + ((N + 127) // 128) * ((K + 127) // 128) * 2
-        # NVF4: B is N*K/2 bytes, A is K*2 bytes, scale is (K/32)*N bytes
-        nvf4_bytes = N * (K // 2) + K * 2 + ((K + 31) // 32) * N
-
-        print(
-            f"Data sizes: BF16={bf16_bytes / 1e6:.1f}MB, FP8={fp8_bytes / 1e6:.1f}MB, NVF4={nvf4_bytes / 1e6:.1f}MB"
-        )
-        print(
-            f"Theoretical time @1000GB/s: BF16={bf16_bytes / 1e9 * 1e6:.1f}us, FP8={fp8_bytes / 1e9 * 1e6:.1f}us"
-        )
-        print()
-
-        # ===== BF16 GEMV =====
-        A_bf16 = gk.empty((K,), dtype="bfloat16")
-        B_bf16 = gk.empty((K, N), dtype="bfloat16")
-        C_bf16 = gk.empty((N,), dtype="bfloat16")
-
-        # Warmup
-        for _ in range(warmup):
-            gemv_bf16(A_bf16, B_bf16, out=C_bf16)
-        native.device_synchronize()
-
-        # Benchmark with individual timing
-        times_bf16 = []
-        for _ in range(iterations):
-            native.device_synchronize()
-            start = time.perf_counter()
-            gemv_bf16(A_bf16, B_bf16, out=C_bf16)
-            native.device_synchronize()
-            end = time.perf_counter()
-            times_bf16.append((end - start) * 1e6)
-
-        median_bf16 = np.median(times_bf16)
-        min_bf16 = np.min(times_bf16)
-        print(
-            f"BF16:  median={median_bf16:.1f}us, min={min_bf16:.1f}us, "
-            f"BW={bf16_bytes / median_bf16 / 1e3:.0f}GB/s"
-        )
-
-        # ===== FP8 GEMV (optimized, B[N,K] layout) =====
-        A_fp8 = gk.empty((K,), dtype="bfloat16")
-        B_fp8_nk = from_numpy(np.zeros((N, K), dtype=np.uint8))  # [N, K] layout
-        n_blocks = (N + 127) // 128
-        k_blocks = (K + 127) // 128
-        B_scale_fp8 = from_numpy(np.ones((n_blocks, k_blocks), dtype=np.float16).view(np.uint16))
-        C_fp8 = gk.empty((N,), dtype="bfloat16")
-
-        for _ in range(warmup):
-            gemv_fp8_bf16(A_fp8, B_fp8_nk, B_scale_fp8, out=C_fp8)
-        native.device_synchronize()
-
-        times_fp8 = []
-        for _ in range(iterations):
-            native.device_synchronize()
-            start = time.perf_counter()
-            gemv_fp8_bf16(A_fp8, B_fp8_nk, B_scale_fp8, out=C_fp8)
-            native.device_synchronize()
-            end = time.perf_counter()
-            times_fp8.append((end - start) * 1e6)
-
-        median_fp8 = np.median(times_fp8)
-        min_fp8 = np.min(times_fp8)
-        print(
-            f"FP8:   median={median_fp8:.1f}us, min={min_fp8:.1f}us, "
-            f"BW={fp8_bytes / median_fp8 / 1e3:.0f}GB/s"
-        )
-
-        # ===== NVFP4 GEMV =====
-        if gemv_nvf4_available():
-            A_nvf4 = gk.empty((K,), dtype="bfloat16")
-            B_nvf4 = from_numpy(np.zeros((K // 2, N), dtype=np.uint8))
-            k_scale_blocks = (K + 31) // 32
-            B_scale_nvf4 = from_numpy(np.ones((k_scale_blocks, N), dtype=np.uint8))
-            C_nvf4 = gk.empty((N,), dtype="bfloat16")
-
-            for _ in range(warmup):
-                gemv_nvf4_bf16(A_nvf4, B_nvf4, B_scale_nvf4, out=C_nvf4)
-            native.device_synchronize()
-
-            times_nvf4 = []
-            for _ in range(iterations):
-                native.device_synchronize()
-                start = time.perf_counter()
-                gemv_nvf4_bf16(A_nvf4, B_nvf4, B_scale_nvf4, out=C_nvf4)
-                native.device_synchronize()
-                end = time.perf_counter()
-                times_nvf4.append((end - start) * 1e6)
-
-            median_nvf4 = np.median(times_nvf4)
-            min_nvf4 = np.min(times_nvf4)
-            print(
-                f"NVFP4: median={median_nvf4:.1f}us, min={min_nvf4:.1f}us, "
-                f"BW={nvf4_bytes / median_nvf4 / 1e3:.0f}GB/s"
-            )
-        else:
-            median_nvf4 = float("inf")
-            print("NVFP4: N/A")
-
-        # Summary
-        print()
-        print("Speedup vs BF16:")
-        print(f"  FP8:   {median_bf16 / median_fp8:.2f}x")
-        if gemv_nvf4_available():
-            print(f"  NVFP4: {median_bf16 / median_nvf4:.2f}x")
-
-
-if __name__ == "__main__":
-    benchmark_gemv_detailed()
diff --git a/benchmarks/benchmark_nvf4_bf16.py b/benchmarks/benchmark_nvf4_bf16.py
deleted file mode 100644
index 2a5213b..0000000
--- a/benchmarks/benchmark_nvf4_bf16.py
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/usr/bin/env python3
-"""
-NVF4-BF16 GEMM Benchmark for SM120 (Blackwell GeForce)
-
-Benchmarks NVF4 (4-bit) GEMM with BF16 I/O.
-NVF4 provides 2x memory bandwidth compared to FP8.
-"""
-
-import time
-
-import numpy as np
-
-
-def bf16_to_f32(bf16_uint16: np.ndarray) -> np.ndarray:
-    """Convert BFloat16 (stored as uint16) to float32."""
-    bf16_uint16 = bf16_uint16.astype(np.uint16)
-    f32_bits = bf16_uint16.astype(np.uint32) << 16
-    return f32_bits.view(np.float32)
-
-
-def f32_to_bf16(f32: np.ndarray) -> np.ndarray:
-    """Convert float32 to BFloat16 (stored as uint16)."""
-    f32 = f32.astype(np.float32)
-    f32_bits = f32.view(np.uint32)
-    bf16_bits = (f32_bits >> 16).astype(np.uint16)
-    return bf16_bits
-
-
-def benchmark_nvf4_bf16(sizes: list[int], warmup: int = 5, iterations: int = 20):
-    """Benchmark NVF4-BF16 GEMM at various sizes."""
-    from pygpukit.core.backend import get_native_module
-    from pygpukit.core.factory import from_numpy
-    from pygpukit.ops import matmul_nvf4_bf16_sm120, nvf4_bf16_sm120_available
-
-    native = get_native_module()
-
-    if not nvf4_bf16_sm120_available():
-        print("NVF4-BF16 SM120 not available")
-        return
-
-    print("=" * 70)
-    print("NVF4-BF16 GEMM Benchmark (SM120 Blackwell GeForce)")
-    print("=" * 70)
-
-    # Get GPU info
-    props = native.get_device_properties(0)
-    print(f"GPU: {props.name}")
-    print(f"SM: {props.compute_capability_major}.{props.compute_capability_minor}")
-    print()
-    print("GPU-side quantization: BF16 -> NVF4 (no H2D copies)")
-    print()
-
-    results = []
-
-    for size in sizes:
-        M, N, K = size, size, size
-        flops = 2.0 * M * N * K  # FLOPs for GEMM
-
-        # Create NVF4-appropriate data (values in representable range)
-        nvf4_values = np.array([0.5, 1.0, 1.5, 2.0, 3.0, 4.0], dtype=np.float32)
-        A = np.random.choice(nvf4_values, size=(M, K)).astype(np.float32)
-        B = np.random.choice(nvf4_values, size=(K, N)).astype(np.float32)
-
-        A_bf16 = f32_to_bf16(A)
-        B_bf16 = f32_to_bf16(B)
-
-        A_gpu = from_numpy(A_bf16)
-        B_gpu = from_numpy(B_bf16)
-
-        # Warmup
-        for _ in range(warmup):
-            C_gpu = matmul_nvf4_bf16_sm120(A_gpu, B_gpu)
-        native.device_synchronize()
-
-        # Benchmark
-        times = []
-        for _ in range(iterations):
-            native.device_synchronize()
-            start = time.perf_counter()
-            C_gpu = matmul_nvf4_bf16_sm120(A_gpu, B_gpu)
-            native.device_synchronize()
-            end = time.perf_counter()
-            times.append(end - start)
-
-        # Get result and verify
-        C_uint16 = C_gpu.to_numpy()
-        C_f32 = bf16_to_f32(C_uint16)
-        C_ref = bf16_to_f32(A_bf16) @ bf16_to_f32(B_bf16)
-
-        rel_error = np.linalg.norm(C_f32 - C_ref) / np.linalg.norm(C_ref)
-
-        median_time = np.median(times)
-        min_time = np.min(times)
-        tflops_median = flops / median_time / 1e12
-        tflops_max = flops / min_time / 1e12
-
-        results.append(
-            {
-                "size": size,
-                "tflops_median": tflops_median,
-                "tflops_max": tflops_max,
-                "time_ms": median_time * 1000,
-                "rel_error": rel_error,
-            }
-        )
-
-        status = "PASS" if rel_error < 0.05 else "FAIL"
-        print(
-            f"{M}x{N}x{K}: {tflops_median:.2f} TFLOPS (median), "
-            f"{tflops_max:.2f} TFLOPS (max), "
-            f"rel_error={rel_error:.2e} [{status}]"
-        )
-
-    print()
-    print("=" * 70)
-    print("Summary Table (for README)")
-    print("=" * 70)
-    print("| Size | TFLOPS (median) | TFLOPS (max) | Time (ms) |")
-    print("|------|-----------------|--------------|-----------|")
-    for r in results:
-        print(
-            f"| {r['size']}x{r['size']} | {r['tflops_median']:.2f} | "
-            f"{r['tflops_max']:.2f} | {r['time_ms']:.2f} |"
-        )
-
-    return results
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(description="NVF4-BF16 GEMM Benchmark")
-    parser.add_argument(
-        "--sizes",
-        nargs="+",
-        type=int,
-        default=[1024, 2048, 4096, 8192],
-        help="Matrix sizes to benchmark",
-    )
-    parser.add_argument("--warmup", type=int, default=5, help="Number of warmup iterations")
-    parser.add_argument("--iterations", type=int, default=20, help="Number of benchmark iterations")
-
-    args = parser.parse_args()
-
-    benchmark_nvf4_bf16(args.sizes, args.warmup, args.iterations)
diff --git a/benchmarks/benchmark_nvf4_nvf4.py b/benchmarks/benchmark_nvf4_nvf4.py
deleted file mode 100644
index 6ff909d..0000000
--- a/benchmarks/benchmark_nvf4_nvf4.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/usr/bin/env python3
-"""
-Pure NVF4 GEMM Benchmark for SM120 (Blackwell GeForce)
-
-Benchmarks NVF4 GEMM without quantization overhead to measure
-pure tensor core performance.
-"""
-
-import time
-
-import numpy as np
-
-
-def benchmark_nvf4_nvf4(sizes: list[int], warmup: int = 5, iterations: int = 20):
-    """Benchmark pure NVF4 GEMM at various sizes."""
-    from pygpukit.core.backend import get_native_module
-    from pygpukit.core.factory import zeros
-
-    native = get_native_module()
-
-    if not native.nvf4_nvf4_sm120_available():
-        print("NVF4-NVF4 SM120 not available")
-        return
-
-    print("=" * 70)
-    print("Pure NVF4 GEMM Benchmark (SM120 Blackwell GeForce)")
-    print("=" * 70)
-
-    # Get GPU info
-    props = native.get_device_properties(0)
-    print(f"GPU: {props.name}")
-    print(f"SM: {props.compute_capability_major}.{props.compute_capability_minor}")
-    print()
-    print("Pre-quantized NVF4 data (no quantization overhead)")
-    print()
-
-    results = []
-
-    for size in sizes:
-        M, N, K = size, size, size
-        flops = 2.0 * M * N * K  # FLOPs for GEMM
-
-        # Allocate output buffer (BF16)
-        D_gpu = zeros((M, N), dtype="bfloat16")
-        D_native = D_gpu._get_native()  # Get native GPUArray
-
-        # Warmup
-        for _ in range(warmup):
-            native.benchmark_gemm_nvf4_sm120(D_native, M, N, K)
-        native.device_synchronize()
-
-        # Benchmark
-        times = []
-        for _ in range(iterations):
-            native.device_synchronize()
-            start = time.perf_counter()
-            native.benchmark_gemm_nvf4_sm120(D_native, M, N, K)
-            native.device_synchronize()
-            end = time.perf_counter()
-            times.append(end - start)
-
-        median_time = np.median(times)
-        min_time = np.min(times)
-        tflops_median = flops / median_time / 1e12
-        tflops_max = flops / min_time / 1e12
-
-        results.append(
-            {
-                "size": size,
-                "tflops_median": tflops_median,
-                "tflops_max": tflops_max,
-                "time_ms": median_time * 1000,
-            }
-        )
-
-        print(
-            f"{M}x{N}x{K}: {tflops_median:.2f} TFLOPS (median), "
-            f"{tflops_max:.2f} TFLOPS (max), "
-            f"time={median_time * 1000:.2f}ms"
-        )
-
-    print()
-    print("=" * 70)
-    print("Summary Table")
-    print("=" * 70)
-    print("| Size | TFLOPS (median) | TFLOPS (max) | Time (ms) |")
-    print("|------|-----------------|--------------|-----------|")
-    for r in results:
-        print(
-            f"| {r['size']}x{r['size']} | {r['tflops_median']:.2f} | "
-            f"{r['tflops_max']:.2f} | {r['time_ms']:.2f} |"
-        )
-
-    return results
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(description="Pure NVF4 GEMM Benchmark")
-    parser.add_argument(
-        "--sizes",
-        nargs="+",
-        type=int,
-        default=[1024, 2048, 4096, 8192, 12288, 16384],
-        help="Matrix sizes to benchmark",
-    )
-    parser.add_argument("--warmup", type=int, default=5, help="Number of warmup iterations")
-    parser.add_argument("--iterations", type=int, default=20, help="Number of benchmark iterations")
-
-    args = parser.parse_args()
-
-    benchmark_nvf4_nvf4(args.sizes, args.warmup, args.iterations)
diff --git a/benchmarks/benchmark_w8a16_gemm.py b/benchmarks/benchmark_w8a16_gemm.py
deleted file mode 100644
index 5da5f38..0000000
--- a/benchmarks/benchmark_w8a16_gemm.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-"""
-W8A16 GEMM Benchmark for SM120.
-
-Tests FP8 weight x BF16 activation -> BF16 output.
-"""
-
-import time
-
-import numpy as np
-
-import pygpukit as gk
-from pygpukit.core import from_numpy
-from pygpukit.core.backend import get_native_module
-from pygpukit.ops.matmul import w8a16_gemm_sm120
-
-
-def benchmark_w8a16_gemm():
-    """Benchmark W8A16 GEMM kernel."""
-    native = get_native_module()
-
-    print("=" * 80)
-    print("W8A16 GEMM Benchmark (SM120)")
-    print("=" * 80)
-
-    # Get GPU info
-    props = native.get_device_properties(0)
-    print(f"GPU: {props.name}")
-    print()
-
-    # Test configurations (typical LLM layer sizes)
-    # Qwen3-30B-A3B MoE: hidden=2048, intermediate varies by expert
-    configs = [
-        # (M, K, N) - prefill batch sizes
-        (1, 2048, 8192),  # Single token, small MLP
-        (16, 2048, 8192),  # Small batch
-        (64, 2048, 8192),  # Medium batch
-        (128, 4096, 14336),  # Large batch, Qwen-7B MLP
-        (256, 4096, 14336),  # Larger batch
-        (512, 4096, 14336),  # Prefill size
-        (1024, 4096, 14336),  # Long prefill
-    ]
-
-    warmup = 10
-    iterations = 50
-
-    for M, K, N in configs:
-        print(f"\n{'=' * 60}")
-        print(f"M={M}, K={K}, N={N}")
-        print(f"{'=' * 60}")
-
-        # Calculate data sizes
-        A_bytes = M * K * 2  # BF16
-        B_bytes = K * N * 1  # FP8
-        C_bytes = M * N * 2  # BF16
-        scale_k = (K + 127) // 128
-        scale_n = (N + 127) // 128
-        scale_bytes = scale_k * scale_n * 2  # BF16 scale
-        total_bytes = A_bytes + B_bytes + C_bytes + scale_bytes
-
-        print(f"Data: A={A_bytes / 1e6:.2f}MB, B={B_bytes / 1e6:.2f}MB, C={C_bytes / 1e6:.2f}MB")
-        print(f"Total I/O: {total_bytes / 1e6:.2f}MB")
-
-        # Calculate FLOPS (2*M*N*K for matmul)
-        flops = 2 * M * N * K
-
-        # Create tensors
-        A_bf16 = gk.empty((M, K), dtype="bfloat16")
-        B_fp8 = from_numpy(np.random.randint(0, 256, (K, N), dtype=np.uint8))
-        B_scale = gk.empty((scale_k, scale_n), dtype="bfloat16")
-        C_out = gk.empty((M, N), dtype="bfloat16")
-
-        # Warmup
-        for _ in range(warmup):
-            w8a16_gemm_sm120(A_bf16, B_fp8, B_scale, out=C_out)
-        native.device_synchronize()
-
-        # Benchmark
-        times = []
-        for _ in range(iterations):
-            native.device_synchronize()
-            start = time.perf_counter()
-            w8a16_gemm_sm120(A_bf16, B_fp8, B_scale, out=C_out)
-            native.device_synchronize()
-            end = time.perf_counter()
-            times.append((end - start) * 1e6)  # microseconds
-
-        median_us = np.median(times)
-        min_us = np.min(times)
-        max_us = np.max(times)
-
-        # Calculate performance
-        tflops = flops / median_us / 1e6  # TFLOPS
-        bw = total_bytes / median_us / 1e3  # GB/s
-
-        print(f"Time: median={median_us:.1f}us, min={min_us:.1f}us, max={max_us:.1f}us")
-        print(f"Performance: {tflops:.2f} TFLOPS, BW={bw:.0f} GB/s")
-
-
-if __name__ == "__main__":
-    benchmark_w8a16_gemm()
diff --git a/examples/benchmark_compare.py b/examples/benchmark_compare.py
deleted file mode 100644
index 0e7cc82..0000000
--- a/examples/benchmark_compare.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/env python3
-"""Compare tiled vs naive matmul (via NVRTC JIT)."""
-
-import sys
-
-sys.path.insert(0, "src")
-import time
-
-import numpy as np
-
-import pygpukit as gp
-from pygpukit.core.backend import get_backend
-
-# Naive kernel source (for comparison)
-NAIVE_KERNEL = """
-extern "C" __global__ void matmul_naive(
-    const float* A, const float* B, float* C,
-    int M, int N, int K
-) {
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
-    int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (row < M && col < N) {
-        float sum = 0.0f;
-        for (int k = 0; k < K; ++k) {
-            sum += A[row * K + k] * B[k * N + col];
-        }
-        C[row * N + col] = sum;
-    }
-}
-"""
-
-
-def benchmark_current(a_gpu, b_gpu, iterations=10):
-    """Benchmark current (tiled) implementation."""
-    # Warmup
-    _ = gp.matmul(a_gpu, b_gpu)
-
-    times = []
-    for _ in range(iterations):
-        start = time.perf_counter()
-        _ = gp.matmul(a_gpu, b_gpu)
-        times.append(time.perf_counter() - start)
-    return np.mean(times) * 1000
-
-
-def main():
-    print("=" * 70)
-    print("  Tiled vs Naive Matmul Comparison")
-    print("=" * 70)
-    print()
-
-    backend = get_backend()
-    props = backend.get_device_properties()
-    print(f"GPU: {props.name}")
-    print()
-
-    # Note: We cannot easily run naive kernel without modifying C++ code
-    # So we'll compare with CLAUDE.md historical data
-
-    print("Benchmark results on RTX 3090 Ti:")
-    print("  Naive kernel is faster than tiled due to 6MB L2 cache")
-    print()
-
-    print("Current (Naive) implementation:")
-    sizes = [512, 1024, 2048]
-
-    for size in sizes:
-        np.random.seed(42)
-        a_np = np.random.rand(size, size).astype(np.float32)
-        b_np = np.random.rand(size, size).astype(np.float32)
-
-        a_gpu = gp.from_numpy(a_np)
-        b_gpu = gp.from_numpy(b_np)
-
-        gpu_ms = benchmark_current(a_gpu, b_gpu)
-        flops = 2 * size * size * size
-        gflops = flops / (gpu_ms / 1000) / 1e9
-
-        print(f"  {size}x{size}: {gpu_ms:.2f} ms, {gflops:.0f} GFLOPS")
-
-    print()
-    print("-" * 70)
-    print("Analysis:")
-    print("  The naive kernel outperforms tiled on RTX 3090 Ti because:")
-    print("  1. Large L2 cache (6MB) provides efficient global memory access")
-    print("  2. __syncthreads() in tiled kernel adds synchronization overhead")
-    print("  3. Shared memory management overhead doesn't pay off")
-    print()
-    print("  For truly faster matmul, consider:")
-    print("  - cuBLAS: 20+ TFLOPS on RTX 3090 Ti")
-    print("  - Advanced tiling with register blocking")
-    print("  - Tensor cores for mixed precision")
-    print("-" * 70)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/benchmark_large.py b/examples/benchmark_large.py
deleted file mode 100644
index 4b6a4c5..0000000
--- a/examples/benchmark_large.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python3
-"""Benchmark large matrices."""
-
-import sys
-
-sys.path.insert(0, "src")
-import time
-
-import numpy as np
-
-import pygpukit as gp
-
-sizes = [4096]
-for size in sizes:
-    np.random.seed(42)
-    a_np = np.random.rand(size, size).astype(np.float32)
-    b_np = np.random.rand(size, size).astype(np.float32)
-
-    # NumPy
-    start = time.perf_counter()
-    _ = np.matmul(a_np, b_np)
-    numpy_ms = (time.perf_counter() - start) * 1000
-
-    # GPU
-    a_gpu = gp.from_numpy(a_np)
-    b_gpu = gp.from_numpy(b_np)
-    _ = gp.matmul(a_gpu, b_gpu)  # warmup
-
-    start = time.perf_counter()
-    _ = gp.matmul(a_gpu, b_gpu)
-    gpu_ms = (time.perf_counter() - start) * 1000
-
-    flops = 2 * size * size * size
-    gflops = flops / (gpu_ms / 1000) / 1e9
-
-    print(
-        f"{size}x{size}: NumPy={numpy_ms:.1f}ms, GPU={gpu_ms:.1f}ms, Speedup={numpy_ms / gpu_ms:.1f}x, {gflops:.0f} GFLOPS"
-    )
diff --git a/examples/benchmark_matmul.py b/examples/benchmark_matmul.py
deleted file mode 100644
index 34b5cbe..0000000
--- a/examples/benchmark_matmul.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env python3
-"""Benchmark: Tiled matmul vs NumPy.
-
-Demonstrates the performance improvement from shared memory tiling.
-"""
-
-from __future__ import annotations
-
-import sys
-import time
-
-import numpy as np
-
-sys.path.insert(0, "src")
-
-import pygpukit as gp
-from pygpukit.core.backend import get_backend
-
-
-def benchmark_matmul(size: int, iterations: int = 10) -> dict:
-    """Benchmark matmul for a given matrix size."""
-    np.random.seed(42)
-
-    # Create test data
-    a_np = np.random.rand(size, size).astype(np.float32)
-    b_np = np.random.rand(size, size).astype(np.float32)
-
-    # NumPy benchmark
-    numpy_times = []
-    for _ in range(iterations):
-        start = time.perf_counter()
-        _ = np.matmul(a_np, b_np)
-        numpy_times.append(time.perf_counter() - start)
-    numpy_avg = np.mean(numpy_times) * 1000  # ms
-
-    # PyGPUkit benchmark
-    a_gpu = gp.from_numpy(a_np)
-    b_gpu = gp.from_numpy(b_np)
-
-    # Warm-up
-    _ = gp.matmul(a_gpu, b_gpu)
-
-    gpu_times = []
-    for _ in range(iterations):
-        start = time.perf_counter()
-        _ = gp.matmul(a_gpu, b_gpu)
-        gpu_times.append(time.perf_counter() - start)
-    gpu_avg = np.mean(gpu_times) * 1000  # ms
-
-    # Calculate GFLOPS (2 * N^3 FLOPs for matmul)
-    flops = 2 * size * size * size
-    gpu_gflops = flops / (gpu_avg / 1000) / 1e9
-    numpy_gflops = flops / (numpy_avg / 1000) / 1e9
-
-    return {
-        "size": size,
-        "numpy_ms": numpy_avg,
-        "gpu_ms": gpu_avg,
-        "speedup": numpy_avg / gpu_avg,
-        "numpy_gflops": numpy_gflops,
-        "gpu_gflops": gpu_gflops,
-    }
-
-
-def main():
-    print("=" * 70)
-    print("  PyGPUkit Tiled Matmul Benchmark")
-    print("=" * 70)
-    print()
-
-    # Get backend info
-    backend = get_backend()
-    props = backend.get_device_properties()
-    print(f"GPU: {props.name}")
-    print(f"Memory: {props.total_memory / (1024**3):.2f} GB")
-    print(f"SMs: {props.multiprocessor_count}")
-    print()
-
-    # Benchmark various sizes
-    sizes = [128, 256, 512, 1024, 2048]
-
-    print("Running benchmarks (10 iterations each)...")
-    print()
-
-    results = []
-    for size in sizes:
-        print(f"  Testing {size}x{size}...", end=" ", flush=True)
-        result = benchmark_matmul(size)
-        results.append(result)
-        print(f"done ({result['gpu_ms']:.2f} ms)")
-
-    print()
-    print("=" * 70)
-    print("  RESULTS")
-    print("=" * 70)
-    print()
-    print(
-        f"{'Size':>8} | {'NumPy (ms)':>12} | {'GPU (ms)':>12} | {'Speedup':>8} | {'GPU GFLOPS':>12}"
-    )
-    print("-" * 70)
-
-    for r in results:
-        print(
-            f"{r['size']:>8} | {r['numpy_ms']:>12.3f} | {r['gpu_ms']:>12.3f} | {r['speedup']:>7.1f}x | {r['gpu_gflops']:>12.1f}"
-        )
-
-    print()
-    print("=" * 70)
-    print()
-
-    # Peak performance
-    best = max(results, key=lambda x: x["gpu_gflops"])
-    print(f"Peak GPU Performance: {best['gpu_gflops']:.1f} GFLOPS at {best['size']}x{best['size']}")
-    print(f"Best Speedup vs NumPy: {max(r['speedup'] for r in results):.1f}x")
-    print()
-
-    # Verify correctness
-    print("Verifying correctness...")
-    a_np = np.random.rand(256, 256).astype(np.float32)
-    b_np = np.random.rand(256, 256).astype(np.float32)
-
-    expected = np.matmul(a_np, b_np)
-    result = gp.matmul(gp.from_numpy(a_np), gp.from_numpy(b_np)).to_numpy()
-
-    max_diff = np.max(np.abs(expected - result))
-    print(f"Max difference from NumPy: {max_diff:.2e}")
-
-    if max_diff < 1e-4:
-        print("[OK] Results match NumPy (within tolerance)")
-    else:
-        print("[FAIL] Results differ from NumPy!")
-
-    print()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/benchmark_tiled_matmul.py b/examples/benchmark_tiled_matmul.py
deleted file mode 100644
index f4bead3..0000000
--- a/examples/benchmark_tiled_matmul.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""Benchmark: Tiled vs Naive Matmul Performance"""
-
-import os
-import sys
-import time
-
-# Add CUDA DLLs to PATH
-cuda_path = os.environ.get("CUDA_PATH", r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4")
-cuda_bin = os.path.join(cuda_path, "bin")
-if cuda_bin not in os.environ["PATH"]:
-    os.environ["PATH"] = cuda_bin + os.pathsep + os.environ["PATH"]
-if hasattr(os, "add_dll_directory"):
-    os.add_dll_directory(cuda_bin)
-
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src", "pygpukit"))
-
-import numpy as np  # noqa: E402
-
-print("=" * 70)
-print("Tiled Matmul Benchmark - PyGPUkit v0.2")
-print("=" * 70)
-
-try:
-    import _pygpukit_native as native  # noqa: E402
-
-    print(f"\nCUDA available: {native.is_cuda_available()}")
-
-    if native.is_cuda_available():
-        props = native.get_device_properties(0)
-        print(f"GPU: {props.name}")
-        print(f"Memory: {props.total_memory / 1024**3:.1f} GB")
-
-        print("\n" + "-" * 70)
-        print("Matrix Size | Kernel    | Time (ms) | GFLOPS  | Speedup")
-        print("-" * 70)
-
-        sizes = [512, 1024, 2048, 3072, 4096]
-
-        for size in sizes:
-            M, N, K = size, size, size
-
-            # Create test matrices
-            A_np = np.random.randn(M, K).astype(np.float32)
-            B_np = np.random.randn(K, N).astype(np.float32)
-
-            # Warmup
-            A_gpu = native.from_numpy(A_np)
-            B_gpu = native.from_numpy(B_np)
-            _ = native.matmul(A_gpu, B_gpu)
-
-            # Benchmark GPU
-            iterations = 5 if size >= 2048 else 10
-            times = []
-            for _ in range(iterations):
-                A_gpu = native.from_numpy(A_np)
-                B_gpu = native.from_numpy(B_np)
-                start = time.perf_counter()
-                C_gpu = native.matmul(A_gpu, B_gpu)
-                gpu_time = time.perf_counter() - start
-                times.append(gpu_time)
-
-            avg_time = np.median(times)
-            gflops = 2 * M * N * K / avg_time / 1e9
-
-            # Check which kernel is used (threshold is 2048)
-            kernel = "Tiled" if size >= 2048 else "L2-opt"
-
-            # CPU reference
-            start = time.perf_counter()
-            C_cpu = np.matmul(A_np, B_np)
-            cpu_time = time.perf_counter() - start
-
-            speedup = cpu_time / avg_time
-
-            # Verify correctness
-            C_result = C_gpu.to_numpy()
-            rel_error = np.max(np.abs(C_result - C_cpu)) / np.max(np.abs(C_cpu))
-
-            print(
-                f"{size:>5}x{size:<5} | {kernel:<9} | {avg_time * 1000:>8.2f} | {gflops:>7.1f} | {speedup:>5.1f}x"
-            )
-
-            if rel_error > 1e-3:
-                print(f"  WARNING: High relative error: {rel_error:.2e}")
-
-        print("-" * 70)
-        print("\nTiled kernel should show improved performance for sizes >= 2048")
-        print("=" * 70)
-
-except ImportError as e:
-    print(f"Error: {e}")
-    print("Native module not available")
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
deleted file mode 100644
index fb63003..0000000
--- a/scripts/benchmark.py
+++ /dev/null
@@ -1,538 +0,0 @@
-#!/usr/bin/env python3
-"""
-PyGPUkit Comprehensive Benchmark
-
-Benchmarks all supported dtypes:
-- FP32 (Ampere optimized kernel)
-- TF32 v1 (WMMA TensorCore)
-- TF32 v2 (PTX mma.sync TensorCore, optimized)
-- FP16 (simple kernel, TensorCore planned)
-- BF16 (simple kernel, TensorCore planned)
-
-Runtime Modes:
-- Driver-Only: Uses pre-compiled kernels, no CUDA Toolkit needed
-- Full (JIT): Same kernels + JIT compilation for custom ops
-
-Note: Built-in matmul kernels are pre-compiled, so Driver-Only and Full
-modes have identical performance for matmul operations.
-
-Usage:
-    python benchmark_all.py [--sizes SIZES] [--quick] [--tf32-version v1|v2]
-
-Output format matches README.md tables for easy updates.
-"""
-
-import argparse
-import os
-import time
-from dataclasses import dataclass
-
-import numpy as np
-
-# =============================================================================
-# Setup CUDA DLL path (Windows)
-# =============================================================================
-cuda_path = os.environ.get("CUDA_PATH", r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4")
-cuda_bin = os.path.join(cuda_path, "bin")
-if os.path.isdir(cuda_bin):
-    if cuda_bin not in os.environ.get("PATH", ""):
-        os.environ["PATH"] = cuda_bin + os.pathsep + os.environ.get("PATH", "")
-    if hasattr(os, "add_dll_directory"):
-        os.add_dll_directory(cuda_bin)
-
-
-# =============================================================================
-# Data Classes
-# =============================================================================
-@dataclass
-class BenchmarkResult:
-    dtype: str
-    size: int
-    tflops_median: float
-    tflops_max: float
-    time_ms: float
-    correct: bool
-    rel_error: float
-
-
-@dataclass
-class GPUInfo:
-    name: str
-    sm_major: int
-    sm_minor: int
-    nvrtc_available: bool
-
-
-# =============================================================================
-# Native Module Import Helper
-# =============================================================================
-_native_module = None
-
-
-def get_native_module():
-    """Get native module with fallback."""
-    global _native_module
-    if _native_module is not None:
-        return _native_module
-    try:
-        import _pygpukit_native as native
-
-        _native_module = native
-    except ImportError:
-        from pygpukit import _pygpukit_native as native
-
-        _native_module = native
-    return _native_module
-
-
-# =============================================================================
-# Benchmark Functions
-# =============================================================================
-def get_gpu_info() -> GPUInfo:
-    """Get GPU information."""
-    native = get_native_module()
-    props = native.get_device_properties(0)
-
-    try:
-        import pygpukit as gpk
-
-        nvrtc = gpk.is_nvrtc_available()
-    except Exception:
-        nvrtc = False
-
-    return GPUInfo(
-        name=props.name,
-        sm_major=props.compute_capability_major,
-        sm_minor=props.compute_capability_minor,
-        nvrtc_available=nvrtc,
-    )
-
-
-def benchmark_fp32(size: int, warmup: int = 5, iterations: int = 10) -> BenchmarkResult:
-    """Benchmark FP32 matmul (Ampere optimized kernel)."""
-    native = get_native_module()
-
-    A = np.random.randn(size, size).astype(np.float32)
-    B = np.random.randn(size, size).astype(np.float32)
-
-    A_gpu = native.from_numpy(A)
-    B_gpu = native.from_numpy(B)
-
-    # Correctness
-    C_gpu = native.matmul(A_gpu, B_gpu)
-    C_result = C_gpu.to_numpy()
-    C_expected = A @ B
-    rel_error = np.max(np.abs(C_result - C_expected)) / np.max(np.abs(C_expected))
-    correct = rel_error < 1e-3
-
-    # Warmup
-    for _ in range(warmup):
-        _ = native.matmul(A_gpu, B_gpu)
-
-    # Benchmark
-    times = []
-    for _ in range(iterations):
-        start = time.perf_counter()
-        _ = native.matmul(A_gpu, B_gpu)
-        elapsed = time.perf_counter() - start
-        times.append(elapsed)
-
-    median_time = np.median(times)
-    min_time = np.min(times)
-    flops = 2.0 * size * size * size
-
-    return BenchmarkResult(
-        dtype="FP32",
-        size=size,
-        tflops_median=flops / median_time / 1e12,
-        tflops_max=flops / min_time / 1e12,
-        time_ms=median_time * 1000,
-        correct=correct,
-        rel_error=rel_error,
-    )
-
-
-def benchmark_tf32(
-    size: int, warmup: int = 5, iterations: int = 10, use_v2: bool = True
-) -> BenchmarkResult:
-    """Benchmark TF32 TensorCore matmul.
-
-    Uses environment variables to control kernel selection:
-    - PYGPUKIT_ALLOW_TF32=1: Enable TF32 kernels
-    - PYGPUKIT_TF32_V2=1: Use optimized v2 kernel (PTX mma.sync)
-    """
-    native = get_native_module()
-
-    # Set environment for TF32
-    os.environ["PYGPUKIT_ALLOW_TF32"] = "1"
-    if use_v2:
-        os.environ["PYGPUKIT_TF32_V2"] = "1"
-    else:
-        os.environ.pop("PYGPUKIT_TF32_V2", None)
-
-    A = np.random.randn(size, size).astype(np.float32)
-    B = np.random.randn(size, size).astype(np.float32)
-
-    A_gpu = native.from_numpy(A)
-    B_gpu = native.from_numpy(B)
-
-    # Correctness - use native.matmul which respects env vars
-    C_gpu = native.matmul(A_gpu, B_gpu)
-    C_result = C_gpu.to_numpy()
-    C_expected = A @ B
-    rel_error = np.max(np.abs(C_result - C_expected)) / np.max(np.abs(C_expected))
-    correct = rel_error < 1e-2  # TF32 has ~0.1% per-op error
-
-    # Warmup
-    for _ in range(warmup):
-        _ = native.matmul(A_gpu, B_gpu)
-
-    # Benchmark
-    times = []
-    for _ in range(iterations):
-        start = time.perf_counter()
-        _ = native.matmul(A_gpu, B_gpu)
-        elapsed = time.perf_counter() - start
-        times.append(elapsed)
-
-    median_time = np.median(times)
-    min_time = np.min(times)
-    flops = 2.0 * size * size * size
-
-    version = "v2" if use_v2 else "v1"
-    return BenchmarkResult(
-        dtype=f"TF32 {version}",
-        size=size,
-        tflops_median=flops / median_time / 1e12,
-        tflops_max=flops / min_time / 1e12,
-        time_ms=median_time * 1000,
-        correct=correct,
-        rel_error=rel_error,
-    )
-
-
-def benchmark_fp16(size: int, warmup: int = 5, iterations: int = 10) -> BenchmarkResult:
-    """Benchmark FP16 matmul (simple kernel, no TensorCore yet)."""
-    native = get_native_module()
-
-    A = np.random.randn(size, size).astype(np.float16)
-    B = np.random.randn(size, size).astype(np.float16)
-
-    A_gpu = native.from_numpy(A)
-    B_gpu = native.from_numpy(B)
-
-    # Correctness
-    C_gpu = native.matmul(A_gpu, B_gpu)
-    C_result = C_gpu.to_numpy()
-    C_expected = (A.astype(np.float32) @ B.astype(np.float32)).astype(np.float16)
-    rel_error = np.max(np.abs(C_result.astype(np.float32) - C_expected.astype(np.float32))) / (
-        np.max(np.abs(C_expected.astype(np.float32))) + 1e-7
-    )
-    correct = rel_error < 0.05
-
-    # Warmup
-    for _ in range(warmup):
-        _ = native.matmul(A_gpu, B_gpu)
-
-    # Benchmark
-    times = []
-    for _ in range(iterations):
-        start = time.perf_counter()
-        _ = native.matmul(A_gpu, B_gpu)
-        elapsed = time.perf_counter() - start
-        times.append(elapsed)
-
-    median_time = np.median(times)
-    min_time = np.min(times)
-    flops = 2.0 * size * size * size
-
-    return BenchmarkResult(
-        dtype="FP16",
-        size=size,
-        tflops_median=flops / median_time / 1e12,
-        tflops_max=flops / min_time / 1e12,
-        time_ms=median_time * 1000,
-        correct=correct,
-        rel_error=rel_error,
-    )
-
-
-def benchmark_bf16(size: int, warmup: int = 5, iterations: int = 10) -> BenchmarkResult:
-    """Benchmark BF16 matmul (simple kernel, no TensorCore yet)."""
-    native = get_native_module()
-    import pygpukit as gpk
-
-    A_fp32 = np.random.randn(size, size).astype(np.float32)
-    B_fp32 = np.random.randn(size, size).astype(np.float32)
-
-    # Convert to BF16 via GPUArray
-    A_gpu = gpk.from_numpy(A_fp32).astype(gpk.bfloat16)._get_native()
-    B_gpu = gpk.from_numpy(B_fp32).astype(gpk.bfloat16)._get_native()
-
-    # Correctness
-    C_gpu = native.matmul(A_gpu, B_gpu)
-    C_gpk = gpk.GPUArray._wrap_native(C_gpu).astype(gpk.float32)
-    C_result = C_gpk.to_numpy()
-    C_expected = A_fp32 @ B_fp32
-    rel_error = np.max(np.abs(C_result - C_expected)) / (np.max(np.abs(C_expected)) + 1e-7)
-    correct = rel_error < 0.05
-
-    # Re-create arrays for benchmark
-    A_gpu = gpk.from_numpy(A_fp32).astype(gpk.bfloat16)._get_native()
-    B_gpu = gpk.from_numpy(B_fp32).astype(gpk.bfloat16)._get_native()
-
-    # Warmup
-    for _ in range(warmup):
-        _ = native.matmul(A_gpu, B_gpu)
-
-    # Benchmark
-    times = []
-    for _ in range(iterations):
-        start = time.perf_counter()
-        _ = native.matmul(A_gpu, B_gpu)
-        elapsed = time.perf_counter() - start
-        times.append(elapsed)
-
-    median_time = np.median(times)
-    min_time = np.min(times)
-    flops = 2.0 * size * size * size
-
-    return BenchmarkResult(
-        dtype="BF16",
-        size=size,
-        tflops_median=flops / median_time / 1e12,
-        tflops_max=flops / min_time / 1e12,
-        time_ms=median_time * 1000,
-        correct=correct,
-        rel_error=rel_error,
-    )
-
-
-# =============================================================================
-# Output Functions
-# =============================================================================
-def print_header(gpu_info: GPUInfo, tf32_version: str):
-    """Print benchmark header."""
-    print("=" * 70)
-    print(" PyGPUkit Comprehensive Benchmark")
-    print("=" * 70)
-    print()
-    print(f"GPU: {gpu_info.name}")
-    print(f"SM: {gpu_info.sm_major}.{gpu_info.sm_minor}")
-    print(f"NVRTC (JIT): {'Available' if gpu_info.nvrtc_available else 'Not Available'}")
-    print(f"TF32 Kernel: {tf32_version}")
-    print()
-    print("Note: Built-in matmul kernels are pre-compiled.")
-    print("      Driver-Only and Full modes have identical matmul performance.")
-    print()
-
-
-def print_correctness_results(results: list):
-    """Print correctness verification results."""
-    print("=" * 70)
-    print(" Correctness Verification")
-    print("=" * 70)
-    print()
-    print(f"{'Dtype':<12} {'Size':<12} {'Rel Error':<12} {'Status':<8}")
-    print("-" * 48)
-
-    for r in results:
-        status = "PASS" if r.correct else "FAIL"
-        print(f"{r.dtype:<12} {r.size}x{r.size:<6} {r.rel_error:<12.2e} {status:<8}")
-    print()
-
-
-def print_benchmark_results(results: list, sizes: list):
-    """Print benchmark results."""
-    print("=" * 70)
-    print(" Performance Results (TFLOPS)")
-    print("=" * 70)
-    print()
-
-    # Group by size
-    by_size = {}
-    for r in results:
-        if r.size not in by_size:
-            by_size[r.size] = {}
-        by_size[r.size][r.dtype] = r
-
-    # Get all dtypes
-    all_dtypes = []
-    for r in results:
-        if r.dtype not in all_dtypes:
-            all_dtypes.append(r.dtype)
-
-    # Print header
-    header = f"{'Size':<14}"
-    for dt in all_dtypes:
-        header += f"{dt:<12}"
-    print(header)
-    print("-" * (14 + 12 * len(all_dtypes)))
-
-    # Print rows
-    for size in sizes:
-        if size not in by_size:
-            continue
-        row = by_size[size]
-        line = f"{size}x{size:<8}"
-        for dt in all_dtypes:
-            r = row.get(dt)
-            if r:
-                line += f"{r.tflops_median:<12.1f}"
-            else:
-                line += f"{'-':<12}"
-        print(line)
-
-    print()
-
-
-def print_readme_table(results: list, sizes: list):
-    """Print README.md compatible markdown table."""
-    print("=" * 70)
-    print(" README.md Table")
-    print("=" * 70)
-    print()
-
-    # Group by size
-    by_size = {}
-    for r in results:
-        if r.size not in by_size:
-            by_size[r.size] = {}
-        by_size[r.size][r.dtype] = r
-
-    # Get dtypes
-    all_dtypes = []
-    for r in results:
-        if r.dtype not in all_dtypes:
-            all_dtypes.append(r.dtype)
-
-    # Print markdown table
-    header = "| Matrix Size |"
-    separator = "|-------------|"
-    for dt in all_dtypes:
-        header += f" {dt} |"
-        separator += "------|"
-    print(header)
-    print(separator)
-
-    for size in sizes:
-        if size not in by_size:
-            continue
-        row = by_size[size]
-        line = f"| {size}x{size} |"
-        for dt in all_dtypes:
-            r = row.get(dt)
-            if r:
-                line += f" {r.tflops_median:.1f} TFLOPS |"
-            else:
-                line += " - |"
-        print(line)
-
-    print()
-
-
-# =============================================================================
-# Main
-# =============================================================================
-def main():
-    parser = argparse.ArgumentParser(description="PyGPUkit Comprehensive Benchmark")
-    parser.add_argument(
-        "--sizes",
-        type=str,
-        default="2048,4096,8192",
-        help="Comma-separated matrix sizes (default: 2048,4096,8192)",
-    )
-    parser.add_argument("--quick", action="store_true", help="Quick mode: fewer iterations")
-    parser.add_argument(
-        "--dtypes",
-        type=str,
-        default="fp32,tf32,fp16,bf16",
-        help="Comma-separated dtypes to benchmark",
-    )
-    parser.add_argument(
-        "--tf32-version",
-        type=str,
-        default="v2",
-        choices=["v1", "v2"],
-        help="TF32 kernel version: v1 (WMMA) or v2 (PTX mma.sync, default)",
-    )
-    args = parser.parse_args()
-
-    sizes = [int(s.strip()) for s in args.sizes.split(",")]
-    dtypes = [d.strip().lower() for d in args.dtypes.split(",")]
-    use_tf32_v2 = args.tf32_version == "v2"
-
-    warmup = 3 if args.quick else 5
-    iterations = 5 if args.quick else 10
-
-    # Get GPU info
-    gpu_info = get_gpu_info()
-    print_header(gpu_info, args.tf32_version.upper())
-
-    # Run benchmarks
-    results = []
-
-    print("Running benchmarks...")
-    print()
-
-    for size in sizes:
-        iters = max(2, iterations // 2) if size >= 8192 else iterations
-
-        if "fp32" in dtypes:
-            # Disable TF32 for FP32 benchmark
-            os.environ.pop("PYGPUKIT_ALLOW_TF32", None)
-            os.environ.pop("PYGPUKIT_TF32_V2", None)
-            print(f"  FP32 {size}x{size}...", end=" ", flush=True)
-            r = benchmark_fp32(size, warmup, iters)
-            results.append(r)
-            print(f"{r.tflops_median:.1f} TFLOPS")
-
-        if "tf32" in dtypes:
-            print(f"  TF32 {args.tf32_version} {size}x{size}...", end=" ", flush=True)
-            r = benchmark_tf32(size, warmup, iters, use_v2=use_tf32_v2)
-            results.append(r)
-            print(f"{r.tflops_median:.1f} TFLOPS")
-
-        if "fp16" in dtypes:
-            print(f"  FP16 {size}x{size}...", end=" ", flush=True)
-            r = benchmark_fp16(size, warmup, iters)
-            results.append(r)
-            print(f"{r.tflops_median:.1f} TFLOPS")
-
-        if "bf16" in dtypes:
-            print(f"  BF16 {size}x{size}...", end=" ", flush=True)
-            r = benchmark_bf16(size, warmup, iters)
-            results.append(r)
-            print(f"{r.tflops_median:.1f} TFLOPS")
-
-    print()
-
-    # Print results
-    print_correctness_results(results)
-    print_benchmark_results(results, sizes)
-    print_readme_table(results, sizes)
-
-    # Summary
-    print("=" * 70)
-    print(" Summary")
-    print("=" * 70)
-    print()
-    print(f"GPU: {gpu_info.name}")
-    print(f"TF32 Kernel: {args.tf32_version.upper()}")
-
-    if results:
-        peak = max(results, key=lambda r: r.tflops_median)
-        print(f"Peak: {peak.tflops_median:.1f} TFLOPS ({peak.dtype}, {peak.size}x{peak.size})")
-
-    print()
-    print("RTX 3090 Ti Theoretical:")
-    print("  FP32: ~40 TFLOPS")
-    print("  TF32 TensorCore: ~80 TFLOPS (Sparse: ~156 TFLOPS)")
-    print("  FP16 TensorCore: ~160 TFLOPS (not yet optimized)")
-    print()
-    print("Note: FP16/BF16 use simple kernels. TensorCore optimization in Issue #60.")
-    print()
-
-
-if __name__ == "__main__":
-    main()

From 4da2707682950bd9cd64bc8b91c28ef3be7aea4e Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Wed, 31 Dec 2025 00:01:37 +0900
Subject: [PATCH 4/5] docs: update benchmark docs and add skill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add .claude/skills/benchmark/README.md
- Update CLAUDE.md to use 'python -m pygpukit.benchmark'
- Update PR checklist benchmark command

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .claude/skills/benchmark/README.md | 57 ++++++++++++++++++++++++++++++
 CLAUDE.md                          | 37 +++++++++++--------
 2 files changed, 80 insertions(+), 14 deletions(-)
 create mode 100644 .claude/skills/benchmark/README.md

diff --git a/.claude/skills/benchmark/README.md b/.claude/skills/benchmark/README.md
new file mode 100644
index 0000000..089c817
--- /dev/null
+++ b/.claude/skills/benchmark/README.md
@@ -0,0 +1,57 @@
+# Benchmark Skill
+
+Run unified benchmark suite for GEMM, GEMV, and attention kernels.
+
+## Commands
+
+```bash
+# Quick benchmark (default: GEMM + GEMV)
+python -m pygpukit.benchmark --quick
+
+# Full benchmark with all sizes
+python -m pygpukit.benchmark
+
+# Save results to JSON
+python -m pygpukit.benchmark --quick --save results.json
+
+# Compare with baseline
+python -m pygpukit.benchmark --compare baseline.json
+
+# Fail on regression (for CI)
+python -m pygpukit.benchmark --compare baseline.json --fail-on-regression
+
+# Specific benchmarks
+python -m pygpukit.benchmark --gemm --sizes 4096,8192
+python -m pygpukit.benchmark --gemv --dtypes bf16,fp8
+python -m pygpukit.benchmark --attention --seq-lens 512,1024,2048
+
+# All benchmarks including FP8 (SM120+)
+python -m pygpukit.benchmark --all --fp8
+
+# Markdown output for README
+python -m pygpukit.benchmark --quick --markdown
+```
+
+## Output
+
+- Time in microseconds (us)
+- TFLOPS for compute benchmarks
+- Correctness verification
+- JSON export for regression tracking
+
+## Usage in Code
+
+```python
+from pygpukit.benchmark import BenchmarkSuite
+
+suite = BenchmarkSuite(quick=True)
+suite.add_gemm(sizes=[(4096, 4096, 4096)])
+suite.add_gemv(dtypes=["bf16", "fp8"])
+report = suite.run()
+report.save("baseline.json")
+
+# Compare
+comparison = suite.compare("baseline.json")
+if comparison.has_regression(threshold=0.05):
+    print("Regression detected!")
+```
diff --git a/CLAUDE.md b/CLAUDE.md
index 66e6722..9a1ae15 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -633,7 +633,7 @@ python -m mypy src/ --ignore-missing-imports --disable-error-code=union-attr --d
 python -m pytest tests/ -v
 
 # 4. Benchmark runs (optional but recommended)
-python benchmark.py --quick
+python -m pygpukit.benchmark --quick
 ```
 
 **DO NOT create PR until all checks pass locally.**
@@ -679,27 +679,36 @@ If performance or correctness degrades:
 
 ### Benchmarking
 
-**Always use `benchmark.py` for performance measurement.**
+**Use unified benchmark suite: `python -m pygpukit.benchmark`**
 
 ```bash
-# Full benchmark (all dtypes, all sizes)
-python benchmark.py
+# Quick benchmark (GEMM + GEMV)
+python -m pygpukit.benchmark --quick
 
-# Quick mode (fewer warmup/iterations)
-python benchmark.py --quick
+# Full benchmark
+python -m pygpukit.benchmark
 
-# Specific sizes
-python benchmark.py --sizes 4096 8192
+# Save results and compare with baseline
+python -m pygpukit.benchmark --quick --save baseline.json
+python -m pygpukit.benchmark --compare baseline.json --fail-on-regression
 
-# TF32 kernel version selection
-python benchmark.py --tf32-version v1   # WMMA API
-python benchmark.py --tf32-version v2   # PTX mma.sync (default)
+# Specific benchmarks
+python -m pygpukit.benchmark --gemm --sizes 4096,8192
+python -m pygpukit.benchmark --gemv --dtypes bf16,fp8
+python -m pygpukit.benchmark --attention --seq-lens 512,1024
+
+# All benchmarks including FP8 (SM120+)
+python -m pygpukit.benchmark --all --fp8
+
+# Markdown output for README
+python -m pygpukit.benchmark --quick --markdown
 ```
 
 **Output includes:**
-- Kernel-only timing (no D2H copy overhead)
-- Correctness verification (relative error)
-- README.md-ready table format
+- Time in microseconds (us)
+- TFLOPS for compute benchmarks
+- Correctness verification
+- JSON export for regression tracking
 
 **Environment Variables:**
 - `PYGPUKIT_ALLOW_TF32=1` - Enable TF32 TensorCore

From 1cb43424d42b26101157210dd9318b212e2bae61 Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Wed, 31 Dec 2025 00:03:46 +0900
Subject: [PATCH 5/5] fix(benchmark): fix lint errors - unused imports and
 ambiguous var names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/pygpukit/benchmark/attention.py | 36 ++++++++++++++---------------
 src/pygpukit/benchmark/cli.py       |  1 -
 src/pygpukit/benchmark/gemm.py      |  2 --
 src/pygpukit/benchmark/gemv.py      |  1 -
 4 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/src/pygpukit/benchmark/attention.py b/src/pygpukit/benchmark/attention.py
index 8741d10..a79f95c 100644
--- a/src/pygpukit/benchmark/attention.py
+++ b/src/pygpukit/benchmark/attention.py
@@ -2,8 +2,6 @@
 
 from __future__ import annotations
 
-import numpy as np
-
 from .base import Benchmark
 from .results import BenchmarkResult
 
@@ -62,11 +60,11 @@ def _benchmark_sdpa(
         # (Q@K^T and attn@V, each 2*seq*seq*dim)
         flops = 4.0 * seq_len * seq_len * self.head_dim * self.num_heads
 
-        # Create Q, K, V
-        Q = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
-        K = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
-        V = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
-        O = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
+        # Create Q, K, V, Out
+        q = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
+        k = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
+        v = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
+        out = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
 
         # Check if native SDPA available
         if not hasattr(native, "sdpa_causal_bf16"):
@@ -74,10 +72,10 @@ def _benchmark_sdpa(
 
         def run_fn() -> None:
             native.sdpa_causal_bf16(
-                Q._get_native(),
-                K._get_native(),
-                V._get_native(),
-                O._get_native(),
+                q._get_native(),
+                k._get_native(),
+                v._get_native(),
+                out._get_native(),
             )
 
         return self._measure(name, run_fn, params, flops=flops)
@@ -139,20 +137,20 @@ def _benchmark_gqa(
         # GQA FLOPs (KV heads broadcasted)
         flops = 4.0 * seq_len * seq_len * self.head_dim * self.num_heads
 
-        Q = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
-        K = gk.empty((self.num_kv_heads, seq_len, self.head_dim), dtype="bfloat16")
-        V = gk.empty((self.num_kv_heads, seq_len, self.head_dim), dtype="bfloat16")
-        O = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
+        q = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
+        k = gk.empty((self.num_kv_heads, seq_len, self.head_dim), dtype="bfloat16")
+        v = gk.empty((self.num_kv_heads, seq_len, self.head_dim), dtype="bfloat16")
+        out = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16")
 
         if not hasattr(native, "sdpa_causal_gqa_bf16"):
             return None
 
         def run_fn() -> None:
             native.sdpa_causal_gqa_bf16(
-                Q._get_native(),
-                K._get_native(),
-                V._get_native(),
-                O._get_native(),
+                q._get_native(),
+                k._get_native(),
+                v._get_native(),
+                out._get_native(),
                 self.num_heads // self.num_kv_heads,
             )
 
diff --git a/src/pygpukit/benchmark/cli.py b/src/pygpukit/benchmark/cli.py
index 5c0390d..b3380bb 100644
--- a/src/pygpukit/benchmark/cli.py
+++ b/src/pygpukit/benchmark/cli.py
@@ -4,7 +4,6 @@
 
 import argparse
 import sys
-from pathlib import Path
 
 from . import BenchmarkReport, BenchmarkSuite
 
diff --git a/src/pygpukit/benchmark/gemm.py b/src/pygpukit/benchmark/gemm.py
index 2e0cd91..3c15b11 100644
--- a/src/pygpukit/benchmark/gemm.py
+++ b/src/pygpukit/benchmark/gemm.py
@@ -2,8 +2,6 @@
 
 from __future__ import annotations
 
-from typing import Literal
-
 import numpy as np
 
 from .base import Benchmark
diff --git a/src/pygpukit/benchmark/gemv.py b/src/pygpukit/benchmark/gemv.py
index 695ee9d..d8b0585 100644
--- a/src/pygpukit/benchmark/gemv.py
+++ b/src/pygpukit/benchmark/gemv.py
@@ -7,7 +7,6 @@
 from .base import Benchmark
 from .results import BenchmarkResult
 
-
 # LLM-relevant GEMV configurations
 LLM_CONFIGS = [
     # (K, N, label)