From 616abb27bb3f119de47017684cb742e1da36b9ea Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 30 Dec 2025 23:41:34 +0900 Subject: [PATCH 1/5] feat(benchmark): Add unified benchmark suite (#163) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add modular benchmark suite with: - BenchmarkSuite class for unified benchmark orchestration - GEMM benchmarks (fp32, tf32, bf16, fp16, fp8) - GEMV benchmarks (bf16, fp8, nvf4, int4, w8a8) - Attention benchmarks (SDPA, GQA) - JSON export and baseline comparison - Regression detection with configurable threshold - CLI interface: python -m pygpukit.benchmark Usage: from pygpukit.benchmark import BenchmarkSuite suite = BenchmarkSuite() suite.add_gemm().add_gemv() report = suite.run() report.save("baseline.json") # Compare with baseline comparison = suite.compare("baseline.json") if comparison.has_regression(threshold=0.05): raise RuntimeError("Regression detected!") Closes #163 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/pygpukit/benchmark/__init__.py | 271 ++++++++++++++++++++++++++++ src/pygpukit/benchmark/__main__.py | 7 + src/pygpukit/benchmark/attention.py | 159 ++++++++++++++++ src/pygpukit/benchmark/base.py | 137 ++++++++++++++ src/pygpukit/benchmark/cli.py | 233 ++++++++++++++++++++++++ src/pygpukit/benchmark/gemm.py | 214 ++++++++++++++++++++++ src/pygpukit/benchmark/gemv.py | 211 ++++++++++++++++++++++ src/pygpukit/benchmark/results.py | 187 +++++++++++++++++++ 8 files changed, 1419 insertions(+) create mode 100644 src/pygpukit/benchmark/__init__.py create mode 100644 src/pygpukit/benchmark/__main__.py create mode 100644 src/pygpukit/benchmark/attention.py create mode 100644 src/pygpukit/benchmark/base.py create mode 100644 src/pygpukit/benchmark/cli.py create mode 100644 src/pygpukit/benchmark/gemm.py create mode 100644 src/pygpukit/benchmark/gemv.py create mode 100644 src/pygpukit/benchmark/results.py diff --git a/src/pygpukit/benchmark/__init__.py b/src/pygpukit/benchmark/__init__.py new file mode 100644 index 0000000..cc55a4a --- /dev/null +++ b/src/pygpukit/benchmark/__init__.py @@ -0,0 +1,271 @@ +"""PyGPUkit Benchmark Suite. + +Usage: + from pygpukit.benchmark import BenchmarkSuite + + suite = BenchmarkSuite() + suite.add_gemm() + suite.add_gemv() + report = suite.run() + report.save("baseline.json") + + # Compare with baseline + comparison = suite.compare("baseline.json") + if comparison.has_regression(): + raise RuntimeError("Performance regression detected!") +""" + +from __future__ import annotations + +from pathlib import Path + +from .attention import GQABenchmark, SDPABenchmark +from .base import Benchmark, get_gpu_info, measure_kernel +from .gemm import FP8GEMMBenchmark, GEMMBenchmark +from .gemv import GEMVBenchmark, W8A8GEMVBenchmark +from .results import ( + BenchmarkReport, + BenchmarkResult, + ComparisonResult, + GPUInfo, + Regression, + compare_reports, +) + +__all__ = [ + "BenchmarkSuite", + "BenchmarkReport", + "BenchmarkResult", + "ComparisonResult", + "GPUInfo", + "Regression", + "Benchmark", + "GEMMBenchmark", + "FP8GEMMBenchmark", + "GEMVBenchmark", + "W8A8GEMVBenchmark", + "SDPABenchmark", + "GQABenchmark", + "get_gpu_info", + "measure_kernel", + "compare_reports", +] + + +class BenchmarkSuite: + """Unified benchmark suite for PyGPUkit. + + Example: + suite = BenchmarkSuite() + suite.add_gemm(sizes=[(4096, 4096, 4096)]) + suite.add_gemv() + report = suite.run() + report.save("results.json") + """ + + def __init__(self, warmup: int = 10, iterations: int = 50, quick: bool = False): + """Initialize benchmark suite. + + Args: + warmup: Number of warmup iterations + iterations: Number of timed iterations + quick: If True, use reduced warmup/iterations + """ + if quick: + warmup = 5 + iterations = 20 + self.warmup = warmup + self.iterations = iterations + self.benchmarks: list[Benchmark] = [] + + def add_gemm( + self, + sizes: list[tuple[int, int, int]] | None = None, + dtypes: list[str] | None = None, + ) -> BenchmarkSuite: + """Add GEMM benchmark. + + Args: + sizes: List of (M, K, N) tuples + dtypes: List of dtypes to benchmark (fp32, tf32, bf16, fp16) + """ + self.benchmarks.append( + GEMMBenchmark( + sizes=sizes, + dtypes=dtypes, + warmup=self.warmup, + iterations=self.iterations, + ) + ) + return self + + def add_fp8_gemm( + self, + sizes: list[tuple[int, int, int]] | None = None, + ) -> BenchmarkSuite: + """Add FP8 GEMM benchmark (SM120+).""" + self.benchmarks.append( + FP8GEMMBenchmark( + sizes=sizes, + warmup=self.warmup, + iterations=self.iterations, + ) + ) + return self + + def add_gemv( + self, + configs: list[tuple[int, int, str]] | None = None, + dtypes: list[str] | None = None, + ) -> BenchmarkSuite: + """Add GEMV benchmark. + + Args: + configs: List of (K, N, label) tuples + dtypes: List of dtypes (bf16, fp8, nvf4, int4) + """ + self.benchmarks.append( + GEMVBenchmark( + configs=configs, + dtypes=dtypes, + warmup=self.warmup, + iterations=self.iterations, + ) + ) + return self + + def add_w8a8_gemv( + self, + configs: list[tuple[int, int, str]] | None = None, + ) -> BenchmarkSuite: + """Add W8A8 GEMV benchmark.""" + self.benchmarks.append( + W8A8GEMVBenchmark( + configs=configs, + warmup=self.warmup, + iterations=self.iterations, + ) + ) + return self + + def add_attention( + self, + seq_lens: list[int] | None = None, + num_heads: int = 32, + head_dim: int = 128, + ) -> BenchmarkSuite: + """Add SDPA benchmark.""" + self.benchmarks.append( + SDPABenchmark( + seq_lens=seq_lens, + num_heads=num_heads, + head_dim=head_dim, + warmup=self.warmup, + iterations=self.iterations, + ) + ) + return self + + def add_gqa( + self, + seq_lens: list[int] | None = None, + num_heads: int = 32, + num_kv_heads: int = 8, + head_dim: int = 128, + ) -> BenchmarkSuite: + """Add GQA benchmark.""" + self.benchmarks.append( + GQABenchmark( + seq_lens=seq_lens, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + head_dim=head_dim, + warmup=self.warmup, + iterations=self.iterations, + ) + ) + return self + + def add_all(self) -> BenchmarkSuite: + """Add all available benchmarks with default settings.""" + self.add_gemm() + self.add_gemv() + self.add_attention() + return self + + def run(self, verbose: bool = True) -> BenchmarkReport: + """Run all benchmarks. + + Args: + verbose: If True, print progress + + Returns: + BenchmarkReport with all results + """ + gpu_info = get_gpu_info() + report = BenchmarkReport(gpu=gpu_info) + + if verbose: + print("=" * 60) + print("PyGPUkit Benchmark Suite") + print("=" * 60) + print(f"GPU: {gpu_info.name}") + print(f"SM: {gpu_info.sm_major}.{gpu_info.sm_minor}") + print(f"Memory: {gpu_info.memory_gb:.1f} GB") + print() + + for benchmark in self.benchmarks: + if verbose: + print(f"Running {benchmark.__class__.__name__}...") + + results = benchmark.run() + for result in results: + report.add(result) + if verbose: + tflops_str = f"{result.tflops:.1f} TFLOPS" if result.tflops else "" + print(f" {result.name}: {result.median_us:.1f} us {tflops_str}") + + if verbose: + print() + + return report + + def compare( + self, + baseline_path: str | Path, + threshold: float = 0.05, + verbose: bool = True, + ) -> ComparisonResult: + """Run benchmarks and compare with baseline. + + Args: + baseline_path: Path to baseline JSON file + threshold: Regression threshold (0.05 = 5%) + verbose: If True, print comparison summary + + Returns: + ComparisonResult + """ + current = self.run(verbose=verbose) + baseline = BenchmarkReport.load(baseline_path) + comparison = compare_reports(current, baseline, threshold=threshold) + + if verbose: + print(comparison.summary()) + + return comparison + + +def run_quick() -> BenchmarkReport: + """Run quick benchmark suite.""" + suite = BenchmarkSuite(quick=True) + suite.add_gemm(sizes=[(4096, 4096, 4096)], dtypes=["bf16"]) + suite.add_gemv(dtypes=["bf16"]) + return suite.run() + + +def run_full() -> BenchmarkReport: + """Run full benchmark suite.""" + suite = BenchmarkSuite() + suite.add_all() + return suite.run() diff --git a/src/pygpukit/benchmark/__main__.py b/src/pygpukit/benchmark/__main__.py new file mode 100644 index 0000000..2ecb328 --- /dev/null +++ b/src/pygpukit/benchmark/__main__.py @@ -0,0 +1,7 @@ +"""Entry point for python -m pygpukit.benchmark.""" + +import sys + +from .cli import main + +sys.exit(main()) diff --git a/src/pygpukit/benchmark/attention.py b/src/pygpukit/benchmark/attention.py new file mode 100644 index 0000000..8741d10 --- /dev/null +++ b/src/pygpukit/benchmark/attention.py @@ -0,0 +1,159 @@ +"""Attention benchmarks.""" + +from __future__ import annotations + +import numpy as np + +from .base import Benchmark +from .results import BenchmarkResult + + +class SDPABenchmark(Benchmark): + """Scaled Dot-Product Attention benchmark.""" + + category = "attention" + + def __init__( + self, + seq_lens: list[int] | None = None, + num_heads: int = 32, + head_dim: int = 128, + warmup: int = 10, + iterations: int = 50, + ): + super().__init__(warmup=warmup, iterations=iterations) + self.seq_lens = seq_lens or [512, 1024, 2048, 4096] + self.num_heads = num_heads + self.head_dim = head_dim + + def run(self) -> list[BenchmarkResult]: + """Run SDPA benchmarks.""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + results = [] + + for seq_len in self.seq_lens: + try: + result = self._benchmark_sdpa(native, seq_len) + if result: + results.append(result) + except Exception as e: + print(f" SDPA seq_len={seq_len}: ERROR - {e}") + + return results + + def _benchmark_sdpa( + self, + native: object, + seq_len: int, + ) -> BenchmarkResult | None: + """Benchmark SDPA for a given sequence length.""" + import pygpukit as gk + + name = f"sdpa_seq{seq_len}" + params = { + "seq_len": seq_len, + "num_heads": self.num_heads, + "head_dim": self.head_dim, + } + + # Attention FLOPs: 4 * seq_len^2 * head_dim * num_heads + # (Q@K^T and attn@V, each 2*seq*seq*dim) + flops = 4.0 * seq_len * seq_len * self.head_dim * self.num_heads + + # Create Q, K, V + Q = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") + K = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") + V = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") + O = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") + + # Check if native SDPA available + if not hasattr(native, "sdpa_causal_bf16"): + return None + + def run_fn() -> None: + native.sdpa_causal_bf16( + Q._get_native(), + K._get_native(), + V._get_native(), + O._get_native(), + ) + + return self._measure(name, run_fn, params, flops=flops) + + +class GQABenchmark(Benchmark): + """Grouped Query Attention benchmark.""" + + category = "attention" + + def __init__( + self, + seq_lens: list[int] | None = None, + num_heads: int = 32, + num_kv_heads: int = 8, + head_dim: int = 128, + warmup: int = 10, + iterations: int = 50, + ): + super().__init__(warmup=warmup, iterations=iterations) + self.seq_lens = seq_lens or [512, 1024, 2048] + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = head_dim + + def run(self) -> list[BenchmarkResult]: + """Run GQA benchmarks.""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + results = [] + + for seq_len in self.seq_lens: + try: + result = self._benchmark_gqa(native, seq_len) + if result: + results.append(result) + except Exception as e: + print(f" GQA seq_len={seq_len}: ERROR - {e}") + + return results + + def _benchmark_gqa( + self, + native: object, + seq_len: int, + ) -> BenchmarkResult | None: + """Benchmark GQA.""" + import pygpukit as gk + + name = f"gqa_seq{seq_len}" + params = { + "seq_len": seq_len, + "num_heads": self.num_heads, + "num_kv_heads": self.num_kv_heads, + "head_dim": self.head_dim, + } + + # GQA FLOPs (KV heads broadcasted) + flops = 4.0 * seq_len * seq_len * self.head_dim * self.num_heads + + Q = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") + K = gk.empty((self.num_kv_heads, seq_len, self.head_dim), dtype="bfloat16") + V = gk.empty((self.num_kv_heads, seq_len, self.head_dim), dtype="bfloat16") + O = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") + + if not hasattr(native, "sdpa_causal_gqa_bf16"): + return None + + def run_fn() -> None: + native.sdpa_causal_gqa_bf16( + Q._get_native(), + K._get_native(), + V._get_native(), + O._get_native(), + self.num_heads // self.num_kv_heads, + ) + + return self._measure(name, run_fn, params, flops=flops) diff --git a/src/pygpukit/benchmark/base.py b/src/pygpukit/benchmark/base.py new file mode 100644 index 0000000..e5629fc --- /dev/null +++ b/src/pygpukit/benchmark/base.py @@ -0,0 +1,137 @@ +"""Base benchmark class and utilities.""" + +from __future__ import annotations + +import time +from abc import ABC, abstractmethod +from typing import Any, Callable + +import numpy as np + +from .results import BenchmarkResult, GPUInfo + + +def get_gpu_info() -> GPUInfo: + """Get GPU information.""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + props = native.get_device_properties(0) + + return GPUInfo( + name=props.name, + sm_major=props.compute_capability_major, + sm_minor=props.compute_capability_minor, + memory_gb=props.total_memory / (1024**3), + ) + + +def measure_kernel( + fn: Callable[[], Any], + warmup: int = 10, + iterations: int = 50, + sync_fn: Callable[[], None] | None = None, +) -> tuple[float, float, float, float]: + """Measure kernel execution time. + + Args: + fn: Function to benchmark + warmup: Number of warmup iterations + iterations: Number of timed iterations + sync_fn: Optional sync function (e.g., device_synchronize) + + Returns: + (median_us, min_us, max_us, std_us) + """ + if sync_fn is None: + from pygpukit.core.backend import get_native_module + + native = get_native_module() + sync_fn = native.device_synchronize + + # Warmup + for _ in range(warmup): + fn() + sync_fn() + + # Benchmark + times = [] + for _ in range(iterations): + sync_fn() + start = time.perf_counter() + fn() + sync_fn() + end = time.perf_counter() + times.append((end - start) * 1e6) # Convert to microseconds + + times_arr = np.array(times) + return ( + float(np.median(times_arr)), + float(np.min(times_arr)), + float(np.max(times_arr)), + float(np.std(times_arr)), + ) + + +class Benchmark(ABC): + """Abstract base class for benchmarks.""" + + category: str = "unknown" + warmup: int = 10 + iterations: int = 50 + + def __init__(self, warmup: int | None = None, iterations: int | None = None): + if warmup is not None: + self.warmup = warmup + if iterations is not None: + self.iterations = iterations + + @abstractmethod + def run(self) -> list[BenchmarkResult]: + """Run the benchmark and return results.""" + pass + + def _measure( + self, + name: str, + fn: Callable[[], Any], + params: dict[str, Any], + flops: float | None = None, + bytes_moved: float | None = None, + check_fn: Callable[[], tuple[bool, float]] | None = None, + ) -> BenchmarkResult: + """Measure a single benchmark case.""" + median_us, min_us, max_us, std_us = measure_kernel( + fn, warmup=self.warmup, iterations=self.iterations + ) + + # Calculate TFLOPS if flops provided + tflops = None + if flops is not None and median_us > 0: + tflops = flops / median_us / 1e6 # TFLOPS = flops / us / 1e6 + + # Calculate bandwidth if bytes provided + bandwidth = None + if bytes_moved is not None and median_us > 0: + bandwidth = bytes_moved / median_us / 1e3 # GB/s = bytes / us / 1e3 + + # Check correctness + correct = True + rel_error = 0.0 + if check_fn is not None: + correct, rel_error = check_fn() + + return BenchmarkResult( + name=name, + category=self.category, + params=params, + median_us=median_us, + min_us=min_us, + max_us=max_us, + std_us=std_us, + tflops=tflops, + bandwidth_gbps=bandwidth, + correct=correct, + rel_error=rel_error, + iterations=self.iterations, + ) diff --git a/src/pygpukit/benchmark/cli.py b/src/pygpukit/benchmark/cli.py new file mode 100644 index 0000000..5c0390d --- /dev/null +++ b/src/pygpukit/benchmark/cli.py @@ -0,0 +1,233 @@ +"""CLI interface for benchmark suite.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from . import BenchmarkReport, BenchmarkSuite + + +def main() -> int: + """Main entry point for benchmark CLI.""" + parser = argparse.ArgumentParser( + description="PyGPUkit Benchmark Suite", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python -m pygpukit.benchmark # Run default benchmarks + python -m pygpukit.benchmark --quick # Quick benchmarks + python -m pygpukit.benchmark --save results.json + python -m pygpukit.benchmark --compare baseline.json + python -m pygpukit.benchmark --gemm --sizes 4096,8192 + python -m pygpukit.benchmark --gemv --dtypes bf16,fp8 +""", + ) + + # Output options + parser.add_argument( + "--save", + type=str, + metavar="FILE", + help="Save results to JSON file", + ) + parser.add_argument( + "--compare", + type=str, + metavar="FILE", + help="Compare with baseline JSON file", + ) + parser.add_argument( + "--threshold", + type=float, + default=0.05, + help="Regression threshold (default: 0.05 = 5%%)", + ) + parser.add_argument( + "--fail-on-regression", + action="store_true", + help="Exit with code 1 if regression detected", + ) + + # Benchmark selection + parser.add_argument( + "--all", + action="store_true", + help="Run all benchmarks", + ) + parser.add_argument( + "--gemm", + action="store_true", + help="Run GEMM benchmarks", + ) + parser.add_argument( + "--gemv", + action="store_true", + help="Run GEMV benchmarks", + ) + parser.add_argument( + "--attention", + action="store_true", + help="Run attention benchmarks", + ) + parser.add_argument( + "--fp8", + action="store_true", + help="Include FP8 benchmarks (SM120+)", + ) + + # Benchmark parameters + parser.add_argument( + "--sizes", + type=str, + help="GEMM sizes: comma-separated (e.g., 2048,4096,8192)", + ) + parser.add_argument( + "--dtypes", + type=str, + help="Dtypes: comma-separated (e.g., fp32,tf32,bf16)", + ) + parser.add_argument( + "--seq-lens", + type=str, + help="Attention seq lengths: comma-separated (e.g., 512,1024,2048)", + ) + + # Performance options + parser.add_argument( + "--quick", + action="store_true", + help="Quick mode: fewer iterations", + ) + parser.add_argument( + "--warmup", + type=int, + default=10, + help="Warmup iterations (default: 10)", + ) + parser.add_argument( + "--iterations", + type=int, + default=50, + help="Benchmark iterations (default: 50)", + ) + + # Output format + parser.add_argument( + "--quiet", + action="store_true", + help="Suppress progress output", + ) + parser.add_argument( + "--markdown", + action="store_true", + help="Output results as markdown table", + ) + + args = parser.parse_args() + + # Create suite + suite = BenchmarkSuite( + warmup=args.warmup, + iterations=args.iterations, + quick=args.quick, + ) + + # Parse sizes + sizes = None + if args.sizes: + size_list = [int(s.strip()) for s in args.sizes.split(",")] + sizes = [(s, s, s) for s in size_list] # Square matrices + + # Parse dtypes + dtypes = None + if args.dtypes: + dtypes = [d.strip() for d in args.dtypes.split(",")] + + # Parse seq lens + seq_lens = None + if args.seq_lens: + seq_lens = [int(s.strip()) for s in args.seq_lens.split(",")] + + # Add benchmarks + if args.all: + suite.add_all() + if args.fp8: + suite.add_fp8_gemm() + suite.add_w8a8_gemv() + else: + # Default: add gemm and gemv if nothing specified + has_selection = args.gemm or args.gemv or args.attention + if not has_selection: + suite.add_gemm(sizes=sizes, dtypes=dtypes) + suite.add_gemv(dtypes=dtypes) + else: + if args.gemm: + suite.add_gemm(sizes=sizes, dtypes=dtypes) + if args.fp8: + suite.add_fp8_gemm(sizes=sizes) + if args.gemv: + suite.add_gemv(dtypes=dtypes) + if args.fp8: + suite.add_w8a8_gemv() + if args.attention: + suite.add_attention(seq_lens=seq_lens) + suite.add_gqa(seq_lens=seq_lens) + + # Run benchmarks + verbose = not args.quiet + if args.compare: + comparison = suite.compare( + args.compare, + threshold=args.threshold, + verbose=verbose, + ) + if args.fail_on_regression and comparison.has_regression(args.threshold): + print("\nERROR: Performance regression detected!") + return 1 + report = comparison.current + else: + report = suite.run(verbose=verbose) + + # Save results + if args.save: + report.save(args.save) + if verbose: + print(f"Results saved to {args.save}") + + # Print markdown table + if args.markdown: + print_markdown_table(report) + + return 0 + + +def print_markdown_table(report: BenchmarkReport) -> None: + """Print results as markdown table.""" + + print("\n## Benchmark Results\n") + print(f"GPU: {report.gpu.name}") + print(f"SM: {report.gpu.sm_major}.{report.gpu.sm_minor}") + print() + + # Group by category + by_category: dict[str, list] = {} + for r in report.results: + if r.category not in by_category: + by_category[r.category] = [] + by_category[r.category].append(r) + + for category, results in by_category.items(): + print(f"### {category.upper()}\n") + print("| Name | Time (us) | TFLOPS | Correct |") + print("|------|-----------|--------|---------|") + for r in results: + tflops = f"{r.tflops:.1f}" if r.tflops else "-" + correct = "Yes" if r.correct else "No" + print(f"| {r.name} | {r.median_us:.1f} | {tflops} | {correct} |") + print() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/pygpukit/benchmark/gemm.py b/src/pygpukit/benchmark/gemm.py new file mode 100644 index 0000000..2e0cd91 --- /dev/null +++ b/src/pygpukit/benchmark/gemm.py @@ -0,0 +1,214 @@ +"""GEMM (General Matrix Multiply) benchmarks.""" + +from __future__ import annotations + +from typing import Literal + +import numpy as np + +from .base import Benchmark +from .results import BenchmarkResult + + +class GEMMBenchmark(Benchmark): + """GEMM benchmark for various dtypes and sizes.""" + + category = "gemm" + + def __init__( + self, + sizes: list[tuple[int, int, int]] | None = None, + dtypes: list[str] | None = None, + warmup: int = 10, + iterations: int = 50, + ): + super().__init__(warmup=warmup, iterations=iterations) + self.sizes = sizes or [ + (2048, 2048, 2048), + (4096, 4096, 4096), + (8192, 8192, 8192), + ] + self.dtypes = dtypes or ["fp32", "tf32", "bf16"] + + def run(self) -> list[BenchmarkResult]: + """Run GEMM benchmarks.""" + results: list[BenchmarkResult] = [] + + for dtype in self.dtypes: + for M, K, N in self.sizes: + try: + result = self._benchmark_gemm(dtype, M, K, N) + if result: + results.append(result) + except Exception as e: + print(f" GEMM {dtype} {M}x{K}x{N}: ERROR - {e}") + + return results + + def _benchmark_gemm( + self, + dtype: str, + M: int, + K: int, + N: int, + ) -> BenchmarkResult | None: + """Benchmark single GEMM configuration.""" + import os + + from pygpukit.core.backend import get_native_module + + native = get_native_module() + + name = f"gemm_{dtype}_{M}x{K}x{N}" + params = {"dtype": dtype, "M": M, "K": K, "N": N} + flops = 2.0 * M * K * N + + if dtype == "fp32": + os.environ.pop("PYGPUKIT_ALLOW_TF32", None) + A = np.random.randn(M, K).astype(np.float32) * 0.1 + B = np.random.randn(K, N).astype(np.float32) * 0.1 + A_gpu = native.from_numpy(A) + B_gpu = native.from_numpy(B) + + def run_fn() -> None: + native.matmul(A_gpu, B_gpu) + + def check_fn() -> tuple[bool, float]: + C_gpu = native.matmul(A_gpu, B_gpu) + C = C_gpu.to_numpy() + C_ref = A @ B + err = float(np.max(np.abs(C - C_ref)) / (np.max(np.abs(C_ref)) + 1e-8)) + return err < 1e-3, err + + elif dtype == "tf32": + os.environ["PYGPUKIT_ALLOW_TF32"] = "1" + os.environ["PYGPUKIT_TF32_V2"] = "1" + A = np.random.randn(M, K).astype(np.float32) * 0.1 + B = np.random.randn(K, N).astype(np.float32) * 0.1 + A_gpu = native.from_numpy(A) + B_gpu = native.from_numpy(B) + + def run_fn() -> None: + native.matmul(A_gpu, B_gpu) + + def check_fn() -> tuple[bool, float]: + C_gpu = native.matmul(A_gpu, B_gpu) + C = C_gpu.to_numpy() + C_ref = A @ B + err = float(np.max(np.abs(C - C_ref)) / (np.max(np.abs(C_ref)) + 1e-8)) + return err < 0.01, err + + elif dtype == "bf16": + import pygpukit as gk + + A = np.random.randn(M, K).astype(np.float32) * 0.1 + B = np.random.randn(K, N).astype(np.float32) * 0.1 + A_gpu = gk.from_numpy(A).astype(gk.bfloat16)._get_native() + B_gpu = gk.from_numpy(B).astype(gk.bfloat16)._get_native() + + def run_fn() -> None: + native.matmul(A_gpu, B_gpu) + + def check_fn() -> tuple[bool, float]: + import pygpukit as gk + + C_gpu = native.matmul(A_gpu, B_gpu) + C = gk.GPUArray._wrap_native(C_gpu).astype(gk.float32).to_numpy() + C_ref = A @ B + err = float(np.max(np.abs(C - C_ref)) / (np.max(np.abs(C_ref)) + 1e-8)) + return err < 0.05, err + + elif dtype == "fp16": + A = np.random.randn(M, K).astype(np.float16) * 0.1 + B = np.random.randn(K, N).astype(np.float16) * 0.1 + A_gpu = native.from_numpy(A) + B_gpu = native.from_numpy(B) + + def run_fn() -> None: + native.matmul(A_gpu, B_gpu) + + def check_fn() -> tuple[bool, float]: + C_gpu = native.matmul(A_gpu, B_gpu) + C = C_gpu.to_numpy().astype(np.float32) + C_ref = (A.astype(np.float32) @ B.astype(np.float32)).astype(np.float16) + err = float( + np.max(np.abs(C - C_ref.astype(np.float32))) + / (np.max(np.abs(C_ref.astype(np.float32))) + 1e-8) + ) + return err < 0.05, err + + else: + return None + + return self._measure(name, run_fn, params, flops=flops, check_fn=check_fn) + + +class FP8GEMMBenchmark(Benchmark): + """FP8 GEMM benchmark (SM120+).""" + + category = "gemm" + + def __init__( + self, + sizes: list[tuple[int, int, int]] | None = None, + warmup: int = 10, + iterations: int = 50, + ): + super().__init__(warmup=warmup, iterations=iterations) + self.sizes = sizes or [ + (1024, 4096, 14336), + (2048, 4096, 14336), + (4096, 4096, 14336), + ] + + def run(self) -> list[BenchmarkResult]: + """Run FP8 GEMM benchmarks.""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + results: list[BenchmarkResult] = [] + + # Check SM120 availability + props = native.get_device_properties(0) + sm = props.compute_capability_major * 10 + props.compute_capability_minor + if sm < 120: + print(f" FP8 GEMM: Requires SM120+ (current: SM{sm})") + return results + + for M, K, N in self.sizes: + try: + result = self._benchmark_fp8_gemm(native, M, K, N) + if result: + results.append(result) + except Exception as e: + print(f" FP8 GEMM {M}x{K}x{N}: ERROR - {e}") + + return results + + def _benchmark_fp8_gemm( + self, + native: object, + M: int, + K: int, + N: int, + ) -> BenchmarkResult | None: + """Benchmark FP8 GEMM.""" + from pygpukit.core import from_numpy + + name = f"gemm_fp8_{M}x{K}x{N}" + params = {"dtype": "fp8", "M": M, "K": K, "N": N} + flops = 2.0 * M * K * N + + A_fp8 = from_numpy(np.random.randint(0, 256, (M, K), dtype=np.uint8)) + B_fp8 = from_numpy(np.random.randint(0, 256, (K, N), dtype=np.uint8)) + C_fp8 = from_numpy(np.zeros((M, N), dtype=np.uint8)) + + # Try v5 (cached) kernel + func = getattr(native, "gemm_fp8_fp8_sm120_v5", None) + if func is None: + return None + + def run_fn() -> None: + func(A_fp8._get_native(), B_fp8._get_native(), C_fp8._get_native()) + + return self._measure(name, run_fn, params, flops=flops) diff --git a/src/pygpukit/benchmark/gemv.py b/src/pygpukit/benchmark/gemv.py new file mode 100644 index 0000000..4f2adb3 --- /dev/null +++ b/src/pygpukit/benchmark/gemv.py @@ -0,0 +1,211 @@ +"""GEMV (General Matrix-Vector) benchmarks.""" + +from __future__ import annotations + +import numpy as np + +from .base import Benchmark +from .results import BenchmarkResult + + +# LLM-relevant GEMV configurations +LLM_CONFIGS = [ + # (K, N, label) + (4096, 4096, "7B_hidden"), + (4096, 14336, "7B_mlp_up"), + (14336, 4096, "7B_mlp_down"), + (8192, 8192, "72B_hidden"), + (8192, 29568, "72B_mlp_up"), + (29568, 8192, "72B_mlp_down"), +] + + +class GEMVBenchmark(Benchmark): + """GEMV benchmark for LLM decode (M=1).""" + + category = "gemv" + + def __init__( + self, + configs: list[tuple[int, int, str]] | None = None, + dtypes: list[str] | None = None, + warmup: int = 10, + iterations: int = 50, + ): + super().__init__(warmup=warmup, iterations=iterations) + self.configs = configs or LLM_CONFIGS + self.dtypes = dtypes or ["bf16", "fp8", "nvf4"] + + def run(self) -> list[BenchmarkResult]: + """Run GEMV benchmarks.""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + results: list[BenchmarkResult] = [] + + for K, N, label in self.configs: + for dtype in self.dtypes: + try: + result = self._benchmark_gemv(native, K, N, label, dtype) + if result: + results.append(result) + except Exception as e: + print(f" GEMV {dtype} {label}: ERROR - {e}") + + return results + + def _benchmark_gemv( + self, + native: object, + K: int, + N: int, + label: str, + dtype: str, + ) -> BenchmarkResult | None: + """Benchmark single GEMV configuration.""" + import pygpukit as gk + from pygpukit.core import from_numpy + + name = f"gemv_{dtype}_{label}" + params = {"dtype": dtype, "K": K, "N": N, "label": label} + flops = 2.0 * K * N # M=1 + + if dtype == "bf16": + from pygpukit.ops.matmul import gemv_bf16 + + A = gk.empty((K,), dtype="bfloat16") + B = gk.empty((K, N), dtype="bfloat16") + C = gk.empty((N,), dtype="bfloat16") + + def run_fn() -> None: + gemv_bf16(A, B, out=C) + + elif dtype == "fp8": + from pygpukit.ops.matmul import fp8_init_lut, gemv_fp8_bf16 + + fp8_init_lut() + A = gk.empty((K,), dtype="bfloat16") + B_fp8 = from_numpy(np.zeros((N, K), dtype=np.uint8)) + n_blocks = (N + 127) // 128 + k_blocks = (K + 127) // 128 + B_scale = from_numpy(np.ones((n_blocks, k_blocks), dtype=np.float16).view(np.uint16)) + C = gk.empty((N,), dtype="bfloat16") + + def run_fn() -> None: + gemv_fp8_bf16(A, B_fp8, B_scale, out=C) + + elif dtype == "nvf4": + from pygpukit.ops.matmul import gemv_nvf4_available, gemv_nvf4_bf16 + + if not gemv_nvf4_available(): + return None + + A = gk.empty((K,), dtype="bfloat16") + B_nvf4 = from_numpy(np.zeros((K // 2, N), dtype=np.uint8)) + k_scale_blocks = (K + 31) // 32 + B_scale = from_numpy(np.ones((k_scale_blocks, N), dtype=np.uint8)) + C = gk.empty((N,), dtype="bfloat16") + + def run_fn() -> None: + gemv_nvf4_bf16(A, B_nvf4, B_scale, out=C) + + elif dtype == "int4": + if not hasattr(native, "int4_gemv_available") or not native.int4_gemv_available(): + return None + + def pack_int4(values: np.ndarray) -> np.ndarray: + flat = values.reshape(-1) + low = flat[0::2].astype(np.int32) & 0x0F + high = flat[1::2].astype(np.int32) & 0x0F + packed = (high << 4) | low + new_shape = list(values.shape) + new_shape[-1] //= 2 + return packed.astype(np.uint8).reshape(new_shape) + + A_raw = np.random.randint(-8, 8, K, dtype=np.int8) + B_raw = np.random.randint(-8, 8, (N, K), dtype=np.int8) + A_int4 = from_numpy(pack_int4(A_raw.reshape(1, -1)).reshape(-1)) + B_int4 = from_numpy(pack_int4(B_raw)) + C_int4 = from_numpy(np.zeros(N, dtype=np.int32)) + + def run_fn() -> None: + native.int4_gemv_int32_sm120( + A_int4._get_native(), B_int4._get_native(), C_int4._get_native() + ) + + else: + return None + + return self._measure(name, run_fn, params, flops=flops) + + +class W8A8GEMVBenchmark(Benchmark): + """W8A8 (FP8 weights, FP8 activations) GEMV benchmark.""" + + category = "gemv" + + def __init__( + self, + configs: list[tuple[int, int, str]] | None = None, + warmup: int = 10, + iterations: int = 50, + ): + super().__init__(warmup=warmup, iterations=iterations) + self.configs = configs or LLM_CONFIGS[:3] # Smaller set + + def run(self) -> list[BenchmarkResult]: + """Run W8A8 GEMV benchmarks.""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + results: list[BenchmarkResult] = [] + + # Check availability + if not hasattr(native, "gemv_fp8_fp8_available") or not native.gemv_fp8_fp8_available(): + print(" W8A8 GEMV: Not available") + return results + + for K, N, label in self.configs: + try: + result = self._benchmark_w8a8(native, K, N, label) + if result: + results.append(result) + except Exception as e: + print(f" W8A8 GEMV {label}: ERROR - {e}") + + return results + + def _benchmark_w8a8( + self, + native: object, + K: int, + N: int, + label: str, + ) -> BenchmarkResult | None: + """Benchmark W8A8 GEMV.""" + from pygpukit.core import from_numpy, zeros + + name = f"gemv_w8a8_{label}" + params = {"dtype": "w8a8", "K": K, "N": N, "label": label} + flops = 2.0 * K * N + + block_size = 128 + n_scales_k = (K + block_size - 1) // block_size + n_scales_n = (N + block_size - 1) // block_size + + A_fp8 = from_numpy(np.random.randint(0, 256, K, dtype=np.uint8)) + B_fp8 = from_numpy(np.random.randint(0, 256, (N, K), dtype=np.uint8)) + scale_A = from_numpy(np.ones(n_scales_k, dtype=np.float32)) + scale_B = from_numpy(np.ones(n_scales_n * n_scales_k, dtype=np.float32)) + C = zeros((N,), dtype="bfloat16") + + def run_fn() -> None: + native.gemv_fp8_fp8_bf16_sm120( + A_fp8._get_native(), + B_fp8._get_native(), + scale_A._get_native(), + scale_B._get_native(), + C._get_native(), + ) + + return self._measure(name, run_fn, params, flops=flops) diff --git a/src/pygpukit/benchmark/results.py b/src/pygpukit/benchmark/results.py new file mode 100644 index 0000000..268f7dc --- /dev/null +++ b/src/pygpukit/benchmark/results.py @@ -0,0 +1,187 @@ +"""Benchmark result classes and comparison utilities.""" + +from __future__ import annotations + +import json +from dataclasses import asdict, dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any + + +@dataclass +class BenchmarkResult: + """Single benchmark result.""" + + name: str + category: str # gemm, gemv, attention, inference + params: dict[str, Any] # M, K, N, dtype, etc. + median_us: float # Median time in microseconds + min_us: float + max_us: float + std_us: float + tflops: float | None = None # For compute benchmarks + bandwidth_gbps: float | None = None # For memory benchmarks + correct: bool = True + rel_error: float = 0.0 + iterations: int = 0 + timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) + + def to_dict(self) -> dict: + return asdict(self) + + @classmethod + def from_dict(cls, d: dict) -> BenchmarkResult: + return cls(**d) + + +@dataclass +class GPUInfo: + """GPU information.""" + + name: str + sm_major: int + sm_minor: int + memory_gb: float + driver_version: str = "" + cuda_version: str = "" + + def to_dict(self) -> dict: + return asdict(self) + + +@dataclass +class BenchmarkReport: + """Complete benchmark report with multiple results.""" + + gpu: GPUInfo + results: list[BenchmarkResult] = field(default_factory=list) + timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) + version: str = "0.2.18" + + def add(self, result: BenchmarkResult) -> None: + self.results.append(result) + + def save(self, path: str | Path) -> None: + """Save report to JSON file.""" + path = Path(path) + data = { + "version": self.version, + "timestamp": self.timestamp, + "gpu": self.gpu.to_dict(), + "results": [r.to_dict() for r in self.results], + } + path.write_text(json.dumps(data, indent=2)) + + @classmethod + def load(cls, path: str | Path) -> BenchmarkReport: + """Load report from JSON file.""" + path = Path(path) + data = json.loads(path.read_text()) + gpu = GPUInfo(**data["gpu"]) + results = [BenchmarkResult.from_dict(r) for r in data["results"]] + return cls( + gpu=gpu, + results=results, + timestamp=data.get("timestamp", ""), + version=data.get("version", "unknown"), + ) + + def to_dict(self) -> dict: + return { + "version": self.version, + "timestamp": self.timestamp, + "gpu": self.gpu.to_dict(), + "results": [r.to_dict() for r in self.results], + } + + +@dataclass +class Regression: + """Regression information.""" + + result: BenchmarkResult + baseline: BenchmarkResult + delta_percent: float # Negative = regression + + +@dataclass +class ComparisonResult: + """Result of comparing two benchmark reports.""" + + current: BenchmarkReport + baseline: BenchmarkReport + regressions: list[Regression] = field(default_factory=list) + improvements: list[Regression] = field(default_factory=list) + missing: list[str] = field(default_factory=list) # In baseline but not current + new: list[str] = field(default_factory=list) # In current but not baseline + + def has_regression(self, threshold: float = 0.05) -> bool: + """Check if any regression exceeds threshold.""" + return any(r.delta_percent < -threshold * 100 for r in self.regressions) + + def summary(self) -> str: + """Generate comparison summary.""" + lines = [] + lines.append("=" * 60) + lines.append("Benchmark Comparison") + lines.append("=" * 60) + lines.append(f"Baseline: {self.baseline.timestamp}") + lines.append(f"Current: {self.current.timestamp}") + lines.append("") + + if self.regressions: + lines.append("REGRESSIONS:") + for r in sorted(self.regressions, key=lambda x: x.delta_percent): + lines.append( + f" {r.result.name}: {r.baseline.median_us:.1f} -> " + f"{r.result.median_us:.1f} us ({r.delta_percent:+.1f}%)" + ) + lines.append("") + + if self.improvements: + lines.append("IMPROVEMENTS:") + for r in sorted(self.improvements, key=lambda x: -x.delta_percent)[:5]: + lines.append( + f" {r.result.name}: {r.baseline.median_us:.1f} -> " + f"{r.result.median_us:.1f} us ({r.delta_percent:+.1f}%)" + ) + lines.append("") + + return "\n".join(lines) + + +def compare_reports( + current: BenchmarkReport, + baseline: BenchmarkReport, + threshold: float = 0.05, +) -> ComparisonResult: + """Compare two benchmark reports.""" + result = ComparisonResult(current=current, baseline=baseline) + + # Build lookup by name+params + def key(r: BenchmarkResult) -> str: + params_str = json.dumps(r.params, sort_keys=True) + return f"{r.category}:{r.name}:{params_str}" + + baseline_map = {key(r): r for r in baseline.results} + current_map = {key(r): r for r in current.results} + + for k, curr in current_map.items(): + if k in baseline_map: + base = baseline_map[k] + if base.median_us > 0: + delta = (curr.median_us - base.median_us) / base.median_us * 100 + reg = Regression(result=curr, baseline=base, delta_percent=-delta) + if delta > threshold * 100: + result.regressions.append(reg) + elif delta < -threshold * 100: + result.improvements.append(reg) + else: + result.new.append(k) + + for k in baseline_map: + if k not in current_map: + result.missing.append(k) + + return result From 4e71a9850f115185c2024897cf5289fe66c95e2f Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 30 Dec 2025 23:57:07 +0900 Subject: [PATCH 2/5] fix(benchmark): correct GEMV B matrix layout [N,K] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/pygpukit/benchmark/gemv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pygpukit/benchmark/gemv.py b/src/pygpukit/benchmark/gemv.py index 4f2adb3..695ee9d 100644 --- a/src/pygpukit/benchmark/gemv.py +++ b/src/pygpukit/benchmark/gemv.py @@ -74,7 +74,7 @@ def _benchmark_gemv( from pygpukit.ops.matmul import gemv_bf16 A = gk.empty((K,), dtype="bfloat16") - B = gk.empty((K, N), dtype="bfloat16") + B = gk.empty((N, K), dtype="bfloat16") # B[N, K] layout for gemv C = gk.empty((N,), dtype="bfloat16") def run_fn() -> None: From 68a1f7230fc901033eb286d2086993edae2d6fde Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 30 Dec 2025 23:59:33 +0900 Subject: [PATCH 3/5] chore: remove old benchmark files replaced by unified suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Deleted: - scripts/benchmark.py - benchmarks/benchmark_gemv_*.py - benchmarks/benchmark_nvf4_*.py - benchmarks/benchmark_w8a16_gemm.py - examples/benchmark_*.py Use 'python -m pygpukit.benchmark' instead. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- benchmarks/benchmark_gemv_all.py | 213 ---------- benchmarks/benchmark_gemv_detailed.py | 165 -------- benchmarks/benchmark_nvf4_bf16.py | 145 ------- benchmarks/benchmark_nvf4_nvf4.py | 113 ------ benchmarks/benchmark_w8a16_gemm.py | 101 ----- examples/benchmark_compare.py | 98 ----- examples/benchmark_large.py | 38 -- examples/benchmark_matmul.py | 137 ------- examples/benchmark_tiled_matmul.py | 93 ----- scripts/benchmark.py | 538 -------------------------- 10 files changed, 1641 deletions(-) delete mode 100644 benchmarks/benchmark_gemv_all.py delete mode 100644 benchmarks/benchmark_gemv_detailed.py delete mode 100644 benchmarks/benchmark_nvf4_bf16.py delete mode 100644 benchmarks/benchmark_nvf4_nvf4.py delete mode 100644 benchmarks/benchmark_w8a16_gemm.py delete mode 100644 examples/benchmark_compare.py delete mode 100644 examples/benchmark_large.py delete mode 100644 examples/benchmark_matmul.py delete mode 100644 examples/benchmark_tiled_matmul.py delete mode 100644 scripts/benchmark.py diff --git a/benchmarks/benchmark_gemv_all.py b/benchmarks/benchmark_gemv_all.py deleted file mode 100644 index 2275525..0000000 --- a/benchmarks/benchmark_gemv_all.py +++ /dev/null @@ -1,213 +0,0 @@ -#!/usr/bin/env python3 -""" -Comprehensive GEMV Benchmark for README.md - -All GEMV kernels with LLM-relevant sizes, reporting in microseconds. -""" - -import time - -import numpy as np - -import pygpukit as gk -from pygpukit.core import from_numpy -from pygpukit.core.backend import get_native_module - - -def benchmark_gemv_all(): - """Comprehensive GEMV benchmark for all formats.""" - from pygpukit.ops.matmul import ( - fp8_init_lut, - gemv_bf16, - gemv_fp8_bf16, - gemv_nvf4_available, - gemv_nvf4_bf16, - ) - - native = get_native_module() - fp8_init_lut() - - print("=" * 80) - print("Comprehensive GEMV Benchmark (RTX 5090)") - print("=" * 80) - - props = native.get_device_properties(0) - print(f"GPU: {props.name}") - print() - - # LLM-relevant configurations - # (K, N) - K is hidden dim, N is output dim - configs = [ - # Qwen-7B style - (4096, 4096, "Qwen-7B hidden"), - (4096, 14336, "Qwen-7B MLP up"), - (14336, 4096, "Qwen-7B MLP down"), - # Qwen-72B style - (8192, 8192, "Qwen-72B hidden"), - (8192, 29568, "Qwen-72B MLP up"), - (29568, 8192, "Qwen-72B MLP down"), - ] - - warmup = 10 - iterations = 50 - - # Results table - results = [] - - for K, N, label in configs: - print(f"\n{label}: K={K}, N={N}") - - # ===== BF16 GEMV ===== - A_bf16 = gk.empty((K,), dtype="bfloat16") - B_bf16 = gk.empty((K, N), dtype="bfloat16") - C_bf16 = gk.empty((N,), dtype="bfloat16") - - for _ in range(warmup): - gemv_bf16(A_bf16, B_bf16, out=C_bf16) - native.device_synchronize() - - times_bf16 = [] - for _ in range(iterations): - native.device_synchronize() - start = time.perf_counter() - gemv_bf16(A_bf16, B_bf16, out=C_bf16) - native.device_synchronize() - end = time.perf_counter() - times_bf16.append((end - start) * 1e6) - - median_bf16 = np.median(times_bf16) - - # ===== FP8 GEMV ===== - try: - A_fp8 = gk.empty((K,), dtype="bfloat16") - B_fp8_nk = from_numpy(np.zeros((N, K), dtype=np.uint8)) - n_blocks = (N + 127) // 128 - k_blocks = (K + 127) // 128 - B_scale_fp8 = from_numpy( - np.ones((n_blocks, k_blocks), dtype=np.float16).view(np.uint16) - ) - C_fp8 = gk.empty((N,), dtype="bfloat16") - - for _ in range(warmup): - gemv_fp8_bf16(A_fp8, B_fp8_nk, B_scale_fp8, out=C_fp8) - native.device_synchronize() - - times_fp8 = [] - for _ in range(iterations): - native.device_synchronize() - start = time.perf_counter() - gemv_fp8_bf16(A_fp8, B_fp8_nk, B_scale_fp8, out=C_fp8) - native.device_synchronize() - end = time.perf_counter() - times_fp8.append((end - start) * 1e6) - - median_fp8 = np.median(times_fp8) - except Exception: - median_fp8 = float("inf") - - # ===== NVF4 GEMV ===== - if gemv_nvf4_available(): - A_nvf4 = gk.empty((K,), dtype="bfloat16") - B_nvf4 = from_numpy(np.zeros((K // 2, N), dtype=np.uint8)) - k_scale_blocks = (K + 31) // 32 - B_scale_nvf4 = from_numpy(np.ones((k_scale_blocks, N), dtype=np.uint8)) - C_nvf4 = gk.empty((N,), dtype="bfloat16") - - for _ in range(warmup): - gemv_nvf4_bf16(A_nvf4, B_nvf4, B_scale_nvf4, out=C_nvf4) - native.device_synchronize() - - times_nvf4 = [] - for _ in range(iterations): - native.device_synchronize() - start = time.perf_counter() - gemv_nvf4_bf16(A_nvf4, B_nvf4, B_scale_nvf4, out=C_nvf4) - native.device_synchronize() - end = time.perf_counter() - times_nvf4.append((end - start) * 1e6) - - median_nvf4 = np.median(times_nvf4) - else: - median_nvf4 = float("inf") - - # ===== Int4 GEMV ===== - try: - if native.int4_gemv_available(): - - def pack_int4(values: np.ndarray) -> np.ndarray: - flat = values.reshape(-1) - low = flat[0::2].astype(np.int32) & 0x0F - high = flat[1::2].astype(np.int32) & 0x0F - packed = (high << 4) | low - new_shape = list(values.shape) - new_shape[-1] //= 2 - return packed.astype(np.uint8).reshape(new_shape) - - A_int4_raw = np.random.randint(-8, 8, K, dtype=np.int8) - B_int4_raw = np.random.randint(-8, 8, (N, K), dtype=np.int8) - A_int4 = from_numpy(pack_int4(A_int4_raw.reshape(1, -1)).reshape(-1)) - B_int4 = from_numpy(pack_int4(B_int4_raw)) - C_int4 = from_numpy(np.zeros(N, dtype=np.int32)) - - for _ in range(warmup): - native.int4_gemv_int32_sm120( - A_int4._get_native(), B_int4._get_native(), C_int4._get_native() - ) - native.device_synchronize() - - times_int4 = [] - for _ in range(iterations): - native.device_synchronize() - start = time.perf_counter() - native.int4_gemv_int32_sm120( - A_int4._get_native(), B_int4._get_native(), C_int4._get_native() - ) - native.device_synchronize() - end = time.perf_counter() - times_int4.append((end - start) * 1e6) - - median_int4 = np.median(times_int4) - else: - median_int4 = float("inf") - except Exception: - median_int4 = float("inf") - - results.append( - { - "label": label, - "K": K, - "N": N, - "bf16": median_bf16, - "fp8": median_fp8, - "nvf4": median_nvf4, - "int4": median_int4, - } - ) - - print(f" BF16: {median_bf16:.1f} us") - print(f" FP8: {median_fp8:.1f} us") - if median_nvf4 != float("inf"): - print(f" NVF4: {median_nvf4:.1f} us") - if median_int4 != float("inf"): - print(f" Int4: {median_int4:.1f} us") - - # Print README table - print("\n" + "=" * 80) - print("README.md Table (GEMV Performance)") - print("=" * 80) - print() - print("| Layer | K | N | BF16 | FP8 | NVF4 | Int4 |") - print("|-------|------|-------|------|-----|------|------|") - - for r in results: - bf16_str = f"{r['bf16']:.0f} us" - fp8_str = f"{r['fp8']:.0f} us" - nvf4_str = f"{r['nvf4']:.0f} us" if r["nvf4"] != float("inf") else "—" - int4_str = f"{r['int4']:.0f} us" if r["int4"] != float("inf") else "—" - print( - f"| {r['label']} | {r['K']} | {r['N']} | {bf16_str} | {fp8_str} | {nvf4_str} | {int4_str} |" - ) - - -if __name__ == "__main__": - benchmark_gemv_all() diff --git a/benchmarks/benchmark_gemv_detailed.py b/benchmarks/benchmark_gemv_detailed.py deleted file mode 100644 index 832868e..0000000 --- a/benchmarks/benchmark_gemv_detailed.py +++ /dev/null @@ -1,165 +0,0 @@ -#!/usr/bin/env python3 -""" -Detailed GEMV Benchmark with individual timing per iteration. - -Compares: BF16, FP8, NVFP4 GEMV kernels. -""" - -import time - -import numpy as np - -import pygpukit as gk -from pygpukit.core import from_numpy -from pygpukit.core.backend import get_native_module - - -def benchmark_gemv_detailed(): - """Detailed GEMV benchmark with per-iteration timing.""" - from pygpukit.ops.matmul import ( - fp8_init_lut, - gemv_bf16, - gemv_fp8_bf16, - gemv_nvf4_available, - gemv_nvf4_bf16, - ) - - native = get_native_module() - fp8_init_lut() - - print("=" * 80) - print("Detailed GEMV Benchmark") - print("=" * 80) - - # Get GPU info - props = native.get_device_properties(0) - print(f"GPU: {props.name}") - print("Memory Bandwidth: ~1792 GB/s (theoretical)") - print() - - configs = [ - (4096, 4096), - (14336, 4096), - (4096, 14336), - ] - - warmup = 10 - iterations = 50 - - for N, K in configs: - print(f"\n{'=' * 60}") - print(f"N={N}, K={K}") - print(f"{'=' * 60}") - - # Calculate theoretical bandwidth - # BF16: B is K*N*2 bytes, A is K*2 bytes - bf16_bytes = K * N * 2 + K * 2 - # FP8: B is N*K bytes, A is K*2 bytes, scale is (N/128)*(K/128)*2 bytes - fp8_bytes = N * K + K * 2 + ((N + 127) // 128) * ((K + 127) // 128) * 2 - # NVF4: B is N*K/2 bytes, A is K*2 bytes, scale is (K/32)*N bytes - nvf4_bytes = N * (K // 2) + K * 2 + ((K + 31) // 32) * N - - print( - f"Data sizes: BF16={bf16_bytes / 1e6:.1f}MB, FP8={fp8_bytes / 1e6:.1f}MB, NVF4={nvf4_bytes / 1e6:.1f}MB" - ) - print( - f"Theoretical time @1000GB/s: BF16={bf16_bytes / 1e9 * 1e6:.1f}us, FP8={fp8_bytes / 1e9 * 1e6:.1f}us" - ) - print() - - # ===== BF16 GEMV ===== - A_bf16 = gk.empty((K,), dtype="bfloat16") - B_bf16 = gk.empty((K, N), dtype="bfloat16") - C_bf16 = gk.empty((N,), dtype="bfloat16") - - # Warmup - for _ in range(warmup): - gemv_bf16(A_bf16, B_bf16, out=C_bf16) - native.device_synchronize() - - # Benchmark with individual timing - times_bf16 = [] - for _ in range(iterations): - native.device_synchronize() - start = time.perf_counter() - gemv_bf16(A_bf16, B_bf16, out=C_bf16) - native.device_synchronize() - end = time.perf_counter() - times_bf16.append((end - start) * 1e6) - - median_bf16 = np.median(times_bf16) - min_bf16 = np.min(times_bf16) - print( - f"BF16: median={median_bf16:.1f}us, min={min_bf16:.1f}us, " - f"BW={bf16_bytes / median_bf16 / 1e3:.0f}GB/s" - ) - - # ===== FP8 GEMV (optimized, B[N,K] layout) ===== - A_fp8 = gk.empty((K,), dtype="bfloat16") - B_fp8_nk = from_numpy(np.zeros((N, K), dtype=np.uint8)) # [N, K] layout - n_blocks = (N + 127) // 128 - k_blocks = (K + 127) // 128 - B_scale_fp8 = from_numpy(np.ones((n_blocks, k_blocks), dtype=np.float16).view(np.uint16)) - C_fp8 = gk.empty((N,), dtype="bfloat16") - - for _ in range(warmup): - gemv_fp8_bf16(A_fp8, B_fp8_nk, B_scale_fp8, out=C_fp8) - native.device_synchronize() - - times_fp8 = [] - for _ in range(iterations): - native.device_synchronize() - start = time.perf_counter() - gemv_fp8_bf16(A_fp8, B_fp8_nk, B_scale_fp8, out=C_fp8) - native.device_synchronize() - end = time.perf_counter() - times_fp8.append((end - start) * 1e6) - - median_fp8 = np.median(times_fp8) - min_fp8 = np.min(times_fp8) - print( - f"FP8: median={median_fp8:.1f}us, min={min_fp8:.1f}us, " - f"BW={fp8_bytes / median_fp8 / 1e3:.0f}GB/s" - ) - - # ===== NVFP4 GEMV ===== - if gemv_nvf4_available(): - A_nvf4 = gk.empty((K,), dtype="bfloat16") - B_nvf4 = from_numpy(np.zeros((K // 2, N), dtype=np.uint8)) - k_scale_blocks = (K + 31) // 32 - B_scale_nvf4 = from_numpy(np.ones((k_scale_blocks, N), dtype=np.uint8)) - C_nvf4 = gk.empty((N,), dtype="bfloat16") - - for _ in range(warmup): - gemv_nvf4_bf16(A_nvf4, B_nvf4, B_scale_nvf4, out=C_nvf4) - native.device_synchronize() - - times_nvf4 = [] - for _ in range(iterations): - native.device_synchronize() - start = time.perf_counter() - gemv_nvf4_bf16(A_nvf4, B_nvf4, B_scale_nvf4, out=C_nvf4) - native.device_synchronize() - end = time.perf_counter() - times_nvf4.append((end - start) * 1e6) - - median_nvf4 = np.median(times_nvf4) - min_nvf4 = np.min(times_nvf4) - print( - f"NVFP4: median={median_nvf4:.1f}us, min={min_nvf4:.1f}us, " - f"BW={nvf4_bytes / median_nvf4 / 1e3:.0f}GB/s" - ) - else: - median_nvf4 = float("inf") - print("NVFP4: N/A") - - # Summary - print() - print("Speedup vs BF16:") - print(f" FP8: {median_bf16 / median_fp8:.2f}x") - if gemv_nvf4_available(): - print(f" NVFP4: {median_bf16 / median_nvf4:.2f}x") - - -if __name__ == "__main__": - benchmark_gemv_detailed() diff --git a/benchmarks/benchmark_nvf4_bf16.py b/benchmarks/benchmark_nvf4_bf16.py deleted file mode 100644 index 2a5213b..0000000 --- a/benchmarks/benchmark_nvf4_bf16.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -""" -NVF4-BF16 GEMM Benchmark for SM120 (Blackwell GeForce) - -Benchmarks NVF4 (4-bit) GEMM with BF16 I/O. -NVF4 provides 2x memory bandwidth compared to FP8. -""" - -import time - -import numpy as np - - -def bf16_to_f32(bf16_uint16: np.ndarray) -> np.ndarray: - """Convert BFloat16 (stored as uint16) to float32.""" - bf16_uint16 = bf16_uint16.astype(np.uint16) - f32_bits = bf16_uint16.astype(np.uint32) << 16 - return f32_bits.view(np.float32) - - -def f32_to_bf16(f32: np.ndarray) -> np.ndarray: - """Convert float32 to BFloat16 (stored as uint16).""" - f32 = f32.astype(np.float32) - f32_bits = f32.view(np.uint32) - bf16_bits = (f32_bits >> 16).astype(np.uint16) - return bf16_bits - - -def benchmark_nvf4_bf16(sizes: list[int], warmup: int = 5, iterations: int = 20): - """Benchmark NVF4-BF16 GEMM at various sizes.""" - from pygpukit.core.backend import get_native_module - from pygpukit.core.factory import from_numpy - from pygpukit.ops import matmul_nvf4_bf16_sm120, nvf4_bf16_sm120_available - - native = get_native_module() - - if not nvf4_bf16_sm120_available(): - print("NVF4-BF16 SM120 not available") - return - - print("=" * 70) - print("NVF4-BF16 GEMM Benchmark (SM120 Blackwell GeForce)") - print("=" * 70) - - # Get GPU info - props = native.get_device_properties(0) - print(f"GPU: {props.name}") - print(f"SM: {props.compute_capability_major}.{props.compute_capability_minor}") - print() - print("GPU-side quantization: BF16 -> NVF4 (no H2D copies)") - print() - - results = [] - - for size in sizes: - M, N, K = size, size, size - flops = 2.0 * M * N * K # FLOPs for GEMM - - # Create NVF4-appropriate data (values in representable range) - nvf4_values = np.array([0.5, 1.0, 1.5, 2.0, 3.0, 4.0], dtype=np.float32) - A = np.random.choice(nvf4_values, size=(M, K)).astype(np.float32) - B = np.random.choice(nvf4_values, size=(K, N)).astype(np.float32) - - A_bf16 = f32_to_bf16(A) - B_bf16 = f32_to_bf16(B) - - A_gpu = from_numpy(A_bf16) - B_gpu = from_numpy(B_bf16) - - # Warmup - for _ in range(warmup): - C_gpu = matmul_nvf4_bf16_sm120(A_gpu, B_gpu) - native.device_synchronize() - - # Benchmark - times = [] - for _ in range(iterations): - native.device_synchronize() - start = time.perf_counter() - C_gpu = matmul_nvf4_bf16_sm120(A_gpu, B_gpu) - native.device_synchronize() - end = time.perf_counter() - times.append(end - start) - - # Get result and verify - C_uint16 = C_gpu.to_numpy() - C_f32 = bf16_to_f32(C_uint16) - C_ref = bf16_to_f32(A_bf16) @ bf16_to_f32(B_bf16) - - rel_error = np.linalg.norm(C_f32 - C_ref) / np.linalg.norm(C_ref) - - median_time = np.median(times) - min_time = np.min(times) - tflops_median = flops / median_time / 1e12 - tflops_max = flops / min_time / 1e12 - - results.append( - { - "size": size, - "tflops_median": tflops_median, - "tflops_max": tflops_max, - "time_ms": median_time * 1000, - "rel_error": rel_error, - } - ) - - status = "PASS" if rel_error < 0.05 else "FAIL" - print( - f"{M}x{N}x{K}: {tflops_median:.2f} TFLOPS (median), " - f"{tflops_max:.2f} TFLOPS (max), " - f"rel_error={rel_error:.2e} [{status}]" - ) - - print() - print("=" * 70) - print("Summary Table (for README)") - print("=" * 70) - print("| Size | TFLOPS (median) | TFLOPS (max) | Time (ms) |") - print("|------|-----------------|--------------|-----------|") - for r in results: - print( - f"| {r['size']}x{r['size']} | {r['tflops_median']:.2f} | " - f"{r['tflops_max']:.2f} | {r['time_ms']:.2f} |" - ) - - return results - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="NVF4-BF16 GEMM Benchmark") - parser.add_argument( - "--sizes", - nargs="+", - type=int, - default=[1024, 2048, 4096, 8192], - help="Matrix sizes to benchmark", - ) - parser.add_argument("--warmup", type=int, default=5, help="Number of warmup iterations") - parser.add_argument("--iterations", type=int, default=20, help="Number of benchmark iterations") - - args = parser.parse_args() - - benchmark_nvf4_bf16(args.sizes, args.warmup, args.iterations) diff --git a/benchmarks/benchmark_nvf4_nvf4.py b/benchmarks/benchmark_nvf4_nvf4.py deleted file mode 100644 index 6ff909d..0000000 --- a/benchmarks/benchmark_nvf4_nvf4.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env python3 -""" -Pure NVF4 GEMM Benchmark for SM120 (Blackwell GeForce) - -Benchmarks NVF4 GEMM without quantization overhead to measure -pure tensor core performance. -""" - -import time - -import numpy as np - - -def benchmark_nvf4_nvf4(sizes: list[int], warmup: int = 5, iterations: int = 20): - """Benchmark pure NVF4 GEMM at various sizes.""" - from pygpukit.core.backend import get_native_module - from pygpukit.core.factory import zeros - - native = get_native_module() - - if not native.nvf4_nvf4_sm120_available(): - print("NVF4-NVF4 SM120 not available") - return - - print("=" * 70) - print("Pure NVF4 GEMM Benchmark (SM120 Blackwell GeForce)") - print("=" * 70) - - # Get GPU info - props = native.get_device_properties(0) - print(f"GPU: {props.name}") - print(f"SM: {props.compute_capability_major}.{props.compute_capability_minor}") - print() - print("Pre-quantized NVF4 data (no quantization overhead)") - print() - - results = [] - - for size in sizes: - M, N, K = size, size, size - flops = 2.0 * M * N * K # FLOPs for GEMM - - # Allocate output buffer (BF16) - D_gpu = zeros((M, N), dtype="bfloat16") - D_native = D_gpu._get_native() # Get native GPUArray - - # Warmup - for _ in range(warmup): - native.benchmark_gemm_nvf4_sm120(D_native, M, N, K) - native.device_synchronize() - - # Benchmark - times = [] - for _ in range(iterations): - native.device_synchronize() - start = time.perf_counter() - native.benchmark_gemm_nvf4_sm120(D_native, M, N, K) - native.device_synchronize() - end = time.perf_counter() - times.append(end - start) - - median_time = np.median(times) - min_time = np.min(times) - tflops_median = flops / median_time / 1e12 - tflops_max = flops / min_time / 1e12 - - results.append( - { - "size": size, - "tflops_median": tflops_median, - "tflops_max": tflops_max, - "time_ms": median_time * 1000, - } - ) - - print( - f"{M}x{N}x{K}: {tflops_median:.2f} TFLOPS (median), " - f"{tflops_max:.2f} TFLOPS (max), " - f"time={median_time * 1000:.2f}ms" - ) - - print() - print("=" * 70) - print("Summary Table") - print("=" * 70) - print("| Size | TFLOPS (median) | TFLOPS (max) | Time (ms) |") - print("|------|-----------------|--------------|-----------|") - for r in results: - print( - f"| {r['size']}x{r['size']} | {r['tflops_median']:.2f} | " - f"{r['tflops_max']:.2f} | {r['time_ms']:.2f} |" - ) - - return results - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="Pure NVF4 GEMM Benchmark") - parser.add_argument( - "--sizes", - nargs="+", - type=int, - default=[1024, 2048, 4096, 8192, 12288, 16384], - help="Matrix sizes to benchmark", - ) - parser.add_argument("--warmup", type=int, default=5, help="Number of warmup iterations") - parser.add_argument("--iterations", type=int, default=20, help="Number of benchmark iterations") - - args = parser.parse_args() - - benchmark_nvf4_nvf4(args.sizes, args.warmup, args.iterations) diff --git a/benchmarks/benchmark_w8a16_gemm.py b/benchmarks/benchmark_w8a16_gemm.py deleted file mode 100644 index 5da5f38..0000000 --- a/benchmarks/benchmark_w8a16_gemm.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python3 -""" -W8A16 GEMM Benchmark for SM120. - -Tests FP8 weight x BF16 activation -> BF16 output. -""" - -import time - -import numpy as np - -import pygpukit as gk -from pygpukit.core import from_numpy -from pygpukit.core.backend import get_native_module -from pygpukit.ops.matmul import w8a16_gemm_sm120 - - -def benchmark_w8a16_gemm(): - """Benchmark W8A16 GEMM kernel.""" - native = get_native_module() - - print("=" * 80) - print("W8A16 GEMM Benchmark (SM120)") - print("=" * 80) - - # Get GPU info - props = native.get_device_properties(0) - print(f"GPU: {props.name}") - print() - - # Test configurations (typical LLM layer sizes) - # Qwen3-30B-A3B MoE: hidden=2048, intermediate varies by expert - configs = [ - # (M, K, N) - prefill batch sizes - (1, 2048, 8192), # Single token, small MLP - (16, 2048, 8192), # Small batch - (64, 2048, 8192), # Medium batch - (128, 4096, 14336), # Large batch, Qwen-7B MLP - (256, 4096, 14336), # Larger batch - (512, 4096, 14336), # Prefill size - (1024, 4096, 14336), # Long prefill - ] - - warmup = 10 - iterations = 50 - - for M, K, N in configs: - print(f"\n{'=' * 60}") - print(f"M={M}, K={K}, N={N}") - print(f"{'=' * 60}") - - # Calculate data sizes - A_bytes = M * K * 2 # BF16 - B_bytes = K * N * 1 # FP8 - C_bytes = M * N * 2 # BF16 - scale_k = (K + 127) // 128 - scale_n = (N + 127) // 128 - scale_bytes = scale_k * scale_n * 2 # BF16 scale - total_bytes = A_bytes + B_bytes + C_bytes + scale_bytes - - print(f"Data: A={A_bytes / 1e6:.2f}MB, B={B_bytes / 1e6:.2f}MB, C={C_bytes / 1e6:.2f}MB") - print(f"Total I/O: {total_bytes / 1e6:.2f}MB") - - # Calculate FLOPS (2*M*N*K for matmul) - flops = 2 * M * N * K - - # Create tensors - A_bf16 = gk.empty((M, K), dtype="bfloat16") - B_fp8 = from_numpy(np.random.randint(0, 256, (K, N), dtype=np.uint8)) - B_scale = gk.empty((scale_k, scale_n), dtype="bfloat16") - C_out = gk.empty((M, N), dtype="bfloat16") - - # Warmup - for _ in range(warmup): - w8a16_gemm_sm120(A_bf16, B_fp8, B_scale, out=C_out) - native.device_synchronize() - - # Benchmark - times = [] - for _ in range(iterations): - native.device_synchronize() - start = time.perf_counter() - w8a16_gemm_sm120(A_bf16, B_fp8, B_scale, out=C_out) - native.device_synchronize() - end = time.perf_counter() - times.append((end - start) * 1e6) # microseconds - - median_us = np.median(times) - min_us = np.min(times) - max_us = np.max(times) - - # Calculate performance - tflops = flops / median_us / 1e6 # TFLOPS - bw = total_bytes / median_us / 1e3 # GB/s - - print(f"Time: median={median_us:.1f}us, min={min_us:.1f}us, max={max_us:.1f}us") - print(f"Performance: {tflops:.2f} TFLOPS, BW={bw:.0f} GB/s") - - -if __name__ == "__main__": - benchmark_w8a16_gemm() diff --git a/examples/benchmark_compare.py b/examples/benchmark_compare.py deleted file mode 100644 index 0e7cc82..0000000 --- a/examples/benchmark_compare.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python3 -"""Compare tiled vs naive matmul (via NVRTC JIT).""" - -import sys - -sys.path.insert(0, "src") -import time - -import numpy as np - -import pygpukit as gp -from pygpukit.core.backend import get_backend - -# Naive kernel source (for comparison) -NAIVE_KERNEL = """ -extern "C" __global__ void matmul_naive( - const float* A, const float* B, float* C, - int M, int N, int K -) { - int row = blockIdx.y * blockDim.y + threadIdx.y; - int col = blockIdx.x * blockDim.x + threadIdx.x; - - if (row < M && col < N) { - float sum = 0.0f; - for (int k = 0; k < K; ++k) { - sum += A[row * K + k] * B[k * N + col]; - } - C[row * N + col] = sum; - } -} -""" - - -def benchmark_current(a_gpu, b_gpu, iterations=10): - """Benchmark current (tiled) implementation.""" - # Warmup - _ = gp.matmul(a_gpu, b_gpu) - - times = [] - for _ in range(iterations): - start = time.perf_counter() - _ = gp.matmul(a_gpu, b_gpu) - times.append(time.perf_counter() - start) - return np.mean(times) * 1000 - - -def main(): - print("=" * 70) - print(" Tiled vs Naive Matmul Comparison") - print("=" * 70) - print() - - backend = get_backend() - props = backend.get_device_properties() - print(f"GPU: {props.name}") - print() - - # Note: We cannot easily run naive kernel without modifying C++ code - # So we'll compare with CLAUDE.md historical data - - print("Benchmark results on RTX 3090 Ti:") - print(" Naive kernel is faster than tiled due to 6MB L2 cache") - print() - - print("Current (Naive) implementation:") - sizes = [512, 1024, 2048] - - for size in sizes: - np.random.seed(42) - a_np = np.random.rand(size, size).astype(np.float32) - b_np = np.random.rand(size, size).astype(np.float32) - - a_gpu = gp.from_numpy(a_np) - b_gpu = gp.from_numpy(b_np) - - gpu_ms = benchmark_current(a_gpu, b_gpu) - flops = 2 * size * size * size - gflops = flops / (gpu_ms / 1000) / 1e9 - - print(f" {size}x{size}: {gpu_ms:.2f} ms, {gflops:.0f} GFLOPS") - - print() - print("-" * 70) - print("Analysis:") - print(" The naive kernel outperforms tiled on RTX 3090 Ti because:") - print(" 1. Large L2 cache (6MB) provides efficient global memory access") - print(" 2. __syncthreads() in tiled kernel adds synchronization overhead") - print(" 3. Shared memory management overhead doesn't pay off") - print() - print(" For truly faster matmul, consider:") - print(" - cuBLAS: 20+ TFLOPS on RTX 3090 Ti") - print(" - Advanced tiling with register blocking") - print(" - Tensor cores for mixed precision") - print("-" * 70) - - -if __name__ == "__main__": - main() diff --git a/examples/benchmark_large.py b/examples/benchmark_large.py deleted file mode 100644 index 4b6a4c5..0000000 --- a/examples/benchmark_large.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 -"""Benchmark large matrices.""" - -import sys - -sys.path.insert(0, "src") -import time - -import numpy as np - -import pygpukit as gp - -sizes = [4096] -for size in sizes: - np.random.seed(42) - a_np = np.random.rand(size, size).astype(np.float32) - b_np = np.random.rand(size, size).astype(np.float32) - - # NumPy - start = time.perf_counter() - _ = np.matmul(a_np, b_np) - numpy_ms = (time.perf_counter() - start) * 1000 - - # GPU - a_gpu = gp.from_numpy(a_np) - b_gpu = gp.from_numpy(b_np) - _ = gp.matmul(a_gpu, b_gpu) # warmup - - start = time.perf_counter() - _ = gp.matmul(a_gpu, b_gpu) - gpu_ms = (time.perf_counter() - start) * 1000 - - flops = 2 * size * size * size - gflops = flops / (gpu_ms / 1000) / 1e9 - - print( - f"{size}x{size}: NumPy={numpy_ms:.1f}ms, GPU={gpu_ms:.1f}ms, Speedup={numpy_ms / gpu_ms:.1f}x, {gflops:.0f} GFLOPS" - ) diff --git a/examples/benchmark_matmul.py b/examples/benchmark_matmul.py deleted file mode 100644 index 34b5cbe..0000000 --- a/examples/benchmark_matmul.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python3 -"""Benchmark: Tiled matmul vs NumPy. - -Demonstrates the performance improvement from shared memory tiling. -""" - -from __future__ import annotations - -import sys -import time - -import numpy as np - -sys.path.insert(0, "src") - -import pygpukit as gp -from pygpukit.core.backend import get_backend - - -def benchmark_matmul(size: int, iterations: int = 10) -> dict: - """Benchmark matmul for a given matrix size.""" - np.random.seed(42) - - # Create test data - a_np = np.random.rand(size, size).astype(np.float32) - b_np = np.random.rand(size, size).astype(np.float32) - - # NumPy benchmark - numpy_times = [] - for _ in range(iterations): - start = time.perf_counter() - _ = np.matmul(a_np, b_np) - numpy_times.append(time.perf_counter() - start) - numpy_avg = np.mean(numpy_times) * 1000 # ms - - # PyGPUkit benchmark - a_gpu = gp.from_numpy(a_np) - b_gpu = gp.from_numpy(b_np) - - # Warm-up - _ = gp.matmul(a_gpu, b_gpu) - - gpu_times = [] - for _ in range(iterations): - start = time.perf_counter() - _ = gp.matmul(a_gpu, b_gpu) - gpu_times.append(time.perf_counter() - start) - gpu_avg = np.mean(gpu_times) * 1000 # ms - - # Calculate GFLOPS (2 * N^3 FLOPs for matmul) - flops = 2 * size * size * size - gpu_gflops = flops / (gpu_avg / 1000) / 1e9 - numpy_gflops = flops / (numpy_avg / 1000) / 1e9 - - return { - "size": size, - "numpy_ms": numpy_avg, - "gpu_ms": gpu_avg, - "speedup": numpy_avg / gpu_avg, - "numpy_gflops": numpy_gflops, - "gpu_gflops": gpu_gflops, - } - - -def main(): - print("=" * 70) - print(" PyGPUkit Tiled Matmul Benchmark") - print("=" * 70) - print() - - # Get backend info - backend = get_backend() - props = backend.get_device_properties() - print(f"GPU: {props.name}") - print(f"Memory: {props.total_memory / (1024**3):.2f} GB") - print(f"SMs: {props.multiprocessor_count}") - print() - - # Benchmark various sizes - sizes = [128, 256, 512, 1024, 2048] - - print("Running benchmarks (10 iterations each)...") - print() - - results = [] - for size in sizes: - print(f" Testing {size}x{size}...", end=" ", flush=True) - result = benchmark_matmul(size) - results.append(result) - print(f"done ({result['gpu_ms']:.2f} ms)") - - print() - print("=" * 70) - print(" RESULTS") - print("=" * 70) - print() - print( - f"{'Size':>8} | {'NumPy (ms)':>12} | {'GPU (ms)':>12} | {'Speedup':>8} | {'GPU GFLOPS':>12}" - ) - print("-" * 70) - - for r in results: - print( - f"{r['size']:>8} | {r['numpy_ms']:>12.3f} | {r['gpu_ms']:>12.3f} | {r['speedup']:>7.1f}x | {r['gpu_gflops']:>12.1f}" - ) - - print() - print("=" * 70) - print() - - # Peak performance - best = max(results, key=lambda x: x["gpu_gflops"]) - print(f"Peak GPU Performance: {best['gpu_gflops']:.1f} GFLOPS at {best['size']}x{best['size']}") - print(f"Best Speedup vs NumPy: {max(r['speedup'] for r in results):.1f}x") - print() - - # Verify correctness - print("Verifying correctness...") - a_np = np.random.rand(256, 256).astype(np.float32) - b_np = np.random.rand(256, 256).astype(np.float32) - - expected = np.matmul(a_np, b_np) - result = gp.matmul(gp.from_numpy(a_np), gp.from_numpy(b_np)).to_numpy() - - max_diff = np.max(np.abs(expected - result)) - print(f"Max difference from NumPy: {max_diff:.2e}") - - if max_diff < 1e-4: - print("[OK] Results match NumPy (within tolerance)") - else: - print("[FAIL] Results differ from NumPy!") - - print() - - -if __name__ == "__main__": - main() diff --git a/examples/benchmark_tiled_matmul.py b/examples/benchmark_tiled_matmul.py deleted file mode 100644 index f4bead3..0000000 --- a/examples/benchmark_tiled_matmul.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Benchmark: Tiled vs Naive Matmul Performance""" - -import os -import sys -import time - -# Add CUDA DLLs to PATH -cuda_path = os.environ.get("CUDA_PATH", r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4") -cuda_bin = os.path.join(cuda_path, "bin") -if cuda_bin not in os.environ["PATH"]: - os.environ["PATH"] = cuda_bin + os.pathsep + os.environ["PATH"] -if hasattr(os, "add_dll_directory"): - os.add_dll_directory(cuda_bin) - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src", "pygpukit")) - -import numpy as np # noqa: E402 - -print("=" * 70) -print("Tiled Matmul Benchmark - PyGPUkit v0.2") -print("=" * 70) - -try: - import _pygpukit_native as native # noqa: E402 - - print(f"\nCUDA available: {native.is_cuda_available()}") - - if native.is_cuda_available(): - props = native.get_device_properties(0) - print(f"GPU: {props.name}") - print(f"Memory: {props.total_memory / 1024**3:.1f} GB") - - print("\n" + "-" * 70) - print("Matrix Size | Kernel | Time (ms) | GFLOPS | Speedup") - print("-" * 70) - - sizes = [512, 1024, 2048, 3072, 4096] - - for size in sizes: - M, N, K = size, size, size - - # Create test matrices - A_np = np.random.randn(M, K).astype(np.float32) - B_np = np.random.randn(K, N).astype(np.float32) - - # Warmup - A_gpu = native.from_numpy(A_np) - B_gpu = native.from_numpy(B_np) - _ = native.matmul(A_gpu, B_gpu) - - # Benchmark GPU - iterations = 5 if size >= 2048 else 10 - times = [] - for _ in range(iterations): - A_gpu = native.from_numpy(A_np) - B_gpu = native.from_numpy(B_np) - start = time.perf_counter() - C_gpu = native.matmul(A_gpu, B_gpu) - gpu_time = time.perf_counter() - start - times.append(gpu_time) - - avg_time = np.median(times) - gflops = 2 * M * N * K / avg_time / 1e9 - - # Check which kernel is used (threshold is 2048) - kernel = "Tiled" if size >= 2048 else "L2-opt" - - # CPU reference - start = time.perf_counter() - C_cpu = np.matmul(A_np, B_np) - cpu_time = time.perf_counter() - start - - speedup = cpu_time / avg_time - - # Verify correctness - C_result = C_gpu.to_numpy() - rel_error = np.max(np.abs(C_result - C_cpu)) / np.max(np.abs(C_cpu)) - - print( - f"{size:>5}x{size:<5} | {kernel:<9} | {avg_time * 1000:>8.2f} | {gflops:>7.1f} | {speedup:>5.1f}x" - ) - - if rel_error > 1e-3: - print(f" WARNING: High relative error: {rel_error:.2e}") - - print("-" * 70) - print("\nTiled kernel should show improved performance for sizes >= 2048") - print("=" * 70) - -except ImportError as e: - print(f"Error: {e}") - print("Native module not available") diff --git a/scripts/benchmark.py b/scripts/benchmark.py deleted file mode 100644 index fb63003..0000000 --- a/scripts/benchmark.py +++ /dev/null @@ -1,538 +0,0 @@ -#!/usr/bin/env python3 -""" -PyGPUkit Comprehensive Benchmark - -Benchmarks all supported dtypes: -- FP32 (Ampere optimized kernel) -- TF32 v1 (WMMA TensorCore) -- TF32 v2 (PTX mma.sync TensorCore, optimized) -- FP16 (simple kernel, TensorCore planned) -- BF16 (simple kernel, TensorCore planned) - -Runtime Modes: -- Driver-Only: Uses pre-compiled kernels, no CUDA Toolkit needed -- Full (JIT): Same kernels + JIT compilation for custom ops - -Note: Built-in matmul kernels are pre-compiled, so Driver-Only and Full -modes have identical performance for matmul operations. - -Usage: - python benchmark_all.py [--sizes SIZES] [--quick] [--tf32-version v1|v2] - -Output format matches README.md tables for easy updates. -""" - -import argparse -import os -import time -from dataclasses import dataclass - -import numpy as np - -# ============================================================================= -# Setup CUDA DLL path (Windows) -# ============================================================================= -cuda_path = os.environ.get("CUDA_PATH", r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4") -cuda_bin = os.path.join(cuda_path, "bin") -if os.path.isdir(cuda_bin): - if cuda_bin not in os.environ.get("PATH", ""): - os.environ["PATH"] = cuda_bin + os.pathsep + os.environ.get("PATH", "") - if hasattr(os, "add_dll_directory"): - os.add_dll_directory(cuda_bin) - - -# ============================================================================= -# Data Classes -# ============================================================================= -@dataclass -class BenchmarkResult: - dtype: str - size: int - tflops_median: float - tflops_max: float - time_ms: float - correct: bool - rel_error: float - - -@dataclass -class GPUInfo: - name: str - sm_major: int - sm_minor: int - nvrtc_available: bool - - -# ============================================================================= -# Native Module Import Helper -# ============================================================================= -_native_module = None - - -def get_native_module(): - """Get native module with fallback.""" - global _native_module - if _native_module is not None: - return _native_module - try: - import _pygpukit_native as native - - _native_module = native - except ImportError: - from pygpukit import _pygpukit_native as native - - _native_module = native - return _native_module - - -# ============================================================================= -# Benchmark Functions -# ============================================================================= -def get_gpu_info() -> GPUInfo: - """Get GPU information.""" - native = get_native_module() - props = native.get_device_properties(0) - - try: - import pygpukit as gpk - - nvrtc = gpk.is_nvrtc_available() - except Exception: - nvrtc = False - - return GPUInfo( - name=props.name, - sm_major=props.compute_capability_major, - sm_minor=props.compute_capability_minor, - nvrtc_available=nvrtc, - ) - - -def benchmark_fp32(size: int, warmup: int = 5, iterations: int = 10) -> BenchmarkResult: - """Benchmark FP32 matmul (Ampere optimized kernel).""" - native = get_native_module() - - A = np.random.randn(size, size).astype(np.float32) - B = np.random.randn(size, size).astype(np.float32) - - A_gpu = native.from_numpy(A) - B_gpu = native.from_numpy(B) - - # Correctness - C_gpu = native.matmul(A_gpu, B_gpu) - C_result = C_gpu.to_numpy() - C_expected = A @ B - rel_error = np.max(np.abs(C_result - C_expected)) / np.max(np.abs(C_expected)) - correct = rel_error < 1e-3 - - # Warmup - for _ in range(warmup): - _ = native.matmul(A_gpu, B_gpu) - - # Benchmark - times = [] - for _ in range(iterations): - start = time.perf_counter() - _ = native.matmul(A_gpu, B_gpu) - elapsed = time.perf_counter() - start - times.append(elapsed) - - median_time = np.median(times) - min_time = np.min(times) - flops = 2.0 * size * size * size - - return BenchmarkResult( - dtype="FP32", - size=size, - tflops_median=flops / median_time / 1e12, - tflops_max=flops / min_time / 1e12, - time_ms=median_time * 1000, - correct=correct, - rel_error=rel_error, - ) - - -def benchmark_tf32( - size: int, warmup: int = 5, iterations: int = 10, use_v2: bool = True -) -> BenchmarkResult: - """Benchmark TF32 TensorCore matmul. - - Uses environment variables to control kernel selection: - - PYGPUKIT_ALLOW_TF32=1: Enable TF32 kernels - - PYGPUKIT_TF32_V2=1: Use optimized v2 kernel (PTX mma.sync) - """ - native = get_native_module() - - # Set environment for TF32 - os.environ["PYGPUKIT_ALLOW_TF32"] = "1" - if use_v2: - os.environ["PYGPUKIT_TF32_V2"] = "1" - else: - os.environ.pop("PYGPUKIT_TF32_V2", None) - - A = np.random.randn(size, size).astype(np.float32) - B = np.random.randn(size, size).astype(np.float32) - - A_gpu = native.from_numpy(A) - B_gpu = native.from_numpy(B) - - # Correctness - use native.matmul which respects env vars - C_gpu = native.matmul(A_gpu, B_gpu) - C_result = C_gpu.to_numpy() - C_expected = A @ B - rel_error = np.max(np.abs(C_result - C_expected)) / np.max(np.abs(C_expected)) - correct = rel_error < 1e-2 # TF32 has ~0.1% per-op error - - # Warmup - for _ in range(warmup): - _ = native.matmul(A_gpu, B_gpu) - - # Benchmark - times = [] - for _ in range(iterations): - start = time.perf_counter() - _ = native.matmul(A_gpu, B_gpu) - elapsed = time.perf_counter() - start - times.append(elapsed) - - median_time = np.median(times) - min_time = np.min(times) - flops = 2.0 * size * size * size - - version = "v2" if use_v2 else "v1" - return BenchmarkResult( - dtype=f"TF32 {version}", - size=size, - tflops_median=flops / median_time / 1e12, - tflops_max=flops / min_time / 1e12, - time_ms=median_time * 1000, - correct=correct, - rel_error=rel_error, - ) - - -def benchmark_fp16(size: int, warmup: int = 5, iterations: int = 10) -> BenchmarkResult: - """Benchmark FP16 matmul (simple kernel, no TensorCore yet).""" - native = get_native_module() - - A = np.random.randn(size, size).astype(np.float16) - B = np.random.randn(size, size).astype(np.float16) - - A_gpu = native.from_numpy(A) - B_gpu = native.from_numpy(B) - - # Correctness - C_gpu = native.matmul(A_gpu, B_gpu) - C_result = C_gpu.to_numpy() - C_expected = (A.astype(np.float32) @ B.astype(np.float32)).astype(np.float16) - rel_error = np.max(np.abs(C_result.astype(np.float32) - C_expected.astype(np.float32))) / ( - np.max(np.abs(C_expected.astype(np.float32))) + 1e-7 - ) - correct = rel_error < 0.05 - - # Warmup - for _ in range(warmup): - _ = native.matmul(A_gpu, B_gpu) - - # Benchmark - times = [] - for _ in range(iterations): - start = time.perf_counter() - _ = native.matmul(A_gpu, B_gpu) - elapsed = time.perf_counter() - start - times.append(elapsed) - - median_time = np.median(times) - min_time = np.min(times) - flops = 2.0 * size * size * size - - return BenchmarkResult( - dtype="FP16", - size=size, - tflops_median=flops / median_time / 1e12, - tflops_max=flops / min_time / 1e12, - time_ms=median_time * 1000, - correct=correct, - rel_error=rel_error, - ) - - -def benchmark_bf16(size: int, warmup: int = 5, iterations: int = 10) -> BenchmarkResult: - """Benchmark BF16 matmul (simple kernel, no TensorCore yet).""" - native = get_native_module() - import pygpukit as gpk - - A_fp32 = np.random.randn(size, size).astype(np.float32) - B_fp32 = np.random.randn(size, size).astype(np.float32) - - # Convert to BF16 via GPUArray - A_gpu = gpk.from_numpy(A_fp32).astype(gpk.bfloat16)._get_native() - B_gpu = gpk.from_numpy(B_fp32).astype(gpk.bfloat16)._get_native() - - # Correctness - C_gpu = native.matmul(A_gpu, B_gpu) - C_gpk = gpk.GPUArray._wrap_native(C_gpu).astype(gpk.float32) - C_result = C_gpk.to_numpy() - C_expected = A_fp32 @ B_fp32 - rel_error = np.max(np.abs(C_result - C_expected)) / (np.max(np.abs(C_expected)) + 1e-7) - correct = rel_error < 0.05 - - # Re-create arrays for benchmark - A_gpu = gpk.from_numpy(A_fp32).astype(gpk.bfloat16)._get_native() - B_gpu = gpk.from_numpy(B_fp32).astype(gpk.bfloat16)._get_native() - - # Warmup - for _ in range(warmup): - _ = native.matmul(A_gpu, B_gpu) - - # Benchmark - times = [] - for _ in range(iterations): - start = time.perf_counter() - _ = native.matmul(A_gpu, B_gpu) - elapsed = time.perf_counter() - start - times.append(elapsed) - - median_time = np.median(times) - min_time = np.min(times) - flops = 2.0 * size * size * size - - return BenchmarkResult( - dtype="BF16", - size=size, - tflops_median=flops / median_time / 1e12, - tflops_max=flops / min_time / 1e12, - time_ms=median_time * 1000, - correct=correct, - rel_error=rel_error, - ) - - -# ============================================================================= -# Output Functions -# ============================================================================= -def print_header(gpu_info: GPUInfo, tf32_version: str): - """Print benchmark header.""" - print("=" * 70) - print(" PyGPUkit Comprehensive Benchmark") - print("=" * 70) - print() - print(f"GPU: {gpu_info.name}") - print(f"SM: {gpu_info.sm_major}.{gpu_info.sm_minor}") - print(f"NVRTC (JIT): {'Available' if gpu_info.nvrtc_available else 'Not Available'}") - print(f"TF32 Kernel: {tf32_version}") - print() - print("Note: Built-in matmul kernels are pre-compiled.") - print(" Driver-Only and Full modes have identical matmul performance.") - print() - - -def print_correctness_results(results: list): - """Print correctness verification results.""" - print("=" * 70) - print(" Correctness Verification") - print("=" * 70) - print() - print(f"{'Dtype':<12} {'Size':<12} {'Rel Error':<12} {'Status':<8}") - print("-" * 48) - - for r in results: - status = "PASS" if r.correct else "FAIL" - print(f"{r.dtype:<12} {r.size}x{r.size:<6} {r.rel_error:<12.2e} {status:<8}") - print() - - -def print_benchmark_results(results: list, sizes: list): - """Print benchmark results.""" - print("=" * 70) - print(" Performance Results (TFLOPS)") - print("=" * 70) - print() - - # Group by size - by_size = {} - for r in results: - if r.size not in by_size: - by_size[r.size] = {} - by_size[r.size][r.dtype] = r - - # Get all dtypes - all_dtypes = [] - for r in results: - if r.dtype not in all_dtypes: - all_dtypes.append(r.dtype) - - # Print header - header = f"{'Size':<14}" - for dt in all_dtypes: - header += f"{dt:<12}" - print(header) - print("-" * (14 + 12 * len(all_dtypes))) - - # Print rows - for size in sizes: - if size not in by_size: - continue - row = by_size[size] - line = f"{size}x{size:<8}" - for dt in all_dtypes: - r = row.get(dt) - if r: - line += f"{r.tflops_median:<12.1f}" - else: - line += f"{'-':<12}" - print(line) - - print() - - -def print_readme_table(results: list, sizes: list): - """Print README.md compatible markdown table.""" - print("=" * 70) - print(" README.md Table") - print("=" * 70) - print() - - # Group by size - by_size = {} - for r in results: - if r.size not in by_size: - by_size[r.size] = {} - by_size[r.size][r.dtype] = r - - # Get dtypes - all_dtypes = [] - for r in results: - if r.dtype not in all_dtypes: - all_dtypes.append(r.dtype) - - # Print markdown table - header = "| Matrix Size |" - separator = "|-------------|" - for dt in all_dtypes: - header += f" {dt} |" - separator += "------|" - print(header) - print(separator) - - for size in sizes: - if size not in by_size: - continue - row = by_size[size] - line = f"| {size}x{size} |" - for dt in all_dtypes: - r = row.get(dt) - if r: - line += f" {r.tflops_median:.1f} TFLOPS |" - else: - line += " - |" - print(line) - - print() - - -# ============================================================================= -# Main -# ============================================================================= -def main(): - parser = argparse.ArgumentParser(description="PyGPUkit Comprehensive Benchmark") - parser.add_argument( - "--sizes", - type=str, - default="2048,4096,8192", - help="Comma-separated matrix sizes (default: 2048,4096,8192)", - ) - parser.add_argument("--quick", action="store_true", help="Quick mode: fewer iterations") - parser.add_argument( - "--dtypes", - type=str, - default="fp32,tf32,fp16,bf16", - help="Comma-separated dtypes to benchmark", - ) - parser.add_argument( - "--tf32-version", - type=str, - default="v2", - choices=["v1", "v2"], - help="TF32 kernel version: v1 (WMMA) or v2 (PTX mma.sync, default)", - ) - args = parser.parse_args() - - sizes = [int(s.strip()) for s in args.sizes.split(",")] - dtypes = [d.strip().lower() for d in args.dtypes.split(",")] - use_tf32_v2 = args.tf32_version == "v2" - - warmup = 3 if args.quick else 5 - iterations = 5 if args.quick else 10 - - # Get GPU info - gpu_info = get_gpu_info() - print_header(gpu_info, args.tf32_version.upper()) - - # Run benchmarks - results = [] - - print("Running benchmarks...") - print() - - for size in sizes: - iters = max(2, iterations // 2) if size >= 8192 else iterations - - if "fp32" in dtypes: - # Disable TF32 for FP32 benchmark - os.environ.pop("PYGPUKIT_ALLOW_TF32", None) - os.environ.pop("PYGPUKIT_TF32_V2", None) - print(f" FP32 {size}x{size}...", end=" ", flush=True) - r = benchmark_fp32(size, warmup, iters) - results.append(r) - print(f"{r.tflops_median:.1f} TFLOPS") - - if "tf32" in dtypes: - print(f" TF32 {args.tf32_version} {size}x{size}...", end=" ", flush=True) - r = benchmark_tf32(size, warmup, iters, use_v2=use_tf32_v2) - results.append(r) - print(f"{r.tflops_median:.1f} TFLOPS") - - if "fp16" in dtypes: - print(f" FP16 {size}x{size}...", end=" ", flush=True) - r = benchmark_fp16(size, warmup, iters) - results.append(r) - print(f"{r.tflops_median:.1f} TFLOPS") - - if "bf16" in dtypes: - print(f" BF16 {size}x{size}...", end=" ", flush=True) - r = benchmark_bf16(size, warmup, iters) - results.append(r) - print(f"{r.tflops_median:.1f} TFLOPS") - - print() - - # Print results - print_correctness_results(results) - print_benchmark_results(results, sizes) - print_readme_table(results, sizes) - - # Summary - print("=" * 70) - print(" Summary") - print("=" * 70) - print() - print(f"GPU: {gpu_info.name}") - print(f"TF32 Kernel: {args.tf32_version.upper()}") - - if results: - peak = max(results, key=lambda r: r.tflops_median) - print(f"Peak: {peak.tflops_median:.1f} TFLOPS ({peak.dtype}, {peak.size}x{peak.size})") - - print() - print("RTX 3090 Ti Theoretical:") - print(" FP32: ~40 TFLOPS") - print(" TF32 TensorCore: ~80 TFLOPS (Sparse: ~156 TFLOPS)") - print(" FP16 TensorCore: ~160 TFLOPS (not yet optimized)") - print() - print("Note: FP16/BF16 use simple kernels. TensorCore optimization in Issue #60.") - print() - - -if __name__ == "__main__": - main() From 4da2707682950bd9cd64bc8b91c28ef3be7aea4e Mon Sep 17 00:00:00 2001 From: m96-chan Date: Wed, 31 Dec 2025 00:01:37 +0900 Subject: [PATCH 4/5] docs: update benchmark docs and add skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add .claude/skills/benchmark/README.md - Update CLAUDE.md to use 'python -m pygpukit.benchmark' - Update PR checklist benchmark command 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .claude/skills/benchmark/README.md | 57 ++++++++++++++++++++++++++++++ CLAUDE.md | 37 +++++++++++-------- 2 files changed, 80 insertions(+), 14 deletions(-) create mode 100644 .claude/skills/benchmark/README.md diff --git a/.claude/skills/benchmark/README.md b/.claude/skills/benchmark/README.md new file mode 100644 index 0000000..089c817 --- /dev/null +++ b/.claude/skills/benchmark/README.md @@ -0,0 +1,57 @@ +# Benchmark Skill + +Run unified benchmark suite for GEMM, GEMV, and attention kernels. + +## Commands + +```bash +# Quick benchmark (default: GEMM + GEMV) +python -m pygpukit.benchmark --quick + +# Full benchmark with all sizes +python -m pygpukit.benchmark + +# Save results to JSON +python -m pygpukit.benchmark --quick --save results.json + +# Compare with baseline +python -m pygpukit.benchmark --compare baseline.json + +# Fail on regression (for CI) +python -m pygpukit.benchmark --compare baseline.json --fail-on-regression + +# Specific benchmarks +python -m pygpukit.benchmark --gemm --sizes 4096,8192 +python -m pygpukit.benchmark --gemv --dtypes bf16,fp8 +python -m pygpukit.benchmark --attention --seq-lens 512,1024,2048 + +# All benchmarks including FP8 (SM120+) +python -m pygpukit.benchmark --all --fp8 + +# Markdown output for README +python -m pygpukit.benchmark --quick --markdown +``` + +## Output + +- Time in microseconds (us) +- TFLOPS for compute benchmarks +- Correctness verification +- JSON export for regression tracking + +## Usage in Code + +```python +from pygpukit.benchmark import BenchmarkSuite + +suite = BenchmarkSuite(quick=True) +suite.add_gemm(sizes=[(4096, 4096, 4096)]) +suite.add_gemv(dtypes=["bf16", "fp8"]) +report = suite.run() +report.save("baseline.json") + +# Compare +comparison = suite.compare("baseline.json") +if comparison.has_regression(threshold=0.05): + print("Regression detected!") +``` diff --git a/CLAUDE.md b/CLAUDE.md index 66e6722..9a1ae15 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -633,7 +633,7 @@ python -m mypy src/ --ignore-missing-imports --disable-error-code=union-attr --d python -m pytest tests/ -v # 4. Benchmark runs (optional but recommended) -python benchmark.py --quick +python -m pygpukit.benchmark --quick ``` **DO NOT create PR until all checks pass locally.** @@ -679,27 +679,36 @@ If performance or correctness degrades: ### Benchmarking -**Always use `benchmark.py` for performance measurement.** +**Use unified benchmark suite: `python -m pygpukit.benchmark`** ```bash -# Full benchmark (all dtypes, all sizes) -python benchmark.py +# Quick benchmark (GEMM + GEMV) +python -m pygpukit.benchmark --quick -# Quick mode (fewer warmup/iterations) -python benchmark.py --quick +# Full benchmark +python -m pygpukit.benchmark -# Specific sizes -python benchmark.py --sizes 4096 8192 +# Save results and compare with baseline +python -m pygpukit.benchmark --quick --save baseline.json +python -m pygpukit.benchmark --compare baseline.json --fail-on-regression -# TF32 kernel version selection -python benchmark.py --tf32-version v1 # WMMA API -python benchmark.py --tf32-version v2 # PTX mma.sync (default) +# Specific benchmarks +python -m pygpukit.benchmark --gemm --sizes 4096,8192 +python -m pygpukit.benchmark --gemv --dtypes bf16,fp8 +python -m pygpukit.benchmark --attention --seq-lens 512,1024 + +# All benchmarks including FP8 (SM120+) +python -m pygpukit.benchmark --all --fp8 + +# Markdown output for README +python -m pygpukit.benchmark --quick --markdown ``` **Output includes:** -- Kernel-only timing (no D2H copy overhead) -- Correctness verification (relative error) -- README.md-ready table format +- Time in microseconds (us) +- TFLOPS for compute benchmarks +- Correctness verification +- JSON export for regression tracking **Environment Variables:** - `PYGPUKIT_ALLOW_TF32=1` - Enable TF32 TensorCore From 1cb43424d42b26101157210dd9318b212e2bae61 Mon Sep 17 00:00:00 2001 From: m96-chan Date: Wed, 31 Dec 2025 00:03:46 +0900 Subject: [PATCH 5/5] fix(benchmark): fix lint errors - unused imports and ambiguous var names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/pygpukit/benchmark/attention.py | 36 ++++++++++++++--------------- src/pygpukit/benchmark/cli.py | 1 - src/pygpukit/benchmark/gemm.py | 2 -- src/pygpukit/benchmark/gemv.py | 1 - 4 files changed, 17 insertions(+), 23 deletions(-) diff --git a/src/pygpukit/benchmark/attention.py b/src/pygpukit/benchmark/attention.py index 8741d10..a79f95c 100644 --- a/src/pygpukit/benchmark/attention.py +++ b/src/pygpukit/benchmark/attention.py @@ -2,8 +2,6 @@ from __future__ import annotations -import numpy as np - from .base import Benchmark from .results import BenchmarkResult @@ -62,11 +60,11 @@ def _benchmark_sdpa( # (Q@K^T and attn@V, each 2*seq*seq*dim) flops = 4.0 * seq_len * seq_len * self.head_dim * self.num_heads - # Create Q, K, V - Q = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") - K = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") - V = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") - O = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") + # Create Q, K, V, Out + q = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") + k = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") + v = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") + out = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") # Check if native SDPA available if not hasattr(native, "sdpa_causal_bf16"): @@ -74,10 +72,10 @@ def _benchmark_sdpa( def run_fn() -> None: native.sdpa_causal_bf16( - Q._get_native(), - K._get_native(), - V._get_native(), - O._get_native(), + q._get_native(), + k._get_native(), + v._get_native(), + out._get_native(), ) return self._measure(name, run_fn, params, flops=flops) @@ -139,20 +137,20 @@ def _benchmark_gqa( # GQA FLOPs (KV heads broadcasted) flops = 4.0 * seq_len * seq_len * self.head_dim * self.num_heads - Q = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") - K = gk.empty((self.num_kv_heads, seq_len, self.head_dim), dtype="bfloat16") - V = gk.empty((self.num_kv_heads, seq_len, self.head_dim), dtype="bfloat16") - O = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") + q = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") + k = gk.empty((self.num_kv_heads, seq_len, self.head_dim), dtype="bfloat16") + v = gk.empty((self.num_kv_heads, seq_len, self.head_dim), dtype="bfloat16") + out = gk.empty((self.num_heads, seq_len, self.head_dim), dtype="bfloat16") if not hasattr(native, "sdpa_causal_gqa_bf16"): return None def run_fn() -> None: native.sdpa_causal_gqa_bf16( - Q._get_native(), - K._get_native(), - V._get_native(), - O._get_native(), + q._get_native(), + k._get_native(), + v._get_native(), + out._get_native(), self.num_heads // self.num_kv_heads, ) diff --git a/src/pygpukit/benchmark/cli.py b/src/pygpukit/benchmark/cli.py index 5c0390d..b3380bb 100644 --- a/src/pygpukit/benchmark/cli.py +++ b/src/pygpukit/benchmark/cli.py @@ -4,7 +4,6 @@ import argparse import sys -from pathlib import Path from . import BenchmarkReport, BenchmarkSuite diff --git a/src/pygpukit/benchmark/gemm.py b/src/pygpukit/benchmark/gemm.py index 2e0cd91..3c15b11 100644 --- a/src/pygpukit/benchmark/gemm.py +++ b/src/pygpukit/benchmark/gemm.py @@ -2,8 +2,6 @@ from __future__ import annotations -from typing import Literal - import numpy as np from .base import Benchmark diff --git a/src/pygpukit/benchmark/gemv.py b/src/pygpukit/benchmark/gemv.py index 695ee9d..d8b0585 100644 --- a/src/pygpukit/benchmark/gemv.py +++ b/src/pygpukit/benchmark/gemv.py @@ -7,7 +7,6 @@ from .base import Benchmark from .results import BenchmarkResult - # LLM-relevant GEMV configurations LLM_CONFIGS = [ # (K, N, label)