diff --git a/native/bindings/ops_bindings.cpp b/native/bindings/ops_bindings.cpp index d46b536..539a41f 100644 --- a/native/bindings/ops_bindings.cpp +++ b/native/bindings/ops_bindings.cpp @@ -31,4 +31,13 @@ void init_ops_bindings(py::module_& m) { m.def("matmul_", py::overload_cast(&ops::matmul), py::arg("a"), py::arg("b"), py::arg("out"), "Matrix multiplication with output array"); + + // TF32 variants + m.def("matmul_tf32", py::overload_cast(&ops::matmul), + py::arg("a"), py::arg("b"), py::arg("use_tf32"), + "Matrix multiplication with explicit TF32 control"); + + m.def("matmul_tf32_", py::overload_cast(&ops::matmul), + py::arg("a"), py::arg("b"), py::arg("out"), py::arg("use_tf32"), + "Matrix multiplication with explicit TF32 control and output array"); } diff --git a/native/ops/basic.cu b/native/ops/basic.cu index 78001f1..fdd7980 100644 --- a/native/ops/basic.cu +++ b/native/ops/basic.cu @@ -939,5 +939,149 @@ GPUArray matmul(const GPUArray& a, const GPUArray& b) { return c; } +// Internal helper: matmul with explicit TF32 control +static void matmul_impl(const GPUArray& a, const GPUArray& b, GPUArray& c, bool use_tf32_explicit) { + validate_matmul_shapes(a, b, "matmul"); + validate_same_dtype(a, b, "matmul"); + + size_t M = a.shape()[0]; + size_t K = a.shape()[1]; + size_t N = b.shape()[1]; + + if (c.shape()[0] != M || c.shape()[1] != N) { + throw std::runtime_error("matmul output shape mismatch"); + } + + // Check GPU compute capability for TF32 support + int device; + cudaGetDevice(&device); + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, device); + int sm_version = prop.major * 10 + prop.minor; + + // TF32 only works with float32 and SM >= 80 + bool tf32_enabled = use_tf32_explicit && + (a.dtype() == DataType::Float32) && + (sm_version >= 80); + + if (use_tf32_explicit && !tf32_enabled) { + if (a.dtype() != DataType::Float32) { + throw std::runtime_error("TF32 matmul requires float32 dtype"); + } + if (sm_version < 80) { + throw std::runtime_error("TF32 matmul requires SM >= 80 (Ampere or newer)"); + } + } + + // Use TF32 kernel for explicit request and large matrices + bool use_tf32 = tf32_enabled && + ((M >= OPTIMIZED_MATMUL_THRESHOLD && + N >= OPTIMIZED_MATMUL_THRESHOLD && + K >= OPTIMIZED_MATMUL_THRESHOLD) || + (M == 16 && (N == 8 || N == 16))); + + bool use_optimized = !use_tf32 && + (a.dtype() == DataType::Float32) && + (M >= OPTIMIZED_MATMUL_THRESHOLD || + N >= OPTIMIZED_MATMUL_THRESHOLD || + K >= OPTIMIZED_MATMUL_THRESHOLD); + + bool use_tiled = !use_optimized && !use_tf32 && + (M >= TILED_MATMUL_THRESHOLD || + N >= TILED_MATMUL_THRESHOLD || + K >= TILED_MATMUL_THRESHOLD); + + if (use_tf32) { + // TF32 TensorCore kernels + if (M == 16 && (N == 8 || N == 16)) { + tf32::launch_single_tile_verified( + static_cast(a.data()), + static_cast(b.data()), + static_cast(c.data()), + M, N, K); + } else { + tf32::launch_sgemm_tf32( + static_cast(a.data()), + static_cast(b.data()), + static_cast(c.data()), + M, N, K); + } + } else if (use_optimized) { + ampere::launch_sgemm_ampere( + static_cast(a.data()), + static_cast(b.data()), + static_cast(c.data()), + M, N, K); + } else if (use_tiled) { + dim3 block_size(TILE_N / THREAD_N, TILE_M / THREAD_M); + dim3 grid_size( + (N + TILE_N - 1) / TILE_N, + (M + TILE_M - 1) / TILE_M + ); + + switch (a.dtype()) { + case DataType::Float32: + matmul_f32_tiled_kernel<<>>( + static_cast(a.data()), + static_cast(b.data()), + static_cast(c.data()), + M, N, K); + break; + case DataType::Float64: + matmul_f64_tiled_kernel<<>>( + static_cast(a.data()), + static_cast(b.data()), + static_cast(c.data()), + M, N, K); + break; + default: + throw std::runtime_error("matmul only supports float32 and float64"); + } + } else { + dim3 block_size(BLOCK_SIZE, BLOCK_SIZE); + dim3 grid_size( + (N + BLOCK_SIZE - 1) / BLOCK_SIZE, + (M + BLOCK_SIZE - 1) / BLOCK_SIZE + ); + + switch (a.dtype()) { + case DataType::Float32: + matmul_f32_l2opt_kernel<<>>( + static_cast(a.data()), + static_cast(b.data()), + static_cast(c.data()), + M, N, K); + break; + case DataType::Float64: + matmul_f64_l2opt_kernel<<>>( + static_cast(a.data()), + static_cast(b.data()), + static_cast(c.data()), + M, N, K); + break; + default: + throw std::runtime_error("matmul only supports float32 and float64"); + } + } + + sync_and_check("matmul kernel failed"); +} + +void matmul(const GPUArray& a, const GPUArray& b, GPUArray& c, bool use_tf32) { + matmul_impl(a, b, c, use_tf32); +} + +GPUArray matmul(const GPUArray& a, const GPUArray& b, bool use_tf32) { + validate_matmul_shapes(a, b, "matmul"); + validate_same_dtype(a, b, "matmul"); + + size_t M = a.shape()[0]; + size_t N = b.shape()[1]; + + GPUArray c({M, N}, a.dtype()); + matmul_impl(a, b, c, use_tf32); + return c; +} + } // namespace ops } // namespace pygpukit diff --git a/native/ops/basic.cuh b/native/ops/basic.cuh index 14de780..9af5c3c 100644 --- a/native/ops/basic.cuh +++ b/native/ops/basic.cuh @@ -16,10 +16,17 @@ void mul(const GPUArray& a, const GPUArray& b, GPUArray& c); // a: (M, K), b: (K, N), c: (M, N) void matmul(const GPUArray& a, const GPUArray& b, GPUArray& c); +// Matrix multiplication with explicit TF32 control +// use_tf32: force TF32 TensorCore path (requires SM >= 80 and float32) +void matmul(const GPUArray& a, const GPUArray& b, GPUArray& c, bool use_tf32); + // Convenience functions that return new arrays GPUArray add(const GPUArray& a, const GPUArray& b); GPUArray mul(const GPUArray& a, const GPUArray& b); GPUArray matmul(const GPUArray& a, const GPUArray& b); +// Matmul with explicit TF32 control +GPUArray matmul(const GPUArray& a, const GPUArray& b, bool use_tf32); + } // namespace ops } // namespace pygpukit diff --git a/rust/pygpukit-core/src/device.rs b/rust/pygpukit-core/src/device.rs new file mode 100644 index 0000000..3661ffe --- /dev/null +++ b/rust/pygpukit-core/src/device.rs @@ -0,0 +1,284 @@ +//! Device capabilities and kernel type definitions +//! +//! Provides GPU device capability detection and kernel type enumeration +//! for selecting appropriate kernel implementations. + +/// GPU kernel type enumeration +/// +/// Represents different kernel implementations available for GPU operations. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum KernelType { + /// FP32 FMA (Fused Multiply-Add) kernel + /// Standard FP32 precision, used for maximum accuracy + Fp32Fma, + + /// TF32 MMA (Matrix Multiply-Accumulate) kernel using Tensor Cores + /// Uses TF32 precision (19-bit mantissa) for faster computation + /// Only available on Ampere (SM80+) and newer GPUs + Tf32Mma, + + /// FP16 MMA kernel using Tensor Cores + /// Uses FP16 precision for maximum throughput + Fp16Mma, + + /// BF16 MMA kernel using Tensor Cores + /// Uses BF16 precision (8-bit exponent, 7-bit mantissa) + Bf16Mma, + + /// L2-optimized naive kernel + /// Simple kernel optimized for L2 cache locality + L2Naive, + + /// Tiled shared memory kernel + /// Uses shared memory tiling for memory bandwidth optimization + TiledSmem, +} + +impl KernelType { + /// Check if this kernel type uses Tensor Cores + pub fn uses_tensor_cores(&self) -> bool { + matches!(self, KernelType::Tf32Mma | KernelType::Fp16Mma | KernelType::Bf16Mma) + } + + /// Get the minimum SM version required for this kernel type + pub fn min_sm_version(&self) -> u32 { + match self { + KernelType::Fp32Fma => 60, // Pascal + KernelType::L2Naive => 60, // Pascal + KernelType::TiledSmem => 60, // Pascal + KernelType::Tf32Mma => 80, // Ampere + KernelType::Fp16Mma => 70, // Volta (but better on Ampere) + KernelType::Bf16Mma => 80, // Ampere + } + } + + /// Get human-readable name + pub fn name(&self) -> &'static str { + match self { + KernelType::Fp32Fma => "FP32 FMA", + KernelType::Tf32Mma => "TF32 MMA (TensorCore)", + KernelType::Fp16Mma => "FP16 MMA (TensorCore)", + KernelType::Bf16Mma => "BF16 MMA (TensorCore)", + KernelType::L2Naive => "L2 Naive", + KernelType::TiledSmem => "Tiled Shared Memory", + } + } +} + +impl std::fmt::Display for KernelType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.name()) + } +} + +/// GPU Device Capabilities +/// +/// Contains information about GPU hardware capabilities +/// used for kernel selection and optimization decisions. +#[derive(Debug, Clone, Default)] +pub struct DeviceCapabilities { + /// Device index + pub device_id: u32, + + /// Device name (e.g., "NVIDIA GeForce RTX 3090 Ti") + pub name: String, + + /// SM (Streaming Multiprocessor) version + /// Computed as major * 10 + minor (e.g., SM 8.6 = 86) + pub sm_version: u32, + + /// Compute capability major version + pub compute_major: u32, + + /// Compute capability minor version + pub compute_minor: u32, + + /// Whether TF32 Tensor Cores are available (SM >= 80) + pub tensorcore: bool, + + /// Whether FP16 Tensor Cores are available (SM >= 70) + pub tensorcore_fp16: bool, + + /// Whether BF16 Tensor Cores are available (SM >= 80) + pub tensorcore_bf16: bool, + + /// Total global memory in bytes + pub total_memory: u64, + + /// L2 cache size in bytes + pub l2_cache_size: u32, + + /// Shared memory per block in bytes + pub shared_mem_per_block: u32, + + /// Maximum threads per block + pub max_threads_per_block: u32, + + /// Number of SMs + pub sm_count: u32, + + /// Warp size + pub warp_size: u32, + + /// Whether async copy (cp.async) is supported (SM >= 80) + pub async_copy: bool, +} + +impl DeviceCapabilities { + /// Create capabilities for a specific SM version + /// + /// This is useful for testing or when actual device info is not available. + pub fn from_sm_version(sm_version: u32) -> Self { + let compute_major = sm_version / 10; + let compute_minor = sm_version % 10; + + Self { + device_id: 0, + name: format!("SM {}.{}", compute_major, compute_minor), + sm_version, + compute_major, + compute_minor, + tensorcore: sm_version >= 80, + tensorcore_fp16: sm_version >= 70, + tensorcore_bf16: sm_version >= 80, + total_memory: 0, + l2_cache_size: 0, + shared_mem_per_block: 49152, // 48KB default + max_threads_per_block: 1024, + sm_count: 0, + warp_size: 32, + async_copy: sm_version >= 80, + } + } + + /// Create an Ampere (RTX 30xx / A100) device + pub fn ampere() -> Self { + Self::from_sm_version(86) + } + + /// Create an Ada (RTX 40xx) device + pub fn ada() -> Self { + Self::from_sm_version(89) + } + + /// Create a Hopper (H100) device + pub fn hopper() -> Self { + Self::from_sm_version(90) + } + + /// Check if a kernel type is supported + pub fn supports_kernel(&self, kernel_type: KernelType) -> bool { + self.sm_version >= kernel_type.min_sm_version() + } + + /// Get the best kernel type for matmul + /// + /// # Arguments + /// * `use_tf32` - Whether TF32 is allowed + /// * `dtype_is_fp32` - Whether the data type is FP32 + /// * `large_matrix` - Whether the matrix is large enough for optimized kernels + pub fn best_matmul_kernel(&self, use_tf32: bool, dtype_is_fp32: bool, large_matrix: bool) -> KernelType { + if use_tf32 && dtype_is_fp32 && self.tensorcore && large_matrix { + KernelType::Tf32Mma + } else if dtype_is_fp32 && large_matrix && self.sm_version >= 80 { + // Ampere-optimized FP32 FMA + KernelType::Fp32Fma + } else if large_matrix { + KernelType::TiledSmem + } else { + KernelType::L2Naive + } + } + + /// Check if this is an Ampere or newer GPU + pub fn is_ampere_or_newer(&self) -> bool { + self.sm_version >= 80 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_kernel_type_tensor_cores() { + assert!(!KernelType::Fp32Fma.uses_tensor_cores()); + assert!(KernelType::Tf32Mma.uses_tensor_cores()); + assert!(KernelType::Fp16Mma.uses_tensor_cores()); + assert!(KernelType::Bf16Mma.uses_tensor_cores()); + assert!(!KernelType::L2Naive.uses_tensor_cores()); + } + + #[test] + fn test_kernel_type_min_sm() { + assert_eq!(KernelType::Fp32Fma.min_sm_version(), 60); + assert_eq!(KernelType::Tf32Mma.min_sm_version(), 80); + assert_eq!(KernelType::Fp16Mma.min_sm_version(), 70); + assert_eq!(KernelType::Bf16Mma.min_sm_version(), 80); + } + + #[test] + fn test_device_capabilities_from_sm() { + let caps = DeviceCapabilities::from_sm_version(86); + assert_eq!(caps.sm_version, 86); + assert_eq!(caps.compute_major, 8); + assert_eq!(caps.compute_minor, 6); + assert!(caps.tensorcore); + assert!(caps.tensorcore_fp16); + assert!(caps.tensorcore_bf16); + assert!(caps.async_copy); + } + + #[test] + fn test_device_capabilities_old_gpu() { + let caps = DeviceCapabilities::from_sm_version(75); + assert_eq!(caps.sm_version, 75); + assert!(!caps.tensorcore); // TF32 requires SM80 + assert!(caps.tensorcore_fp16); // FP16 tensor cores on Turing + assert!(!caps.tensorcore_bf16); // BF16 requires SM80 + assert!(!caps.async_copy); + } + + #[test] + fn test_supports_kernel() { + let ampere = DeviceCapabilities::ampere(); + assert!(ampere.supports_kernel(KernelType::Fp32Fma)); + assert!(ampere.supports_kernel(KernelType::Tf32Mma)); + assert!(ampere.supports_kernel(KernelType::Fp16Mma)); + + let turing = DeviceCapabilities::from_sm_version(75); + assert!(turing.supports_kernel(KernelType::Fp32Fma)); + assert!(!turing.supports_kernel(KernelType::Tf32Mma)); + assert!(turing.supports_kernel(KernelType::Fp16Mma)); + } + + #[test] + fn test_best_matmul_kernel_tf32() { + let ampere = DeviceCapabilities::ampere(); + + // TF32 enabled, FP32 dtype, large matrix + let kernel = ampere.best_matmul_kernel(true, true, true); + assert_eq!(kernel, KernelType::Tf32Mma); + + // TF32 disabled + let kernel = ampere.best_matmul_kernel(false, true, true); + assert_eq!(kernel, KernelType::Fp32Fma); + } + + #[test] + fn test_best_matmul_kernel_small_matrix() { + let ampere = DeviceCapabilities::ampere(); + + // Small matrix should use L2 naive + let kernel = ampere.best_matmul_kernel(true, true, false); + assert_eq!(kernel, KernelType::L2Naive); + } + + #[test] + fn test_is_ampere_or_newer() { + assert!(DeviceCapabilities::ampere().is_ampere_or_newer()); + assert!(DeviceCapabilities::ada().is_ampere_or_newer()); + assert!(DeviceCapabilities::hopper().is_ampere_or_newer()); + assert!(!DeviceCapabilities::from_sm_version(75).is_ampere_or_newer()); + } +} diff --git a/rust/pygpukit-core/src/lib.rs b/rust/pygpukit-core/src/lib.rs index 169609c..cf47f0d 100644 --- a/rust/pygpukit-core/src/lib.rs +++ b/rust/pygpukit-core/src/lib.rs @@ -6,11 +6,13 @@ //! - Async memory transfer engine with separate streams //! - Kernel dispatch controller with stream management //! - Kernel pacing engine with bandwidth control +//! - Device capabilities and kernel type selection pub mod memory; pub mod scheduler; pub mod transfer; pub mod dispatch; +pub mod device; pub use memory::{MemoryBlock, MemoryPool, PoolStats, MemoryError}; pub use scheduler::{ @@ -29,3 +31,4 @@ pub use dispatch::{ SliceScheduler, SliceConfig, SlicedKernel, KernelSlice, SliceInfo, SliceStats, KernelCache, CacheConfig, CachedKernel, CompileOptions, CacheStats, }; +pub use device::{KernelType, DeviceCapabilities}; diff --git a/rust/pygpukit-python/src/device.rs b/rust/pygpukit-python/src/device.rs new file mode 100644 index 0000000..6c11b78 --- /dev/null +++ b/rust/pygpukit-python/src/device.rs @@ -0,0 +1,279 @@ +//! PyO3 bindings for device capabilities and kernel types + +use pyo3::prelude::*; +use pygpukit_core::device::{DeviceCapabilities, KernelType}; + +/// Python-exposed kernel type enum +#[pyclass(name = "KernelType")] +#[derive(Clone)] +pub struct PyKernelType { + inner: KernelType, +} + +#[pymethods] +impl PyKernelType { + /// FP32 FMA kernel + #[classattr] + fn FP32_FMA() -> Self { + Self { inner: KernelType::Fp32Fma } + } + + /// TF32 MMA (TensorCore) kernel + #[classattr] + fn TF32_MMA() -> Self { + Self { inner: KernelType::Tf32Mma } + } + + /// FP16 MMA (TensorCore) kernel + #[classattr] + fn FP16_MMA() -> Self { + Self { inner: KernelType::Fp16Mma } + } + + /// BF16 MMA (TensorCore) kernel + #[classattr] + fn BF16_MMA() -> Self { + Self { inner: KernelType::Bf16Mma } + } + + /// L2-optimized naive kernel + #[classattr] + fn L2_NAIVE() -> Self { + Self { inner: KernelType::L2Naive } + } + + /// Tiled shared memory kernel + #[classattr] + fn TILED_SMEM() -> Self { + Self { inner: KernelType::TiledSmem } + } + + /// Check if this kernel type uses Tensor Cores + fn uses_tensor_cores(&self) -> bool { + self.inner.uses_tensor_cores() + } + + /// Get the minimum SM version required + fn min_sm_version(&self) -> u32 { + self.inner.min_sm_version() + } + + /// Get human-readable name + fn name(&self) -> &'static str { + self.inner.name() + } + + fn __repr__(&self) -> String { + format!("KernelType.{}", match self.inner { + KernelType::Fp32Fma => "FP32_FMA", + KernelType::Tf32Mma => "TF32_MMA", + KernelType::Fp16Mma => "FP16_MMA", + KernelType::Bf16Mma => "BF16_MMA", + KernelType::L2Naive => "L2_NAIVE", + KernelType::TiledSmem => "TILED_SMEM", + }) + } + + fn __str__(&self) -> String { + self.inner.name().to_string() + } + + fn __eq__(&self, other: &PyKernelType) -> bool { + self.inner == other.inner + } +} + +impl From for PyKernelType { + fn from(inner: KernelType) -> Self { + Self { inner } + } +} + +impl From<&PyKernelType> for KernelType { + fn from(py_type: &PyKernelType) -> Self { + py_type.inner + } +} + +/// Python-exposed device capabilities +#[pyclass(name = "DeviceCapabilities")] +#[derive(Clone)] +pub struct PyDeviceCapabilities { + inner: DeviceCapabilities, +} + +#[pymethods] +impl PyDeviceCapabilities { + /// Create capabilities from SM version + #[new] + #[pyo3(signature = (sm_version=86))] + fn new(sm_version: u32) -> Self { + Self { + inner: DeviceCapabilities::from_sm_version(sm_version), + } + } + + /// Create Ampere device capabilities + #[staticmethod] + fn ampere() -> Self { + Self { + inner: DeviceCapabilities::ampere(), + } + } + + /// Create Ada device capabilities + #[staticmethod] + fn ada() -> Self { + Self { + inner: DeviceCapabilities::ada(), + } + } + + /// Create Hopper device capabilities + #[staticmethod] + fn hopper() -> Self { + Self { + inner: DeviceCapabilities::hopper(), + } + } + + /// Device ID + #[getter] + fn device_id(&self) -> u32 { + self.inner.device_id + } + + /// Device name + #[getter] + fn name(&self) -> &str { + &self.inner.name + } + + /// SM version (e.g., 86 for SM 8.6) + #[getter] + fn sm_version(&self) -> u32 { + self.inner.sm_version + } + + /// Compute capability (alias for sm_version) + #[getter] + fn compute_capability(&self) -> u32 { + self.inner.sm_version + } + + /// Compute major version + #[getter] + fn compute_major(&self) -> u32 { + self.inner.compute_major + } + + /// Compute minor version + #[getter] + fn compute_minor(&self) -> u32 { + self.inner.compute_minor + } + + /// Whether TF32 Tensor Cores are available (SM >= 80) + #[getter] + fn tensorcore(&self) -> bool { + self.inner.tensorcore + } + + /// Whether FP16 Tensor Cores are available (SM >= 70) + #[getter] + fn tensorcore_fp16(&self) -> bool { + self.inner.tensorcore_fp16 + } + + /// Whether BF16 Tensor Cores are available (SM >= 80) + #[getter] + fn tensorcore_bf16(&self) -> bool { + self.inner.tensorcore_bf16 + } + + /// Total global memory in bytes + #[getter] + fn total_memory(&self) -> u64 { + self.inner.total_memory + } + + /// L2 cache size in bytes + #[getter] + fn l2_cache_size(&self) -> u32 { + self.inner.l2_cache_size + } + + /// Shared memory per block in bytes + #[getter] + fn shared_mem_per_block(&self) -> u32 { + self.inner.shared_mem_per_block + } + + /// Maximum threads per block + #[getter] + fn max_threads_per_block(&self) -> u32 { + self.inner.max_threads_per_block + } + + /// Number of SMs + #[getter] + fn sm_count(&self) -> u32 { + self.inner.sm_count + } + + /// Warp size + #[getter] + fn warp_size(&self) -> u32 { + self.inner.warp_size + } + + /// Whether async copy (cp.async) is supported + #[getter] + fn async_copy(&self) -> bool { + self.inner.async_copy + } + + /// Check if a kernel type is supported + fn supports_kernel(&self, kernel_type: &PyKernelType) -> bool { + self.inner.supports_kernel(kernel_type.inner) + } + + /// Get the best matmul kernel type + #[pyo3(signature = (use_tf32=false, dtype_is_fp32=true, large_matrix=true))] + fn best_matmul_kernel(&self, use_tf32: bool, dtype_is_fp32: bool, large_matrix: bool) -> PyKernelType { + self.inner.best_matmul_kernel(use_tf32, dtype_is_fp32, large_matrix).into() + } + + /// Check if this is Ampere or newer + fn is_ampere_or_newer(&self) -> bool { + self.inner.is_ampere_or_newer() + } + + fn __repr__(&self) -> String { + format!( + "DeviceCapabilities(sm_version={}, tensorcore={}, name='{}')", + self.inner.sm_version, + self.inner.tensorcore, + self.inner.name + ) + } +} + +impl From for PyDeviceCapabilities { + fn from(inner: DeviceCapabilities) -> Self { + Self { inner } + } +} + +impl From<&PyDeviceCapabilities> for DeviceCapabilities { + fn from(py_caps: &PyDeviceCapabilities) -> Self { + py_caps.inner.clone() + } +} + +/// Register device module +pub fn register(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + Ok(()) +} diff --git a/rust/pygpukit-python/src/lib.rs b/rust/pygpukit-python/src/lib.rs index 8ea56e2..5ff3cc0 100644 --- a/rust/pygpukit-python/src/lib.rs +++ b/rust/pygpukit-python/src/lib.rs @@ -9,6 +9,7 @@ mod memory; mod scheduler; mod transfer; mod dispatch; +mod device; /// PyGPUkit Rust module #[pymodule] @@ -33,6 +34,11 @@ fn _pygpukit_rust(m: &Bound<'_, PyModule>) -> PyResult<()> { dispatch::register(&dispatch_module)?; m.add_submodule(&dispatch_module)?; + // Device submodule + let device_module = PyModule::new(m.py(), "device")?; + device::register(&device_module)?; + m.add_submodule(&device_module)?; + // Also export at top level for convenience m.add_class::()?; m.add_class::()?; @@ -91,6 +97,9 @@ fn _pygpukit_rust(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + // Device capabilities + m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/src/pygpukit/__init__.py b/src/pygpukit/__init__.py index 287a819..f6f1bc8 100644 --- a/src/pygpukit/__init__.py +++ b/src/pygpukit/__init__.py @@ -3,13 +3,25 @@ __version__ = "0.2.0" from pygpukit.core.array import GPUArray -from pygpukit.core.device import DeviceInfo, get_device_info, is_cuda_available +from pygpukit.core.device import ( + DeviceInfo, + get_device_capabilities, + get_device_info, + is_cuda_available, +) from pygpukit.core.dtypes import DataType, float32, float64, int32, int64 from pygpukit.core.factory import empty, from_numpy, ones, zeros from pygpukit.core.stream import Stream, StreamManager, default_stream from pygpukit.jit.compiler import JITKernel, jit from pygpukit.ops.basic import add, matmul, mul +# Try to import Rust types +try: + from pygpukit._pygpukit_rust import DeviceCapabilities, KernelType +except ImportError: + DeviceCapabilities = None + KernelType = None + __all__ = [ # Version "__version__", @@ -17,7 +29,10 @@ "GPUArray", # Device "DeviceInfo", + "DeviceCapabilities", + "KernelType", "get_device_info", + "get_device_capabilities", "is_cuda_available", # Data types "DataType", diff --git a/src/pygpukit/core/device.py b/src/pygpukit/core/device.py index bb4edf9..00d3d2a 100644 --- a/src/pygpukit/core/device.py +++ b/src/pygpukit/core/device.py @@ -60,3 +60,64 @@ def get_device_info(device_id: int = 0) -> DeviceInfo: max_threads_per_block=props.max_threads_per_block, warp_size=props.warp_size, ) + + +def get_device_capabilities(device_id: int = 0): + """Get device capabilities from Rust backend. + + Returns a DeviceCapabilities object with: + - sm_version: SM version (e.g., 86 for SM 8.6) + - tensorcore: Whether TF32 TensorCores are available + - tensorcore_fp16: Whether FP16 TensorCores are available + - tensorcore_bf16: Whether BF16 TensorCores are available + - async_copy: Whether cp.async is supported + + Args: + device_id: Device index (default 0). + + Returns: + DeviceCapabilities from Rust backend. + """ + try: + from pygpukit._pygpukit_rust import DeviceCapabilities + except ImportError: + # Rust module not available - create from device info + info = get_device_info(device_id) + if info.compute_capability: + sm_version = info.compute_capability[0] * 10 + info.compute_capability[1] + else: + sm_version = 0 + + # Create a simple capabilities object + @dataclass + class _DeviceCapabilities: + device_id: int + name: str + sm_version: int + compute_capability: int + tensorcore: bool + tensorcore_fp16: bool + tensorcore_bf16: bool + async_copy: bool + + return _DeviceCapabilities( + device_id=device_id, + name=info.name, + sm_version=sm_version, + compute_capability=sm_version, + tensorcore=sm_version >= 80, + tensorcore_fp16=sm_version >= 70, + tensorcore_bf16=sm_version >= 80, + async_copy=sm_version >= 80, + ) + + # Get actual device info and create capabilities + info = get_device_info(device_id) + if info.compute_capability: + sm_version = info.compute_capability[0] * 10 + info.compute_capability[1] + else: + sm_version = 0 + + # Create Rust DeviceCapabilities with actual SM version + caps = DeviceCapabilities(sm_version) + return caps diff --git a/src/pygpukit/ops/basic.py b/src/pygpukit/ops/basic.py index 469b23d..0ed444e 100644 --- a/src/pygpukit/ops/basic.py +++ b/src/pygpukit/ops/basic.py @@ -121,18 +121,23 @@ def _mul_native(a: GPUArray, b: GPUArray) -> GPUArray: return GPUArray._wrap_native(c_native) -def matmul(a: GPUArray, b: GPUArray) -> GPUArray: +def matmul(a: GPUArray, b: GPUArray, *, use_tf32: bool | None = None) -> GPUArray: """Matrix multiplication of two 2D arrays. Args: a: First input array (M x K). b: Second input array (K x N). + use_tf32: Whether to use TF32 TensorCore acceleration (Ampere+ only). + - None (default): Use PYGPUKIT_ALLOW_TF32 environment variable + - True: Force TF32 mode (requires SM >= 80 and float32) + - False: Force FP32 mode Returns: A new GPUArray containing the matrix product (M x N). Raises: ValueError: If arrays are not 2D or dimensions don't match. + RuntimeError: If use_tf32=True but GPU doesn't support it or dtype is not float32. """ if a.ndim != 2: raise ValueError(f"matmul requires 2D arrays, got {a.ndim}D for first argument") @@ -147,10 +152,16 @@ def matmul(a: GPUArray, b: GPUArray) -> GPUArray: _validate_same_dtype(a, b, "matmul") + # Check TF32 dtype requirement early (before backend dispatch) + if use_tf32 is True: + from pygpukit.core.dtypes import float32 + if a.dtype != float32: + raise RuntimeError("TF32 matmul requires float32 dtype") + backend = get_backend() if isinstance(backend, NativeBackend) and backend.is_available(): - return _matmul_native(a, b) + return _matmul_native(a, b, use_tf32=use_tf32) else: return _matmul_cpu(a, b) @@ -163,8 +174,15 @@ def _matmul_cpu(a: GPUArray, b: GPUArray) -> GPUArray: return from_numpy(result_np) -def _matmul_native(a: GPUArray, b: GPUArray) -> GPUArray: - """Native C++ CUDA implementation of matmul (zero-copy).""" +def _matmul_native(a: GPUArray, b: GPUArray, *, use_tf32: bool | None = None) -> GPUArray: + """Native C++ CUDA implementation of matmul (zero-copy). + + Args: + a: First input array. + b: Second input array. + use_tf32: Whether to use TF32 TensorCore acceleration. + None means use environment variable PYGPUKIT_ALLOW_TF32. + """ from pygpukit.core.backend import get_native_module native = get_native_module() @@ -174,7 +192,12 @@ def _matmul_native(a: GPUArray, b: GPUArray) -> GPUArray: b_native = b._get_native() # Perform operation on GPU - c_native = native.matmul(a_native, b_native) + if use_tf32 is not None: + # Use explicit TF32 control + c_native = native.matmul_tf32(a_native, b_native, use_tf32) + else: + # Use environment variable for TF32 control + c_native = native.matmul(a_native, b_native) # Wrap result (zero-copy) return GPUArray._wrap_native(c_native) diff --git a/tests/test_tf32_api.py b/tests/test_tf32_api.py new file mode 100644 index 0000000..59773e7 --- /dev/null +++ b/tests/test_tf32_api.py @@ -0,0 +1,175 @@ +"""Tests for TF32 API integration (TDD). + +These tests verify the use_tf32 parameter for matmul and +the Rust-side DeviceCapabilities.tensorcore support. +""" + +import numpy as np +import pytest + +import pygpukit as gp + + +class TestMatmulTF32API: + """Tests for matmul use_tf32 parameter.""" + + def test_matmul_use_tf32_false_default(self): + """Test that use_tf32=False is the default behavior.""" + np.random.seed(42) + a_np = np.random.rand(64, 64).astype(np.float32) + b_np = np.random.rand(64, 64).astype(np.float32) + + a = gp.from_numpy(a_np) + b = gp.from_numpy(b_np) + + # Default behavior (no use_tf32 arg) should use FP32 + c = gp.matmul(a, b) + + result = c.to_numpy() + expected = np.matmul(a_np, b_np) + # FP32 should be very accurate + np.testing.assert_array_almost_equal(result, expected, decimal=4) + + def test_matmul_use_tf32_explicit_false(self): + """Test matmul with explicit use_tf32=False.""" + np.random.seed(42) + a_np = np.random.rand(64, 64).astype(np.float32) + b_np = np.random.rand(64, 64).astype(np.float32) + + a = gp.from_numpy(a_np) + b = gp.from_numpy(b_np) + + c = gp.matmul(a, b, use_tf32=False) + + result = c.to_numpy() + expected = np.matmul(a_np, b_np) + np.testing.assert_array_almost_equal(result, expected, decimal=4) + + def test_matmul_use_tf32_true_correctness(self): + """Test matmul with use_tf32=True produces correct results within TF32 tolerance.""" + np.random.seed(42) + # Large enough to trigger TF32 kernel + a_np = np.random.rand(1024, 1024).astype(np.float32) + b_np = np.random.rand(1024, 1024).astype(np.float32) + + a = gp.from_numpy(a_np) + b = gp.from_numpy(b_np) + + c = gp.matmul(a, b, use_tf32=True) + + result = c.to_numpy() + expected = np.matmul(a_np, b_np) + + # TF32 has lower precision (~0.1% relative error per op) + # For 1024 accumulations, expect ~1-5% relative error + rel_error = np.abs(result - expected) / (np.abs(expected) + 1e-8) + max_rel_error = np.max(rel_error) + assert max_rel_error < 0.1, f"TF32 relative error too high: {max_rel_error}" + + def test_matmul_use_tf32_small_matrix_fallback(self): + """Test that small matrices with use_tf32=True still work (may fallback to FP32).""" + np.random.seed(42) + a_np = np.random.rand(16, 16).astype(np.float32) + b_np = np.random.rand(16, 16).astype(np.float32) + + a = gp.from_numpy(a_np) + b = gp.from_numpy(b_np) + + # Small matrix - implementation may use FP32 fallback + c = gp.matmul(a, b, use_tf32=True) + + result = c.to_numpy() + expected = np.matmul(a_np, b_np) + # Should still be reasonably accurate + np.testing.assert_array_almost_equal(result, expected, decimal=2) + + def test_matmul_use_tf32_float64_raises(self): + """Test that use_tf32=True with float64 raises an error.""" + a_np = np.random.rand(64, 64).astype(np.float64) + b_np = np.random.rand(64, 64).astype(np.float64) + + a = gp.from_numpy(a_np) + b = gp.from_numpy(b_np) + + # TF32 only works with float32 - should raise RuntimeError + with pytest.raises(RuntimeError, match="float32"): + gp.matmul(a, b, use_tf32=True) + + def test_matmul_use_tf32_rectangular(self): + """Test TF32 matmul with rectangular matrices.""" + np.random.seed(42) + a_np = np.random.rand(512, 1024).astype(np.float32) + b_np = np.random.rand(1024, 768).astype(np.float32) + + a = gp.from_numpy(a_np) + b = gp.from_numpy(b_np) + + c = gp.matmul(a, b, use_tf32=True) + + assert c.shape == (512, 768) + result = c.to_numpy() + expected = np.matmul(a_np, b_np) + + rel_error = np.abs(result - expected) / (np.abs(expected) + 1e-8) + max_rel_error = np.max(rel_error) + assert max_rel_error < 0.1, f"TF32 relative error too high: {max_rel_error}" + + +class TestDeviceCapabilities: + """Tests for DeviceCapabilities from Rust.""" + + def test_device_capabilities_exists(self): + """Test that DeviceCapabilities class is available.""" + assert hasattr(gp, 'DeviceCapabilities') or hasattr(gp, 'get_device_capabilities') + + def test_device_capabilities_tensorcore_field(self): + """Test that DeviceCapabilities has tensorcore field.""" + # Get capabilities for current device + caps = gp.get_device_capabilities() + + assert hasattr(caps, 'tensorcore') + assert isinstance(caps.tensorcore, bool) + + def test_device_capabilities_sm_version(self): + """Test that DeviceCapabilities has SM version info.""" + caps = gp.get_device_capabilities() + + assert hasattr(caps, 'sm_version') or hasattr(caps, 'compute_capability') + + def test_tensorcore_requires_sm80(self): + """Test that tensorcore is True only for SM >= 80.""" + caps = gp.get_device_capabilities() + + sm_version = getattr(caps, 'sm_version', None) or getattr(caps, 'compute_capability', 0) + if sm_version >= 80: + # Ampere or newer should have tensor cores + assert caps.tensorcore is True + else: + # Older GPUs don't have TF32 tensor cores + assert caps.tensorcore is False + + +class TestKernelTypeRust: + """Tests for Rust kernel type enum.""" + + def test_kernel_type_exists(self): + """Test that KernelType enum is available from Rust.""" + # This should be exposed via pygpukit._pygpukit_rust + try: + from pygpukit._pygpukit_rust import KernelType + assert hasattr(KernelType, 'Tf32Mma') or hasattr(KernelType, 'TF32_MMA') + except ImportError: + # Rust module may not be built yet - skip + pytest.skip("Rust module not available") + + def test_kernel_type_fp32_exists(self): + """Test that FP32 kernel type exists.""" + try: + from pygpukit._pygpukit_rust import KernelType + assert hasattr(KernelType, 'Fp32Fma') or hasattr(KernelType, 'FP32_FMA') + except ImportError: + pytest.skip("Rust module not available") + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"])