diff --git a/native/bindings/ops_bindings.cpp b/native/bindings/ops_bindings.cpp
index d46b536..539a41f 100644
--- a/native/bindings/ops_bindings.cpp
+++ b/native/bindings/ops_bindings.cpp
@@ -31,4 +31,13 @@ void init_ops_bindings(py::module_& m) {
     m.def("matmul_", py::overload_cast<const GPUArray&, const GPUArray&, GPUArray&>(&ops::matmul),
           py::arg("a"), py::arg("b"), py::arg("out"),
           "Matrix multiplication with output array");
+
+    // TF32 variants
+    m.def("matmul_tf32", py::overload_cast<const GPUArray&, const GPUArray&, bool>(&ops::matmul),
+          py::arg("a"), py::arg("b"), py::arg("use_tf32"),
+          "Matrix multiplication with explicit TF32 control");
+
+    m.def("matmul_tf32_", py::overload_cast<const GPUArray&, const GPUArray&, GPUArray&, bool>(&ops::matmul),
+          py::arg("a"), py::arg("b"), py::arg("out"), py::arg("use_tf32"),
+          "Matrix multiplication with explicit TF32 control and output array");
 }
diff --git a/native/ops/basic.cu b/native/ops/basic.cu
index 78001f1..fdd7980 100644
--- a/native/ops/basic.cu
+++ b/native/ops/basic.cu
@@ -939,5 +939,149 @@ GPUArray matmul(const GPUArray& a, const GPUArray& b) {
     return c;
 }
 
+// Internal helper: matmul with explicit TF32 control
+static void matmul_impl(const GPUArray& a, const GPUArray& b, GPUArray& c, bool use_tf32_explicit) {
+    validate_matmul_shapes(a, b, "matmul");
+    validate_same_dtype(a, b, "matmul");
+
+    size_t M = a.shape()[0];
+    size_t K = a.shape()[1];
+    size_t N = b.shape()[1];
+
+    if (c.shape()[0] != M || c.shape()[1] != N) {
+        throw std::runtime_error("matmul output shape mismatch");
+    }
+
+    // Check GPU compute capability for TF32 support
+    int device;
+    cudaGetDevice(&device);
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device);
+    int sm_version = prop.major * 10 + prop.minor;
+
+    // TF32 only works with float32 and SM >= 80
+    bool tf32_enabled = use_tf32_explicit &&
+                        (a.dtype() == DataType::Float32) &&
+                        (sm_version >= 80);
+
+    if (use_tf32_explicit && !tf32_enabled) {
+        if (a.dtype() != DataType::Float32) {
+            throw std::runtime_error("TF32 matmul requires float32 dtype");
+        }
+        if (sm_version < 80) {
+            throw std::runtime_error("TF32 matmul requires SM >= 80 (Ampere or newer)");
+        }
+    }
+
+    // Use TF32 kernel for explicit request and large matrices
+    bool use_tf32 = tf32_enabled &&
+                    ((M >= OPTIMIZED_MATMUL_THRESHOLD &&
+                      N >= OPTIMIZED_MATMUL_THRESHOLD &&
+                      K >= OPTIMIZED_MATMUL_THRESHOLD) ||
+                     (M == 16 && (N == 8 || N == 16)));
+
+    bool use_optimized = !use_tf32 &&
+                         (a.dtype() == DataType::Float32) &&
+                         (M >= OPTIMIZED_MATMUL_THRESHOLD ||
+                          N >= OPTIMIZED_MATMUL_THRESHOLD ||
+                          K >= OPTIMIZED_MATMUL_THRESHOLD);
+
+    bool use_tiled = !use_optimized && !use_tf32 &&
+                     (M >= TILED_MATMUL_THRESHOLD ||
+                      N >= TILED_MATMUL_THRESHOLD ||
+                      K >= TILED_MATMUL_THRESHOLD);
+
+    if (use_tf32) {
+        // TF32 TensorCore kernels
+        if (M == 16 && (N == 8 || N == 16)) {
+            tf32::launch_single_tile_verified(
+                static_cast<const float*>(a.data()),
+                static_cast<const float*>(b.data()),
+                static_cast<float*>(c.data()),
+                M, N, K);
+        } else {
+            tf32::launch_sgemm_tf32(
+                static_cast<const float*>(a.data()),
+                static_cast<const float*>(b.data()),
+                static_cast<float*>(c.data()),
+                M, N, K);
+        }
+    } else if (use_optimized) {
+        ampere::launch_sgemm_ampere(
+            static_cast<const float*>(a.data()),
+            static_cast<const float*>(b.data()),
+            static_cast<float*>(c.data()),
+            M, N, K);
+    } else if (use_tiled) {
+        dim3 block_size(TILE_N / THREAD_N, TILE_M / THREAD_M);
+        dim3 grid_size(
+            (N + TILE_N - 1) / TILE_N,
+            (M + TILE_M - 1) / TILE_M
+        );
+
+        switch (a.dtype()) {
+            case DataType::Float32:
+                matmul_f32_tiled_kernel<<<grid_size, block_size>>>(
+                    static_cast<const float*>(a.data()),
+                    static_cast<const float*>(b.data()),
+                    static_cast<float*>(c.data()),
+                    M, N, K);
+                break;
+            case DataType::Float64:
+                matmul_f64_tiled_kernel<<<grid_size, block_size>>>(
+                    static_cast<const double*>(a.data()),
+                    static_cast<const double*>(b.data()),
+                    static_cast<double*>(c.data()),
+                    M, N, K);
+                break;
+            default:
+                throw std::runtime_error("matmul only supports float32 and float64");
+        }
+    } else {
+        dim3 block_size(BLOCK_SIZE, BLOCK_SIZE);
+        dim3 grid_size(
+            (N + BLOCK_SIZE - 1) / BLOCK_SIZE,
+            (M + BLOCK_SIZE - 1) / BLOCK_SIZE
+        );
+
+        switch (a.dtype()) {
+            case DataType::Float32:
+                matmul_f32_l2opt_kernel<<<grid_size, block_size>>>(
+                    static_cast<const float*>(a.data()),
+                    static_cast<const float*>(b.data()),
+                    static_cast<float*>(c.data()),
+                    M, N, K);
+                break;
+            case DataType::Float64:
+                matmul_f64_l2opt_kernel<<<grid_size, block_size>>>(
+                    static_cast<const double*>(a.data()),
+                    static_cast<const double*>(b.data()),
+                    static_cast<double*>(c.data()),
+                    M, N, K);
+                break;
+            default:
+                throw std::runtime_error("matmul only supports float32 and float64");
+        }
+    }
+
+    sync_and_check("matmul kernel failed");
+}
+
+void matmul(const GPUArray& a, const GPUArray& b, GPUArray& c, bool use_tf32) {
+    matmul_impl(a, b, c, use_tf32);
+}
+
+GPUArray matmul(const GPUArray& a, const GPUArray& b, bool use_tf32) {
+    validate_matmul_shapes(a, b, "matmul");
+    validate_same_dtype(a, b, "matmul");
+
+    size_t M = a.shape()[0];
+    size_t N = b.shape()[1];
+
+    GPUArray c({M, N}, a.dtype());
+    matmul_impl(a, b, c, use_tf32);
+    return c;
+}
+
 } // namespace ops
 } // namespace pygpukit
diff --git a/native/ops/basic.cuh b/native/ops/basic.cuh
index 14de780..9af5c3c 100644
--- a/native/ops/basic.cuh
+++ b/native/ops/basic.cuh
@@ -16,10 +16,17 @@ void mul(const GPUArray& a, const GPUArray& b, GPUArray& c);
 // a: (M, K), b: (K, N), c: (M, N)
 void matmul(const GPUArray& a, const GPUArray& b, GPUArray& c);
 
+// Matrix multiplication with explicit TF32 control
+// use_tf32: force TF32 TensorCore path (requires SM >= 80 and float32)
+void matmul(const GPUArray& a, const GPUArray& b, GPUArray& c, bool use_tf32);
+
 // Convenience functions that return new arrays
 GPUArray add(const GPUArray& a, const GPUArray& b);
 GPUArray mul(const GPUArray& a, const GPUArray& b);
 GPUArray matmul(const GPUArray& a, const GPUArray& b);
 
+// Matmul with explicit TF32 control
+GPUArray matmul(const GPUArray& a, const GPUArray& b, bool use_tf32);
+
 } // namespace ops
 } // namespace pygpukit
diff --git a/rust/pygpukit-core/src/device.rs b/rust/pygpukit-core/src/device.rs
new file mode 100644
index 0000000..3661ffe
--- /dev/null
+++ b/rust/pygpukit-core/src/device.rs
@@ -0,0 +1,284 @@
+//! Device capabilities and kernel type definitions
+//!
+//! Provides GPU device capability detection and kernel type enumeration
+//! for selecting appropriate kernel implementations.
+
+/// GPU kernel type enumeration
+///
+/// Represents different kernel implementations available for GPU operations.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum KernelType {
+    /// FP32 FMA (Fused Multiply-Add) kernel
+    /// Standard FP32 precision, used for maximum accuracy
+    Fp32Fma,
+
+    /// TF32 MMA (Matrix Multiply-Accumulate) kernel using Tensor Cores
+    /// Uses TF32 precision (19-bit mantissa) for faster computation
+    /// Only available on Ampere (SM80+) and newer GPUs
+    Tf32Mma,
+
+    /// FP16 MMA kernel using Tensor Cores
+    /// Uses FP16 precision for maximum throughput
+    Fp16Mma,
+
+    /// BF16 MMA kernel using Tensor Cores
+    /// Uses BF16 precision (8-bit exponent, 7-bit mantissa)
+    Bf16Mma,
+
+    /// L2-optimized naive kernel
+    /// Simple kernel optimized for L2 cache locality
+    L2Naive,
+
+    /// Tiled shared memory kernel
+    /// Uses shared memory tiling for memory bandwidth optimization
+    TiledSmem,
+}
+
+impl KernelType {
+    /// Check if this kernel type uses Tensor Cores
+    pub fn uses_tensor_cores(&self) -> bool {
+        matches!(self, KernelType::Tf32Mma | KernelType::Fp16Mma | KernelType::Bf16Mma)
+    }
+
+    /// Get the minimum SM version required for this kernel type
+    pub fn min_sm_version(&self) -> u32 {
+        match self {
+            KernelType::Fp32Fma => 60,      // Pascal
+            KernelType::L2Naive => 60,      // Pascal
+            KernelType::TiledSmem => 60,    // Pascal
+            KernelType::Tf32Mma => 80,      // Ampere
+            KernelType::Fp16Mma => 70,      // Volta (but better on Ampere)
+            KernelType::Bf16Mma => 80,      // Ampere
+        }
+    }
+
+    /// Get human-readable name
+    pub fn name(&self) -> &'static str {
+        match self {
+            KernelType::Fp32Fma => "FP32 FMA",
+            KernelType::Tf32Mma => "TF32 MMA (TensorCore)",
+            KernelType::Fp16Mma => "FP16 MMA (TensorCore)",
+            KernelType::Bf16Mma => "BF16 MMA (TensorCore)",
+            KernelType::L2Naive => "L2 Naive",
+            KernelType::TiledSmem => "Tiled Shared Memory",
+        }
+    }
+}
+
+impl std::fmt::Display for KernelType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.name())
+    }
+}
+
+/// GPU Device Capabilities
+///
+/// Contains information about GPU hardware capabilities
+/// used for kernel selection and optimization decisions.
+#[derive(Debug, Clone, Default)]
+pub struct DeviceCapabilities {
+    /// Device index
+    pub device_id: u32,
+
+    /// Device name (e.g., "NVIDIA GeForce RTX 3090 Ti")
+    pub name: String,
+
+    /// SM (Streaming Multiprocessor) version
+    /// Computed as major * 10 + minor (e.g., SM 8.6 = 86)
+    pub sm_version: u32,
+
+    /// Compute capability major version
+    pub compute_major: u32,
+
+    /// Compute capability minor version
+    pub compute_minor: u32,
+
+    /// Whether TF32 Tensor Cores are available (SM >= 80)
+    pub tensorcore: bool,
+
+    /// Whether FP16 Tensor Cores are available (SM >= 70)
+    pub tensorcore_fp16: bool,
+
+    /// Whether BF16 Tensor Cores are available (SM >= 80)
+    pub tensorcore_bf16: bool,
+
+    /// Total global memory in bytes
+    pub total_memory: u64,
+
+    /// L2 cache size in bytes
+    pub l2_cache_size: u32,
+
+    /// Shared memory per block in bytes
+    pub shared_mem_per_block: u32,
+
+    /// Maximum threads per block
+    pub max_threads_per_block: u32,
+
+    /// Number of SMs
+    pub sm_count: u32,
+
+    /// Warp size
+    pub warp_size: u32,
+
+    /// Whether async copy (cp.async) is supported (SM >= 80)
+    pub async_copy: bool,
+}
+
+impl DeviceCapabilities {
+    /// Create capabilities for a specific SM version
+    ///
+    /// This is useful for testing or when actual device info is not available.
+    pub fn from_sm_version(sm_version: u32) -> Self {
+        let compute_major = sm_version / 10;
+        let compute_minor = sm_version % 10;
+
+        Self {
+            device_id: 0,
+            name: format!("SM {}.{}", compute_major, compute_minor),
+            sm_version,
+            compute_major,
+            compute_minor,
+            tensorcore: sm_version >= 80,
+            tensorcore_fp16: sm_version >= 70,
+            tensorcore_bf16: sm_version >= 80,
+            total_memory: 0,
+            l2_cache_size: 0,
+            shared_mem_per_block: 49152, // 48KB default
+            max_threads_per_block: 1024,
+            sm_count: 0,
+            warp_size: 32,
+            async_copy: sm_version >= 80,
+        }
+    }
+
+    /// Create an Ampere (RTX 30xx / A100) device
+    pub fn ampere() -> Self {
+        Self::from_sm_version(86)
+    }
+
+    /// Create an Ada (RTX 40xx) device
+    pub fn ada() -> Self {
+        Self::from_sm_version(89)
+    }
+
+    /// Create a Hopper (H100) device
+    pub fn hopper() -> Self {
+        Self::from_sm_version(90)
+    }
+
+    /// Check if a kernel type is supported
+    pub fn supports_kernel(&self, kernel_type: KernelType) -> bool {
+        self.sm_version >= kernel_type.min_sm_version()
+    }
+
+    /// Get the best kernel type for matmul
+    ///
+    /// # Arguments
+    /// * `use_tf32` - Whether TF32 is allowed
+    /// * `dtype_is_fp32` - Whether the data type is FP32
+    /// * `large_matrix` - Whether the matrix is large enough for optimized kernels
+    pub fn best_matmul_kernel(&self, use_tf32: bool, dtype_is_fp32: bool, large_matrix: bool) -> KernelType {
+        if use_tf32 && dtype_is_fp32 && self.tensorcore && large_matrix {
+            KernelType::Tf32Mma
+        } else if dtype_is_fp32 && large_matrix && self.sm_version >= 80 {
+            // Ampere-optimized FP32 FMA
+            KernelType::Fp32Fma
+        } else if large_matrix {
+            KernelType::TiledSmem
+        } else {
+            KernelType::L2Naive
+        }
+    }
+
+    /// Check if this is an Ampere or newer GPU
+    pub fn is_ampere_or_newer(&self) -> bool {
+        self.sm_version >= 80
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_kernel_type_tensor_cores() {
+        assert!(!KernelType::Fp32Fma.uses_tensor_cores());
+        assert!(KernelType::Tf32Mma.uses_tensor_cores());
+        assert!(KernelType::Fp16Mma.uses_tensor_cores());
+        assert!(KernelType::Bf16Mma.uses_tensor_cores());
+        assert!(!KernelType::L2Naive.uses_tensor_cores());
+    }
+
+    #[test]
+    fn test_kernel_type_min_sm() {
+        assert_eq!(KernelType::Fp32Fma.min_sm_version(), 60);
+        assert_eq!(KernelType::Tf32Mma.min_sm_version(), 80);
+        assert_eq!(KernelType::Fp16Mma.min_sm_version(), 70);
+        assert_eq!(KernelType::Bf16Mma.min_sm_version(), 80);
+    }
+
+    #[test]
+    fn test_device_capabilities_from_sm() {
+        let caps = DeviceCapabilities::from_sm_version(86);
+        assert_eq!(caps.sm_version, 86);
+        assert_eq!(caps.compute_major, 8);
+        assert_eq!(caps.compute_minor, 6);
+        assert!(caps.tensorcore);
+        assert!(caps.tensorcore_fp16);
+        assert!(caps.tensorcore_bf16);
+        assert!(caps.async_copy);
+    }
+
+    #[test]
+    fn test_device_capabilities_old_gpu() {
+        let caps = DeviceCapabilities::from_sm_version(75);
+        assert_eq!(caps.sm_version, 75);
+        assert!(!caps.tensorcore);  // TF32 requires SM80
+        assert!(caps.tensorcore_fp16);  // FP16 tensor cores on Turing
+        assert!(!caps.tensorcore_bf16);  // BF16 requires SM80
+        assert!(!caps.async_copy);
+    }
+
+    #[test]
+    fn test_supports_kernel() {
+        let ampere = DeviceCapabilities::ampere();
+        assert!(ampere.supports_kernel(KernelType::Fp32Fma));
+        assert!(ampere.supports_kernel(KernelType::Tf32Mma));
+        assert!(ampere.supports_kernel(KernelType::Fp16Mma));
+
+        let turing = DeviceCapabilities::from_sm_version(75);
+        assert!(turing.supports_kernel(KernelType::Fp32Fma));
+        assert!(!turing.supports_kernel(KernelType::Tf32Mma));
+        assert!(turing.supports_kernel(KernelType::Fp16Mma));
+    }
+
+    #[test]
+    fn test_best_matmul_kernel_tf32() {
+        let ampere = DeviceCapabilities::ampere();
+
+        // TF32 enabled, FP32 dtype, large matrix
+        let kernel = ampere.best_matmul_kernel(true, true, true);
+        assert_eq!(kernel, KernelType::Tf32Mma);
+
+        // TF32 disabled
+        let kernel = ampere.best_matmul_kernel(false, true, true);
+        assert_eq!(kernel, KernelType::Fp32Fma);
+    }
+
+    #[test]
+    fn test_best_matmul_kernel_small_matrix() {
+        let ampere = DeviceCapabilities::ampere();
+
+        // Small matrix should use L2 naive
+        let kernel = ampere.best_matmul_kernel(true, true, false);
+        assert_eq!(kernel, KernelType::L2Naive);
+    }
+
+    #[test]
+    fn test_is_ampere_or_newer() {
+        assert!(DeviceCapabilities::ampere().is_ampere_or_newer());
+        assert!(DeviceCapabilities::ada().is_ampere_or_newer());
+        assert!(DeviceCapabilities::hopper().is_ampere_or_newer());
+        assert!(!DeviceCapabilities::from_sm_version(75).is_ampere_or_newer());
+    }
+}
diff --git a/rust/pygpukit-core/src/lib.rs b/rust/pygpukit-core/src/lib.rs
index 169609c..cf47f0d 100644
--- a/rust/pygpukit-core/src/lib.rs
+++ b/rust/pygpukit-core/src/lib.rs
@@ -6,11 +6,13 @@
 //! - Async memory transfer engine with separate streams
 //! - Kernel dispatch controller with stream management
 //! - Kernel pacing engine with bandwidth control
+//! - Device capabilities and kernel type selection
 
 pub mod memory;
 pub mod scheduler;
 pub mod transfer;
 pub mod dispatch;
+pub mod device;
 
 pub use memory::{MemoryBlock, MemoryPool, PoolStats, MemoryError};
 pub use scheduler::{
@@ -29,3 +31,4 @@ pub use dispatch::{
     SliceScheduler, SliceConfig, SlicedKernel, KernelSlice, SliceInfo, SliceStats,
     KernelCache, CacheConfig, CachedKernel, CompileOptions, CacheStats,
 };
+pub use device::{KernelType, DeviceCapabilities};
diff --git a/rust/pygpukit-python/src/device.rs b/rust/pygpukit-python/src/device.rs
new file mode 100644
index 0000000..6c11b78
--- /dev/null
+++ b/rust/pygpukit-python/src/device.rs
@@ -0,0 +1,279 @@
+//! PyO3 bindings for device capabilities and kernel types
+
+use pyo3::prelude::*;
+use pygpukit_core::device::{DeviceCapabilities, KernelType};
+
+/// Python-exposed kernel type enum
+#[pyclass(name = "KernelType")]
+#[derive(Clone)]
+pub struct PyKernelType {
+    inner: KernelType,
+}
+
+#[pymethods]
+impl PyKernelType {
+    /// FP32 FMA kernel
+    #[classattr]
+    fn FP32_FMA() -> Self {
+        Self { inner: KernelType::Fp32Fma }
+    }
+
+    /// TF32 MMA (TensorCore) kernel
+    #[classattr]
+    fn TF32_MMA() -> Self {
+        Self { inner: KernelType::Tf32Mma }
+    }
+
+    /// FP16 MMA (TensorCore) kernel
+    #[classattr]
+    fn FP16_MMA() -> Self {
+        Self { inner: KernelType::Fp16Mma }
+    }
+
+    /// BF16 MMA (TensorCore) kernel
+    #[classattr]
+    fn BF16_MMA() -> Self {
+        Self { inner: KernelType::Bf16Mma }
+    }
+
+    /// L2-optimized naive kernel
+    #[classattr]
+    fn L2_NAIVE() -> Self {
+        Self { inner: KernelType::L2Naive }
+    }
+
+    /// Tiled shared memory kernel
+    #[classattr]
+    fn TILED_SMEM() -> Self {
+        Self { inner: KernelType::TiledSmem }
+    }
+
+    /// Check if this kernel type uses Tensor Cores
+    fn uses_tensor_cores(&self) -> bool {
+        self.inner.uses_tensor_cores()
+    }
+
+    /// Get the minimum SM version required
+    fn min_sm_version(&self) -> u32 {
+        self.inner.min_sm_version()
+    }
+
+    /// Get human-readable name
+    fn name(&self) -> &'static str {
+        self.inner.name()
+    }
+
+    fn __repr__(&self) -> String {
+        format!("KernelType.{}", match self.inner {
+            KernelType::Fp32Fma => "FP32_FMA",
+            KernelType::Tf32Mma => "TF32_MMA",
+            KernelType::Fp16Mma => "FP16_MMA",
+            KernelType::Bf16Mma => "BF16_MMA",
+            KernelType::L2Naive => "L2_NAIVE",
+            KernelType::TiledSmem => "TILED_SMEM",
+        })
+    }
+
+    fn __str__(&self) -> String {
+        self.inner.name().to_string()
+    }
+
+    fn __eq__(&self, other: &PyKernelType) -> bool {
+        self.inner == other.inner
+    }
+}
+
+impl From<KernelType> for PyKernelType {
+    fn from(inner: KernelType) -> Self {
+        Self { inner }
+    }
+}
+
+impl From<&PyKernelType> for KernelType {
+    fn from(py_type: &PyKernelType) -> Self {
+        py_type.inner
+    }
+}
+
+/// Python-exposed device capabilities
+#[pyclass(name = "DeviceCapabilities")]
+#[derive(Clone)]
+pub struct PyDeviceCapabilities {
+    inner: DeviceCapabilities,
+}
+
+#[pymethods]
+impl PyDeviceCapabilities {
+    /// Create capabilities from SM version
+    #[new]
+    #[pyo3(signature = (sm_version=86))]
+    fn new(sm_version: u32) -> Self {
+        Self {
+            inner: DeviceCapabilities::from_sm_version(sm_version),
+        }
+    }
+
+    /// Create Ampere device capabilities
+    #[staticmethod]
+    fn ampere() -> Self {
+        Self {
+            inner: DeviceCapabilities::ampere(),
+        }
+    }
+
+    /// Create Ada device capabilities
+    #[staticmethod]
+    fn ada() -> Self {
+        Self {
+            inner: DeviceCapabilities::ada(),
+        }
+    }
+
+    /// Create Hopper device capabilities
+    #[staticmethod]
+    fn hopper() -> Self {
+        Self {
+            inner: DeviceCapabilities::hopper(),
+        }
+    }
+
+    /// Device ID
+    #[getter]
+    fn device_id(&self) -> u32 {
+        self.inner.device_id
+    }
+
+    /// Device name
+    #[getter]
+    fn name(&self) -> &str {
+        &self.inner.name
+    }
+
+    /// SM version (e.g., 86 for SM 8.6)
+    #[getter]
+    fn sm_version(&self) -> u32 {
+        self.inner.sm_version
+    }
+
+    /// Compute capability (alias for sm_version)
+    #[getter]
+    fn compute_capability(&self) -> u32 {
+        self.inner.sm_version
+    }
+
+    /// Compute major version
+    #[getter]
+    fn compute_major(&self) -> u32 {
+        self.inner.compute_major
+    }
+
+    /// Compute minor version
+    #[getter]
+    fn compute_minor(&self) -> u32 {
+        self.inner.compute_minor
+    }
+
+    /// Whether TF32 Tensor Cores are available (SM >= 80)
+    #[getter]
+    fn tensorcore(&self) -> bool {
+        self.inner.tensorcore
+    }
+
+    /// Whether FP16 Tensor Cores are available (SM >= 70)
+    #[getter]
+    fn tensorcore_fp16(&self) -> bool {
+        self.inner.tensorcore_fp16
+    }
+
+    /// Whether BF16 Tensor Cores are available (SM >= 80)
+    #[getter]
+    fn tensorcore_bf16(&self) -> bool {
+        self.inner.tensorcore_bf16
+    }
+
+    /// Total global memory in bytes
+    #[getter]
+    fn total_memory(&self) -> u64 {
+        self.inner.total_memory
+    }
+
+    /// L2 cache size in bytes
+    #[getter]
+    fn l2_cache_size(&self) -> u32 {
+        self.inner.l2_cache_size
+    }
+
+    /// Shared memory per block in bytes
+    #[getter]
+    fn shared_mem_per_block(&self) -> u32 {
+        self.inner.shared_mem_per_block
+    }
+
+    /// Maximum threads per block
+    #[getter]
+    fn max_threads_per_block(&self) -> u32 {
+        self.inner.max_threads_per_block
+    }
+
+    /// Number of SMs
+    #[getter]
+    fn sm_count(&self) -> u32 {
+        self.inner.sm_count
+    }
+
+    /// Warp size
+    #[getter]
+    fn warp_size(&self) -> u32 {
+        self.inner.warp_size
+    }
+
+    /// Whether async copy (cp.async) is supported
+    #[getter]
+    fn async_copy(&self) -> bool {
+        self.inner.async_copy
+    }
+
+    /// Check if a kernel type is supported
+    fn supports_kernel(&self, kernel_type: &PyKernelType) -> bool {
+        self.inner.supports_kernel(kernel_type.inner)
+    }
+
+    /// Get the best matmul kernel type
+    #[pyo3(signature = (use_tf32=false, dtype_is_fp32=true, large_matrix=true))]
+    fn best_matmul_kernel(&self, use_tf32: bool, dtype_is_fp32: bool, large_matrix: bool) -> PyKernelType {
+        self.inner.best_matmul_kernel(use_tf32, dtype_is_fp32, large_matrix).into()
+    }
+
+    /// Check if this is Ampere or newer
+    fn is_ampere_or_newer(&self) -> bool {
+        self.inner.is_ampere_or_newer()
+    }
+
+    fn __repr__(&self) -> String {
+        format!(
+            "DeviceCapabilities(sm_version={}, tensorcore={}, name='{}')",
+            self.inner.sm_version,
+            self.inner.tensorcore,
+            self.inner.name
+        )
+    }
+}
+
+impl From<DeviceCapabilities> for PyDeviceCapabilities {
+    fn from(inner: DeviceCapabilities) -> Self {
+        Self { inner }
+    }
+}
+
+impl From<&PyDeviceCapabilities> for DeviceCapabilities {
+    fn from(py_caps: &PyDeviceCapabilities) -> Self {
+        py_caps.inner.clone()
+    }
+}
+
+/// Register device module
+pub fn register(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_class::<PyKernelType>()?;
+    m.add_class::<PyDeviceCapabilities>()?;
+    Ok(())
+}
diff --git a/rust/pygpukit-python/src/lib.rs b/rust/pygpukit-python/src/lib.rs
index 8ea56e2..5ff3cc0 100644
--- a/rust/pygpukit-python/src/lib.rs
+++ b/rust/pygpukit-python/src/lib.rs
@@ -9,6 +9,7 @@ mod memory;
 mod scheduler;
 mod transfer;
 mod dispatch;
+mod device;
 
 /// PyGPUkit Rust module
 #[pymodule]
@@ -33,6 +34,11 @@ fn _pygpukit_rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
     dispatch::register(&dispatch_module)?;
     m.add_submodule(&dispatch_module)?;
 
+    // Device submodule
+    let device_module = PyModule::new(m.py(), "device")?;
+    device::register(&device_module)?;
+    m.add_submodule(&device_module)?;
+
     // Also export at top level for convenience
     m.add_class::<memory::PyMemoryPool>()?;
     m.add_class::<memory::PyMemoryBlock>()?;
@@ -91,6 +97,9 @@ fn _pygpukit_rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<scheduler::PyPartitionUsage>()?;
     m.add_class::<scheduler::PyPartition>()?;
     m.add_class::<scheduler::PyPartitionStats>()?;
+    // Device capabilities
+    m.add_class::<device::PyKernelType>()?;
+    m.add_class::<device::PyDeviceCapabilities>()?;
 
     Ok(())
 }
diff --git a/src/pygpukit/__init__.py b/src/pygpukit/__init__.py
index 287a819..f6f1bc8 100644
--- a/src/pygpukit/__init__.py
+++ b/src/pygpukit/__init__.py
@@ -3,13 +3,25 @@
 __version__ = "0.2.0"
 
 from pygpukit.core.array import GPUArray
-from pygpukit.core.device import DeviceInfo, get_device_info, is_cuda_available
+from pygpukit.core.device import (
+    DeviceInfo,
+    get_device_capabilities,
+    get_device_info,
+    is_cuda_available,
+)
 from pygpukit.core.dtypes import DataType, float32, float64, int32, int64
 from pygpukit.core.factory import empty, from_numpy, ones, zeros
 from pygpukit.core.stream import Stream, StreamManager, default_stream
 from pygpukit.jit.compiler import JITKernel, jit
 from pygpukit.ops.basic import add, matmul, mul
 
+# Try to import Rust types
+try:
+    from pygpukit._pygpukit_rust import DeviceCapabilities, KernelType
+except ImportError:
+    DeviceCapabilities = None
+    KernelType = None
+
 __all__ = [
     # Version
     "__version__",
@@ -17,7 +29,10 @@
     "GPUArray",
     # Device
     "DeviceInfo",
+    "DeviceCapabilities",
+    "KernelType",
     "get_device_info",
+    "get_device_capabilities",
     "is_cuda_available",
     # Data types
     "DataType",
diff --git a/src/pygpukit/core/device.py b/src/pygpukit/core/device.py
index bb4edf9..00d3d2a 100644
--- a/src/pygpukit/core/device.py
+++ b/src/pygpukit/core/device.py
@@ -60,3 +60,64 @@ def get_device_info(device_id: int = 0) -> DeviceInfo:
         max_threads_per_block=props.max_threads_per_block,
         warp_size=props.warp_size,
     )
+
+
+def get_device_capabilities(device_id: int = 0):
+    """Get device capabilities from Rust backend.
+
+    Returns a DeviceCapabilities object with:
+    - sm_version: SM version (e.g., 86 for SM 8.6)
+    - tensorcore: Whether TF32 TensorCores are available
+    - tensorcore_fp16: Whether FP16 TensorCores are available
+    - tensorcore_bf16: Whether BF16 TensorCores are available
+    - async_copy: Whether cp.async is supported
+
+    Args:
+        device_id: Device index (default 0).
+
+    Returns:
+        DeviceCapabilities from Rust backend.
+    """
+    try:
+        from pygpukit._pygpukit_rust import DeviceCapabilities
+    except ImportError:
+        # Rust module not available - create from device info
+        info = get_device_info(device_id)
+        if info.compute_capability:
+            sm_version = info.compute_capability[0] * 10 + info.compute_capability[1]
+        else:
+            sm_version = 0
+
+        # Create a simple capabilities object
+        @dataclass
+        class _DeviceCapabilities:
+            device_id: int
+            name: str
+            sm_version: int
+            compute_capability: int
+            tensorcore: bool
+            tensorcore_fp16: bool
+            tensorcore_bf16: bool
+            async_copy: bool
+
+        return _DeviceCapabilities(
+            device_id=device_id,
+            name=info.name,
+            sm_version=sm_version,
+            compute_capability=sm_version,
+            tensorcore=sm_version >= 80,
+            tensorcore_fp16=sm_version >= 70,
+            tensorcore_bf16=sm_version >= 80,
+            async_copy=sm_version >= 80,
+        )
+
+    # Get actual device info and create capabilities
+    info = get_device_info(device_id)
+    if info.compute_capability:
+        sm_version = info.compute_capability[0] * 10 + info.compute_capability[1]
+    else:
+        sm_version = 0
+
+    # Create Rust DeviceCapabilities with actual SM version
+    caps = DeviceCapabilities(sm_version)
+    return caps
diff --git a/src/pygpukit/ops/basic.py b/src/pygpukit/ops/basic.py
index 469b23d..0ed444e 100644
--- a/src/pygpukit/ops/basic.py
+++ b/src/pygpukit/ops/basic.py
@@ -121,18 +121,23 @@ def _mul_native(a: GPUArray, b: GPUArray) -> GPUArray:
     return GPUArray._wrap_native(c_native)
 
 
-def matmul(a: GPUArray, b: GPUArray) -> GPUArray:
+def matmul(a: GPUArray, b: GPUArray, *, use_tf32: bool | None = None) -> GPUArray:
     """Matrix multiplication of two 2D arrays.
 
     Args:
         a: First input array (M x K).
         b: Second input array (K x N).
+        use_tf32: Whether to use TF32 TensorCore acceleration (Ampere+ only).
+            - None (default): Use PYGPUKIT_ALLOW_TF32 environment variable
+            - True: Force TF32 mode (requires SM >= 80 and float32)
+            - False: Force FP32 mode
 
     Returns:
         A new GPUArray containing the matrix product (M x N).
 
     Raises:
         ValueError: If arrays are not 2D or dimensions don't match.
+        RuntimeError: If use_tf32=True but GPU doesn't support it or dtype is not float32.
     """
     if a.ndim != 2:
         raise ValueError(f"matmul requires 2D arrays, got {a.ndim}D for first argument")
@@ -147,10 +152,16 @@ def matmul(a: GPUArray, b: GPUArray) -> GPUArray:
 
     _validate_same_dtype(a, b, "matmul")
 
+    # Check TF32 dtype requirement early (before backend dispatch)
+    if use_tf32 is True:
+        from pygpukit.core.dtypes import float32
+        if a.dtype != float32:
+            raise RuntimeError("TF32 matmul requires float32 dtype")
+
     backend = get_backend()
 
     if isinstance(backend, NativeBackend) and backend.is_available():
-        return _matmul_native(a, b)
+        return _matmul_native(a, b, use_tf32=use_tf32)
     else:
         return _matmul_cpu(a, b)
 
@@ -163,8 +174,15 @@ def _matmul_cpu(a: GPUArray, b: GPUArray) -> GPUArray:
     return from_numpy(result_np)
 
 
-def _matmul_native(a: GPUArray, b: GPUArray) -> GPUArray:
-    """Native C++ CUDA implementation of matmul (zero-copy)."""
+def _matmul_native(a: GPUArray, b: GPUArray, *, use_tf32: bool | None = None) -> GPUArray:
+    """Native C++ CUDA implementation of matmul (zero-copy).
+
+    Args:
+        a: First input array.
+        b: Second input array.
+        use_tf32: Whether to use TF32 TensorCore acceleration.
+            None means use environment variable PYGPUKIT_ALLOW_TF32.
+    """
     from pygpukit.core.backend import get_native_module
 
     native = get_native_module()
@@ -174,7 +192,12 @@ def _matmul_native(a: GPUArray, b: GPUArray) -> GPUArray:
     b_native = b._get_native()
 
     # Perform operation on GPU
-    c_native = native.matmul(a_native, b_native)
+    if use_tf32 is not None:
+        # Use explicit TF32 control
+        c_native = native.matmul_tf32(a_native, b_native, use_tf32)
+    else:
+        # Use environment variable for TF32 control
+        c_native = native.matmul(a_native, b_native)
 
     # Wrap result (zero-copy)
     return GPUArray._wrap_native(c_native)
diff --git a/tests/test_tf32_api.py b/tests/test_tf32_api.py
new file mode 100644
index 0000000..59773e7
--- /dev/null
+++ b/tests/test_tf32_api.py
@@ -0,0 +1,175 @@
+"""Tests for TF32 API integration (TDD).
+
+These tests verify the use_tf32 parameter for matmul and
+the Rust-side DeviceCapabilities.tensorcore support.
+"""
+
+import numpy as np
+import pytest
+
+import pygpukit as gp
+
+
+class TestMatmulTF32API:
+    """Tests for matmul use_tf32 parameter."""
+
+    def test_matmul_use_tf32_false_default(self):
+        """Test that use_tf32=False is the default behavior."""
+        np.random.seed(42)
+        a_np = np.random.rand(64, 64).astype(np.float32)
+        b_np = np.random.rand(64, 64).astype(np.float32)
+
+        a = gp.from_numpy(a_np)
+        b = gp.from_numpy(b_np)
+
+        # Default behavior (no use_tf32 arg) should use FP32
+        c = gp.matmul(a, b)
+
+        result = c.to_numpy()
+        expected = np.matmul(a_np, b_np)
+        # FP32 should be very accurate
+        np.testing.assert_array_almost_equal(result, expected, decimal=4)
+
+    def test_matmul_use_tf32_explicit_false(self):
+        """Test matmul with explicit use_tf32=False."""
+        np.random.seed(42)
+        a_np = np.random.rand(64, 64).astype(np.float32)
+        b_np = np.random.rand(64, 64).astype(np.float32)
+
+        a = gp.from_numpy(a_np)
+        b = gp.from_numpy(b_np)
+
+        c = gp.matmul(a, b, use_tf32=False)
+
+        result = c.to_numpy()
+        expected = np.matmul(a_np, b_np)
+        np.testing.assert_array_almost_equal(result, expected, decimal=4)
+
+    def test_matmul_use_tf32_true_correctness(self):
+        """Test matmul with use_tf32=True produces correct results within TF32 tolerance."""
+        np.random.seed(42)
+        # Large enough to trigger TF32 kernel
+        a_np = np.random.rand(1024, 1024).astype(np.float32)
+        b_np = np.random.rand(1024, 1024).astype(np.float32)
+
+        a = gp.from_numpy(a_np)
+        b = gp.from_numpy(b_np)
+
+        c = gp.matmul(a, b, use_tf32=True)
+
+        result = c.to_numpy()
+        expected = np.matmul(a_np, b_np)
+
+        # TF32 has lower precision (~0.1% relative error per op)
+        # For 1024 accumulations, expect ~1-5% relative error
+        rel_error = np.abs(result - expected) / (np.abs(expected) + 1e-8)
+        max_rel_error = np.max(rel_error)
+        assert max_rel_error < 0.1, f"TF32 relative error too high: {max_rel_error}"
+
+    def test_matmul_use_tf32_small_matrix_fallback(self):
+        """Test that small matrices with use_tf32=True still work (may fallback to FP32)."""
+        np.random.seed(42)
+        a_np = np.random.rand(16, 16).astype(np.float32)
+        b_np = np.random.rand(16, 16).astype(np.float32)
+
+        a = gp.from_numpy(a_np)
+        b = gp.from_numpy(b_np)
+
+        # Small matrix - implementation may use FP32 fallback
+        c = gp.matmul(a, b, use_tf32=True)
+
+        result = c.to_numpy()
+        expected = np.matmul(a_np, b_np)
+        # Should still be reasonably accurate
+        np.testing.assert_array_almost_equal(result, expected, decimal=2)
+
+    def test_matmul_use_tf32_float64_raises(self):
+        """Test that use_tf32=True with float64 raises an error."""
+        a_np = np.random.rand(64, 64).astype(np.float64)
+        b_np = np.random.rand(64, 64).astype(np.float64)
+
+        a = gp.from_numpy(a_np)
+        b = gp.from_numpy(b_np)
+
+        # TF32 only works with float32 - should raise RuntimeError
+        with pytest.raises(RuntimeError, match="float32"):
+            gp.matmul(a, b, use_tf32=True)
+
+    def test_matmul_use_tf32_rectangular(self):
+        """Test TF32 matmul with rectangular matrices."""
+        np.random.seed(42)
+        a_np = np.random.rand(512, 1024).astype(np.float32)
+        b_np = np.random.rand(1024, 768).astype(np.float32)
+
+        a = gp.from_numpy(a_np)
+        b = gp.from_numpy(b_np)
+
+        c = gp.matmul(a, b, use_tf32=True)
+
+        assert c.shape == (512, 768)
+        result = c.to_numpy()
+        expected = np.matmul(a_np, b_np)
+
+        rel_error = np.abs(result - expected) / (np.abs(expected) + 1e-8)
+        max_rel_error = np.max(rel_error)
+        assert max_rel_error < 0.1, f"TF32 relative error too high: {max_rel_error}"
+
+
+class TestDeviceCapabilities:
+    """Tests for DeviceCapabilities from Rust."""
+
+    def test_device_capabilities_exists(self):
+        """Test that DeviceCapabilities class is available."""
+        assert hasattr(gp, 'DeviceCapabilities') or hasattr(gp, 'get_device_capabilities')
+
+    def test_device_capabilities_tensorcore_field(self):
+        """Test that DeviceCapabilities has tensorcore field."""
+        # Get capabilities for current device
+        caps = gp.get_device_capabilities()
+
+        assert hasattr(caps, 'tensorcore')
+        assert isinstance(caps.tensorcore, bool)
+
+    def test_device_capabilities_sm_version(self):
+        """Test that DeviceCapabilities has SM version info."""
+        caps = gp.get_device_capabilities()
+
+        assert hasattr(caps, 'sm_version') or hasattr(caps, 'compute_capability')
+
+    def test_tensorcore_requires_sm80(self):
+        """Test that tensorcore is True only for SM >= 80."""
+        caps = gp.get_device_capabilities()
+
+        sm_version = getattr(caps, 'sm_version', None) or getattr(caps, 'compute_capability', 0)
+        if sm_version >= 80:
+            # Ampere or newer should have tensor cores
+            assert caps.tensorcore is True
+        else:
+            # Older GPUs don't have TF32 tensor cores
+            assert caps.tensorcore is False
+
+
+class TestKernelTypeRust:
+    """Tests for Rust kernel type enum."""
+
+    def test_kernel_type_exists(self):
+        """Test that KernelType enum is available from Rust."""
+        # This should be exposed via pygpukit._pygpukit_rust
+        try:
+            from pygpukit._pygpukit_rust import KernelType
+            assert hasattr(KernelType, 'Tf32Mma') or hasattr(KernelType, 'TF32_MMA')
+        except ImportError:
+            # Rust module may not be built yet - skip
+            pytest.skip("Rust module not available")
+
+    def test_kernel_type_fp32_exists(self):
+        """Test that FP32 kernel type exists."""
+        try:
+            from pygpukit._pygpukit_rust import KernelType
+            assert hasattr(KernelType, 'Fp32Fma') or hasattr(KernelType, 'FP32_FMA')
+        except ImportError:
+            pytest.skip("Rust module not available")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])