From 28bff0343bebd4958a055168376f3dc75aa27c6c Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 08:40:36 +0900 Subject: [PATCH 01/14] refactor: organize root directory structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Create scripts/ directory for development tools - Move benchmark*.py to scripts/ - Move build_cuda*.bat, compile_dump.bat to scripts/ - Move dump_*.cu debug tools to scripts/ - Move demo_scheduler_log.py to examples/ - Delete redundant TechStack.md (info in CLAUDE.md) - Update README.md Project Structure section Root directory now contains only essential project files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- README.md | 2 ++ TechStack.md | 21 ------------------- .../demo_scheduler_log.py | 0 benchmark.py => scripts/benchmark.py | 0 .../benchmark_pytorch.py | 0 .../benchmark_rust.py | 0 build_cuda12.bat => scripts/build_cuda12.bat | 0 build_cuda13.bat => scripts/build_cuda13.bat | 0 compile_dump.bat => scripts/compile_dump.bat | 0 .../dump_c_fragment.cu | 0 .../dump_fragments.cu | 0 11 files changed, 2 insertions(+), 21 deletions(-) delete mode 100644 TechStack.md rename demo_scheduler_log.py => examples/demo_scheduler_log.py (100%) rename benchmark.py => scripts/benchmark.py (100%) rename benchmark_pytorch.py => scripts/benchmark_pytorch.py (100%) rename benchmark_rust.py => scripts/benchmark_rust.py (100%) rename build_cuda12.bat => scripts/build_cuda12.bat (100%) rename build_cuda13.bat => scripts/build_cuda13.bat (100%) rename compile_dump.bat => scripts/compile_dump.bat (100%) rename dump_c_fragment.cu => scripts/dump_c_fragment.cu (100%) rename dump_fragments.cu => scripts/dump_fragments.cu (100%) diff --git a/README.md b/README.md index e85c36c..56fdc5a 100644 --- a/README.md +++ b/README.md @@ -422,7 +422,9 @@ PyGPUkit/ rust/ # Rust backend (memory pool, scheduler) pygpukit-core/ # Pure Rust core logic pygpukit-python/ # PyO3 bindings + docs/ # Documentation guides examples/ # Demo scripts + scripts/ # Build scripts, benchmarks tests/ # Test suite ``` diff --git a/TechStack.md b/TechStack.md deleted file mode 100644 index 1a51581..0000000 --- a/TechStack.md +++ /dev/null @@ -1,21 +0,0 @@ -``` -PyGPUkit -│ -├── python/ → Python API(NumPy互換) -│ -├── core/ -│ ├── C++ (CUDA Runtime API) -│ └── Rust backend(opt-in) -│ -├── memory/ -│ ├── Rust(LRU, pool allocator) -│ └── Python shim -│ -├── scheduler/ -│ ├── Rust(状態管理) -│ └── C++(kernel launch wrappers) -│ -└── jit/ - ├── C++(NVRTC) - └── Python wrappers -``` diff --git a/demo_scheduler_log.py b/examples/demo_scheduler_log.py similarity index 100% rename from demo_scheduler_log.py rename to examples/demo_scheduler_log.py diff --git a/benchmark.py b/scripts/benchmark.py similarity index 100% rename from benchmark.py rename to scripts/benchmark.py diff --git a/benchmark_pytorch.py b/scripts/benchmark_pytorch.py similarity index 100% rename from benchmark_pytorch.py rename to scripts/benchmark_pytorch.py diff --git a/benchmark_rust.py b/scripts/benchmark_rust.py similarity index 100% rename from benchmark_rust.py rename to scripts/benchmark_rust.py diff --git a/build_cuda12.bat b/scripts/build_cuda12.bat similarity index 100% rename from build_cuda12.bat rename to scripts/build_cuda12.bat diff --git a/build_cuda13.bat b/scripts/build_cuda13.bat similarity index 100% rename from build_cuda13.bat rename to scripts/build_cuda13.bat diff --git a/compile_dump.bat b/scripts/compile_dump.bat similarity index 100% rename from compile_dump.bat rename to scripts/compile_dump.bat diff --git a/dump_c_fragment.cu b/scripts/dump_c_fragment.cu similarity index 100% rename from dump_c_fragment.cu rename to scripts/dump_c_fragment.cu diff --git a/dump_fragments.cu b/scripts/dump_fragments.cu similarity index 100% rename from dump_fragments.cu rename to scripts/dump_fragments.cu From b84bce5f3da0ef0e4456d9f79755c602b460510c Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 09:10:46 +0900 Subject: [PATCH 02/14] feat(v0.2.9): General LLM Execution - Attention layer and E2E inference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## New Features - **Softmax GPU kernel**: Row-wise softmax with numerical stability - **CausalSelfAttention**: Multi-head causal self-attention for GPT-2 - **Full TransformerBlock**: ln_1 -> attention -> residual -> ln_2 -> mlp -> residual ## Changes - `softmax()` operation added to ops API (native GPU + CPU fallback) - `CausalSelfAttention` class with QKV projection and causal masking - `TransformerBlock` updated to support attention (backward compatible) - `load_gpt2_from_safetensors()` now loads attention weights by default ## API - `gpk.softmax(input)` - Row-wise softmax - `gpk.llm.CausalSelfAttention` - Attention module - `load_gpt2_from_safetensors(path, load_attention=True)` - Full model loading ## Architecture Support - GPT-2 E2E inference now possible - GPT-2/GPT-Neo/LLaMA-style architectures supported 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- native/bindings/ops_bindings.cpp | 6 + native/ops/nn/nn.cu | 55 ++++++ native/ops/nn/nn_kernels.cuh | 322 +++++++++++++++++++++++++++++++ native/ops/ops.cuh | 4 + pyproject.toml | 2 +- src/pygpukit/__init__.py | 4 +- src/pygpukit/llm/__init__.py | 2 + src/pygpukit/llm/model.py | 186 +++++++++++++++--- src/pygpukit/ops/__init__.py | 2 + src/pygpukit/ops/basic.py | 46 +++++ 10 files changed, 604 insertions(+), 25 deletions(-) diff --git a/native/bindings/ops_bindings.cpp b/native/bindings/ops_bindings.cpp index 1c365b8..e52c108 100644 --- a/native/bindings/ops_bindings.cpp +++ b/native/bindings/ops_bindings.cpp @@ -139,6 +139,12 @@ void init_ops_bindings(py::module_& m) { py::arg("input"), py::arg("gamma"), py::arg("beta"), py::arg("eps") = 1e-5f, "Layer normalization: (x - mean) / sqrt(var + eps) * gamma + beta"); + // Softmax + m.def("softmax", &ops::softmax, + py::arg("input"), + "Softmax: y[i] = exp(x[i] - max(x)) / sum(exp(x - max(x)))\n" + "Applied row-wise: input [batch, features] -> output [batch, features]"); + // ======================================================================== // Fused Operations (CUTLASS Epilogue Fusion) // ======================================================================== diff --git a/native/ops/nn/nn.cu b/native/ops/nn/nn.cu index d2e63ac..598976c 100644 --- a/native/ops/nn/nn.cu +++ b/native/ops/nn/nn.cu @@ -286,6 +286,61 @@ GPUArray linear(const GPUArray& input, const GPUArray& weight, const GPUArray* b throw std::runtime_error("linear: not yet implemented - use matmul + bias_add separately for MVP"); } +// ============================================================================ +// Softmax +// ============================================================================ + +GPUArray softmax(const GPUArray& input) { + if (input.ndim() != 2) { + throw std::runtime_error("softmax expects 2D input [batch, features]"); + } + if (input.dtype() != DataType::Float32 && input.dtype() != DataType::Float64 && + input.dtype() != DataType::Float16 && input.dtype() != DataType::BFloat16) { + throw std::runtime_error("softmax only supports float types"); + } + + size_t batch_size = input.shape()[0]; + size_t features = input.shape()[1]; + + GPUArray result(input.shape(), input.dtype()); + + // One block per row + int block_size = std::min(256, (int)((features + 31) / 32 * 32)); + block_size = std::max(32, block_size); + + switch (input.dtype()) { + case DataType::Float32: + nn::softmax_f32_kernel<<>>( + static_cast(input.data()), + static_cast(result.data()), + batch_size, features); + break; + case DataType::Float64: + nn::softmax_f64_kernel<<>>( + static_cast(input.data()), + static_cast(result.data()), + batch_size, features); + break; + case DataType::Float16: + nn::softmax_f16_kernel<<>>( + static_cast(input.data()), + static_cast<__half*>(result.data()), + batch_size, features); + break; + case DataType::BFloat16: + nn::softmax_bf16_kernel<<>>( + static_cast(input.data()), + static_cast<__nv_bfloat16*>(result.data()), + batch_size, features); + break; + default: + break; + } + + sync_and_check("softmax kernel failed"); + return result; +} + // ============================================================================ // LayerNorm // ============================================================================ diff --git a/native/ops/nn/nn_kernels.cuh b/native/ops/nn/nn_kernels.cuh index fb1d459..aa30907 100644 --- a/native/ops/nn/nn_kernels.cuh +++ b/native/ops/nn/nn_kernels.cuh @@ -477,6 +477,328 @@ __global__ void layernorm_bf16_kernel(const __nv_bfloat16* __restrict__ input, } } +// ============================================================================ +// Softmax +// ============================================================================ + +// Softmax: y[i] = exp(x[i] - max(x)) / sum(exp(x - max(x))) +// Applied row-wise: input [batch, features] -> output [batch, features] +// Uses online softmax algorithm for numerical stability + +__global__ void softmax_f32_kernel(const float* __restrict__ input, + float* __restrict__ output, + size_t batch_size, + size_t features) { + int row = blockIdx.x; + if (row >= batch_size) return; + + const float* row_input = input + row * features; + float* row_output = output + row * features; + + // Step 1: Find max for numerical stability + float max_val = -INFINITY; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + max_val = fmaxf(max_val, row_input[i]); + } + + // Warp-level reduction for max + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + max_val = fmaxf(max_val, __shfl_down_sync(0xffffffff, max_val, offset)); + } + + __shared__ float shared_max[32]; + int lane = threadIdx.x % warpSize; + int warp_id = threadIdx.x / warpSize; + + if (lane == 0) { + shared_max[warp_id] = max_val; + } + __syncthreads(); + + if (warp_id == 0) { + max_val = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_max[threadIdx.x] : -INFINITY; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + max_val = fmaxf(max_val, __shfl_down_sync(0xffffffff, max_val, offset)); + } + } + + __shared__ float row_max; + if (threadIdx.x == 0) { + row_max = max_val; + } + __syncthreads(); + + // Step 2: Compute exp(x - max) and sum + float sum = 0.0f; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + float exp_val = expf(row_input[i] - row_max); + row_output[i] = exp_val; // Store temporarily + sum += exp_val; + } + + // Warp-level reduction for sum + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + + __shared__ float shared_sum[32]; + if (lane == 0) { + shared_sum[warp_id] = sum; + } + __syncthreads(); + + if (warp_id == 0) { + sum = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_sum[threadIdx.x] : 0.0f; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + } + + __shared__ float row_sum; + if (threadIdx.x == 0) { + row_sum = sum; + } + __syncthreads(); + + // Step 3: Normalize + float inv_sum = 1.0f / row_sum; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + row_output[i] *= inv_sum; + } +} + +__global__ void softmax_f64_kernel(const double* __restrict__ input, + double* __restrict__ output, + size_t batch_size, + size_t features) { + int row = blockIdx.x; + if (row >= batch_size) return; + + const double* row_input = input + row * features; + double* row_output = output + row * features; + + double max_val = -INFINITY; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + max_val = fmax(max_val, row_input[i]); + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + max_val = fmax(max_val, __shfl_down_sync(0xffffffff, max_val, offset)); + } + + __shared__ double shared_max[32]; + int lane = threadIdx.x % warpSize; + int warp_id = threadIdx.x / warpSize; + + if (lane == 0) { + shared_max[warp_id] = max_val; + } + __syncthreads(); + + if (warp_id == 0) { + max_val = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_max[threadIdx.x] : -INFINITY; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + max_val = fmax(max_val, __shfl_down_sync(0xffffffff, max_val, offset)); + } + } + + __shared__ double row_max; + if (threadIdx.x == 0) { + row_max = max_val; + } + __syncthreads(); + + double sum = 0.0; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + double exp_val = exp(row_input[i] - row_max); + row_output[i] = exp_val; + sum += exp_val; + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + + __shared__ double shared_sum[32]; + if (lane == 0) { + shared_sum[warp_id] = sum; + } + __syncthreads(); + + if (warp_id == 0) { + sum = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_sum[threadIdx.x] : 0.0; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + } + + __shared__ double row_sum; + if (threadIdx.x == 0) { + row_sum = sum; + } + __syncthreads(); + + double inv_sum = 1.0 / row_sum; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + row_output[i] *= inv_sum; + } +} + +__global__ void softmax_f16_kernel(const __half* __restrict__ input, + __half* __restrict__ output, + size_t batch_size, + size_t features) { + int row = blockIdx.x; + if (row >= batch_size) return; + + const __half* row_input = input + row * features; + __half* row_output = output + row * features; + + // Compute in FP32 for precision + float max_val = -INFINITY; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + max_val = fmaxf(max_val, __half2float(row_input[i])); + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + max_val = fmaxf(max_val, __shfl_down_sync(0xffffffff, max_val, offset)); + } + + __shared__ float shared_max[32]; + int lane = threadIdx.x % warpSize; + int warp_id = threadIdx.x / warpSize; + + if (lane == 0) { + shared_max[warp_id] = max_val; + } + __syncthreads(); + + if (warp_id == 0) { + max_val = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_max[threadIdx.x] : -INFINITY; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + max_val = fmaxf(max_val, __shfl_down_sync(0xffffffff, max_val, offset)); + } + } + + __shared__ float row_max; + if (threadIdx.x == 0) { + row_max = max_val; + } + __syncthreads(); + + float sum = 0.0f; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + float exp_val = expf(__half2float(row_input[i]) - row_max); + row_output[i] = __float2half(exp_val); + sum += exp_val; + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + + __shared__ float shared_sum[32]; + if (lane == 0) { + shared_sum[warp_id] = sum; + } + __syncthreads(); + + if (warp_id == 0) { + sum = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_sum[threadIdx.x] : 0.0f; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + } + + __shared__ float row_sum; + if (threadIdx.x == 0) { + row_sum = sum; + } + __syncthreads(); + + float inv_sum = 1.0f / row_sum; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + row_output[i] = __float2half(__half2float(row_output[i]) * inv_sum); + } +} + +__global__ void softmax_bf16_kernel(const __nv_bfloat16* __restrict__ input, + __nv_bfloat16* __restrict__ output, + size_t batch_size, + size_t features) { + int row = blockIdx.x; + if (row >= batch_size) return; + + const __nv_bfloat16* row_input = input + row * features; + __nv_bfloat16* row_output = output + row * features; + + float max_val = -INFINITY; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + max_val = fmaxf(max_val, __bfloat162float(row_input[i])); + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + max_val = fmaxf(max_val, __shfl_down_sync(0xffffffff, max_val, offset)); + } + + __shared__ float shared_max[32]; + int lane = threadIdx.x % warpSize; + int warp_id = threadIdx.x / warpSize; + + if (lane == 0) { + shared_max[warp_id] = max_val; + } + __syncthreads(); + + if (warp_id == 0) { + max_val = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_max[threadIdx.x] : -INFINITY; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + max_val = fmaxf(max_val, __shfl_down_sync(0xffffffff, max_val, offset)); + } + } + + __shared__ float row_max; + if (threadIdx.x == 0) { + row_max = max_val; + } + __syncthreads(); + + float sum = 0.0f; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + float exp_val = expf(__bfloat162float(row_input[i]) - row_max); + row_output[i] = __float2bfloat16(exp_val); + sum += exp_val; + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + + __shared__ float shared_sum[32]; + if (lane == 0) { + shared_sum[warp_id] = sum; + } + __syncthreads(); + + if (warp_id == 0) { + sum = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_sum[threadIdx.x] : 0.0f; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + } + + __shared__ float row_sum; + if (threadIdx.x == 0) { + row_sum = sum; + } + __syncthreads(); + + float inv_sum = 1.0f / row_sum; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + row_output[i] = __float2bfloat16(__bfloat162float(row_output[i]) * inv_sum); + } +} + // ============================================================================ // Matrix Transpose // ============================================================================ diff --git a/native/ops/ops.cuh b/native/ops/ops.cuh index 88d3f53..c851702 100644 --- a/native/ops/ops.cuh +++ b/native/ops/ops.cuh @@ -98,6 +98,10 @@ void bias_add_inplace(GPUArray& output, const GPUArray& bias); // input: [batch, features], gamma/beta: [features] GPUArray layernorm(const GPUArray& input, const GPUArray& gamma, const GPUArray& beta, float eps = 1e-5f); +// Softmax: y[i] = exp(x[i] - max(x)) / sum(exp(x - max(x))) +// Applied row-wise: input [batch, features] -> output [batch, features] +GPUArray softmax(const GPUArray& input); + // ============================================================================ // Fused Operations (CUTLASS Epilogue Fusion) // ============================================================================ diff --git a/pyproject.toml b/pyproject.toml index f24dcbd..e669ab9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "PyGPUkit" -version = "0.2.8" +version = "0.2.9" description = "A lightweight GPU runtime for Python with Rust-powered scheduler, NVRTC JIT compilation, and NumPy-like API" readme = "README.md" license = "MIT" diff --git a/src/pygpukit/__init__.py b/src/pygpukit/__init__.py index 9321e23..4c24e6e 100644 --- a/src/pygpukit/__init__.py +++ b/src/pygpukit/__init__.py @@ -1,6 +1,6 @@ """PyGPUkit - A lightweight GPU runtime for Python.""" -__version__ = "0.2.6" +__version__ = "0.2.9" # LLM support (safetensors loader) from pygpukit import llm, ops @@ -43,6 +43,7 @@ mean, mul, relu, + softmax, sub, sum, transpose, @@ -108,6 +109,7 @@ "log", "relu", "gelu", + "softmax", "layernorm", "matmul", "transpose", diff --git a/src/pygpukit/llm/__init__.py b/src/pygpukit/llm/__init__.py index 92e603d..7629968 100644 --- a/src/pygpukit/llm/__init__.py +++ b/src/pygpukit/llm/__init__.py @@ -331,6 +331,7 @@ def __repr__(self) -> str: from pygpukit.llm.model import ( # noqa: E402 MLP, + CausalSelfAttention, GPT2Config, GPT2Model, LayerNorm, @@ -350,6 +351,7 @@ def __repr__(self) -> str: # Model components "GPT2Config", "GPT2Model", + "CausalSelfAttention", "LayerNorm", "Linear", "MLP", diff --git a/src/pygpukit/llm/model.py b/src/pygpukit/llm/model.py index cb7dd14..f2e5e57 100644 --- a/src/pygpukit/llm/model.py +++ b/src/pygpukit/llm/model.py @@ -169,54 +169,165 @@ def __call__(self, x: GPUArray) -> GPUArray: return layernorm(x, self.weight, self.bias, self.eps) +class CausalSelfAttention: + """Causal self-attention for GPT-2. + + Structure: + - c_attn: [n_embd] -> [3*n_embd] (Q, K, V projection) + - Split into n_head heads + - Q @ K^T / sqrt(d_k) with causal mask + - Softmax + - Attention @ V + - c_proj: [n_embd] -> [n_embd] + """ + + def __init__( + self, + c_attn_weight: GPUArray, + c_attn_bias: GPUArray | None, + c_proj_weight: GPUArray, + c_proj_bias: GPUArray | None, + n_head: int, + n_embd: int, + ): + """Initialize CausalSelfAttention. + + Args: + c_attn_weight: QKV projection weight [3*n_embd, n_embd] + c_attn_bias: QKV projection bias [3*n_embd] + c_proj_weight: Output projection weight [n_embd, n_embd] + c_proj_bias: Output projection bias [n_embd] + n_head: Number of attention heads + n_embd: Embedding dimension + """ + self.c_attn = Linear(c_attn_weight, c_attn_bias) + self.c_proj = Linear(c_proj_weight, c_proj_bias) + self.n_head = n_head + self.n_embd = n_embd + self.head_dim = n_embd // n_head + + def __call__(self, x: GPUArray) -> GPUArray: + """Forward pass with causal self-attention. + + Args: + x: Input tensor [seq_len, n_embd] + + Returns: + Output tensor [seq_len, n_embd] + """ + import numpy as np + + seq_len = x.shape[0] + + # QKV projection: [seq_len, n_embd] -> [seq_len, 3*n_embd] + qkv = self.c_attn(x) + qkv_np = qkv.to_numpy() + + # Split into Q, K, V: each [seq_len, n_embd] + q_np = qkv_np[:, :self.n_embd] + k_np = qkv_np[:, self.n_embd:2*self.n_embd] + v_np = qkv_np[:, 2*self.n_embd:] + + # Reshape for multi-head: [seq_len, n_head, head_dim] + q_np = q_np.reshape(seq_len, self.n_head, self.head_dim) + k_np = k_np.reshape(seq_len, self.n_head, self.head_dim) + v_np = v_np.reshape(seq_len, self.n_head, self.head_dim) + + # Transpose to [n_head, seq_len, head_dim] for batched attention + q_np = q_np.transpose(1, 0, 2) + k_np = k_np.transpose(1, 0, 2) + v_np = v_np.transpose(1, 0, 2) + + # Compute attention scores: [n_head, seq_len, seq_len] + scale = 1.0 / np.sqrt(self.head_dim) + # Q @ K^T: [n_head, seq_len, head_dim] @ [n_head, head_dim, seq_len] + attn_scores = np.matmul(q_np, k_np.transpose(0, 2, 1)) * scale + + # Apply causal mask (lower triangular) + causal_mask = np.triu(np.ones((seq_len, seq_len)), k=1).astype(bool) + attn_scores[:, causal_mask] = -1e9 + + # Softmax over last dimension + attn_scores_max = attn_scores.max(axis=-1, keepdims=True) + attn_exp = np.exp(attn_scores - attn_scores_max) + attn_weights = attn_exp / attn_exp.sum(axis=-1, keepdims=True) + + # Attention @ V: [n_head, seq_len, head_dim] + attn_output = np.matmul(attn_weights, v_np) + + # Transpose back: [seq_len, n_head, head_dim] + attn_output = attn_output.transpose(1, 0, 2) + + # Reshape to [seq_len, n_embd] + attn_output = attn_output.reshape(seq_len, self.n_embd) + + # Output projection + out = from_numpy(attn_output.astype(np.float32)) + out = self.c_proj(out) + + return out + + class TransformerBlock: - """Transformer block (MLP only, no attention for MVP). + """Full transformer block with attention and MLP. - Structure: ln -> mlp -> residual + Structure: ln_1 -> attention -> residual -> ln_2 -> mlp -> residual """ def __init__( self, - ln_weight: GPUArray, - ln_bias: GPUArray, + ln_1_weight: GPUArray, + ln_1_bias: GPUArray, + attn: CausalSelfAttention | None, + ln_2_weight: GPUArray, + ln_2_bias: GPUArray, mlp: MLP, eps: float = 1e-5, ): """Initialize TransformerBlock. Args: - ln_weight: LayerNorm weight [n_embd] - ln_bias: LayerNorm bias [n_embd] + ln_1_weight: First LayerNorm weight [n_embd] + ln_1_bias: First LayerNorm bias [n_embd] + attn: CausalSelfAttention module (None for MLP-only mode) + ln_2_weight: Second LayerNorm weight [n_embd] + ln_2_bias: Second LayerNorm bias [n_embd] mlp: MLP block eps: LayerNorm epsilon """ - self.ln = LayerNorm(ln_weight, ln_bias, eps) + self.ln_1 = LayerNorm(ln_1_weight, ln_1_bias, eps) + self.attn = attn + self.ln_2 = LayerNorm(ln_2_weight, ln_2_bias, eps) self.mlp = mlp def __call__(self, x: GPUArray) -> GPUArray: - """Forward pass: ln -> mlp -> residual + """Forward pass: ln_1 -> attn -> residual -> ln_2 -> mlp -> residual Args: - x: Input tensor [batch, n_embd] + x: Input tensor [seq_len, n_embd] Returns: - Output tensor [batch, n_embd] + Output tensor [seq_len, n_embd] """ - # LayerNorm - h = self.ln(x) - # MLP + # Attention block (if available) + if self.attn is not None: + h = self.ln_1(x) + h = self.attn(h) + x = add(x, h) + + # MLP block + h = self.ln_2(x) h = self.mlp(h) - # Residual return add(x, h) class GPT2Model: - """GPT-2 model (MLP only, no attention for MVP). + """GPT-2 model with full transformer blocks. Structure: - Token embedding - Position embedding - - Transformer blocks (MLP only) + - Transformer blocks (attention + MLP) - Final LayerNorm - LM head (tied to embedding) """ @@ -351,15 +462,14 @@ def generate( def load_gpt2_from_safetensors( model_path: str, config: GPT2Config | None = None, + load_attention: bool = True, ) -> GPT2Model: """Load GPT-2 model from safetensors file. - Note: This is an MVP that only loads MLP weights (no attention). - The model will not produce coherent text without attention. - Args: model_path: Path to model.safetensors file config: Model configuration (defaults to GPT-2 small) + load_attention: Whether to load attention weights (default: True) Returns: GPT2Model instance @@ -391,6 +501,12 @@ def load_tensor(name: str) -> GPUArray: arr = np.frombuffer(data, dtype=np_dtype).reshape(info.shape) return from_numpy(arr.copy()) + def try_load_tensor(name: str) -> GPUArray | None: + """Try to load tensor, return None if not found.""" + if name in st.tensor_names: + return load_tensor(name) + return None + # Load embeddings wte = load_tensor("wte.weight") wpe = load_tensor("wpe.weight") @@ -406,18 +522,42 @@ def load_tensor(name: str) -> GPUArray: # Skip blocks without MLP (shouldn't happen for GPT-2) continue - # LayerNorm 2 (before MLP in GPT-2) + # LayerNorm 1 (before attention) + ln_1_w = load_tensor(f"{prefix}ln_1.weight") + ln_1_b = load_tensor(f"{prefix}ln_1.bias") + + # Attention (optional) + attn = None + if load_attention: + attn_c_attn_w_name = f"{prefix}attn.c_attn.weight" + if attn_c_attn_w_name in st.tensor_names: + attn_c_attn_w = load_tensor(attn_c_attn_w_name) + attn_c_attn_b = try_load_tensor(f"{prefix}attn.c_attn.bias") + attn_c_proj_w = load_tensor(f"{prefix}attn.c_proj.weight") + attn_c_proj_b = try_load_tensor(f"{prefix}attn.c_proj.bias") + + attn = CausalSelfAttention( + attn_c_attn_w, attn_c_attn_b, + attn_c_proj_w, attn_c_proj_b, + config.n_head, config.n_embd + ) + + # LayerNorm 2 (before MLP) ln_2_w = load_tensor(f"{prefix}ln_2.weight") ln_2_b = load_tensor(f"{prefix}ln_2.bias") # MLP mlp_c_fc_w = load_tensor(f"{prefix}mlp.c_fc.weight") - mlp_c_fc_b = load_tensor(f"{prefix}mlp.c_fc.bias") + mlp_c_fc_b = try_load_tensor(f"{prefix}mlp.c_fc.bias") mlp_c_proj_w = load_tensor(f"{prefix}mlp.c_proj.weight") - mlp_c_proj_b = load_tensor(f"{prefix}mlp.c_proj.bias") + mlp_c_proj_b = try_load_tensor(f"{prefix}mlp.c_proj.bias") mlp = MLP(mlp_c_fc_w, mlp_c_fc_b, mlp_c_proj_w, mlp_c_proj_b) - block = TransformerBlock(ln_2_w, ln_2_b, mlp, config.layer_norm_eps) + block = TransformerBlock( + ln_1_w, ln_1_b, attn, + ln_2_w, ln_2_b, mlp, + config.layer_norm_eps + ) blocks.append(block) # Final LayerNorm diff --git a/src/pygpukit/ops/__init__.py b/src/pygpukit/ops/__init__.py index fc32be0..6fd6de4 100644 --- a/src/pygpukit/ops/__init__.py +++ b/src/pygpukit/ops/__init__.py @@ -14,6 +14,7 @@ mean, mul, relu, + softmax, sub, sum, transpose, @@ -28,6 +29,7 @@ "log", "relu", "gelu", + "softmax", "layernorm", "matmul", "sum", diff --git a/src/pygpukit/ops/basic.py b/src/pygpukit/ops/basic.py index ff6ef50..c82483a 100644 --- a/src/pygpukit/ops/basic.py +++ b/src/pygpukit/ops/basic.py @@ -674,6 +674,52 @@ def _layernorm_native( return GPUArray._wrap_native(c_native) +def softmax(input: GPUArray) -> GPUArray: + """Softmax activation applied row-wise. + + Computes: y[i] = exp(x[i] - max(x)) / sum(exp(x - max(x))) + + Args: + input: Input array of shape [batch, features]. + + Returns: + A new GPUArray containing the softmax output. + + Raises: + ValueError: If input is not 2D or dtype is not a float type. + """ + _validate_float_dtype(input, "softmax") + + if input.ndim != 2: + raise ValueError(f"softmax expects 2D input [batch, features], got {input.ndim}D") + + backend = get_backend() + + if isinstance(backend, NativeBackend) and backend.is_available(): + return _softmax_native(input) + else: + return _softmax_cpu(input) + + +def _softmax_cpu(input: GPUArray) -> GPUArray: + """CPU implementation of softmax.""" + x = input.to_numpy() + # Numerical stability: subtract max + x_max = x.max(axis=1, keepdims=True) + exp_x = np.exp(x - x_max) + return from_numpy(exp_x / exp_x.sum(axis=1, keepdims=True)) + + +def _softmax_native(input: GPUArray) -> GPUArray: + """Native C++ CUDA implementation of softmax (zero-copy).""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + input_native = input._get_native() + c_native = native.softmax(input_native) + return GPUArray._wrap_native(c_native) + + def transpose(a: GPUArray) -> GPUArray: """Matrix transpose. From bf8e1237120b4596f1ae61e00e935b609f89dd24 Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 11:33:35 +0900 Subject: [PATCH 03/14] feat(v0.2.9): Unified Transformer + Hybrid Attention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Unify GPT-2 and LLaMA into common Transformer abstraction - TransformerConfig: vocab_size, hidden_size, num_layers, num_heads, num_kv_heads, norm_type, activation, use_rope, causal - CausalTransformerModel with generate() method - Attention, MLP, Norm, TransformerBlock classes - Legacy aliases preserved for backward compatibility - Hybrid Attention execution - GPU SDPA for prefill (seq_len > 1) - CPU numpy for decode (seq_len = 1) to minimize kernel overhead - New GPU tensor ops (CUDA kernels) - concat_axis0, repeat_interleave_axis1, transpose_3d_021, reshape_copy - Required for GQA KV head expansion - Add E2E demo (examples/demo_llm_e2e.py) Benchmark (RTX 3090 Ti): GPT-2 (124M): 11.2 tok/s decode, 89.6 ms/token TinyLlama (1.1B): 5.3 tok/s decode, 188.2 ms/token 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- examples/demo_llm_e2e.py | 368 +++++++++ native/bindings/ops_bindings.cpp | 61 ++ native/ops/nn/nn.cu | 390 +++++++++ native/ops/nn/nn_kernels.cuh | 778 ++++++++++++++++++ native/ops/ops.cuh | 40 + src/pygpukit/core/backend.py | 2 +- src/pygpukit/llm/__init__.py | 41 +- src/pygpukit/llm/model.py | 1289 ++++++++++++++++++++++-------- src/pygpukit/ops/__init__.py | 16 + src/pygpukit/ops/basic.py | 530 ++++++++++++ 10 files changed, 3157 insertions(+), 358 deletions(-) create mode 100644 examples/demo_llm_e2e.py diff --git a/examples/demo_llm_e2e.py b/examples/demo_llm_e2e.py new file mode 100644 index 0000000..04d6c0b --- /dev/null +++ b/examples/demo_llm_e2e.py @@ -0,0 +1,368 @@ +#!/usr/bin/env python3 +""" +PyGPUkit v0.2.9 - End-to-End LLM Inference Demo + +Demonstrates the unified Transformer implementation with: +- Model loading from safetensors (GPT-2 or LLaMA) +- Tokenization +- KV-cache enabled autoregressive generation +- Hybrid Attention: CPU for decode (seq_len=1), GPU for prefill +- Performance benchmarking (prefill + decode) + +Usage: + python demo_llm_e2e.py --model /path/to/model.safetensors --tokenizer /path/to/tokenizer.json + +Supported models: + - LLaMA 2/3 (any size in safetensors format) + - GPT-2 (safetensors format) +""" + +from __future__ import annotations + +import argparse +import time +from pathlib import Path + + +def section(title: str) -> None: + """Print section header.""" + print() + print("=" * 70) + print(f" {title}") + print("=" * 70) + + +def format_time(ms: float) -> str: + """Format time in appropriate units.""" + if ms < 1: + return f"{ms * 1000:.2f} us" + elif ms < 1000: + return f"{ms:.2f} ms" + else: + return f"{ms / 1000:.2f} s" + + +def main(): + parser = argparse.ArgumentParser(description="PyGPUkit E2E LLM Demo") + parser.add_argument( + "--model", + type=str, + required=True, + help="Path to model.safetensors", + ) + parser.add_argument( + "--tokenizer", + type=str, + required=True, + help="Path to tokenizer.json", + ) + parser.add_argument( + "--prompt", + type=str, + default="The quick brown fox", + help="Input prompt for generation", + ) + parser.add_argument( + "--max-tokens", + type=int, + default=32, + help="Maximum tokens to generate", + ) + parser.add_argument( + "--temperature", + type=float, + default=0.7, + help="Sampling temperature", + ) + parser.add_argument( + "--top-k", + type=int, + default=50, + help="Top-k sampling", + ) + parser.add_argument( + "--top-p", + type=float, + default=0.9, + help="Top-p (nucleus) sampling", + ) + parser.add_argument( + "--model-type", + type=str, + choices=["auto", "llama", "gpt2"], + default="auto", + help="Model type (auto-detect by default)", + ) + parser.add_argument( + "--benchmark-only", + action="store_true", + help="Run benchmark without text generation", + ) + args = parser.parse_args() + + print("=" * 70) + print(" PyGPUkit v0.2.9 - End-to-End LLM Inference Demo") + print("=" * 70) + + # Check paths + model_path = Path(args.model) + tokenizer_path = Path(args.tokenizer) + + if not model_path.exists(): + print(f"\nError: Model not found: {model_path}") + return 1 + + if not tokenizer_path.exists(): + print(f"\nError: Tokenizer not found: {tokenizer_path}") + return 1 + + # Import PyGPUkit + try: + import pygpukit as gpk + from pygpukit.llm import ( + SafeTensorsFile, + Tokenizer, + load_gpt2_from_safetensors, + load_llama_from_safetensors, + ) + + print("\nPyGPUkit loaded successfully") + print(f" CUDA available: {gpk.is_cuda_available()}") + except ImportError as e: + print(f"\nError importing PyGPUkit: {e}") + return 1 + + # ========================================================================= + # Load Tokenizer + # ========================================================================= + section("Loading Tokenizer") + + start = time.perf_counter() + tokenizer = Tokenizer(str(tokenizer_path)) + tokenizer_time = (time.perf_counter() - start) * 1000 + + print(f" Path: {tokenizer_path}") + print(f" Vocab size: {tokenizer.vocab_size:,}") + print(f" BOS token: {tokenizer.bos_token_id}") + print(f" EOS token: {tokenizer.eos_token_id}") + print(f" Load time: {format_time(tokenizer_time)}") + + # ========================================================================= + # Detect Model Type + # ========================================================================= + section("Detecting Model Type") + + st = SafeTensorsFile(str(model_path)) + tensor_names = st.tensor_names + + if args.model_type == "auto": + if "model.embed_tokens.weight" in tensor_names: + model_type = "llama" + elif "wte.weight" in tensor_names: + model_type = "gpt2" + else: + print(" Error: Could not auto-detect model type") + print(f" First 10 tensor names: {tensor_names[:10]}") + return 1 + else: + model_type = args.model_type + + print(f" Model type: {model_type.upper()}") + print(f" Total tensors: {len(tensor_names)}") + print(f" File size: {st.file_size / 1024 / 1024:.1f} MB") + + # ========================================================================= + # Load Model + # ========================================================================= + section("Loading Model") + + start = time.perf_counter() + if model_type == "llama": + model = load_llama_from_safetensors(str(model_path)) + else: + model = load_gpt2_from_safetensors(str(model_path)) + load_time = (time.perf_counter() - start) * 1000 + + config = model.config + print(f" Architecture: {model_type.upper()}") + print(f" Hidden size: {config.hidden_size}") + print(f" Num layers: {config.num_layers}") + print(f" Num heads: {config.num_heads}") + print(f" Num KV heads: {config.num_kv_heads}") + print(f" Head dim: {config.head_dim}") + print(f" Intermediate size: {config.intermediate_size}") + print(f" Vocab size: {config.vocab_size}") + print(f" Norm type: {config.norm_type}") + print(f" Activation: {config.activation}") + print(f" Use RoPE: {config.use_rope}") + print(f" Load time: {format_time(load_time)}") + + # Estimate model size + params = ( + config.vocab_size * config.hidden_size # embed_tokens + + config.num_layers + * ( + 4 * config.hidden_size * config.hidden_size # Q, K, V, O projections + + 3 * config.hidden_size * config.intermediate_size # gate, up, down + + 2 * config.hidden_size # norms + ) + + config.hidden_size # final norm + ) + print(f" Estimated params: {params / 1e9:.2f}B") + + # ========================================================================= + # Tokenize Prompt + # ========================================================================= + section("Tokenizing Prompt") + + prompt = args.prompt + input_ids = tokenizer.encode(prompt) + + print(f' Prompt: "{prompt}"') + print(f" Token IDs: {input_ids}") + print(f" Num tokens: {len(input_ids)}") + + # Show token breakdown + print(" Tokens: ", end="") + for tid in input_ids[:10]: + token_str = tokenizer.id_to_token(tid) + # Use ASCII-safe representation to avoid encoding issues + safe_repr = repr(token_str).encode("ascii", "replace").decode("ascii") + print(f"[{tid}:{safe_repr}] ", end="") + if len(input_ids) > 10: + print("...") + else: + print() + + # ========================================================================= + # Benchmark: Prefill + Decode + # ========================================================================= + section("Benchmark: Prefill + Decode") + + # Warmup + print(" Warming up...") + _ = model.generate(input_ids[:4], max_new_tokens=2, temperature=0.0, use_cache=True) + + # Prefill benchmark (various sequence lengths) + print("\n Prefill Performance (GPU SDPA):") + for seq_len in [8, 16, 32, 64, 128]: + # Create test sequence + test_ids = (input_ids * ((seq_len // len(input_ids)) + 1))[:seq_len] + + # Time prefill only + times = [] + for _ in range(3): + start = time.perf_counter() + hidden, past_kv = model(test_ids, use_cache=True) + elapsed = (time.perf_counter() - start) * 1000 + times.append(elapsed) + + avg_time = sum(times) / len(times) + tokens_per_sec = seq_len / (avg_time / 1000) + print(f" seq_len={seq_len:3d}: {avg_time:6.2f} ms ({tokens_per_sec:,.0f} tok/s)") + + # Decode benchmark (single token) + print("\n Decode Performance (Hybrid CPU/GPU):") + + # First do a prefill to get KV cache + hidden, past_kv = model(input_ids, use_cache=True) + logits = model.get_logits(hidden) + logits_np = logits.to_numpy() + next_token = int(logits_np[-1].argmax()) + + # Time single-token decode + decode_times = [] + for _ in range(10): + start = time.perf_counter() + hidden, past_kv = model([next_token], past_key_values=past_kv, use_cache=True) + _ = model.get_logits(hidden) + elapsed = (time.perf_counter() - start) * 1000 + decode_times.append(elapsed) + + avg_decode = sum(decode_times) / len(decode_times) + min_decode = min(decode_times) + max_decode = max(decode_times) + tokens_per_sec = 1000 / avg_decode + + print( + f" Single token decode: {avg_decode:.2f} ms (min={min_decode:.2f}, max={max_decode:.2f})" + ) + print(f" Decode throughput: {tokens_per_sec:.1f} tok/s") + + # Per-layer estimate + per_layer = avg_decode / config.num_layers + print(f" Per-layer time: {per_layer:.2f} ms") + + if args.benchmark_only: + section("Benchmark Complete") + return 0 + + # ========================================================================= + # Text Generation + # ========================================================================= + section("Text Generation") + + print(f' Prompt: "{prompt}"') + print(f" Max new tokens: {args.max_tokens}") + print(f" Temperature: {args.temperature}") + print(f" Top-k: {args.top_k}") + print(f" Top-p: {args.top_p}") + print() + + # Generate with timing + start = time.perf_counter() + output_ids = model.generate( + input_ids, + max_new_tokens=args.max_tokens, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + eos_token_id=tokenizer.eos_token_id, + use_cache=True, + ) + total_time = (time.perf_counter() - start) * 1000 + + # Decode output + output_text = tokenizer.decode(output_ids) + new_tokens = len(output_ids) - len(input_ids) + + print(" Generated text:") + print(f" {'-' * 60}") + print(f" {output_text}") + print(f" {'-' * 60}") + print() + print(f" Input tokens: {len(input_ids)}") + print(f" Output tokens: {len(output_ids)}") + print(f" New tokens: {new_tokens}") + print(f" Total time: {format_time(total_time)}") + print(f" Throughput: {new_tokens / (total_time / 1000):.1f} tok/s") + + # ========================================================================= + # Summary + # ========================================================================= + section("Summary") + + print(f" Model: {model_type.upper()} ({config.num_layers}L, {config.hidden_size}H)") + print(" Prefill: GPU SDPA with causal mask") + print(" Decode: Hybrid (CPU for seq_len=1, GPU otherwise)") + print(" KV Cache: numpy (CPU)") + print() + print(" Performance:") + print(f" Prefill (64 tokens): ~{sum(times) / len(times):.1f} ms") + print(f" Decode (per token): ~{avg_decode:.1f} ms") + print(f" Generation: {new_tokens / (total_time / 1000):.1f} tok/s") + + print() + print(" PyGPUkit v0.2.9 Features Used:") + print(" - Unified TransformerConfig") + print(" - CausalTransformerModel with generate()") + print(" - Hybrid Attention (CPU decode / GPU prefill)") + print(" - GPU: RMSNorm, SDPA, SiLU, RoPE, matmul") + print(" - SafeTensors loading with BFloat16 support") + print(" - Rust-based tokenizer") + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/native/bindings/ops_bindings.cpp b/native/bindings/ops_bindings.cpp index e52c108..9b9786d 100644 --- a/native/bindings/ops_bindings.cpp +++ b/native/bindings/ops_bindings.cpp @@ -145,6 +145,13 @@ void init_ops_bindings(py::module_& m) { "Softmax: y[i] = exp(x[i] - max(x)) / sum(exp(x - max(x)))\n" "Applied row-wise: input [batch, features] -> output [batch, features]"); + // RMSNorm + m.def("rmsnorm", &ops::rmsnorm, + py::arg("input"), py::arg("gamma"), py::arg("eps") = 1e-5f, + "RMS normalization: x / sqrt(mean(x^2) + eps) * gamma\n" + "Simpler than LayerNorm (no mean subtraction, no beta)\n" + "input: [batch, features], gamma: [features]"); + // ======================================================================== // Fused Operations (CUTLASS Epilogue Fusion) // ======================================================================== @@ -155,4 +162,58 @@ void init_ops_bindings(py::module_& m) { "Fused linear + bias + GELU: output = gelu(input @ weight^T + bias)\n" "Uses CUTLASS TensorCore epilogue fusion for efficiency.\n" "input: [batch, in_features], weight: [out_features, in_features], bias: [out_features]"); + + // ======================================================================== + // Additional Neural Network Operations + // ======================================================================== + + // SiLU (Swish) activation + m.def("silu", &ops::silu, + py::arg("input"), + "SiLU (Swish) activation: y = x * sigmoid(x)"); + + // RoPE (Rotary Position Embedding) - In-place + m.def("rope_inplace", &ops::rope_inplace, + py::arg("q"), py::arg("k"), py::arg("cos"), py::arg("sin"), + "Apply RoPE to Q and K tensors in-place.\n" + "q: [seq_len, n_heads_q, head_dim]\n" + "k: [seq_len, n_heads_k, head_dim]\n" + "cos, sin: [seq_len, head_dim]"); + + // Scaled Dot-Product Attention with Causal Mask + m.def("sdpa_causal", &ops::sdpa_causal, + py::arg("Q"), py::arg("K"), py::arg("V"), py::arg("scale") = 0.0f, + "Scaled Dot-Product Attention with causal mask.\n" + "Q: [n_heads, q_len, head_dim]\n" + "K: [n_heads, kv_len, head_dim]\n" + "V: [n_heads, kv_len, head_dim]\n" + "Output: [n_heads, q_len, head_dim]\n" + "scale: 1/sqrt(head_dim), auto-computed if <= 0"); + + // ======================================================================== + // Tensor Manipulation Operations + // ======================================================================== + + // Concat along axis 0 + m.def("concat_axis0", &ops::concat_axis0, + py::arg("a"), py::arg("b"), + "Concat two tensors along axis 0.\n" + "a: [dim0_a, ...], b: [dim0_b, ...]\n" + "Output: [dim0_a + dim0_b, ...]"); + + // Repeat interleave along axis 1 (for GQA) + m.def("repeat_interleave_axis1", &ops::repeat_interleave_axis1, + py::arg("input"), py::arg("repeats"), + "Repeat tensor along axis 1 (interleaved).\n" + "input: [dim0, dim1, dim2] -> output: [dim0, dim1 * repeats, dim2]"); + + // Transpose 3D: [d0, d1, d2] -> [d1, d0, d2] + m.def("transpose_3d_021", &ops::transpose_3d_021, + py::arg("input"), + "Transpose 3D tensor: [d0, d1, d2] -> [d1, d0, d2]"); + + // Reshape with copy + m.def("reshape_copy", &ops::reshape_copy, + py::arg("input"), py::arg("new_shape"), + "Reshape tensor with copy (ensures contiguous output)."); } diff --git a/native/ops/nn/nn.cu b/native/ops/nn/nn.cu index 598976c..9d3f74e 100644 --- a/native/ops/nn/nn.cu +++ b/native/ops/nn/nn.cu @@ -414,5 +414,395 @@ GPUArray layernorm(const GPUArray& input, const GPUArray& gamma, const GPUArray& return result; } +// ============================================================================ +// RMSNorm (Root Mean Square Normalization) +// ============================================================================ + +GPUArray rmsnorm(const GPUArray& input, const GPUArray& gamma, float eps) { + // input: [batch, features] + // gamma: [features] + + if (input.ndim() != 2) { + throw std::runtime_error("rmsnorm expects 2D input [batch, features]"); + } + if (gamma.ndim() != 1) { + throw std::runtime_error("rmsnorm expects 1D gamma"); + } + if (input.dtype() != gamma.dtype()) { + throw std::runtime_error("rmsnorm: dtype mismatch"); + } + + size_t batch_size = input.shape()[0]; + size_t features = input.shape()[1]; + + if (gamma.shape()[0] != features) { + throw std::runtime_error("rmsnorm: gamma size must match features"); + } + + GPUArray result(input.shape(), input.dtype()); + + // One block per row, use enough threads to cover features + int block_size = std::min(256, (int)((features + 31) / 32 * 32)); + block_size = std::max(32, block_size); + + switch (input.dtype()) { + case DataType::Float32: + nn::rmsnorm_f32_kernel<<>>( + static_cast(input.data()), + static_cast(gamma.data()), + static_cast(result.data()), + batch_size, features, eps); + break; + case DataType::Float64: + nn::rmsnorm_f64_kernel<<>>( + static_cast(input.data()), + static_cast(gamma.data()), + static_cast(result.data()), + batch_size, features, (double)eps); + break; + case DataType::Float16: + nn::rmsnorm_f16_kernel<<>>( + static_cast(input.data()), + static_cast(gamma.data()), + static_cast<__half*>(result.data()), + batch_size, features, eps); + break; + case DataType::BFloat16: + nn::rmsnorm_bf16_kernel<<>>( + static_cast(input.data()), + static_cast(gamma.data()), + static_cast<__nv_bfloat16*>(result.data()), + batch_size, features, eps); + break; + default: + throw std::runtime_error("rmsnorm only supports float types"); + } + + sync_and_check("rmsnorm kernel failed"); + return result; +} + +// ============================================================================ +// RoPE (Rotary Position Embedding) - In-place +// ============================================================================ + +void rope_inplace(GPUArray& q, GPUArray& k, const GPUArray& cos, const GPUArray& sin) { + // q: [seq_len, n_heads_q, head_dim] + // k: [seq_len, n_heads_k, head_dim] + // cos, sin: [seq_len, head_dim] + + if (q.ndim() != 3 || k.ndim() != 3 || cos.ndim() != 2 || sin.ndim() != 2) { + throw std::runtime_error("rope: invalid dimensions"); + } + if (q.dtype() != DataType::Float32 || k.dtype() != DataType::Float32) { + throw std::runtime_error("rope: only float32 supported"); + } + + int seq_len = q.shape()[0]; + int n_heads_q = q.shape()[1]; + int n_heads_k = k.shape()[1]; + int head_dim = q.shape()[2]; + + if (k.shape()[0] != seq_len || k.shape()[2] != head_dim) { + throw std::runtime_error("rope: q and k shape mismatch"); + } + if (cos.shape()[0] != seq_len || cos.shape()[1] != head_dim) { + throw std::runtime_error("rope: cos shape mismatch"); + } + if (sin.shape()[0] != seq_len || sin.shape()[1] != head_dim) { + throw std::runtime_error("rope: sin shape mismatch"); + } + + // Total work items: max of Q and K + int half_dim = head_dim / 2; + int total_q = seq_len * n_heads_q * half_dim; + int total_k = seq_len * n_heads_k * half_dim; + int total_work = std::max(total_q, total_k); + + const int block_size = 256; + const int grid_size = (total_work + block_size - 1) / block_size; + + nn::rope_f32_kernel<<>>( + static_cast(q.data()), + static_cast(k.data()), + static_cast(cos.data()), + static_cast(sin.data()), + seq_len, n_heads_q, n_heads_k, head_dim); + + sync_and_check("rope kernel failed"); +} + +// ============================================================================ +// SiLU (Swish) Activation: x * sigmoid(x) +// ============================================================================ + +GPUArray silu(const GPUArray& input) { + if (input.dtype() != DataType::Float32 && input.dtype() != DataType::Float64 && + input.dtype() != DataType::Float16 && input.dtype() != DataType::BFloat16) { + throw std::runtime_error("silu only supports float types"); + } + + GPUArray result(input.shape(), input.dtype()); + size_t n = input.size(); + + const int block_size = 256; + const int grid_size = (n + block_size - 1) / block_size; + + switch (input.dtype()) { + case DataType::Float32: + nn::silu_f32_kernel<<>>( + static_cast(input.data()), + static_cast(result.data()), + n); + break; + case DataType::Float64: + nn::silu_f64_kernel<<>>( + static_cast(input.data()), + static_cast(result.data()), + n); + break; + case DataType::Float16: + nn::silu_f16_kernel<<>>( + static_cast(input.data()), + static_cast<__half*>(result.data()), + n); + break; + case DataType::BFloat16: + nn::silu_bf16_kernel<<>>( + static_cast(input.data()), + static_cast<__nv_bfloat16*>(result.data()), + n); + break; + default: + break; + } + + sync_and_check("silu kernel failed"); + return result; +} + +// ============================================================================ +// Scaled Dot-Product Attention (SDPA) with Causal Mask +// ============================================================================ + +GPUArray sdpa_causal(const GPUArray& Q, const GPUArray& K, const GPUArray& V, float scale) { + // Q: [n_heads, q_len, head_dim] + // K: [n_heads, kv_len, head_dim] + // V: [n_heads, kv_len, head_dim] + // Output: [n_heads, q_len, head_dim] + + if (Q.ndim() != 3 || K.ndim() != 3 || V.ndim() != 3) { + throw std::runtime_error("sdpa expects 3D inputs [n_heads, seq_len, head_dim]"); + } + if (Q.dtype() != K.dtype() || Q.dtype() != V.dtype()) { + throw std::runtime_error("sdpa: dtype mismatch"); + } + + int n_heads = Q.shape()[0]; + int q_len = Q.shape()[1]; + int head_dim = Q.shape()[2]; + int kv_len = K.shape()[1]; + + if (K.shape()[0] != n_heads || V.shape()[0] != n_heads) { + throw std::runtime_error("sdpa: n_heads mismatch"); + } + if (K.shape()[2] != head_dim || V.shape()[2] != head_dim) { + throw std::runtime_error("sdpa: head_dim mismatch"); + } + if (K.shape()[1] != V.shape()[1]) { + throw std::runtime_error("sdpa: K and V seq_len mismatch"); + } + + GPUArray result({(size_t)n_heads, (size_t)q_len, (size_t)head_dim}, Q.dtype()); + + // Compute scale if not provided + if (scale <= 0.0f) { + scale = 1.0f / sqrtf((float)head_dim); + } + + // Causal offset for proper masking + int causal_offset = kv_len - q_len; + + // Grid: one block per (head, query_position) pair + dim3 grid(n_heads, q_len); + int block_size = 128; // Enough threads for reduction + + // Shared memory: need space for attention scores [kv_len] + size_t shared_mem_size = kv_len * sizeof(float); + + switch (Q.dtype()) { + case DataType::Float32: + nn::sdpa_causal_f32_kernel<<>>( + static_cast(Q.data()), + static_cast(K.data()), + static_cast(V.data()), + static_cast(result.data()), + n_heads, q_len, kv_len, head_dim, scale, causal_offset); + break; + case DataType::Float16: + nn::sdpa_causal_f16_kernel<<>>( + static_cast(Q.data()), + static_cast(K.data()), + static_cast(V.data()), + static_cast<__half*>(result.data()), + n_heads, q_len, kv_len, head_dim, scale, causal_offset); + break; + case DataType::BFloat16: + nn::sdpa_causal_bf16_kernel<<>>( + static_cast(Q.data()), + static_cast(K.data()), + static_cast(V.data()), + static_cast<__nv_bfloat16*>(result.data()), + n_heads, q_len, kv_len, head_dim, scale, causal_offset); + break; + default: + throw std::runtime_error("sdpa only supports Float32, Float16, BFloat16"); + } + + sync_and_check("sdpa kernel failed"); + return result; +} + +// ============================================================================ +// Tensor Manipulation Operations +// ============================================================================ + +// Concat two tensors along axis 0 +// a: [dim0_a, ...], b: [dim0_b, ...] -> output: [dim0_a + dim0_b, ...] +GPUArray concat_axis0(const GPUArray& a, const GPUArray& b) { + if (a.dtype() != b.dtype()) { + throw std::runtime_error("concat: dtype mismatch"); + } + if (a.dtype() != DataType::Float32) { + throw std::runtime_error("concat: only float32 supported"); + } + if (a.ndim() < 1 || b.ndim() < 1 || a.ndim() != b.ndim()) { + throw std::runtime_error("concat: dimension mismatch"); + } + + // Check that all dimensions except axis 0 match + for (size_t i = 1; i < a.ndim(); i++) { + if (a.shape()[i] != b.shape()[i]) { + throw std::runtime_error("concat: shape mismatch on non-concat axis"); + } + } + + // Compute output shape + std::vector out_shape = a.shape(); + out_shape[0] = a.shape()[0] + b.shape()[0]; + + GPUArray result(out_shape, a.dtype()); + + // Compute stride (elements per "row" along axis 0) + size_t stride = 1; + for (size_t i = 1; i < a.ndim(); i++) { + stride *= a.shape()[i]; + } + + size_t total = result.size(); + const int block_size = 256; + const int grid_size = (total + block_size - 1) / block_size; + + nn::concat_axis0_f32_kernel<<>>( + static_cast(a.data()), + static_cast(b.data()), + static_cast(result.data()), + a.shape()[0], b.shape()[0], stride); + + sync_and_check("concat_axis0 kernel failed"); + return result; +} + +// Repeat interleave along axis 1 (for GQA expansion) +// input: [dim0, dim1, dim2] -> output: [dim0, dim1 * repeats, dim2] +GPUArray repeat_interleave_axis1(const GPUArray& input, size_t repeats) { + if (input.dtype() != DataType::Float32) { + throw std::runtime_error("repeat_interleave: only float32 supported"); + } + if (input.ndim() != 3) { + throw std::runtime_error("repeat_interleave: expects 3D tensor [dim0, dim1, dim2]"); + } + + size_t dim0 = input.shape()[0]; + size_t dim1 = input.shape()[1]; + size_t dim2 = input.shape()[2]; + + std::vector out_shape = {dim0, dim1 * repeats, dim2}; + GPUArray result(out_shape, input.dtype()); + + size_t total = result.size(); + const int block_size = 256; + const int grid_size = (total + block_size - 1) / block_size; + + nn::repeat_interleave_axis1_f32_kernel<<>>( + static_cast(input.data()), + static_cast(result.data()), + dim0, dim1, dim2, repeats); + + sync_and_check("repeat_interleave_axis1 kernel failed"); + return result; +} + +// Transpose 3D tensor: [d0, d1, d2] -> [d1, d0, d2] +GPUArray transpose_3d_021(const GPUArray& input) { + if (input.dtype() != DataType::Float32) { + throw std::runtime_error("transpose_3d_021: only float32 supported"); + } + if (input.ndim() != 3) { + throw std::runtime_error("transpose_3d_021: expects 3D tensor"); + } + + size_t dim0 = input.shape()[0]; + size_t dim1 = input.shape()[1]; + size_t dim2 = input.shape()[2]; + + // Output shape: [dim1, dim0, dim2] + std::vector out_shape = {dim1, dim0, dim2}; + GPUArray result(out_shape, input.dtype()); + + size_t total = input.size(); + const int block_size = 256; + const int grid_size = (total + block_size - 1) / block_size; + + nn::transpose_021_f32_kernel<<>>( + static_cast(input.data()), + static_cast(result.data()), + dim0, dim1, dim2); + + sync_and_check("transpose_3d_021 kernel failed"); + return result; +} + +// Reshape with copy (creates contiguous tensor with new shape) +GPUArray reshape_copy(const GPUArray& input, const std::vector& new_shape) { + if (input.dtype() != DataType::Float32) { + throw std::runtime_error("reshape_copy: only float32 supported"); + } + + // Verify total size matches + size_t input_size = input.size(); + size_t output_size = 1; + for (size_t dim : new_shape) { + output_size *= dim; + } + + if (input_size != output_size) { + throw std::runtime_error("reshape_copy: total size mismatch"); + } + + GPUArray result(new_shape, input.dtype()); + + const int block_size = 256; + const int grid_size = (input_size + block_size - 1) / block_size; + + nn::copy_f32_kernel<<>>( + static_cast(input.data()), + static_cast(result.data()), + input_size); + + sync_and_check("reshape_copy kernel failed"); + return result; +} + } // namespace ops } // namespace pygpukit diff --git a/native/ops/nn/nn_kernels.cuh b/native/ops/nn/nn_kernels.cuh index aa30907..3c25b7e 100644 --- a/native/ops/nn/nn_kernels.cuh +++ b/native/ops/nn/nn_kernels.cuh @@ -477,6 +477,225 @@ __global__ void layernorm_bf16_kernel(const __nv_bfloat16* __restrict__ input, } } +// ============================================================================ +// RMSNorm (Root Mean Square Normalization) +// ============================================================================ + +// RMSNorm: y = x / sqrt(mean(x^2) + eps) * gamma +// Input: [batch, features], normalize over features dimension +// Simpler than LayerNorm: no mean subtraction, no beta + +__global__ void rmsnorm_f32_kernel(const float* __restrict__ input, + const float* __restrict__ gamma, + float* __restrict__ output, + size_t batch_size, + size_t features, + float eps) { + int row = blockIdx.x; + if (row >= batch_size) return; + + const float* row_input = input + row * features; + float* row_output = output + row * features; + + // Compute sum of squares using parallel reduction + float sum_sq = 0.0f; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + float val = row_input[i]; + sum_sq += val * val; + } + + // Warp-level reduction + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum_sq += __shfl_down_sync(0xffffffff, sum_sq, offset); + } + + // Block-level reduction using shared memory + __shared__ float shared_sum[32]; // Max 32 warps + int lane = threadIdx.x % warpSize; + int warp_id = threadIdx.x / warpSize; + + if (lane == 0) { + shared_sum[warp_id] = sum_sq; + } + __syncthreads(); + + // First warp reduces across warps + if (warp_id == 0) { + sum_sq = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_sum[threadIdx.x] : 0.0f; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum_sq += __shfl_down_sync(0xffffffff, sum_sq, offset); + } + } + + __shared__ float inv_rms; + if (threadIdx.x == 0) { + // RMS = sqrt(mean(x^2) + eps) + inv_rms = rsqrtf(sum_sq / features + eps); + } + __syncthreads(); + + // Normalize and apply scale (gamma) + for (int i = threadIdx.x; i < features; i += blockDim.x) { + float x = row_input[i]; + row_output[i] = x * inv_rms * gamma[i]; + } +} + +// Double precision RMSNorm +__global__ void rmsnorm_f64_kernel(const double* __restrict__ input, + const double* __restrict__ gamma, + double* __restrict__ output, + size_t batch_size, + size_t features, + double eps) { + int row = blockIdx.x; + if (row >= batch_size) return; + + const double* row_input = input + row * features; + double* row_output = output + row * features; + + double sum_sq = 0.0; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + double val = row_input[i]; + sum_sq += val * val; + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum_sq += __shfl_down_sync(0xffffffff, sum_sq, offset); + } + + __shared__ double shared_sum[32]; + int lane = threadIdx.x % warpSize; + int warp_id = threadIdx.x / warpSize; + + if (lane == 0) { + shared_sum[warp_id] = sum_sq; + } + __syncthreads(); + + if (warp_id == 0) { + sum_sq = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_sum[threadIdx.x] : 0.0; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum_sq += __shfl_down_sync(0xffffffff, sum_sq, offset); + } + } + + __shared__ double inv_rms; + if (threadIdx.x == 0) { + inv_rms = rsqrt(sum_sq / features + eps); + } + __syncthreads(); + + for (int i = threadIdx.x; i < features; i += blockDim.x) { + double x = row_input[i]; + row_output[i] = x * inv_rms * gamma[i]; + } +} + +// FP16 RMSNorm (compute in FP32 for precision) +__global__ void rmsnorm_f16_kernel(const __half* __restrict__ input, + const __half* __restrict__ gamma, + __half* __restrict__ output, + size_t batch_size, + size_t features, + float eps) { + int row = blockIdx.x; + if (row >= batch_size) return; + + const __half* row_input = input + row * features; + __half* row_output = output + row * features; + + float sum_sq = 0.0f; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + float val = __half2float(row_input[i]); + sum_sq += val * val; + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum_sq += __shfl_down_sync(0xffffffff, sum_sq, offset); + } + + __shared__ float shared_sum[32]; + int lane = threadIdx.x % warpSize; + int warp_id = threadIdx.x / warpSize; + + if (lane == 0) { + shared_sum[warp_id] = sum_sq; + } + __syncthreads(); + + if (warp_id == 0) { + sum_sq = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_sum[threadIdx.x] : 0.0f; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum_sq += __shfl_down_sync(0xffffffff, sum_sq, offset); + } + } + + __shared__ float inv_rms; + if (threadIdx.x == 0) { + inv_rms = rsqrtf(sum_sq / features + eps); + } + __syncthreads(); + + for (int i = threadIdx.x; i < features; i += blockDim.x) { + float x = __half2float(row_input[i]); + float g = __half2float(gamma[i]); + row_output[i] = __float2half(x * inv_rms * g); + } +} + +// BF16 RMSNorm (compute in FP32 for precision) +__global__ void rmsnorm_bf16_kernel(const __nv_bfloat16* __restrict__ input, + const __nv_bfloat16* __restrict__ gamma, + __nv_bfloat16* __restrict__ output, + size_t batch_size, + size_t features, + float eps) { + int row = blockIdx.x; + if (row >= batch_size) return; + + const __nv_bfloat16* row_input = input + row * features; + __nv_bfloat16* row_output = output + row * features; + + float sum_sq = 0.0f; + for (int i = threadIdx.x; i < features; i += blockDim.x) { + float val = __bfloat162float(row_input[i]); + sum_sq += val * val; + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum_sq += __shfl_down_sync(0xffffffff, sum_sq, offset); + } + + __shared__ float shared_sum[32]; + int lane = threadIdx.x % warpSize; + int warp_id = threadIdx.x / warpSize; + + if (lane == 0) { + shared_sum[warp_id] = sum_sq; + } + __syncthreads(); + + if (warp_id == 0) { + sum_sq = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_sum[threadIdx.x] : 0.0f; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum_sq += __shfl_down_sync(0xffffffff, sum_sq, offset); + } + } + + __shared__ float inv_rms; + if (threadIdx.x == 0) { + inv_rms = rsqrtf(sum_sq / features + eps); + } + __syncthreads(); + + for (int i = threadIdx.x; i < features; i += blockDim.x) { + float x = __bfloat162float(row_input[i]); + float g = __bfloat162float(gamma[i]); + row_output[i] = __float2bfloat16(x * inv_rms * g); + } +} + // ============================================================================ // Softmax // ============================================================================ @@ -917,6 +1136,565 @@ __global__ void transpose_bf16_kernel(const __nv_bfloat16* __restrict__ input, } } +// ============================================================================ +// Tensor Manipulation Operations +// ============================================================================ + +// Concat two tensors along axis 0 +// src1: [dim0_1, dim1, dim2], src2: [dim0_2, dim1, dim2] +// dst: [dim0_1 + dim0_2, dim1, dim2] +__global__ void concat_axis0_f32_kernel( + const float* __restrict__ src1, + const float* __restrict__ src2, + float* __restrict__ dst, + size_t dim0_1, // First tensor's dim0 + size_t dim0_2, // Second tensor's dim0 + size_t stride // dim1 * dim2 (elements per row) +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total_src1 = dim0_1 * stride; + size_t total = (dim0_1 + dim0_2) * stride; + + if (idx < total) { + if (idx < total_src1) { + dst[idx] = src1[idx]; + } else { + dst[idx] = src2[idx - total_src1]; + } + } +} + +// Repeat tensor along axis 1 (for GQA expansion) +// src: [dim0, dim1, dim2] -> dst: [dim0, dim1 * repeats, dim2] +// Each element in dim1 is repeated 'repeats' times +__global__ void repeat_interleave_axis1_f32_kernel( + const float* __restrict__ src, + float* __restrict__ dst, + size_t dim0, + size_t dim1, + size_t dim2, + size_t repeats +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = dim0 * dim1 * repeats * dim2; + + if (idx < total) { + // Compute output coordinates + size_t d2 = idx % dim2; + size_t remaining = idx / dim2; + size_t d1_out = remaining % (dim1 * repeats); + size_t d0 = remaining / (dim1 * repeats); + + // Map output d1 to input d1 (integer division gives the source index) + size_t d1_in = d1_out / repeats; + + // Compute source index + size_t src_idx = d0 * dim1 * dim2 + d1_in * dim2 + d2; + dst[idx] = src[src_idx]; + } +} + +// Transpose 3D tensor: [d0, d1, d2] -> [d1, d0, d2] +// Swaps axes 0 and 1 +__global__ void transpose_021_f32_kernel( + const float* __restrict__ src, + float* __restrict__ dst, + size_t dim0, + size_t dim1, + size_t dim2 +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = dim0 * dim1 * dim2; + + if (idx < total) { + // Compute source coordinates [d0, d1, d2] + size_t d2 = idx % dim2; + size_t remaining = idx / dim2; + size_t d1 = remaining % dim1; + size_t d0 = remaining / dim1; + + // Compute destination index [d1, d0, d2] + size_t dst_idx = d1 * dim0 * dim2 + d0 * dim2 + d2; + dst[dst_idx] = src[idx]; + } +} + +// Reshape with copy (ensures contiguous output) +// Simply copies data - reshape is handled by changing shape metadata +__global__ void copy_f32_kernel( + const float* __restrict__ src, + float* __restrict__ dst, + size_t n +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + dst[idx] = src[idx]; + } +} + +// ============================================================================ +// RoPE (Rotary Position Embedding) +// ============================================================================ +// +// Applies rotary position embeddings to Q and K tensors +// q, k: [seq_len, n_heads, head_dim] - input tensors (modified in-place) +// cos, sin: [seq_len, head_dim] - precomputed rotary frequencies +// +// For each position i and head h: +// q_rot[i,h,0:d/2] = q[i,h,0:d/2] * cos[i,0:d/2] - q[i,h,d/2:d] * sin[i,0:d/2] +// q_rot[i,h,d/2:d] = q[i,h,d/2:d] * cos[i,0:d/2] + q[i,h,0:d/2] * sin[i,0:d/2] + +__global__ void rope_f32_kernel( + float* __restrict__ q, // [seq_len, n_heads_q, head_dim] - modified in-place + float* __restrict__ k, // [seq_len, n_heads_k, head_dim] - modified in-place + const float* __restrict__ cos, // [seq_len, head_dim] + const float* __restrict__ sin, // [seq_len, head_dim] + int seq_len, + int n_heads_q, + int n_heads_k, + int head_dim +) { + int half_dim = head_dim / 2; + + // Each thread handles one (seq_pos, head, dim_pair) + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int total_q = seq_len * n_heads_q * half_dim; + int total_k = seq_len * n_heads_k * half_dim; + + // Process Q tensor + if (idx < total_q) { + int d = idx % half_dim; // Which pair (0 to half_dim-1) + int remaining = idx / half_dim; + int h = remaining % n_heads_q; + int s = remaining / n_heads_q; + + int base = s * n_heads_q * head_dim + h * head_dim; + float q0 = q[base + d]; + float q1 = q[base + d + half_dim]; + + int cos_idx = s * head_dim + d; + float c = cos[cos_idx]; + float sn = sin[cos_idx]; + + q[base + d] = q0 * c - q1 * sn; + q[base + d + half_dim] = q1 * c + q0 * sn; + } + + // Process K tensor (may have fewer heads than Q due to GQA) + if (idx < total_k) { + int d = idx % half_dim; + int remaining = idx / half_dim; + int h = remaining % n_heads_k; + int s = remaining / n_heads_k; + + int base = s * n_heads_k * head_dim + h * head_dim; + float k0 = k[base + d]; + float k1 = k[base + d + half_dim]; + + int cos_idx = s * head_dim + d; + float c = cos[cos_idx]; + float sn = sin[cos_idx]; + + k[base + d] = k0 * c - k1 * sn; + k[base + d + half_dim] = k1 * c + k0 * sn; + } +} + +// ============================================================================ +// SiLU (Swish) Activation: x * sigmoid(x) +// ============================================================================ + +__device__ __forceinline__ float silu_f32(float x) { + return x / (1.0f + expf(-x)); +} + +__global__ void silu_f32_kernel(const float* __restrict__ input, + float* __restrict__ output, + size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + output[idx] = silu_f32(input[idx]); + } +} + +__global__ void silu_f64_kernel(const double* __restrict__ input, + double* __restrict__ output, + size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + double x = input[idx]; + output[idx] = x / (1.0 + exp(-x)); + } +} + +__global__ void silu_f16_kernel(const __half* __restrict__ input, + __half* __restrict__ output, + size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + float x = __half2float(input[idx]); + output[idx] = __float2half(silu_f32(x)); + } +} + +__global__ void silu_bf16_kernel(const __nv_bfloat16* __restrict__ input, + __nv_bfloat16* __restrict__ output, + size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + float x = __bfloat162float(input[idx]); + output[idx] = __float2bfloat16(silu_f32(x)); + } +} + +// ============================================================================ +// Scaled Dot-Product Attention (SDPA) with Causal Mask +// ============================================================================ +// +// For multi-head attention: +// Q: [n_heads, q_len, head_dim] +// K: [n_heads, kv_len, head_dim] +// V: [n_heads, kv_len, head_dim] +// Output: [n_heads, q_len, head_dim] +// +// Algorithm: +// 1. scores = Q @ K^T / sqrt(head_dim) -> [n_heads, q_len, kv_len] +// 2. Apply causal mask (future positions = -inf) +// 3. weights = softmax(scores, dim=-1) +// 4. output = weights @ V -> [n_heads, q_len, head_dim] +// +// This kernel handles one (head, query_position) pair per block. +// Each block computes attention for one query position in one head. + +__global__ void sdpa_causal_f32_kernel( + const float* __restrict__ Q, // [n_heads, q_len, head_dim] + const float* __restrict__ K, // [n_heads, kv_len, head_dim] + const float* __restrict__ V, // [n_heads, kv_len, head_dim] + float* __restrict__ output, // [n_heads, q_len, head_dim] + int n_heads, + int q_len, + int kv_len, + int head_dim, + float scale, // 1/sqrt(head_dim) + int causal_offset // kv_len - q_len (for proper causal masking) +) { + // Each block handles one (head, query_pos) pair + int head_idx = blockIdx.x; + int q_pos = blockIdx.y; + + if (head_idx >= n_heads || q_pos >= q_len) return; + + // Pointers for this head + const float* Q_head = Q + head_idx * q_len * head_dim + q_pos * head_dim; + const float* K_head = K + head_idx * kv_len * head_dim; + const float* V_head = V + head_idx * kv_len * head_dim; + float* out_head = output + head_idx * q_len * head_dim + q_pos * head_dim; + + // Causal mask: query at position q_pos can attend to positions 0..(causal_offset + q_pos) + int max_attend = causal_offset + q_pos + 1; + if (max_attend > kv_len) max_attend = kv_len; + + // Step 1: Compute attention scores and find max (for numerical stability) + extern __shared__ float shared[]; + float* scores = shared; // [kv_len] + + float max_score = -INFINITY; + for (int kv_pos = threadIdx.x; kv_pos < kv_len; kv_pos += blockDim.x) { + float score = 0.0f; + if (kv_pos < max_attend) { + // Dot product Q[q_pos] @ K[kv_pos] + for (int d = 0; d < head_dim; d++) { + score += Q_head[d] * K_head[kv_pos * head_dim + d]; + } + score *= scale; + } else { + score = -INFINITY; // Masked position + } + scores[kv_pos] = score; + if (score > max_score) max_score = score; + } + + // Reduce max across threads + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + float other = __shfl_down_sync(0xffffffff, max_score, offset); + max_score = fmaxf(max_score, other); + } + + __shared__ float shared_max[32]; + int lane = threadIdx.x % warpSize; + int warp_id = threadIdx.x / warpSize; + if (lane == 0) shared_max[warp_id] = max_score; + __syncthreads(); + + if (warp_id == 0) { + max_score = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_max[threadIdx.x] : -INFINITY; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + max_score = fmaxf(max_score, __shfl_down_sync(0xffffffff, max_score, offset)); + } + } + + __shared__ float row_max; + if (threadIdx.x == 0) row_max = max_score; + __syncthreads(); + + // Step 2: Compute exp(score - max) and sum + float sum = 0.0f; + for (int kv_pos = threadIdx.x; kv_pos < kv_len; kv_pos += blockDim.x) { + float exp_score = expf(scores[kv_pos] - row_max); + scores[kv_pos] = exp_score; + sum += exp_score; + } + + // Reduce sum + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + + __shared__ float shared_sum[32]; + if (lane == 0) shared_sum[warp_id] = sum; + __syncthreads(); + + if (warp_id == 0) { + sum = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_sum[threadIdx.x] : 0.0f; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + } + + __shared__ float row_sum; + if (threadIdx.x == 0) row_sum = sum; + __syncthreads(); + + // Step 3: Normalize scores to get attention weights + float inv_sum = 1.0f / row_sum; + for (int kv_pos = threadIdx.x; kv_pos < kv_len; kv_pos += blockDim.x) { + scores[kv_pos] *= inv_sum; + } + __syncthreads(); + + // Step 4: Compute output = weights @ V + // Each thread handles a subset of head_dim + for (int d = threadIdx.x; d < head_dim; d += blockDim.x) { + float out_val = 0.0f; + for (int kv_pos = 0; kv_pos < kv_len; kv_pos++) { + out_val += scores[kv_pos] * V_head[kv_pos * head_dim + d]; + } + out_head[d] = out_val; + } +} + +// FP16 SDPA (compute in FP32 for precision) +__global__ void sdpa_causal_f16_kernel( + const __half* __restrict__ Q, + const __half* __restrict__ K, + const __half* __restrict__ V, + __half* __restrict__ output, + int n_heads, + int q_len, + int kv_len, + int head_dim, + float scale, + int causal_offset +) { + int head_idx = blockIdx.x; + int q_pos = blockIdx.y; + + if (head_idx >= n_heads || q_pos >= q_len) return; + + const __half* Q_head = Q + head_idx * q_len * head_dim + q_pos * head_dim; + const __half* K_head = K + head_idx * kv_len * head_dim; + const __half* V_head = V + head_idx * kv_len * head_dim; + __half* out_head = output + head_idx * q_len * head_dim + q_pos * head_dim; + + int max_attend = causal_offset + q_pos + 1; + if (max_attend > kv_len) max_attend = kv_len; + + extern __shared__ float shared[]; + float* scores = shared; + + float max_score = -INFINITY; + for (int kv_pos = threadIdx.x; kv_pos < kv_len; kv_pos += blockDim.x) { + float score = 0.0f; + if (kv_pos < max_attend) { + for (int d = 0; d < head_dim; d++) { + score += __half2float(Q_head[d]) * __half2float(K_head[kv_pos * head_dim + d]); + } + score *= scale; + } else { + score = -INFINITY; + } + scores[kv_pos] = score; + if (score > max_score) max_score = score; + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + max_score = fmaxf(max_score, __shfl_down_sync(0xffffffff, max_score, offset)); + } + + __shared__ float shared_max[32]; + int lane = threadIdx.x % warpSize; + int warp_id = threadIdx.x / warpSize; + if (lane == 0) shared_max[warp_id] = max_score; + __syncthreads(); + + if (warp_id == 0) { + max_score = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_max[threadIdx.x] : -INFINITY; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + max_score = fmaxf(max_score, __shfl_down_sync(0xffffffff, max_score, offset)); + } + } + + __shared__ float row_max; + if (threadIdx.x == 0) row_max = max_score; + __syncthreads(); + + float sum = 0.0f; + for (int kv_pos = threadIdx.x; kv_pos < kv_len; kv_pos += blockDim.x) { + float exp_score = expf(scores[kv_pos] - row_max); + scores[kv_pos] = exp_score; + sum += exp_score; + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + + __shared__ float shared_sum[32]; + if (lane == 0) shared_sum[warp_id] = sum; + __syncthreads(); + + if (warp_id == 0) { + sum = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_sum[threadIdx.x] : 0.0f; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + } + + __shared__ float row_sum; + if (threadIdx.x == 0) row_sum = sum; + __syncthreads(); + + float inv_sum = 1.0f / row_sum; + for (int kv_pos = threadIdx.x; kv_pos < kv_len; kv_pos += blockDim.x) { + scores[kv_pos] *= inv_sum; + } + __syncthreads(); + + for (int d = threadIdx.x; d < head_dim; d += blockDim.x) { + float out_val = 0.0f; + for (int kv_pos = 0; kv_pos < kv_len; kv_pos++) { + out_val += scores[kv_pos] * __half2float(V_head[kv_pos * head_dim + d]); + } + out_head[d] = __float2half(out_val); + } +} + +// BF16 SDPA +__global__ void sdpa_causal_bf16_kernel( + const __nv_bfloat16* __restrict__ Q, + const __nv_bfloat16* __restrict__ K, + const __nv_bfloat16* __restrict__ V, + __nv_bfloat16* __restrict__ output, + int n_heads, + int q_len, + int kv_len, + int head_dim, + float scale, + int causal_offset +) { + int head_idx = blockIdx.x; + int q_pos = blockIdx.y; + + if (head_idx >= n_heads || q_pos >= q_len) return; + + const __nv_bfloat16* Q_head = Q + head_idx * q_len * head_dim + q_pos * head_dim; + const __nv_bfloat16* K_head = K + head_idx * kv_len * head_dim; + const __nv_bfloat16* V_head = V + head_idx * kv_len * head_dim; + __nv_bfloat16* out_head = output + head_idx * q_len * head_dim + q_pos * head_dim; + + int max_attend = causal_offset + q_pos + 1; + if (max_attend > kv_len) max_attend = kv_len; + + extern __shared__ float shared[]; + float* scores = shared; + + float max_score = -INFINITY; + for (int kv_pos = threadIdx.x; kv_pos < kv_len; kv_pos += blockDim.x) { + float score = 0.0f; + if (kv_pos < max_attend) { + for (int d = 0; d < head_dim; d++) { + score += __bfloat162float(Q_head[d]) * __bfloat162float(K_head[kv_pos * head_dim + d]); + } + score *= scale; + } else { + score = -INFINITY; + } + scores[kv_pos] = score; + if (score > max_score) max_score = score; + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + max_score = fmaxf(max_score, __shfl_down_sync(0xffffffff, max_score, offset)); + } + + __shared__ float shared_max[32]; + int lane = threadIdx.x % warpSize; + int warp_id = threadIdx.x / warpSize; + if (lane == 0) shared_max[warp_id] = max_score; + __syncthreads(); + + if (warp_id == 0) { + max_score = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_max[threadIdx.x] : -INFINITY; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + max_score = fmaxf(max_score, __shfl_down_sync(0xffffffff, max_score, offset)); + } + } + + __shared__ float row_max; + if (threadIdx.x == 0) row_max = max_score; + __syncthreads(); + + float sum = 0.0f; + for (int kv_pos = threadIdx.x; kv_pos < kv_len; kv_pos += blockDim.x) { + float exp_score = expf(scores[kv_pos] - row_max); + scores[kv_pos] = exp_score; + sum += exp_score; + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + + __shared__ float shared_sum[32]; + if (lane == 0) shared_sum[warp_id] = sum; + __syncthreads(); + + if (warp_id == 0) { + sum = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ? shared_sum[threadIdx.x] : 0.0f; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(0xffffffff, sum, offset); + } + } + + __shared__ float row_sum; + if (threadIdx.x == 0) row_sum = sum; + __syncthreads(); + + float inv_sum = 1.0f / row_sum; + for (int kv_pos = threadIdx.x; kv_pos < kv_len; kv_pos += blockDim.x) { + scores[kv_pos] *= inv_sum; + } + __syncthreads(); + + for (int d = threadIdx.x; d < head_dim; d += blockDim.x) { + float out_val = 0.0f; + for (int kv_pos = 0; kv_pos < kv_len; kv_pos++) { + out_val += scores[kv_pos] * __bfloat162float(V_head[kv_pos * head_dim + d]); + } + out_head[d] = __float2bfloat16(out_val); + } +} + } // namespace nn } // namespace ops } // namespace pygpukit diff --git a/native/ops/ops.cuh b/native/ops/ops.cuh index c851702..36be976 100644 --- a/native/ops/ops.cuh +++ b/native/ops/ops.cuh @@ -102,6 +102,28 @@ GPUArray layernorm(const GPUArray& input, const GPUArray& gamma, const GPUArray& // Applied row-wise: input [batch, features] -> output [batch, features] GPUArray softmax(const GPUArray& input); +// RMSNorm: y = x / sqrt(mean(x^2) + eps) * gamma +// input: [batch, features], gamma: [features] +// Simpler than LayerNorm (no mean subtraction, no beta) +GPUArray rmsnorm(const GPUArray& input, const GPUArray& gamma, float eps = 1e-5f); + +// SiLU (Swish) activation: y = x * sigmoid(x) +GPUArray silu(const GPUArray& input); + +// RoPE (Rotary Position Embedding) - In-place +// q: [seq_len, n_heads_q, head_dim] +// k: [seq_len, n_heads_k, head_dim] +// cos, sin: [seq_len, head_dim] +void rope_inplace(GPUArray& q, GPUArray& k, const GPUArray& cos, const GPUArray& sin); + +// Scaled Dot-Product Attention with Causal Mask +// Q: [n_heads, q_len, head_dim] +// K: [n_heads, kv_len, head_dim] +// V: [n_heads, kv_len, head_dim] +// Output: [n_heads, q_len, head_dim] +// scale: 1/sqrt(head_dim), computed automatically if <= 0 +GPUArray sdpa_causal(const GPUArray& Q, const GPUArray& K, const GPUArray& V, float scale = 0.0f); + // ============================================================================ // Fused Operations (CUTLASS Epilogue Fusion) // ============================================================================ @@ -112,5 +134,23 @@ GPUArray softmax(const GPUArray& input); // output: [batch, out_features] GPUArray linear_bias_gelu(const GPUArray& input, const GPUArray& weight, const GPUArray& bias); +// ============================================================================ +// Tensor Manipulation Operations +// ============================================================================ + +// Concat two tensors along axis 0 +// a: [dim0_a, ...], b: [dim0_b, ...] -> output: [dim0_a + dim0_b, ...] +GPUArray concat_axis0(const GPUArray& a, const GPUArray& b); + +// Repeat interleave along axis 1 (for GQA expansion) +// input: [dim0, dim1, dim2] -> output: [dim0, dim1 * repeats, dim2] +GPUArray repeat_interleave_axis1(const GPUArray& input, size_t repeats); + +// Transpose 3D tensor: [d0, d1, d2] -> [d1, d0, d2] +GPUArray transpose_3d_021(const GPUArray& input); + +// Reshape with copy (creates contiguous tensor with new shape) +GPUArray reshape_copy(const GPUArray& input, const std::vector& new_shape); + } // namespace ops } // namespace pygpukit diff --git a/src/pygpukit/core/backend.py b/src/pygpukit/core/backend.py index a22b271..d3c37c1 100644 --- a/src/pygpukit/core/backend.py +++ b/src/pygpukit/core/backend.py @@ -500,7 +500,7 @@ def get_rust_module() -> Any | None: _rust_import_attempted = True try: - from pygpukit import _pygpukit_rust # type: ignore[attr-defined] + import _pygpukit_rust # type: ignore[import-not-found] _rust_module = _pygpukit_rust except ImportError: diff --git a/src/pygpukit/llm/__init__.py b/src/pygpukit/llm/__init__.py index 7629968..afce946 100644 --- a/src/pygpukit/llm/__init__.py +++ b/src/pygpukit/llm/__init__.py @@ -330,14 +330,28 @@ def __repr__(self) -> str: from pygpukit.llm.model import ( # noqa: E402 + # Unified Transformer classes + TransformerConfig, + CausalTransformerModel, + Attention, MLP, - CausalSelfAttention, + Norm, + TransformerBlock, + Linear, + # Legacy GPT-2 aliases GPT2Config, GPT2Model, + CausalSelfAttention, LayerNorm, - Linear, - TransformerBlock, load_gpt2_from_safetensors, + # Legacy Llama aliases + LlamaConfig, + LlamaModel, + LlamaAttention, + LlamaBlock, + LlamaMLP, + RMSNorm, + load_llama_from_safetensors, ) __all__ = [ @@ -348,13 +362,26 @@ def __repr__(self) -> str: "load_safetensors", # Tokenizer "Tokenizer", - # Model components + # Unified Transformer (v0.2.9) + "TransformerConfig", + "CausalTransformerModel", + "Attention", + "MLP", + "Norm", + "TransformerBlock", + "Linear", + # Legacy GPT-2 aliases "GPT2Config", "GPT2Model", "CausalSelfAttention", "LayerNorm", - "Linear", - "MLP", - "TransformerBlock", "load_gpt2_from_safetensors", + # Legacy Llama aliases + "LlamaConfig", + "LlamaModel", + "LlamaAttention", + "LlamaBlock", + "LlamaMLP", + "RMSNorm", + "load_llama_from_safetensors", ] diff --git a/src/pygpukit/llm/model.py b/src/pygpukit/llm/model.py index f2e5e57..99b2367 100644 --- a/src/pygpukit/llm/model.py +++ b/src/pygpukit/llm/model.py @@ -1,32 +1,167 @@ -"""LLM model components for PyGPUkit. +"""Unified Transformer implementation for PyGPUkit. -Provides transformer building blocks for GPT-2 style models: -- MLP block (fc1 -> gelu -> fc2) -- TransformerBlock (ln -> mlp -> residual) -- GPT2Model (embedding -> blocks -> lm_head) +Provides a common Transformer abstraction that supports both GPT-2 and LLaMA +architectures through configuration differences only. + +Key features: +- Hybrid Attention: CPU for seq_len=1 (decode), GPU for prefill +- GPU-native operations: RMSNorm, LayerNorm, SDPA, SiLU, GELU, RoPE +- Unified TransformerConfig for all model variants +- Backward-compatible loaders for GPT-2 and LLaMA safetensors """ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal + +import numpy as np from pygpukit.core.array import GPUArray from pygpukit.core.factory import from_numpy -from pygpukit.ops.basic import add, bias_add_inplace, gelu, layernorm, matmul, transpose +from pygpukit.ops.basic import ( + add, + bias_add_inplace, + gelu, + layernorm, + matmul, + mul, + reshape_copy, + rmsnorm, + rope_inplace, + sdpa_causal, + silu, + transpose, + transpose_3d_021, +) if TYPE_CHECKING: pass +# ============================================================================= +# Common Sampling Functions +# ============================================================================= + + +def sample_token( + logits: np.ndarray, + temperature: float = 1.0, + top_k: int = 0, + top_p: float = 1.0, +) -> int: + """Sample a token from logits with temperature, top-k, and top-p. + + Args: + logits: Logits array [vocab_size] + temperature: Sampling temperature (lower = more deterministic) + top_k: Keep only top-k tokens (0 = disabled) + top_p: Keep tokens with cumulative prob <= top_p (1.0 = disabled) + + Returns: + Sampled token ID + """ + # Apply temperature + if temperature != 1.0 and temperature > 0: + logits = logits / temperature + + # Convert to probabilities + logits_max = logits.max() + exp_logits = np.exp(logits - logits_max) + probs = exp_logits / exp_logits.sum() + + # Top-k filtering + if top_k > 0 and top_k < len(probs): + top_k_indices = np.argsort(probs)[-top_k:] + mask = np.zeros_like(probs, dtype=bool) + mask[top_k_indices] = True + probs = np.where(mask, probs, 0.0) + probs = probs / probs.sum() + + # Top-p (nucleus) filtering + if top_p < 1.0: + sorted_indices = np.argsort(probs)[::-1] + sorted_probs = probs[sorted_indices] + cumsum = np.cumsum(sorted_probs) + cutoff_idx = np.searchsorted(cumsum, top_p) + 1 + cutoff_idx = min(cutoff_idx, len(sorted_probs)) + mask = np.zeros_like(probs, dtype=bool) + mask[sorted_indices[:cutoff_idx]] = True + probs = np.where(mask, probs, 0.0) + probs = probs / probs.sum() + + # Sample + if temperature == 0: + return int(np.argmax(probs)) + else: + return int(np.random.choice(len(probs), p=probs)) + + +# ============================================================================= +# Unified Transformer Configuration +# ============================================================================= + + @dataclass -class GPT2Config: - """Configuration for GPT-2 model. +class TransformerConfig: + """Unified configuration for Transformer models. + + Supports both GPT-2 and LLaMA style architectures through configuration. + + GPT-2 style: + norm_type="layernorm", activation="gelu", use_rope=False - GPT-2 Small defaults: - vocab_size=50257, n_embd=768, n_layer=12, n_head=12 + LLaMA style: + norm_type="rmsnorm", activation="silu", use_rope=True """ + # Core dimensions + vocab_size: int = 32000 + hidden_size: int = 2048 + num_layers: int = 22 + num_heads: int = 32 + num_kv_heads: int | None = None # None = MHA, int = GQA/MQA + intermediate_size: int | None = None # None = 4 * hidden_size + + # Architecture choices + norm_type: Literal["rmsnorm", "layernorm"] = "rmsnorm" + activation: Literal["gelu", "silu"] = "silu" + use_rope: bool = True + causal: bool = True + + # Hyperparameters + max_position_embeddings: int = 2048 + norm_eps: float = 1e-5 + rope_theta: float = 10000.0 + + # Weight tying + tie_word_embeddings: bool = True + + def __post_init__(self): + if self.num_kv_heads is None: + self.num_kv_heads = self.num_heads + if self.intermediate_size is None: + self.intermediate_size = 4 * self.hidden_size + + @property + def head_dim(self) -> int: + return self.hidden_size // self.num_heads + + @property + def num_kv_groups(self) -> int: + """Number of query heads per KV head (for GQA).""" + return self.num_heads // self.num_kv_heads + + +# ============================================================================= +# Legacy Config Classes (for backward compatibility) +# ============================================================================= + + +@dataclass +class GPT2Config: + """Configuration for GPT-2 model (legacy, use TransformerConfig).""" + vocab_size: int = 50257 n_embd: int = 768 n_layer: int = 12 @@ -36,427 +171,774 @@ class GPT2Config: @property def n_inner(self) -> int: - """Inner dimension of MLP (4 * n_embd).""" return 4 * self.n_embd + def to_transformer_config(self) -> TransformerConfig: + """Convert to unified TransformerConfig.""" + return TransformerConfig( + vocab_size=self.vocab_size, + hidden_size=self.n_embd, + num_layers=self.n_layer, + num_heads=self.n_head, + num_kv_heads=self.n_head, # MHA + intermediate_size=self.n_inner, + norm_type="layernorm", + activation="gelu", + use_rope=False, + causal=True, + max_position_embeddings=self.n_positions, + norm_eps=self.layer_norm_eps, + ) + + +@dataclass +class LlamaConfig: + """Configuration for Llama model (legacy, use TransformerConfig).""" + + vocab_size: int = 32000 + hidden_size: int = 2048 + intermediate_size: int = 5632 + num_hidden_layers: int = 22 + num_attention_heads: int = 32 + num_key_value_heads: int = 4 + max_position_embeddings: int = 2048 + rms_norm_eps: float = 1e-5 + rope_theta: float = 10000.0 + + @property + def head_dim(self) -> int: + return self.hidden_size // self.num_attention_heads + + def to_transformer_config(self) -> TransformerConfig: + """Convert to unified TransformerConfig.""" + return TransformerConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_layers=self.num_hidden_layers, + num_heads=self.num_attention_heads, + num_kv_heads=self.num_key_value_heads, + intermediate_size=self.intermediate_size, + norm_type="rmsnorm", + activation="silu", + use_rope=True, + causal=True, + max_position_embeddings=self.max_position_embeddings, + norm_eps=self.rms_norm_eps, + rope_theta=self.rope_theta, + ) + + +# ============================================================================= +# Common Building Blocks +# ============================================================================= + class Linear: """Linear layer: y = xW^T + b Weights are stored as [out_features, in_features] (PyTorch convention). - Forward pass uses GPU transpose and bias_add_inplace for efficiency. """ - def __init__( - self, - weight: GPUArray, - bias: GPUArray | None = None, - ): - """Initialize Linear layer. - - Args: - weight: Weight matrix [out_features, in_features] - bias: Optional bias vector [out_features] - """ + def __init__(self, weight: GPUArray, bias: GPUArray | None = None): if weight.ndim != 2: raise ValueError(f"weight must be 2D, got {weight.ndim}D") - self.weight = weight # [out_features, in_features] + self.weight = weight self.bias = bias self.out_features = weight.shape[0] self.in_features = weight.shape[1] - # Pre-transpose weight for efficient forward pass self._weight_t: GPUArray | None = None def __call__(self, x: GPUArray) -> GPUArray: - """Forward pass: y = xW^T + b - - Args: - x: Input tensor [batch, in_features] - - Returns: - Output tensor [batch, out_features] - """ if x.ndim != 2: raise ValueError(f"input must be 2D [batch, in_features], got {x.ndim}D") if x.shape[1] != self.in_features: - raise ValueError(f"input features {x.shape[1]} doesn't match weight {self.in_features}") + raise ValueError(f"input features {x.shape[1]} != weight {self.in_features}") - # Lazy transpose - compute once and cache - # weight: [out_features, in_features] -> weight_t: [in_features, out_features] if self._weight_t is None: self._weight_t = transpose(self.weight) - # y = x @ weight_t: [batch, in_features] @ [in_features, out_features] = [batch, out_features] y = matmul(x, self._weight_t) - # Add bias in-place on GPU if present if self.bias is not None: bias_add_inplace(y, self.bias) return y -class MLP: - """MLP block for GPT-2. - - Structure: fc1 -> gelu -> fc2 - fc1: [n_embd] -> [n_inner] - fc2: [n_inner] -> [n_embd] - """ +class Norm: + """Unified normalization layer supporting RMSNorm and LayerNorm.""" def __init__( self, - c_fc_weight: GPUArray, - c_fc_bias: GPUArray | None, - c_proj_weight: GPUArray, - c_proj_bias: GPUArray | None, + weight: GPUArray, + bias: GPUArray | None = None, + norm_type: Literal["rmsnorm", "layernorm"] = "rmsnorm", + eps: float = 1e-5, ): - """Initialize MLP block. - - Args: - c_fc_weight: First linear weight [n_inner, n_embd] - c_fc_bias: First linear bias [n_inner] - c_proj_weight: Second linear weight [n_embd, n_inner] - c_proj_bias: Second linear bias [n_embd] - """ - self.c_fc = Linear(c_fc_weight, c_fc_bias) - self.c_proj = Linear(c_proj_weight, c_proj_bias) + self.weight = weight + self.bias = bias + self.norm_type = norm_type + self.eps = eps def __call__(self, x: GPUArray) -> GPUArray: - """Forward pass: fc1 -> gelu -> fc2 + if self.norm_type == "rmsnorm": + return rmsnorm(x, self.weight, self.eps) + else: + if self.bias is None: + raise ValueError("LayerNorm requires bias") + return layernorm(x, self.weight, self.bias, self.eps) - Args: - x: Input tensor [batch, n_embd] - Returns: - Output tensor [batch, n_embd] - """ - h = self.c_fc(x) - h = gelu(h) - h = self.c_proj(h) - return h +# ============================================================================= +# RoPE (Rotary Position Embedding) +# ============================================================================= -class LayerNorm: - """Layer normalization with learnable parameters.""" +def precompute_freqs_cis( + head_dim: int, max_seq_len: int, theta: float = 10000.0 +) -> tuple[np.ndarray, np.ndarray]: + """Precompute rotary embedding cos/sin tables.""" + freqs = 1.0 / (theta ** (np.arange(0, head_dim, 2, dtype=np.float32) / head_dim)) + t = np.arange(max_seq_len, dtype=np.float32) + freqs = np.outer(t, freqs) + cos = np.cos(freqs) + sin = np.sin(freqs) + cos = np.concatenate([cos, cos], axis=-1) + sin = np.concatenate([sin, sin], axis=-1) + return cos, sin - def __init__( - self, - weight: GPUArray, - bias: GPUArray, - eps: float = 1e-5, - ): - """Initialize LayerNorm. - Args: - weight: Scale parameter (gamma) [features] - bias: Shift parameter (beta) [features] - eps: Epsilon for numerical stability - """ - self.weight = weight - self.bias = bias - self.eps = eps +def apply_rotary_pos_emb_numpy( + q: np.ndarray, k: np.ndarray, cos: np.ndarray, sin: np.ndarray +) -> tuple[np.ndarray, np.ndarray]: + """Apply rotary position embeddings to Q and K (numpy version).""" - def __call__(self, x: GPUArray) -> GPUArray: - """Forward pass. + def rotate_half(x): + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return np.concatenate([-x2, x1], axis=-1) - Args: - x: Input tensor [batch, features] + cos = cos[:, np.newaxis, :] + sin = sin[:, np.newaxis, :] - Returns: - Normalized tensor [batch, features] - """ - return layernorm(x, self.weight, self.bias, self.eps) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed -class CausalSelfAttention: - """Causal self-attention for GPT-2. +# ============================================================================= +# Unified Attention +# ============================================================================= - Structure: - - c_attn: [n_embd] -> [3*n_embd] (Q, K, V projection) - - Split into n_head heads - - Q @ K^T / sqrt(d_k) with causal mask - - Softmax - - Attention @ V - - c_proj: [n_embd] -> [n_embd] + +class Attention: + """Unified attention with Hybrid CPU/GPU execution. + + Supports: + - Multi-Head Attention (MHA): num_kv_heads == num_heads + - Grouped Query Attention (GQA): num_kv_heads < num_heads + - RoPE: enabled via config.use_rope + - Hybrid execution: CPU for seq_len=1, GPU for longer sequences """ def __init__( self, - c_attn_weight: GPUArray, - c_attn_bias: GPUArray | None, - c_proj_weight: GPUArray, - c_proj_bias: GPUArray | None, - n_head: int, - n_embd: int, + q_proj: GPUArray, + k_proj: GPUArray, + v_proj: GPUArray, + o_proj: GPUArray, + config: TransformerConfig, + q_bias: GPUArray | None = None, + k_bias: GPUArray | None = None, + v_bias: GPUArray | None = None, + o_bias: GPUArray | None = None, ): - """Initialize CausalSelfAttention. - - Args: - c_attn_weight: QKV projection weight [3*n_embd, n_embd] - c_attn_bias: QKV projection bias [3*n_embd] - c_proj_weight: Output projection weight [n_embd, n_embd] - c_proj_bias: Output projection bias [n_embd] - n_head: Number of attention heads - n_embd: Embedding dimension - """ - self.c_attn = Linear(c_attn_weight, c_attn_bias) - self.c_proj = Linear(c_proj_weight, c_proj_bias) - self.n_head = n_head - self.n_embd = n_embd - self.head_dim = n_embd // n_head + self.q_proj = Linear(q_proj, q_bias) + self.k_proj = Linear(k_proj, k_bias) + self.v_proj = Linear(v_proj, v_bias) + self.o_proj = Linear(o_proj, o_bias) - def __call__(self, x: GPUArray) -> GPUArray: - """Forward pass with causal self-attention. + self.config = config + self.head_dim = config.head_dim + self.num_heads = config.num_heads + self.num_kv_heads = config.num_kv_heads + self.num_kv_groups = config.num_kv_groups + + # Precompute RoPE if enabled + if config.use_rope: + self._cos, self._sin = precompute_freqs_cis( + self.head_dim, config.max_position_embeddings, config.rope_theta + ) + else: + self._cos, self._sin = None, None + + def __call__( + self, + x: GPUArray, + position_ids: list[int] | None = None, + past_kv: tuple | None = None, + use_cache: bool = False, + ) -> tuple[GPUArray, tuple | None]: + """Forward pass with hybrid CPU/GPU attention. Args: - x: Input tensor [seq_len, n_embd] + x: Input tensor [seq_len, hidden_size] + position_ids: Position IDs for RoPE (auto-generated if None) + past_kv: Tuple of (past_k, past_v) numpy arrays + use_cache: Whether to return KV cache Returns: - Output tensor [seq_len, n_embd] + Tuple of (output, present_kv) """ - import numpy as np + seq_len = x.shape[0] + + if position_ids is None: + position_ids = list(range(seq_len)) + + # Hybrid routing: CPU for seq_len=1, GPU for prefill + if seq_len > 1: + return self._forward_gpu(x, position_ids, past_kv, use_cache) + else: + return self._forward_cpu(x, position_ids, past_kv, use_cache) + def _forward_gpu( + self, + x: GPUArray, + position_ids: list[int], + past_kv: tuple | None, + use_cache: bool, + ) -> tuple[GPUArray, tuple | None]: + """GPU path for long sequences (prefill).""" seq_len = x.shape[0] - # QKV projection: [seq_len, n_embd] -> [seq_len, 3*n_embd] - qkv = self.c_attn(x) - qkv_np = qkv.to_numpy() + # Project Q, K, V + q = self.q_proj(x) + k = self.k_proj(x) + v = self.v_proj(x) + + # Reshape for multi-head + q = reshape_copy(q, (seq_len, self.num_heads, self.head_dim)) + k = reshape_copy(k, (seq_len, self.num_kv_heads, self.head_dim)) + v = reshape_copy(v, (seq_len, self.num_kv_heads, self.head_dim)) + + # Apply RoPE on GPU + if self.config.use_rope: + cos = from_numpy(self._cos[position_ids].astype(np.float32)) + sin = from_numpy(self._sin[position_ids].astype(np.float32)) + rope_inplace(q, k, cos, sin) + + # Convert to numpy for KV cache + k_np = k.to_numpy() + v_np = v.to_numpy() + + # Concatenate with past KV + if past_kv is not None: + past_k, past_v = past_kv + k_np = np.concatenate([past_k, k_np], axis=0) + v_np = np.concatenate([past_v, v_np], axis=0) - # Split into Q, K, V: each [seq_len, n_embd] - q_np = qkv_np[:, :self.n_embd] - k_np = qkv_np[:, self.n_embd:2*self.n_embd] - v_np = qkv_np[:, 2*self.n_embd:] + present_kv = (k_np.copy(), v_np.copy()) if use_cache else None - # Reshape for multi-head: [seq_len, n_head, head_dim] - q_np = q_np.reshape(seq_len, self.n_head, self.head_dim) - k_np = k_np.reshape(seq_len, self.n_head, self.head_dim) - v_np = v_np.reshape(seq_len, self.n_head, self.head_dim) + # Expand for GQA + if self.num_kv_groups > 1: + k_expanded = np.repeat(k_np, self.num_kv_groups, axis=1) + v_expanded = np.repeat(v_np, self.num_kv_groups, axis=1) + else: + k_expanded = k_np + v_expanded = v_np - # Transpose to [n_head, seq_len, head_dim] for batched attention - q_np = q_np.transpose(1, 0, 2) - k_np = k_np.transpose(1, 0, 2) - v_np = v_np.transpose(1, 0, 2) + # GPU SDPA + q_t = transpose_3d_021(q) + k_t = from_numpy(k_expanded.transpose(1, 0, 2).astype(np.float32)) + v_t = from_numpy(v_expanded.transpose(1, 0, 2).astype(np.float32)) - # Compute attention scores: [n_head, seq_len, seq_len] + attn_output = sdpa_causal(q_t, k_t, v_t) + + # Reshape output + attn_output = transpose_3d_021(attn_output) + attn_output = reshape_copy(attn_output, (seq_len, self.num_heads * self.head_dim)) + + return self.o_proj(attn_output), present_kv + + def _forward_cpu( + self, + x: GPUArray, + position_ids: list[int], + past_kv: tuple | None, + use_cache: bool, + ) -> tuple[GPUArray, tuple | None]: + """CPU path for seq_len=1 (decode) - minimal kernel overhead.""" + seq_len = x.shape[0] + + # Project Q, K, V (GPU matmul, then transfer) + q = self.q_proj(x).to_numpy() + k = self.k_proj(x).to_numpy() + v = self.v_proj(x).to_numpy() + + # Reshape for multi-head + q = q.reshape(seq_len, self.num_heads, self.head_dim) + k = k.reshape(seq_len, self.num_kv_heads, self.head_dim) + v = v.reshape(seq_len, self.num_kv_heads, self.head_dim) + + # Apply RoPE (CPU) + if self.config.use_rope: + cos = self._cos[position_ids] + sin = self._sin[position_ids] + q, k = apply_rotary_pos_emb_numpy(q, k, cos, sin) + + # Concatenate with past KV + if past_kv is not None: + past_k, past_v = past_kv + k = np.concatenate([past_k, k], axis=0) + v = np.concatenate([past_v, v], axis=0) + + present_kv = (k.copy(), v.copy()) if use_cache else None + + # Expand for GQA + if self.num_kv_groups > 1: + k_expanded = np.repeat(k, self.num_kv_groups, axis=1) + v_expanded = np.repeat(v, self.num_kv_groups, axis=1) + else: + k_expanded = k + v_expanded = v + + # CPU attention + q = q.transpose(1, 0, 2) + k_expanded = k_expanded.transpose(1, 0, 2) + v_expanded = v_expanded.transpose(1, 0, 2) + + q_len = q.shape[1] + kv_len = k_expanded.shape[1] scale = 1.0 / np.sqrt(self.head_dim) - # Q @ K^T: [n_head, seq_len, head_dim] @ [n_head, head_dim, seq_len] - attn_scores = np.matmul(q_np, k_np.transpose(0, 2, 1)) * scale - # Apply causal mask (lower triangular) - causal_mask = np.triu(np.ones((seq_len, seq_len)), k=1).astype(bool) - attn_scores[:, causal_mask] = -1e9 + attn_scores = np.matmul(q, k_expanded.transpose(0, 2, 1)) * scale - # Softmax over last dimension - attn_scores_max = attn_scores.max(axis=-1, keepdims=True) - attn_exp = np.exp(attn_scores - attn_scores_max) - attn_weights = attn_exp / attn_exp.sum(axis=-1, keepdims=True) + # Causal mask + if self.config.causal: + causal_mask = np.zeros((q_len, kv_len), dtype=bool) + for i in range(q_len): + start_mask = kv_len - q_len + i + 1 + if start_mask < kv_len: + causal_mask[i, start_mask:] = True + attn_scores[:, causal_mask] = -1e9 - # Attention @ V: [n_head, seq_len, head_dim] - attn_output = np.matmul(attn_weights, v_np) + # Softmax + attn_max = attn_scores.max(axis=-1, keepdims=True) + attn_exp = np.exp(attn_scores - attn_max) + attn_weights = attn_exp / attn_exp.sum(axis=-1, keepdims=True) - # Transpose back: [seq_len, n_head, head_dim] + # Attention output + attn_output = np.matmul(attn_weights, v_expanded) attn_output = attn_output.transpose(1, 0, 2) + attn_output = attn_output.reshape(seq_len, self.num_heads * self.head_dim) - # Reshape to [seq_len, n_embd] - attn_output = attn_output.reshape(seq_len, self.n_embd) - - # Output projection + # Output projection (GPU) out = from_numpy(attn_output.astype(np.float32)) - out = self.c_proj(out) + return self.o_proj(out), present_kv + + +# ============================================================================= +# Unified MLP +# ============================================================================= + - return out +class MLP: + """Unified MLP supporting GELU and SwiGLU activations. + + GELU (GPT-2 style): + fc1 -> GELU -> fc2 + + SwiGLU (LLaMA style): + gate_proj -> SiLU -> * up_proj -> down_proj + """ + + def __init__( + self, + config: TransformerConfig, + # GELU path weights + fc1_weight: GPUArray | None = None, + fc1_bias: GPUArray | None = None, + fc2_weight: GPUArray | None = None, + fc2_bias: GPUArray | None = None, + # SwiGLU path weights + gate_proj: GPUArray | None = None, + up_proj: GPUArray | None = None, + down_proj: GPUArray | None = None, + ): + self.config = config + self.activation = config.activation + + if config.activation == "gelu": + if fc1_weight is None or fc2_weight is None: + raise ValueError("GELU MLP requires fc1_weight and fc2_weight") + self.fc1 = Linear(fc1_weight, fc1_bias) + self.fc2 = Linear(fc2_weight, fc2_bias) + else: # silu (SwiGLU) + if gate_proj is None or up_proj is None or down_proj is None: + raise ValueError("SwiGLU MLP requires gate_proj, up_proj, down_proj") + self.gate_proj = Linear(gate_proj) + self.up_proj = Linear(up_proj) + self.down_proj = Linear(down_proj) + + def __call__(self, x: GPUArray) -> GPUArray: + if self.activation == "gelu": + # GELU path: fc1 -> GELU -> fc2 + h = self.fc1(x) + h = gelu(h) + return self.fc2(h) + else: + # SwiGLU path: gate_proj -> SiLU -> * up_proj -> down_proj + gate = silu(self.gate_proj(x)) + up = self.up_proj(x) + return self.down_proj(mul(gate, up)) + + +# ============================================================================= +# Unified TransformerBlock +# ============================================================================= class TransformerBlock: - """Full transformer block with attention and MLP. + """Unified transformer block. - Structure: ln_1 -> attention -> residual -> ln_2 -> mlp -> residual + Structure: + Norm -> Attention -> Residual + Norm -> MLP -> Residual """ def __init__( self, - ln_1_weight: GPUArray, - ln_1_bias: GPUArray, - attn: CausalSelfAttention | None, - ln_2_weight: GPUArray, - ln_2_bias: GPUArray, + attn_norm: Norm, + attn: Attention, + mlp_norm: Norm, mlp: MLP, - eps: float = 1e-5, ): - """Initialize TransformerBlock. - - Args: - ln_1_weight: First LayerNorm weight [n_embd] - ln_1_bias: First LayerNorm bias [n_embd] - attn: CausalSelfAttention module (None for MLP-only mode) - ln_2_weight: Second LayerNorm weight [n_embd] - ln_2_bias: Second LayerNorm bias [n_embd] - mlp: MLP block - eps: LayerNorm epsilon - """ - self.ln_1 = LayerNorm(ln_1_weight, ln_1_bias, eps) + self.attn_norm = attn_norm self.attn = attn - self.ln_2 = LayerNorm(ln_2_weight, ln_2_bias, eps) + self.mlp_norm = mlp_norm self.mlp = mlp - def __call__(self, x: GPUArray) -> GPUArray: - """Forward pass: ln_1 -> attn -> residual -> ln_2 -> mlp -> residual + def __call__( + self, + x: GPUArray, + position_ids: list[int] | None = None, + past_kv: tuple | None = None, + use_cache: bool = False, + ) -> tuple[GPUArray, tuple | None]: + # Attention block + residual = x + x = self.attn_norm(x) + attn_out, present_kv = self.attn(x, position_ids, past_kv, use_cache) + x = add(residual, attn_out) - Args: - x: Input tensor [seq_len, n_embd] + # MLP block + residual = x + x = self.mlp_norm(x) + x = self.mlp(x) + x = add(residual, x) - Returns: - Output tensor [seq_len, n_embd] - """ - # Attention block (if available) - if self.attn is not None: - h = self.ln_1(x) - h = self.attn(h) - x = add(x, h) + return x, present_kv - # MLP block - h = self.ln_2(x) - h = self.mlp(h) - return add(x, h) +# ============================================================================= +# Unified CausalTransformerModel +# ============================================================================= -class GPT2Model: - """GPT-2 model with full transformer blocks. - Structure: - - Token embedding - - Position embedding - - Transformer blocks (attention + MLP) - - Final LayerNorm - - LM head (tied to embedding) +class CausalTransformerModel: + """Unified causal transformer model. + + Supports GPT-2 and LLaMA architectures through configuration. """ def __init__( self, - config: GPT2Config, - wte: GPUArray, # Token embedding [vocab_size, n_embd] - wpe: GPUArray, # Position embedding [n_positions, n_embd] + config: TransformerConfig, + embed_tokens: GPUArray, blocks: list[TransformerBlock], - ln_f_weight: GPUArray, - ln_f_bias: GPUArray, + final_norm: Norm, + lm_head: GPUArray | None = None, + position_embed: GPUArray | None = None, # For GPT-2 style ): - """Initialize GPT-2 model. - - Args: - config: Model configuration - wte: Token embedding weights [vocab_size, n_embd] - wpe: Position embedding weights [n_positions, n_embd] - blocks: List of transformer blocks - ln_f_weight: Final LayerNorm weight - ln_f_bias: Final LayerNorm bias - """ self.config = config - self.wte = wte - self.wpe = wpe + self.embed_tokens = embed_tokens self.blocks = blocks - self.ln_f = LayerNorm(ln_f_weight, ln_f_bias, config.layer_norm_eps) + self.final_norm = final_norm + self.lm_head = lm_head + self.position_embed = position_embed - def __call__(self, input_ids: list[int], position_ids: list[int] | None = None) -> GPUArray: + def __call__( + self, + input_ids: list[int], + position_ids: list[int] | None = None, + past_key_values: list[tuple] | None = None, + use_cache: bool = False, + ) -> tuple[GPUArray, list[tuple] | None]: """Forward pass. Args: input_ids: Token IDs [seq_len] - position_ids: Optional position IDs [seq_len] + position_ids: Position IDs (auto-generated if None) + past_key_values: List of (k, v) tuples per layer + use_cache: Whether to return KV cache Returns: - Hidden states [seq_len, n_embd] + Tuple of (hidden_states, present_key_values) """ - import numpy as np - seq_len = len(input_ids) if position_ids is None: - position_ids = list(range(seq_len)) - - # Get embeddings by indexing (CPU for MVP) - wte_np = self.wte.to_numpy() - wpe_np = self.wpe.to_numpy() - - # Token embeddings: select rows from wte - token_embeds = wte_np[input_ids] # [seq_len, n_embd] - - # Position embeddings: select rows from wpe - pos_embeds = wpe_np[position_ids] # [seq_len, n_embd] - - # Combine embeddings - hidden = from_numpy((token_embeds + pos_embeds).astype(np.float32)) - - # Apply transformer blocks - for block in self.blocks: - hidden = block(hidden) - - # Final LayerNorm - hidden = self.ln_f(hidden) - - return hidden - - def lm_head(self, hidden: GPUArray) -> GPUArray: - """Compute logits from hidden states. + if past_key_values is not None and past_key_values[0] is not None: + past_len = past_key_values[0][0].shape[0] + position_ids = list(range(past_len, past_len + seq_len)) + else: + position_ids = list(range(seq_len)) + + # Token embeddings + embed_np = self.embed_tokens.to_numpy() + hidden = embed_np[input_ids].astype(np.float32) + + # Add position embeddings (GPT-2 style) + if self.position_embed is not None: + pos_embed_np = self.position_embed.to_numpy() + hidden = hidden + pos_embed_np[position_ids] + + hidden = from_numpy(hidden) + + # Transformer blocks + present_key_values = [] + for i, block in enumerate(self.blocks): + past_kv = past_key_values[i] if past_key_values else None + hidden, present_kv = block(hidden, position_ids, past_kv, use_cache) + present_key_values.append(present_kv) + + # Final norm + hidden = self.final_norm(hidden) + + if use_cache: + return hidden, present_key_values + return hidden, None + + def get_logits(self, hidden: GPUArray) -> GPUArray: + """Compute logits from hidden states.""" + hidden_np = hidden.to_numpy() - Args: - hidden: Hidden states [seq_len, n_embd] + if self.lm_head is not None: + lm_head_np = self.lm_head.to_numpy() + else: + # Tied embeddings + lm_head_np = self.embed_tokens.to_numpy() - Returns: - Logits [seq_len, vocab_size] - """ - # LM head is tied to embedding weights - # logits = hidden @ wte.T - wte_np = self.wte.to_numpy() - hidden_np = hidden.to_numpy() - logits = hidden_np @ wte_np.T - return from_numpy(logits.astype(hidden_np.dtype)) + logits = hidden_np @ lm_head_np.T + return from_numpy(logits.astype(np.float32)) def generate( self, input_ids: list[int], max_new_tokens: int = 20, temperature: float = 1.0, + top_k: int = 50, + top_p: float = 0.9, + eos_token_id: int | None = None, + use_cache: bool = True, ) -> list[int]: """Generate tokens autoregressively. Args: input_ids: Initial token IDs - max_new_tokens: Maximum number of new tokens to generate - temperature: Sampling temperature (1.0 = greedy argmax) + max_new_tokens: Maximum new tokens to generate + temperature: Sampling temperature + top_k: Top-k filtering + top_p: Nucleus sampling threshold + eos_token_id: Stop at this token + use_cache: Use KV cache Returns: List of all token IDs (input + generated) """ - import numpy as np - tokens = list(input_ids) + past_key_values = None + + if use_cache: + # Prefill + hidden, past_key_values = self(tokens, use_cache=True) + logits = self.get_logits(hidden) + last_logits = logits.to_numpy()[-1] + next_token = sample_token(last_logits, temperature, top_k, top_p) + tokens.append(next_token) + + if eos_token_id is not None and next_token == eos_token_id: + return tokens - for _ in range(max_new_tokens): - # Truncate to max context length - context = tokens[-self.config.n_positions :] + # Decode + for _ in range(max_new_tokens - 1): + hidden, past_key_values = self( + [next_token], past_key_values=past_key_values, use_cache=True + ) + logits = self.get_logits(hidden) + last_logits = logits.to_numpy()[-1] + next_token = sample_token(last_logits, temperature, top_k, top_p) + tokens.append(next_token) + + if eos_token_id is not None and next_token == eos_token_id: + break + else: + for _ in range(max_new_tokens): + hidden, _ = self(tokens, use_cache=False) + logits = self.get_logits(hidden) + last_logits = logits.to_numpy()[-1] + next_token = sample_token(last_logits, temperature, top_k, top_p) + tokens.append(next_token) + + if eos_token_id is not None and next_token == eos_token_id: + break - # Forward pass - hidden = self(context) + return tokens - # Get logits for last position - logits = self.lm_head(hidden) - logits_np = logits.to_numpy() - last_logits = logits_np[-1] # [vocab_size] - # Apply temperature - if temperature != 1.0: - last_logits = last_logits / temperature +# ============================================================================= +# Legacy Aliases (for backward compatibility) +# ============================================================================= - # Greedy decoding (argmax) - next_token = int(np.argmax(last_logits)) - tokens.append(next_token) - # Stop at EOS (50256 for GPT-2) - if next_token == 50256: - break +# RMSNorm alias +class RMSNorm(Norm): + """RMSNorm layer (legacy alias).""" + + def __init__(self, weight: GPUArray, eps: float = 1e-5): + super().__init__(weight, None, "rmsnorm", eps) + + +# LayerNorm alias +class LayerNorm(Norm): + """LayerNorm layer (legacy alias).""" + + def __init__(self, weight: GPUArray, bias: GPUArray, eps: float = 1e-5): + super().__init__(weight, bias, "layernorm", eps) + + +# Legacy LlamaAttention alias +LlamaAttention = Attention + + +# Legacy LlamaMLP +class LlamaMLP(MLP): + """LLaMA MLP (legacy alias).""" + + def __init__( + self, + gate_proj: GPUArray, + up_proj: GPUArray, + down_proj: GPUArray, + ): + # Create minimal config for SwiGLU + config = TransformerConfig(activation="silu") + super().__init__(config, gate_proj=gate_proj, up_proj=up_proj, down_proj=down_proj) + + +# Legacy LlamaBlock alias +LlamaBlock = TransformerBlock + + +# Legacy LlamaModel alias +class LlamaModel(CausalTransformerModel): + """LLaMA model (legacy alias).""" + + pass + + +# Legacy GPT2Model alias +class GPT2Model(CausalTransformerModel): + """GPT-2 model (legacy alias).""" + + def lm_head(self, hidden: GPUArray) -> GPUArray: + """Legacy lm_head method.""" + return self.get_logits(hidden) + + +# ============================================================================= +# Legacy Attention Classes (for backward compatibility) +# ============================================================================= + + +class CausalSelfAttention(Attention): + """GPT-2 style causal self-attention (legacy alias).""" + + def __init__( + self, + c_attn_weight: GPUArray, + c_attn_bias: GPUArray | None, + c_proj_weight: GPUArray, + c_proj_bias: GPUArray | None, + n_head: int, + n_embd: int, + ): + # GPT-2 uses combined QKV projection + # Split weights for unified Attention class + # c_attn: [3*n_embd, n_embd] -> Q, K, V each [n_embd, n_embd] + c_attn_np = c_attn_weight.to_numpy() + q_weight = from_numpy(c_attn_np[:n_embd].copy()) + k_weight = from_numpy(c_attn_np[n_embd : 2 * n_embd].copy()) + v_weight = from_numpy(c_attn_np[2 * n_embd :].copy()) + + q_bias, k_bias, v_bias = None, None, None + if c_attn_bias is not None: + c_attn_bias_np = c_attn_bias.to_numpy() + q_bias = from_numpy(c_attn_bias_np[:n_embd].copy()) + k_bias = from_numpy(c_attn_bias_np[n_embd : 2 * n_embd].copy()) + v_bias = from_numpy(c_attn_bias_np[2 * n_embd :].copy()) + + config = TransformerConfig( + hidden_size=n_embd, + num_heads=n_head, + num_kv_heads=n_head, # MHA + norm_type="layernorm", + activation="gelu", + use_rope=False, + causal=True, + ) + + super().__init__( + q_weight, + k_weight, + v_weight, + c_proj_weight, + config, + q_bias, + k_bias, + v_bias, + c_proj_bias, + ) + + +# ============================================================================= +# Legacy MLP Class (for backward compatibility) +# ============================================================================= + + +class _LegacyMLP(MLP): + """GPT-2 style MLP (legacy).""" + + def __init__( + self, + c_fc_weight: GPUArray, + c_fc_bias: GPUArray | None, + c_proj_weight: GPUArray, + c_proj_bias: GPUArray | None, + ): + config = TransformerConfig(activation="gelu") + super().__init__( + config, + fc1_weight=c_fc_weight, + fc1_bias=c_fc_bias, + fc2_weight=c_proj_weight, + fc2_bias=c_proj_bias, + ) - return tokens + +# ============================================================================= +# Safetensors Loaders +# ============================================================================= def load_gpt2_from_safetensors( @@ -467,101 +949,208 @@ def load_gpt2_from_safetensors( """Load GPT-2 model from safetensors file. Args: - model_path: Path to model.safetensors file + model_path: Path to model.safetensors config: Model configuration (defaults to GPT-2 small) - load_attention: Whether to load attention weights (default: True) + load_attention: Whether to load attention weights Returns: - GPT2Model instance + GPT2Model instance (CausalTransformerModel) """ from pygpukit.llm import SafeTensorsFile if config is None: config = GPT2Config() + transformer_config = config.to_transformer_config() st = SafeTensorsFile(model_path) - # Helper to load tensor - def load_tensor(name: str) -> GPUArray: + def load_tensor(name: str, do_transpose: bool = False) -> GPUArray: data = st.tensor_bytes(name) info = st.tensor_info(name) - - import numpy as np - - # Determine numpy dtype - dtype_map = { - 0: np.float32, # Float32 - 1: np.float16, # Float16 - 2: np.float32, # BFloat16 -> convert to float32 for now - 3: np.float64, # Float64 - } + dtype_map = {0: np.float32, 1: np.float16, 2: np.float32, 3: np.float64} np_dtype = dtype_map.get(info.dtype, np.float32) - - # Create numpy array from bytes arr = np.frombuffer(data, dtype=np_dtype).reshape(info.shape) - return from_numpy(arr.copy()) + if do_transpose and arr.ndim == 2: + arr = arr.T + return from_numpy(arr.copy().astype(np.float32)) - def try_load_tensor(name: str) -> GPUArray | None: - """Try to load tensor, return None if not found.""" + def try_load(name: str, do_transpose: bool = False) -> GPUArray | None: if name in st.tensor_names: - return load_tensor(name) + return load_tensor(name, do_transpose) return None - # Load embeddings + # Embeddings wte = load_tensor("wte.weight") wpe = load_tensor("wpe.weight") - # Load blocks + # Blocks blocks = [] for i in range(config.n_layer): prefix = f"h.{i}." - # Check if MLP weights exist - mlp_c_fc_w_name = f"{prefix}mlp.c_fc.weight" - if mlp_c_fc_w_name not in st.tensor_names: - # Skip blocks without MLP (shouldn't happen for GPT-2) - continue - - # LayerNorm 1 (before attention) + # Attention norm ln_1_w = load_tensor(f"{prefix}ln_1.weight") ln_1_b = load_tensor(f"{prefix}ln_1.bias") + attn_norm = Norm(ln_1_w, ln_1_b, "layernorm", config.layer_norm_eps) - # Attention (optional) + # Attention attn = None if load_attention: - attn_c_attn_w_name = f"{prefix}attn.c_attn.weight" - if attn_c_attn_w_name in st.tensor_names: - attn_c_attn_w = load_tensor(attn_c_attn_w_name) - attn_c_attn_b = try_load_tensor(f"{prefix}attn.c_attn.bias") - attn_c_proj_w = load_tensor(f"{prefix}attn.c_proj.weight") - attn_c_proj_b = try_load_tensor(f"{prefix}attn.c_proj.bias") - - attn = CausalSelfAttention( - attn_c_attn_w, attn_c_attn_b, - attn_c_proj_w, attn_c_proj_b, - config.n_head, config.n_embd - ) - - # LayerNorm 2 (before MLP) + c_attn_w = load_tensor(f"{prefix}attn.c_attn.weight", do_transpose=True) + c_attn_b = try_load(f"{prefix}attn.c_attn.bias") + c_proj_w = load_tensor(f"{prefix}attn.c_proj.weight", do_transpose=True) + c_proj_b = try_load(f"{prefix}attn.c_proj.bias") + attn = CausalSelfAttention( + c_attn_w, c_attn_b, c_proj_w, c_proj_b, config.n_head, config.n_embd + ) + + # MLP norm ln_2_w = load_tensor(f"{prefix}ln_2.weight") ln_2_b = load_tensor(f"{prefix}ln_2.bias") + mlp_norm = Norm(ln_2_w, ln_2_b, "layernorm", config.layer_norm_eps) # MLP - mlp_c_fc_w = load_tensor(f"{prefix}mlp.c_fc.weight") - mlp_c_fc_b = try_load_tensor(f"{prefix}mlp.c_fc.bias") - mlp_c_proj_w = load_tensor(f"{prefix}mlp.c_proj.weight") - mlp_c_proj_b = try_load_tensor(f"{prefix}mlp.c_proj.bias") - - mlp = MLP(mlp_c_fc_w, mlp_c_fc_b, mlp_c_proj_w, mlp_c_proj_b) - block = TransformerBlock( - ln_1_w, ln_1_b, attn, - ln_2_w, ln_2_b, mlp, - config.layer_norm_eps + c_fc_w = load_tensor(f"{prefix}mlp.c_fc.weight", do_transpose=True) + c_fc_b = try_load(f"{prefix}mlp.c_fc.bias") + c_proj_w = load_tensor(f"{prefix}mlp.c_proj.weight", do_transpose=True) + c_proj_b = try_load(f"{prefix}mlp.c_proj.bias") + mlp = MLP( + transformer_config, + fc1_weight=c_fc_w, + fc1_bias=c_fc_b, + fc2_weight=c_proj_w, + fc2_bias=c_proj_b, ) - blocks.append(block) - # Final LayerNorm + if attn is not None: + block = TransformerBlock(attn_norm, attn, mlp_norm, mlp) + blocks.append(block) + + # Final norm ln_f_w = load_tensor("ln_f.weight") ln_f_b = load_tensor("ln_f.bias") + final_norm = Norm(ln_f_w, ln_f_b, "layernorm", config.layer_norm_eps) + + return GPT2Model(transformer_config, wte, blocks, final_norm, None, wpe) + + +def load_llama_from_safetensors( + model_path: str, + config: LlamaConfig | None = None, +) -> LlamaModel: + """Load Llama model from safetensors file. + + Args: + model_path: Path to model.safetensors + config: Model configuration (auto-detected if None) + + Returns: + LlamaModel instance (CausalTransformerModel) + """ + from pygpukit.llm import SafeTensorsFile + + st = SafeTensorsFile(model_path) + + # Auto-detect config + if config is None: + embed_info = st.tensor_info("model.embed_tokens.weight") + vocab_size = embed_info.shape[0] + hidden_size = embed_info.shape[1] + + num_layers = 0 + while f"model.layers.{num_layers}.self_attn.q_proj.weight" in st.tensor_names: + num_layers += 1 + + q_info = st.tensor_info("model.layers.0.self_attn.q_proj.weight") + k_info = st.tensor_info("model.layers.0.self_attn.k_proj.weight") + gate_info = st.tensor_info("model.layers.0.mlp.gate_proj.weight") + + head_dim = 64 + num_heads = q_info.shape[0] // head_dim + num_kv_heads = k_info.shape[0] // head_dim + + config = LlamaConfig( + vocab_size=vocab_size, + hidden_size=hidden_size, + intermediate_size=gate_info.shape[0], + num_hidden_layers=num_layers, + num_attention_heads=num_heads, + num_key_value_heads=num_kv_heads, + ) + + transformer_config = config.to_transformer_config() + + def load_tensor(name: str) -> GPUArray: + data = st.tensor_bytes(name) + info = st.tensor_info(name) + if info.dtype == 2: # BFloat16 + arr = np.frombuffer(data, dtype=np.uint16).reshape(info.shape) + arr_f32 = np.empty(arr.shape, dtype=np.float32) + arr_f32.view(np.uint32)[:] = arr.astype(np.uint32) << 16 + return from_numpy(arr_f32) + else: + dtype_map = {0: np.float32, 1: np.float16, 3: np.float64} + np_dtype = dtype_map.get(info.dtype, np.float32) + arr = np.frombuffer(data, dtype=np_dtype).reshape(info.shape) + return from_numpy(arr.copy().astype(np.float32)) + + # Embeddings + embed_tokens = load_tensor("model.embed_tokens.weight") + + # Blocks + blocks = [] + for i in range(config.num_hidden_layers): + prefix = f"model.layers.{i}." + + # Attention norm + attn_norm = Norm( + load_tensor(f"{prefix}input_layernorm.weight"), + None, + "rmsnorm", + config.rms_norm_eps, + ) + + # Attention + attn = Attention( + load_tensor(f"{prefix}self_attn.q_proj.weight"), + load_tensor(f"{prefix}self_attn.k_proj.weight"), + load_tensor(f"{prefix}self_attn.v_proj.weight"), + load_tensor(f"{prefix}self_attn.o_proj.weight"), + transformer_config, + ) + + # MLP norm + mlp_norm = Norm( + load_tensor(f"{prefix}post_attention_layernorm.weight"), + None, + "rmsnorm", + config.rms_norm_eps, + ) + + # MLP + mlp = MLP( + transformer_config, + gate_proj=load_tensor(f"{prefix}mlp.gate_proj.weight"), + up_proj=load_tensor(f"{prefix}mlp.up_proj.weight"), + down_proj=load_tensor(f"{prefix}mlp.down_proj.weight"), + ) + + block = TransformerBlock(attn_norm, attn, mlp_norm, mlp) + blocks.append(block) + + # Final norm + final_norm = Norm(load_tensor("model.norm.weight"), None, "rmsnorm", config.rms_norm_eps) + + # LM head + lm_head = None + if "lm_head.weight" in st.tensor_names: + lm_head = load_tensor("lm_head.weight") + + return LlamaModel(transformer_config, embed_tokens, blocks, final_norm, lm_head) + + +# ============================================================================= +# Legacy apply_rotary_pos_emb (for backward compatibility) +# ============================================================================= - return GPT2Model(config, wte, wpe, blocks, ln_f_w, ln_f_b) +apply_rotary_pos_emb = apply_rotary_pos_emb_numpy diff --git a/src/pygpukit/ops/__init__.py b/src/pygpukit/ops/__init__.py index 6fd6de4..b08b2a9 100644 --- a/src/pygpukit/ops/__init__.py +++ b/src/pygpukit/ops/__init__.py @@ -3,6 +3,7 @@ from pygpukit.ops.basic import ( add, bias_add_inplace, + concat_axis0, div, exp, gelu, @@ -14,10 +15,17 @@ mean, mul, relu, + repeat_interleave_axis1, + reshape_copy, + rmsnorm, + rope_inplace, + sdpa_causal, + silu, softmax, sub, sum, transpose, + transpose_3d_021, ) __all__ = [ @@ -29,8 +37,10 @@ "log", "relu", "gelu", + "silu", "softmax", "layernorm", + "rmsnorm", "matmul", "sum", "mean", @@ -38,4 +48,10 @@ "transpose", "bias_add_inplace", "linear_bias_gelu", + "rope_inplace", + "sdpa_causal", + "concat_axis0", + "repeat_interleave_axis1", + "transpose_3d_021", + "reshape_copy", ] diff --git a/src/pygpukit/ops/basic.py b/src/pygpukit/ops/basic.py index c82483a..c0d59c2 100644 --- a/src/pygpukit/ops/basic.py +++ b/src/pygpukit/ops/basic.py @@ -824,6 +824,82 @@ def _bias_add_inplace_native(output: GPUArray, bias: GPUArray) -> None: # ============================================================================ +def rmsnorm( + input: GPUArray, + gamma: GPUArray, + eps: float = 1e-5, +) -> GPUArray: + """RMS Normalization (Root Mean Square Normalization). + + Computes: x / sqrt(mean(x^2) + eps) * gamma + + Simpler than LayerNorm (no mean subtraction, no beta). + Used in Llama and other modern LLMs. + + Args: + input: Input array of shape [batch, features]. + gamma: Scale parameter of shape [features]. + eps: Small epsilon for numerical stability. + + Returns: + A new GPUArray containing the normalized output. + + Raises: + ValueError: If shapes or dtypes don't match. + """ + _validate_float_dtype(input, "rmsnorm") + + if input.ndim != 2: + raise ValueError(f"rmsnorm expects 2D input [batch, features], got {input.ndim}D") + if gamma.ndim != 1: + raise ValueError("rmsnorm expects 1D gamma") + if input.dtype != gamma.dtype: + raise ValueError("rmsnorm: all inputs must have same dtype") + + features = input.shape[1] + if gamma.shape[0] != features: + raise ValueError(f"rmsnorm: gamma size {gamma.shape[0]} must match features {features}") + + backend = get_backend() + + if isinstance(backend, NativeBackend) and backend.is_available(): + return _rmsnorm_native(input, gamma, eps) + else: + return _rmsnorm_cpu(input, gamma, eps) + + +def _rmsnorm_cpu( + input: GPUArray, + gamma: GPUArray, + eps: float, +) -> GPUArray: + """CPU implementation of rmsnorm.""" + x = input.to_numpy() + g = gamma.to_numpy() + + # RMS = sqrt(mean(x^2) + eps) + rms = np.sqrt(np.mean(x**2, axis=1, keepdims=True) + eps) + + # Normalize and scale + result = (x / rms) * g + return from_numpy(result) + + +def _rmsnorm_native( + input: GPUArray, + gamma: GPUArray, + eps: float, +) -> GPUArray: + """Native C++ CUDA implementation of rmsnorm (zero-copy).""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + input_native = input._get_native() + gamma_native = gamma._get_native() + c_native = native.rmsnorm(input_native, gamma_native, eps) + return GPUArray._wrap_native(c_native) + + def linear_bias_gelu( input: GPUArray, weight: GPUArray, @@ -924,3 +1000,457 @@ def _linear_bias_gelu_native( bias_native = bias._get_native() c_native = native.linear_bias_gelu(input_native, weight_native, bias_native) return GPUArray._wrap_native(c_native) + + +# ============================================================================ +# Additional Neural Network Operations +# ============================================================================ + + +def silu(a: GPUArray) -> GPUArray: + """SiLU (Swish) activation: y = x * sigmoid(x). + + Used in Llama and other modern LLMs as the activation in MLP layers. + + Args: + a: Input array. + + Returns: + A new GPUArray containing the SiLU-activated values. + + Raises: + ValueError: If dtype is not a float type. + """ + _validate_float_dtype(a, "silu") + + backend = get_backend() + + if isinstance(backend, NativeBackend) and backend.is_available(): + return _silu_native(a) + else: + return _silu_cpu(a) + + +def _silu_cpu(a: GPUArray) -> GPUArray: + """CPU implementation of SiLU.""" + x = a.to_numpy() + # SiLU = x * sigmoid(x) = x / (1 + exp(-x)) + result = x / (1.0 + np.exp(-x)) + return from_numpy(result) + + +def _silu_native(a: GPUArray) -> GPUArray: + """Native C++ CUDA implementation of SiLU (zero-copy).""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + a_native = a._get_native() + c_native = native.silu(a_native) + return GPUArray._wrap_native(c_native) + + +def sdpa_causal( + Q: GPUArray, + K: GPUArray, + V: GPUArray, + scale: float = 0.0, +) -> GPUArray: + """Scaled Dot-Product Attention with causal mask. + + Computes attention with automatic causal masking for autoregressive + sequence generation. This is the core attention operation used in + transformer models. + + Algorithm: + scores = Q @ K^T / scale + scores = apply_causal_mask(scores) + weights = softmax(scores) + output = weights @ V + + Args: + Q: Query tensor of shape [n_heads, q_len, head_dim]. + K: Key tensor of shape [n_heads, kv_len, head_dim]. + V: Value tensor of shape [n_heads, kv_len, head_dim]. + scale: Scaling factor (typically 1/sqrt(head_dim)). + If <= 0, computed automatically from head_dim. + + Returns: + Output tensor of shape [n_heads, q_len, head_dim]. + + Raises: + ValueError: If shapes or dtypes don't match. + + Note: + For KV cache usage during inference, kv_len >= q_len. + The causal mask ensures query at position i can only attend + to key positions 0 to (kv_len - q_len + i). + """ + _validate_float_dtype(Q, "sdpa_causal") + + if Q.ndim != 3 or K.ndim != 3 or V.ndim != 3: + raise ValueError("sdpa_causal expects 3D inputs [n_heads, seq_len, head_dim]") + if Q.dtype != K.dtype or Q.dtype != V.dtype: + raise ValueError("sdpa_causal: Q, K, V must have same dtype") + + n_heads, q_len, head_dim = Q.shape + kv_len = K.shape[1] + + if K.shape[0] != n_heads or V.shape[0] != n_heads: + raise ValueError("sdpa_causal: n_heads mismatch") + if K.shape[2] != head_dim or V.shape[2] != head_dim: + raise ValueError("sdpa_causal: head_dim mismatch") + if K.shape[1] != V.shape[1]: + raise ValueError("sdpa_causal: K and V seq_len mismatch") + + backend = get_backend() + + if isinstance(backend, NativeBackend) and backend.is_available(): + return _sdpa_causal_native(Q, K, V, scale) + else: + return _sdpa_causal_cpu(Q, K, V, scale) + + +def _sdpa_causal_cpu( + Q: GPUArray, + K: GPUArray, + V: GPUArray, + scale: float, +) -> GPUArray: + """CPU implementation of SDPA with causal mask.""" + q = Q.to_numpy() + k = K.to_numpy() + v = V.to_numpy() + + n_heads, q_len, head_dim = q.shape + kv_len = k.shape[1] + + if scale <= 0: + scale = 1.0 / np.sqrt(head_dim) + + # scores: [n_heads, q_len, kv_len] + scores = np.matmul(q, k.transpose(0, 2, 1)) * scale + + # Create causal mask + causal_offset = kv_len - q_len + for i in range(q_len): + max_attend = causal_offset + i + 1 + if max_attend < kv_len: + scores[:, i, max_attend:] = -np.inf + + # Softmax over last dimension + scores_max = scores.max(axis=-1, keepdims=True) + exp_scores = np.exp(scores - scores_max) + weights = exp_scores / exp_scores.sum(axis=-1, keepdims=True) + + # output: [n_heads, q_len, head_dim] + output = np.matmul(weights, v) + + return from_numpy(output.astype(q.dtype)) + + +def _sdpa_causal_native( + Q: GPUArray, + K: GPUArray, + V: GPUArray, + scale: float, +) -> GPUArray: + """Native C++ CUDA implementation of SDPA with causal mask.""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + q_native = Q._get_native() + k_native = K._get_native() + v_native = V._get_native() + c_native = native.sdpa_causal(q_native, k_native, v_native, scale) + return GPUArray._wrap_native(c_native) + + +def rope_inplace( + q: GPUArray, + k: GPUArray, + cos: GPUArray, + sin: GPUArray, +) -> None: + """Apply Rotary Position Embedding (RoPE) to Q and K tensors in-place. + + Args: + q: Query tensor of shape [seq_len, n_heads_q, head_dim] (modified in-place). + k: Key tensor of shape [seq_len, n_heads_k, head_dim] (modified in-place). + cos: Precomputed cosine of shape [seq_len, head_dim]. + sin: Precomputed sine of shape [seq_len, head_dim]. + + Note: + This operation modifies q and k in-place. + Works with GQA (n_heads_k can be different from n_heads_q). + """ + _validate_float_dtype(q, "rope_inplace") + + if q.ndim != 3 or k.ndim != 3: + raise ValueError("rope_inplace expects 3D q, k [seq_len, n_heads, head_dim]") + if cos.ndim != 2 or sin.ndim != 2: + raise ValueError("rope_inplace expects 2D cos, sin [seq_len, head_dim]") + + backend = get_backend() + + if isinstance(backend, NativeBackend) and backend.is_available(): + _rope_inplace_native(q, k, cos, sin) + else: + _rope_inplace_cpu(q, k, cos, sin) + + +def _rope_inplace_cpu( + q: GPUArray, + k: GPUArray, + cos: GPUArray, + sin: GPUArray, +) -> None: + """CPU implementation of rope_inplace.""" + import numpy as np + + q_np = q.to_numpy() + k_np = k.to_numpy() + cos_np = cos.to_numpy() + sin_np = sin.to_numpy() + + seq_len, n_heads_q, head_dim = q_np.shape + n_heads_k = k_np.shape[1] + half_dim = head_dim // 2 + + # Apply RoPE to Q + for s in range(seq_len): + c = cos_np[s, :half_dim] + sn = sin_np[s, :half_dim] + for h in range(n_heads_q): + q0 = q_np[s, h, :half_dim].copy() + q1 = q_np[s, h, half_dim:].copy() + q_np[s, h, :half_dim] = q0 * c - q1 * sn + q_np[s, h, half_dim:] = q1 * c + q0 * sn + + # Apply RoPE to K + for s in range(seq_len): + c = cos_np[s, :half_dim] + sn = sin_np[s, :half_dim] + for h in range(n_heads_k): + k0 = k_np[s, h, :half_dim].copy() + k1 = k_np[s, h, half_dim:].copy() + k_np[s, h, :half_dim] = k0 * c - k1 * sn + k_np[s, h, half_dim:] = k1 * c + k0 * sn + + # Update the GPUArray data in-place + q._data = from_numpy(q_np)._data + k._data = from_numpy(k_np)._data + + +def _rope_inplace_native( + q: GPUArray, + k: GPUArray, + cos: GPUArray, + sin: GPUArray, +) -> None: + """Native C++ CUDA implementation of rope_inplace.""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + q_native = q._get_native() + k_native = k._get_native() + cos_native = cos._get_native() + sin_native = sin._get_native() + native.rope_inplace(q_native, k_native, cos_native, sin_native) + + +# ============================================================================ +# Tensor Manipulation Operations +# ============================================================================ + + +def concat_axis0(a: GPUArray, b: GPUArray) -> GPUArray: + """Concatenate two tensors along axis 0. + + Args: + a: First tensor of shape [dim0_a, ...]. + b: Second tensor of shape [dim0_b, ...]. + + Returns: + Concatenated tensor of shape [dim0_a + dim0_b, ...]. + + Raises: + ValueError: If shapes don't match along non-concatenation axes. + """ + _validate_same_dtype(a, b, "concat_axis0") + + if a.ndim != b.ndim: + raise ValueError(f"concat_axis0: dimension mismatch ({a.ndim}D vs {b.ndim}D)") + + for i in range(1, a.ndim): + if a.shape[i] != b.shape[i]: + raise ValueError( + f"concat_axis0: shape mismatch at axis {i} ({a.shape[i]} vs {b.shape[i]})" + ) + + backend = get_backend() + + if isinstance(backend, NativeBackend) and backend.is_available(): + return _concat_axis0_native(a, b) + else: + return _concat_axis0_cpu(a, b) + + +def _concat_axis0_cpu(a: GPUArray, b: GPUArray) -> GPUArray: + """CPU implementation of concat_axis0.""" + a_np = a.to_numpy() + b_np = b.to_numpy() + result = np.concatenate([a_np, b_np], axis=0) + return from_numpy(result) + + +def _concat_axis0_native(a: GPUArray, b: GPUArray) -> GPUArray: + """Native C++ CUDA implementation of concat_axis0.""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + a_native = a._get_native() + b_native = b._get_native() + c_native = native.concat_axis0(a_native, b_native) + return GPUArray._wrap_native(c_native) + + +def repeat_interleave_axis1(input: GPUArray, repeats: int) -> GPUArray: + """Repeat tensor elements along axis 1 (interleaved). + + For GQA: expands [n_heads_kv, seq_len, head_dim] to [n_heads, seq_len, head_dim] + by repeating each KV head `repeats` times. + + Args: + input: Input tensor of shape [dim0, dim1, dim2]. + repeats: Number of times to repeat each element along axis 1. + + Returns: + Tensor of shape [dim0, dim1 * repeats, dim2]. + """ + _validate_float_dtype(input, "repeat_interleave_axis1") + + if input.ndim != 3: + raise ValueError( + f"repeat_interleave_axis1 expects 3D input [d0, d1, d2], got {input.ndim}D" + ) + + backend = get_backend() + + if isinstance(backend, NativeBackend) and backend.is_available(): + return _repeat_interleave_axis1_native(input, repeats) + else: + return _repeat_interleave_axis1_cpu(input, repeats) + + +def _repeat_interleave_axis1_cpu(input: GPUArray, repeats: int) -> GPUArray: + """CPU implementation of repeat_interleave_axis1.""" + x = input.to_numpy() + # np.repeat with axis=1 gives interleaved repeat + result = np.repeat(x, repeats, axis=1) + return from_numpy(result) + + +def _repeat_interleave_axis1_native(input: GPUArray, repeats: int) -> GPUArray: + """Native C++ CUDA implementation of repeat_interleave_axis1.""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + input_native = input._get_native() + c_native = native.repeat_interleave_axis1(input_native, repeats) + return GPUArray._wrap_native(c_native) + + +def transpose_3d_021(input: GPUArray) -> GPUArray: + """Transpose 3D tensor: [d0, d1, d2] -> [d1, d0, d2]. + + Swaps axes 0 and 1 while keeping axis 2 in place. + Useful for converting [seq_len, n_heads, head_dim] to [n_heads, seq_len, head_dim]. + + Args: + input: 3D tensor to transpose. + + Returns: + Transposed tensor with axes 0 and 1 swapped. + """ + _validate_float_dtype(input, "transpose_3d_021") + + if input.ndim != 3: + raise ValueError(f"transpose_3d_021 expects 3D input, got {input.ndim}D") + + backend = get_backend() + + if isinstance(backend, NativeBackend) and backend.is_available(): + return _transpose_3d_021_native(input) + else: + return _transpose_3d_021_cpu(input) + + +def _transpose_3d_021_cpu(input: GPUArray) -> GPUArray: + """CPU implementation of transpose_3d_021.""" + x = input.to_numpy() + result = np.transpose(x, (1, 0, 2)).copy() + return from_numpy(result) + + +def _transpose_3d_021_native(input: GPUArray) -> GPUArray: + """Native C++ CUDA implementation of transpose_3d_021.""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + input_native = input._get_native() + c_native = native.transpose_3d_021(input_native) + return GPUArray._wrap_native(c_native) + + +def reshape_copy(input: GPUArray, new_shape: tuple[int, ...]) -> GPUArray: + """Reshape tensor with copy (ensures contiguous output). + + Args: + input: Input tensor to reshape. + new_shape: Target shape (total elements must match). + + Returns: + Reshaped tensor with new shape. + + Raises: + ValueError: If total element count doesn't match. + """ + _validate_float_dtype(input, "reshape_copy") + + # Verify total size + input_size = 1 + for dim in input.shape: + input_size *= dim + + output_size = 1 + for dim in new_shape: + output_size *= dim + + if input_size != output_size: + raise ValueError( + f"reshape_copy: total size mismatch ({input_size} vs {output_size})" + ) + + backend = get_backend() + + if isinstance(backend, NativeBackend) and backend.is_available(): + return _reshape_copy_native(input, new_shape) + else: + return _reshape_copy_cpu(input, new_shape) + + +def _reshape_copy_cpu(input: GPUArray, new_shape: tuple[int, ...]) -> GPUArray: + """CPU implementation of reshape_copy.""" + x = input.to_numpy() + result = x.reshape(new_shape).copy() + return from_numpy(result) + + +def _reshape_copy_native(input: GPUArray, new_shape: tuple[int, ...]) -> GPUArray: + """Native C++ CUDA implementation of reshape_copy.""" + from pygpukit.core.backend import get_native_module + + native = get_native_module() + input_native = input._get_native() + c_native = native.reshape_copy(input_native, list(new_shape)) + return GPUArray._wrap_native(c_native) From a5bd739fd6698899debed8b9167f78b25a3c26e8 Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 12:09:38 +0900 Subject: [PATCH 04/14] feat(llm): Add ShardedSafeTensorsFile and Qwen3 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ShardedSafeTensorsFile: lazy-load sharded safetensors models - load_safetensors() auto-detects .index.json - Opens shards on-demand, not all at once - QK Norm support in Attention class - For Qwen3 style models with Q/K normalization - Reshape 3D->2D for norm, then back to 3D - Qwen3Config and load_qwen3_from_safetensors() - head_dim=128, rope_theta=1e6 - Auto-detect config from tensor shapes Note: Qwen3-8B requires ~32GB VRAM at FP32, needs FP16 support 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/pygpukit/llm/__init__.py | 182 +++++++++++++++++++++++++++--- src/pygpukit/llm/model.py | 207 +++++++++++++++++++++++++++++++++++ 2 files changed, 372 insertions(+), 17 deletions(-) diff --git a/src/pygpukit/llm/__init__.py b/src/pygpukit/llm/__init__.py index afce946..f90f04a 100644 --- a/src/pygpukit/llm/__init__.py +++ b/src/pygpukit/llm/__init__.py @@ -209,16 +209,157 @@ def __repr__(self) -> str: return f"SafeTensorsFile(num_tensors={self.num_tensors}, file_size={self.file_size})" -def load_safetensors(path: str) -> SafeTensorsFile: - """Load a safetensors file. +class ShardedSafeTensorsFile: + """Sharded SafeTensors file loader. + + Handles models split across multiple .safetensors files with an index.json. + Lazily opens shards on demand to minimize memory usage. + + Example: + >>> st = ShardedSafeTensorsFile("model.safetensors.index.json") + >>> print(st.tensor_names[:5]) + ['lm_head.weight', 'model.embed_tokens.weight', ...] + >>> info = st.tensor_info('model.embed_tokens.weight') + >>> data = st.tensor_bytes('model.embed_tokens.weight') + """ + + def __init__(self, index_json_path: str): + """Open a sharded safetensors model. + + Args: + index_json_path: Path to model.safetensors.index.json + """ + import json + from pathlib import Path + + self._index_path = Path(index_json_path) + self._base_dir = self._index_path.parent + + with open(index_json_path, encoding="utf-8") as f: + index = json.load(f) + + # weight_map: { tensor_name: shard_filename } + self._weight_map: dict[str, str] = index.get("weight_map", {}) + self._metadata = index.get("metadata", {}) + + # Lazy-loaded shard files + self._shards: dict[str, SafeTensorsFile] = {} + + # Unique shard files + self._shard_files = list(set(self._weight_map.values())) + + def _get_shard(self, shard_file: str) -> SafeTensorsFile: + """Lazily open a shard file.""" + if shard_file not in self._shards: + shard_path = self._base_dir / shard_file + self._shards[shard_file] = SafeTensorsFile(str(shard_path)) + return self._shards[shard_file] + + @property + def tensor_names(self) -> list[str]: + """Get list of all tensor names across all shards.""" + return list(self._weight_map.keys()) + + @property + def file_size(self) -> int: + """Total file size across all shards (lazy, opens all shards).""" + total = 0 + for shard_file in self._shard_files: + total += self._get_shard(shard_file).file_size + return total + + @property + def num_tensors(self) -> int: + """Number of tensors across all shards.""" + return len(self._weight_map) + + def tensor_info(self, name: str) -> TensorInfo: + """Get metadata for a tensor by name. + + Args: + name: Tensor name + + Returns: + TensorInfo with dtype, shape, offset, and size + + Raises: + KeyError: If tensor name not found + """ + if name not in self._weight_map: + raise KeyError(f"Tensor '{name}' not found") + shard_file = self._weight_map[name] + return self._get_shard(shard_file).tensor_info(name) + + def tensor_bytes(self, name: str) -> bytes: + """Get raw tensor data as bytes. + + Args: + name: Tensor name + + Returns: + Raw bytes of the tensor data + + Raises: + KeyError: If tensor name not found + """ + if name not in self._weight_map: + raise KeyError(f"Tensor '{name}' not found") + shard_file = self._weight_map[name] + return self._get_shard(shard_file).tensor_bytes(name) + + def tensor_as_f32(self, name: str): + """Get tensor data as numpy float32 array. + + Args: + name: Tensor name + + Returns: + 1D numpy array of float32 values + + Raises: + KeyError: If tensor name not found + ValueError: If tensor dtype is not Float32 + """ + if name not in self._weight_map: + raise KeyError(f"Tensor '{name}' not found") + shard_file = self._weight_map[name] + return self._get_shard(shard_file).tensor_as_f32(name) + + def __len__(self) -> int: + return self.num_tensors + + def __contains__(self, name: str) -> bool: + return name in self._weight_map + + def __repr__(self) -> str: + return ( + f"ShardedSafeTensorsFile(num_tensors={self.num_tensors}, " + f"num_shards={len(self._shard_files)})" + ) + + +def load_safetensors(path: str) -> SafeTensorsFile | ShardedSafeTensorsFile: + """Load a safetensors file (single or sharded). + + Automatically detects sharded models by .index.json extension. Args: - path: Path to the .safetensors file + path: Path to .safetensors file or .safetensors.index.json Returns: - SafeTensorsFile object for accessing tensor data + SafeTensorsFile or ShardedSafeTensorsFile for accessing tensor data + + Example: + # Single file + st = load_safetensors("model.safetensors") + + # Sharded model + st = load_safetensors("model.safetensors.index.json") """ - return SafeTensorsFile(path) + if path.endswith(".index.json"): + return ShardedSafeTensorsFile(path) + else: + return SafeTensorsFile(path) class Tokenizer: @@ -330,28 +471,31 @@ def __repr__(self) -> str: from pygpukit.llm.model import ( # noqa: E402 - # Unified Transformer classes - TransformerConfig, - CausalTransformerModel, - Attention, MLP, - Norm, - TransformerBlock, - Linear, + Attention, + CausalSelfAttention, + CausalTransformerModel, # Legacy GPT-2 aliases GPT2Config, GPT2Model, - CausalSelfAttention, LayerNorm, - load_gpt2_from_safetensors, - # Legacy Llama aliases - LlamaConfig, - LlamaModel, + Linear, LlamaAttention, LlamaBlock, + # Legacy Llama aliases + LlamaConfig, LlamaMLP, + LlamaModel, + Norm, + # Qwen3 + Qwen3Config, RMSNorm, + TransformerBlock, + # Unified Transformer classes + TransformerConfig, + load_gpt2_from_safetensors, load_llama_from_safetensors, + load_qwen3_from_safetensors, ) __all__ = [ @@ -359,6 +503,7 @@ def __repr__(self) -> str: "Dtype", "TensorInfo", "SafeTensorsFile", + "ShardedSafeTensorsFile", "load_safetensors", # Tokenizer "Tokenizer", @@ -384,4 +529,7 @@ def __repr__(self) -> str: "LlamaMLP", "RMSNorm", "load_llama_from_safetensors", + # Qwen3 + "Qwen3Config", + "load_qwen3_from_safetensors", ] diff --git a/src/pygpukit/llm/model.py b/src/pygpukit/llm/model.py index 99b2367..4d71fee 100644 --- a/src/pygpukit/llm/model.py +++ b/src/pygpukit/llm/model.py @@ -338,6 +338,7 @@ class Attention: - Multi-Head Attention (MHA): num_kv_heads == num_heads - Grouped Query Attention (GQA): num_kv_heads < num_heads - RoPE: enabled via config.use_rope + - QK Norm: optional normalization of Q and K (Qwen3 style) - Hybrid execution: CPU for seq_len=1, GPU for longer sequences """ @@ -352,12 +353,18 @@ def __init__( k_bias: GPUArray | None = None, v_bias: GPUArray | None = None, o_bias: GPUArray | None = None, + q_norm: Norm | None = None, + k_norm: Norm | None = None, ): self.q_proj = Linear(q_proj, q_bias) self.k_proj = Linear(k_proj, k_bias) self.v_proj = Linear(v_proj, v_bias) self.o_proj = Linear(o_proj, o_bias) + # QK Norm (Qwen3 style) + self.q_norm = q_norm + self.k_norm = k_norm + self.config = config self.head_dim = config.head_dim self.num_heads = config.num_heads @@ -421,6 +428,19 @@ def _forward_gpu( k = reshape_copy(k, (seq_len, self.num_kv_heads, self.head_dim)) v = reshape_copy(v, (seq_len, self.num_kv_heads, self.head_dim)) + # QK Norm (Qwen3 style) - applied per head before RoPE + # Reshape to 2D for norm, then back to 3D + if self.q_norm is not None: + q_shape = (seq_len, self.num_heads, self.head_dim) + q_2d = reshape_copy(q, (seq_len * self.num_heads, self.head_dim)) + q_2d = self.q_norm(q_2d) + q = reshape_copy(q_2d, q_shape) + if self.k_norm is not None: + k_shape = (seq_len, self.num_kv_heads, self.head_dim) + k_2d = reshape_copy(k, (seq_len * self.num_kv_heads, self.head_dim)) + k_2d = self.k_norm(k_2d) + k = reshape_copy(k_2d, k_shape) + # Apply RoPE on GPU if self.config.use_rope: cos = from_numpy(self._cos[position_ids].astype(np.float32)) @@ -480,6 +500,19 @@ def _forward_cpu( k = k.reshape(seq_len, self.num_kv_heads, self.head_dim) v = v.reshape(seq_len, self.num_kv_heads, self.head_dim) + # QK Norm (Qwen3 style) - applied per head before RoPE + # Reshape to 2D for norm, then back to 3D + if self.q_norm is not None: + q_shape = q.shape + q_2d = q.reshape(seq_len * self.num_heads, self.head_dim) + q_2d = self.q_norm(from_numpy(q_2d.astype(np.float32))).to_numpy() + q = q_2d.reshape(q_shape) + if self.k_norm is not None: + k_shape = k.shape + k_2d = k.reshape(seq_len * self.num_kv_heads, self.head_dim) + k_2d = self.k_norm(from_numpy(k_2d.astype(np.float32))).to_numpy() + k = k_2d.reshape(k_shape) + # Apply RoPE (CPU) if self.config.use_rope: cos = self._cos[position_ids] @@ -1149,6 +1182,180 @@ def load_tensor(name: str) -> GPUArray: return LlamaModel(transformer_config, embed_tokens, blocks, final_norm, lm_head) +# ============================================================================= +# Qwen3 Configuration and Loader +# ============================================================================= + + +@dataclass +class Qwen3Config: + """Configuration for Qwen3 model.""" + + vocab_size: int = 151936 + hidden_size: int = 4096 + intermediate_size: int = 12288 + num_hidden_layers: int = 36 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + head_dim: int = 128 # Qwen3 uses 128, not hidden_size // num_heads + max_position_embeddings: int = 40960 + rms_norm_eps: float = 1e-6 + rope_theta: float = 1000000.0 + + def to_transformer_config(self) -> TransformerConfig: + """Convert to unified TransformerConfig.""" + return TransformerConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_layers=self.num_hidden_layers, + num_heads=self.num_attention_heads, + num_kv_heads=self.num_key_value_heads, + intermediate_size=self.intermediate_size, + norm_type="rmsnorm", + activation="silu", + use_rope=True, + causal=True, + max_position_embeddings=self.max_position_embeddings, + norm_eps=self.rms_norm_eps, + rope_theta=self.rope_theta, + ) + + +def load_qwen3_from_safetensors( + model_path: str, + config: Qwen3Config | None = None, +) -> CausalTransformerModel: + """Load Qwen3 model from safetensors file (single or sharded). + + Args: + model_path: Path to model.safetensors or model.safetensors.index.json + config: Model configuration (auto-detected if None) + + Returns: + CausalTransformerModel instance + """ + from pygpukit.llm import load_safetensors + + st = load_safetensors(model_path) + + # Auto-detect config + if config is None: + embed_info = st.tensor_info("model.embed_tokens.weight") + vocab_size = embed_info.shape[0] + hidden_size = embed_info.shape[1] + + num_layers = 0 + while f"model.layers.{num_layers}.self_attn.q_proj.weight" in st.tensor_names: + num_layers += 1 + + q_info = st.tensor_info("model.layers.0.self_attn.q_proj.weight") + k_info = st.tensor_info("model.layers.0.self_attn.k_proj.weight") + gate_info = st.tensor_info("model.layers.0.mlp.gate_proj.weight") + + # Qwen3 uses explicit head_dim from q_norm shape + q_norm_info = st.tensor_info("model.layers.0.self_attn.q_norm.weight") + head_dim = q_norm_info.shape[0] + + num_heads = q_info.shape[0] // head_dim + num_kv_heads = k_info.shape[0] // head_dim + + config = Qwen3Config( + vocab_size=vocab_size, + hidden_size=hidden_size, + intermediate_size=gate_info.shape[0], + num_hidden_layers=num_layers, + num_attention_heads=num_heads, + num_key_value_heads=num_kv_heads, + head_dim=head_dim, + ) + + transformer_config = config.to_transformer_config() + + def load_tensor(name: str) -> GPUArray: + data = st.tensor_bytes(name) + info = st.tensor_info(name) + if info.dtype == 2: # BFloat16 + arr = np.frombuffer(data, dtype=np.uint16).reshape(info.shape) + arr_f32 = np.empty(arr.shape, dtype=np.float32) + arr_f32.view(np.uint32)[:] = arr.astype(np.uint32) << 16 + return from_numpy(arr_f32) + else: + dtype_map = {0: np.float32, 1: np.float16, 3: np.float64} + np_dtype = dtype_map.get(info.dtype, np.float32) + arr = np.frombuffer(data, dtype=np_dtype).reshape(info.shape) + return from_numpy(arr.copy().astype(np.float32)) + + # Embeddings + embed_tokens = load_tensor("model.embed_tokens.weight") + + # Blocks + blocks = [] + for i in range(config.num_hidden_layers): + prefix = f"model.layers.{i}." + + # Attention norm + attn_norm = Norm( + load_tensor(f"{prefix}input_layernorm.weight"), + None, + "rmsnorm", + config.rms_norm_eps, + ) + + # QK Norm (Qwen3 specific) + q_norm = Norm( + load_tensor(f"{prefix}self_attn.q_norm.weight"), + None, + "rmsnorm", + config.rms_norm_eps, + ) + k_norm = Norm( + load_tensor(f"{prefix}self_attn.k_norm.weight"), + None, + "rmsnorm", + config.rms_norm_eps, + ) + + # Attention with QK Norm + attn = Attention( + load_tensor(f"{prefix}self_attn.q_proj.weight"), + load_tensor(f"{prefix}self_attn.k_proj.weight"), + load_tensor(f"{prefix}self_attn.v_proj.weight"), + load_tensor(f"{prefix}self_attn.o_proj.weight"), + transformer_config, + q_norm=q_norm, + k_norm=k_norm, + ) + + # MLP norm + mlp_norm = Norm( + load_tensor(f"{prefix}post_attention_layernorm.weight"), + None, + "rmsnorm", + config.rms_norm_eps, + ) + + # MLP + mlp = MLP( + transformer_config, + gate_proj=load_tensor(f"{prefix}mlp.gate_proj.weight"), + up_proj=load_tensor(f"{prefix}mlp.up_proj.weight"), + down_proj=load_tensor(f"{prefix}mlp.down_proj.weight"), + ) + + block = TransformerBlock(attn_norm, attn, mlp_norm, mlp) + blocks.append(block) + + # Final norm + final_norm = Norm(load_tensor("model.norm.weight"), None, "rmsnorm", config.rms_norm_eps) + + # LM head + lm_head = None + if "lm_head.weight" in st.tensor_names: + lm_head = load_tensor("lm_head.weight") + + return CausalTransformerModel(transformer_config, embed_tokens, blocks, final_norm, lm_head) + + # ============================================================================= # Legacy apply_rotary_pos_emb (for backward compatibility) # ============================================================================= From 74b83244673e83023e39d46d3d6285402b04f46a Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 12:31:54 +0900 Subject: [PATCH 05/14] feat(llm): Add FP16 inference support for Qwen3-8B MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add dtype parameter to load_qwen3_from_safetensors (float32/float16) - Fix RoPE to handle FP16: convert to FP32 for computation, back to FP16 - Fix SDPA dtype preservation for KV cache - Fix _forward_cpu QK Norm dtype preservation - Fix o_proj output dtype in _forward_cpu - Add FP16 fallback for reshape_copy (native only supports FP32) - Add FP16 fallback for transpose_3d_021 (native only supports FP32) - Fix unused variable kv_len in sdpa_causal Tested: Qwen3-8B (16.4GB) FP16 inference fits in 24GB VRAM - Forward pass: 2297ms for 1 token - Generation: 16 tokens in 73.9s (0.2 tok/s with CPU fallbacks) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/pygpukit/llm/model.py | 54 ++++++++++++++++++++++++++------------- src/pygpukit/ops/basic.py | 18 ++++++++----- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/src/pygpukit/llm/model.py b/src/pygpukit/llm/model.py index 4d71fee..feacca1 100644 --- a/src/pygpukit/llm/model.py +++ b/src/pygpukit/llm/model.py @@ -441,11 +441,20 @@ def _forward_gpu( k_2d = self.k_norm(k_2d) k = reshape_copy(k_2d, k_shape) - # Apply RoPE on GPU + # Apply RoPE on GPU (requires FP32) if self.config.use_rope: cos = from_numpy(self._cos[position_ids].astype(np.float32)) sin = from_numpy(self._sin[position_ids].astype(np.float32)) - rope_inplace(q, k, cos, sin) + # RoPE only supports FP32, convert if needed + orig_dtype = q.dtype + if orig_dtype != "float32": + q_f32 = from_numpy(q.to_numpy().astype(np.float32)) + k_f32 = from_numpy(k.to_numpy().astype(np.float32)) + rope_inplace(q_f32, k_f32, cos, sin) + q = from_numpy(q_f32.to_numpy().astype(np.float16)) + k = from_numpy(k_f32.to_numpy().astype(np.float16)) + else: + rope_inplace(q, k, cos, sin) # Convert to numpy for KV cache k_np = k.to_numpy() @@ -467,10 +476,11 @@ def _forward_gpu( k_expanded = k_np v_expanded = v_np - # GPU SDPA + # GPU SDPA (use same dtype as q) q_t = transpose_3d_021(q) - k_t = from_numpy(k_expanded.transpose(1, 0, 2).astype(np.float32)) - v_t = from_numpy(v_expanded.transpose(1, 0, 2).astype(np.float32)) + kv_dtype = k_np.dtype # Preserve dtype from KV cache + k_t = from_numpy(k_expanded.transpose(1, 0, 2).astype(kv_dtype)) + v_t = from_numpy(v_expanded.transpose(1, 0, 2).astype(kv_dtype)) attn_output = sdpa_causal(q_t, k_t, v_t) @@ -501,17 +511,19 @@ def _forward_cpu( v = v.reshape(seq_len, self.num_kv_heads, self.head_dim) # QK Norm (Qwen3 style) - applied per head before RoPE - # Reshape to 2D for norm, then back to 3D + # Reshape to 2D for norm, then back to 3D (preserve dtype) if self.q_norm is not None: q_shape = q.shape + q_dtype = q.dtype q_2d = q.reshape(seq_len * self.num_heads, self.head_dim) - q_2d = self.q_norm(from_numpy(q_2d.astype(np.float32))).to_numpy() - q = q_2d.reshape(q_shape) + q_2d = self.q_norm(from_numpy(q_2d)).to_numpy() + q = q_2d.reshape(q_shape).astype(q_dtype) if self.k_norm is not None: k_shape = k.shape + k_dtype = k.dtype k_2d = k.reshape(seq_len * self.num_kv_heads, self.head_dim) - k_2d = self.k_norm(from_numpy(k_2d.astype(np.float32))).to_numpy() - k = k_2d.reshape(k_shape) + k_2d = self.k_norm(from_numpy(k_2d)).to_numpy() + k = k_2d.reshape(k_shape).astype(k_dtype) # Apply RoPE (CPU) if self.config.use_rope: @@ -565,8 +577,10 @@ def _forward_cpu( attn_output = attn_output.transpose(1, 0, 2) attn_output = attn_output.reshape(seq_len, self.num_heads * self.head_dim) - # Output projection (GPU) - out = from_numpy(attn_output.astype(np.float32)) + # Output projection (GPU) - use same dtype as weights + weight_dtype = str(self.o_proj.weight.dtype) + out_dtype = np.float16 if weight_dtype == "float16" else np.float32 + out = from_numpy(attn_output.astype(out_dtype)) return self.o_proj(out), present_kv @@ -727,16 +741,16 @@ def __call__( else: position_ids = list(range(seq_len)) - # Token embeddings + # Token embeddings (preserve dtype) embed_np = self.embed_tokens.to_numpy() - hidden = embed_np[input_ids].astype(np.float32) + hidden = embed_np[input_ids] # Add position embeddings (GPT-2 style) if self.position_embed is not None: pos_embed_np = self.position_embed.to_numpy() hidden = hidden + pos_embed_np[position_ids] - hidden = from_numpy(hidden) + hidden = from_numpy(hidden.astype(embed_np.dtype)) # Transformer blocks present_key_values = [] @@ -1224,12 +1238,14 @@ def to_transformer_config(self) -> TransformerConfig: def load_qwen3_from_safetensors( model_path: str, config: Qwen3Config | None = None, + dtype: str = "float32", ) -> CausalTransformerModel: """Load Qwen3 model from safetensors file (single or sharded). Args: model_path: Path to model.safetensors or model.safetensors.index.json config: Model configuration (auto-detected if None) + dtype: Weight dtype ("float32" or "float16") Returns: CausalTransformerModel instance @@ -1237,6 +1253,7 @@ def load_qwen3_from_safetensors( from pygpukit.llm import load_safetensors st = load_safetensors(model_path) + target_dtype = np.float16 if dtype == "float16" else np.float32 # Auto-detect config if config is None: @@ -1274,16 +1291,17 @@ def load_qwen3_from_safetensors( def load_tensor(name: str) -> GPUArray: data = st.tensor_bytes(name) info = st.tensor_info(name) - if info.dtype == 2: # BFloat16 + if info.dtype == 2: # BFloat16 -> target dtype arr = np.frombuffer(data, dtype=np.uint16).reshape(info.shape) + # BF16 to FP32 first, then cast to target arr_f32 = np.empty(arr.shape, dtype=np.float32) arr_f32.view(np.uint32)[:] = arr.astype(np.uint32) << 16 - return from_numpy(arr_f32) + return from_numpy(arr_f32.astype(target_dtype)) else: dtype_map = {0: np.float32, 1: np.float16, 3: np.float64} np_dtype = dtype_map.get(info.dtype, np.float32) arr = np.frombuffer(data, dtype=np_dtype).reshape(info.shape) - return from_numpy(arr.copy().astype(np.float32)) + return from_numpy(arr.copy().astype(target_dtype)) # Embeddings embed_tokens = load_tensor("model.embed_tokens.weight") diff --git a/src/pygpukit/ops/basic.py b/src/pygpukit/ops/basic.py index c0d59c2..c3da192 100644 --- a/src/pygpukit/ops/basic.py +++ b/src/pygpukit/ops/basic.py @@ -1093,7 +1093,6 @@ def sdpa_causal( raise ValueError("sdpa_causal: Q, K, V must have same dtype") n_heads, q_len, head_dim = Q.shape - kv_len = K.shape[1] if K.shape[0] != n_heads or V.shape[0] != n_heads: raise ValueError("sdpa_causal: n_heads mismatch") @@ -1205,7 +1204,6 @@ def _rope_inplace_cpu( sin: GPUArray, ) -> None: """CPU implementation of rope_inplace.""" - import numpy as np q_np = q.to_numpy() k_np = k.to_numpy() @@ -1379,8 +1377,12 @@ def transpose_3d_021(input: GPUArray) -> GPUArray: backend = get_backend() + # Native transpose_3d_021 only supports float32, fall back to CPU for other dtypes if isinstance(backend, NativeBackend) and backend.is_available(): - return _transpose_3d_021_native(input) + if str(input.dtype) == "float32": + return _transpose_3d_021_native(input) + else: + return _transpose_3d_021_cpu(input) else: return _transpose_3d_021_cpu(input) @@ -1427,14 +1429,16 @@ def reshape_copy(input: GPUArray, new_shape: tuple[int, ...]) -> GPUArray: output_size *= dim if input_size != output_size: - raise ValueError( - f"reshape_copy: total size mismatch ({input_size} vs {output_size})" - ) + raise ValueError(f"reshape_copy: total size mismatch ({input_size} vs {output_size})") backend = get_backend() + # Native reshape_copy only supports float32, fall back to CPU for other dtypes if isinstance(backend, NativeBackend) and backend.is_available(): - return _reshape_copy_native(input, new_shape) + if str(input.dtype) == "float32": + return _reshape_copy_native(input, new_shape) + else: + return _reshape_copy_cpu(input, new_shape) else: return _reshape_copy_cpu(input, new_shape) From d22e8ee7b8af17a0037427c3b9f62e318fd073e5 Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 12:41:17 +0900 Subject: [PATCH 06/14] feat(ops): Add native FP16/BF16 kernels for transpose_3d_021 and reshape_copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add transpose_021_f16_kernel and transpose_021_bf16_kernel - Add copy_f16_kernel and copy_bf16_kernel for reshape - Update nn.cu dispatch to use new FP16/BF16 kernels - Update Python ops/basic.py to route FP16/BF16 to native kernels Previously these operations fell back to CPU for FP16, causing slow inference. Now they run natively on GPU. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- native/ops/nn/nn.cu | 62 +++++++++++++++++++++++++------- native/ops/nn/nn_kernels.cuh | 68 ++++++++++++++++++++++++++++++++++++ src/pygpukit/ops/basic.py | 10 +++--- 3 files changed, 124 insertions(+), 16 deletions(-) diff --git a/native/ops/nn/nn.cu b/native/ops/nn/nn.cu index 9d3f74e..472bc23 100644 --- a/native/ops/nn/nn.cu +++ b/native/ops/nn/nn.cu @@ -745,8 +745,9 @@ GPUArray repeat_interleave_axis1(const GPUArray& input, size_t repeats) { // Transpose 3D tensor: [d0, d1, d2] -> [d1, d0, d2] GPUArray transpose_3d_021(const GPUArray& input) { - if (input.dtype() != DataType::Float32) { - throw std::runtime_error("transpose_3d_021: only float32 supported"); + if (input.dtype() != DataType::Float32 && input.dtype() != DataType::Float16 && + input.dtype() != DataType::BFloat16) { + throw std::runtime_error("transpose_3d_021: only float32/float16/bfloat16 supported"); } if (input.ndim() != 3) { throw std::runtime_error("transpose_3d_021: expects 3D tensor"); @@ -764,10 +765,28 @@ GPUArray transpose_3d_021(const GPUArray& input) { const int block_size = 256; const int grid_size = (total + block_size - 1) / block_size; - nn::transpose_021_f32_kernel<<>>( - static_cast(input.data()), - static_cast(result.data()), - dim0, dim1, dim2); + switch (input.dtype()) { + case DataType::Float32: + nn::transpose_021_f32_kernel<<>>( + static_cast(input.data()), + static_cast(result.data()), + dim0, dim1, dim2); + break; + case DataType::Float16: + nn::transpose_021_f16_kernel<<>>( + static_cast(input.data()), + static_cast<__half*>(result.data()), + dim0, dim1, dim2); + break; + case DataType::BFloat16: + nn::transpose_021_bf16_kernel<<>>( + static_cast(input.data()), + static_cast<__nv_bfloat16*>(result.data()), + dim0, dim1, dim2); + break; + default: + break; + } sync_and_check("transpose_3d_021 kernel failed"); return result; @@ -775,8 +794,9 @@ GPUArray transpose_3d_021(const GPUArray& input) { // Reshape with copy (creates contiguous tensor with new shape) GPUArray reshape_copy(const GPUArray& input, const std::vector& new_shape) { - if (input.dtype() != DataType::Float32) { - throw std::runtime_error("reshape_copy: only float32 supported"); + if (input.dtype() != DataType::Float32 && input.dtype() != DataType::Float16 && + input.dtype() != DataType::BFloat16) { + throw std::runtime_error("reshape_copy: only float32/float16/bfloat16 supported"); } // Verify total size matches @@ -795,10 +815,28 @@ GPUArray reshape_copy(const GPUArray& input, const std::vector& new_shap const int block_size = 256; const int grid_size = (input_size + block_size - 1) / block_size; - nn::copy_f32_kernel<<>>( - static_cast(input.data()), - static_cast(result.data()), - input_size); + switch (input.dtype()) { + case DataType::Float32: + nn::copy_f32_kernel<<>>( + static_cast(input.data()), + static_cast(result.data()), + input_size); + break; + case DataType::Float16: + nn::copy_f16_kernel<<>>( + static_cast(input.data()), + static_cast<__half*>(result.data()), + input_size); + break; + case DataType::BFloat16: + nn::copy_bf16_kernel<<>>( + static_cast(input.data()), + static_cast<__nv_bfloat16*>(result.data()), + input_size); + break; + default: + break; + } sync_and_check("reshape_copy kernel failed"); return result; diff --git a/native/ops/nn/nn_kernels.cuh b/native/ops/nn/nn_kernels.cuh index 3c25b7e..92414bd 100644 --- a/native/ops/nn/nn_kernels.cuh +++ b/native/ops/nn/nn_kernels.cuh @@ -1219,6 +1219,50 @@ __global__ void transpose_021_f32_kernel( } } +// Transpose 3D FP16: [d0, d1, d2] -> [d1, d0, d2] +__global__ void transpose_021_f16_kernel( + const __half* __restrict__ src, + __half* __restrict__ dst, + size_t dim0, + size_t dim1, + size_t dim2 +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = dim0 * dim1 * dim2; + + if (idx < total) { + size_t d2 = idx % dim2; + size_t remaining = idx / dim2; + size_t d1 = remaining % dim1; + size_t d0 = remaining / dim1; + + size_t dst_idx = d1 * dim0 * dim2 + d0 * dim2 + d2; + dst[dst_idx] = src[idx]; + } +} + +// Transpose 3D BF16: [d0, d1, d2] -> [d1, d0, d2] +__global__ void transpose_021_bf16_kernel( + const __nv_bfloat16* __restrict__ src, + __nv_bfloat16* __restrict__ dst, + size_t dim0, + size_t dim1, + size_t dim2 +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = dim0 * dim1 * dim2; + + if (idx < total) { + size_t d2 = idx % dim2; + size_t remaining = idx / dim2; + size_t d1 = remaining % dim1; + size_t d0 = remaining / dim1; + + size_t dst_idx = d1 * dim0 * dim2 + d0 * dim2 + d2; + dst[dst_idx] = src[idx]; + } +} + // Reshape with copy (ensures contiguous output) // Simply copies data - reshape is handled by changing shape metadata __global__ void copy_f32_kernel( @@ -1232,6 +1276,30 @@ __global__ void copy_f32_kernel( } } +// FP16 copy kernel +__global__ void copy_f16_kernel( + const __half* __restrict__ src, + __half* __restrict__ dst, + size_t n +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + dst[idx] = src[idx]; + } +} + +// BF16 copy kernel +__global__ void copy_bf16_kernel( + const __nv_bfloat16* __restrict__ src, + __nv_bfloat16* __restrict__ dst, + size_t n +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + dst[idx] = src[idx]; + } +} + // ============================================================================ // RoPE (Rotary Position Embedding) // ============================================================================ diff --git a/src/pygpukit/ops/basic.py b/src/pygpukit/ops/basic.py index c3da192..c397eda 100644 --- a/src/pygpukit/ops/basic.py +++ b/src/pygpukit/ops/basic.py @@ -1377,9 +1377,10 @@ def transpose_3d_021(input: GPUArray) -> GPUArray: backend = get_backend() - # Native transpose_3d_021 only supports float32, fall back to CPU for other dtypes + # Native transpose_3d_021 supports float32/float16/bfloat16 if isinstance(backend, NativeBackend) and backend.is_available(): - if str(input.dtype) == "float32": + dtype_str = str(input.dtype) + if dtype_str in ("float32", "float16", "bfloat16"): return _transpose_3d_021_native(input) else: return _transpose_3d_021_cpu(input) @@ -1433,9 +1434,10 @@ def reshape_copy(input: GPUArray, new_shape: tuple[int, ...]) -> GPUArray: backend = get_backend() - # Native reshape_copy only supports float32, fall back to CPU for other dtypes + # Native reshape_copy supports float32/float16/bfloat16 if isinstance(backend, NativeBackend) and backend.is_available(): - if str(input.dtype) == "float32": + dtype_str = str(input.dtype) + if dtype_str in ("float32", "float16", "bfloat16"): return _reshape_copy_native(input, new_shape) else: return _reshape_copy_cpu(input, new_shape) From dcd041ba5736d1b0c70d6fbf634d69b7d98b351e Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 12:52:20 +0900 Subject: [PATCH 07/14] refactor(llm): Add ModelSpec abstraction for model-specific differences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces ModelSpec data structure to unify model differences: - ModelSpec: frozen dataclass with weight patterns and arch flags - GPT2_SPEC: LayerNorm, GELU, combined QKV, position embeddings - LLAMA_SPEC: RMSNorm, SiLU, RoPE, GQA - QWEN3_SPEC: RMSNorm, SiLU, RoPE, GQA, QK Norm New generic loader: - load_model_from_safetensors(): auto-detects model type - detect_model_spec(): detects from tensor names - MODEL_SPECS registry for model type lookup Existing loaders preserved unchanged for backward compatibility: - load_gpt2_from_safetensors() - load_llama_from_safetensors() - load_qwen3_from_safetensors() This is a structural refactor only - no behavior changes. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/pygpukit/llm/__init__.py | 16 ++ src/pygpukit/llm/model.py | 522 ++++++++++++++++++++++++++++++++++- 2 files changed, 535 insertions(+), 3 deletions(-) diff --git a/src/pygpukit/llm/__init__.py b/src/pygpukit/llm/__init__.py index f90f04a..de4d8f2 100644 --- a/src/pygpukit/llm/__init__.py +++ b/src/pygpukit/llm/__init__.py @@ -471,7 +471,11 @@ def __repr__(self) -> str: from pygpukit.llm.model import ( # noqa: E402 + GPT2_SPEC, + LLAMA_SPEC, MLP, + MODEL_SPECS, + QWEN3_SPEC, Attention, CausalSelfAttention, CausalTransformerModel, @@ -486,6 +490,8 @@ def __repr__(self) -> str: LlamaConfig, LlamaMLP, LlamaModel, + # ModelSpec abstraction (v0.2.9) + ModelSpec, Norm, # Qwen3 Qwen3Config, @@ -493,8 +499,10 @@ def __repr__(self) -> str: TransformerBlock, # Unified Transformer classes TransformerConfig, + detect_model_spec, load_gpt2_from_safetensors, load_llama_from_safetensors, + load_model_from_safetensors, load_qwen3_from_safetensors, ) @@ -515,6 +523,14 @@ def __repr__(self) -> str: "Norm", "TransformerBlock", "Linear", + # ModelSpec abstraction (v0.2.9) + "ModelSpec", + "GPT2_SPEC", + "LLAMA_SPEC", + "QWEN3_SPEC", + "MODEL_SPECS", + "detect_model_spec", + "load_model_from_safetensors", # Legacy GPT-2 aliases "GPT2Config", "GPT2Model", diff --git a/src/pygpukit/llm/model.py b/src/pygpukit/llm/model.py index feacca1..2be32fc 100644 --- a/src/pygpukit/llm/model.py +++ b/src/pygpukit/llm/model.py @@ -1,13 +1,14 @@ """Unified Transformer implementation for PyGPUkit. -Provides a common Transformer abstraction that supports both GPT-2 and LLaMA -architectures through configuration differences only. +Provides a common Transformer abstraction that supports GPT-2, LLaMA, and Qwen3 +architectures through ModelSpec configuration. Key features: +- ModelSpec abstraction for model-specific differences - Hybrid Attention: CPU for seq_len=1 (decode), GPU for prefill - GPU-native operations: RMSNorm, LayerNorm, SDPA, SiLU, GELU, RoPE - Unified TransformerConfig for all model variants -- Backward-compatible loaders for GPT-2 and LLaMA safetensors +- Generic loader with automatic model detection """ from __future__ import annotations @@ -39,6 +40,251 @@ pass +# ============================================================================= +# ModelSpec - Data-only abstraction for model-specific differences +# ============================================================================= + + +@dataclass(frozen=True) +class ModelSpec: + """Model specification defining architecture-specific configurations. + + This is a data-only structure with no methods or behavior. + All model-specific differences are expressed as configuration values. + """ + + # Model identifier + name: str + + # Weight name patterns (HF name patterns for tensor lookup) + # These are format strings with {layer} placeholder + embed_tokens: str + position_embed: str | None # None if using RoPE + lm_head: str | None # None if tied embeddings + final_norm: str + final_norm_bias: str | None + + # Per-layer weight patterns + attn_norm: str + attn_norm_bias: str | None + q_proj: str + k_proj: str + v_proj: str + o_proj: str + q_bias: str | None + k_bias: str | None + v_bias: str | None + o_bias: str | None + q_norm: str | None # QK Norm (Qwen3) + k_norm: str | None + + mlp_norm: str + mlp_norm_bias: str | None + + # MLP weights (GELU style) + fc1: str | None + fc1_bias: str | None + fc2: str | None + fc2_bias: str | None + + # MLP weights (SwiGLU style) + gate_proj: str | None + up_proj: str | None + down_proj: str | None + + # Architecture flags + norm_type: Literal["rmsnorm", "layernorm"] + activation: Literal["gelu", "silu"] + use_rope: bool + use_qk_norm: bool + use_position_embed: bool # GPT-2 style absolute position embeddings + qkv_combined: bool # GPT-2 uses combined QKV projection + weight_transpose: bool # GPT-2 weights need transpose + + # Default hyperparameters + default_norm_eps: float = 1e-5 + default_rope_theta: float = 10000.0 + + # Config class name for detection + hf_model_type: str = "" + + +# ============================================================================= +# Concrete Model Specs +# ============================================================================= + + +GPT2_SPEC = ModelSpec( + name="gpt2", + # Embeddings + embed_tokens="wte.weight", + position_embed="wpe.weight", + lm_head=None, # Tied to embed_tokens + final_norm="ln_f.weight", + final_norm_bias="ln_f.bias", + # Attention (combined QKV) + attn_norm="h.{layer}.ln_1.weight", + attn_norm_bias="h.{layer}.ln_1.bias", + q_proj="h.{layer}.attn.c_attn.weight", # Combined QKV + k_proj="h.{layer}.attn.c_attn.weight", # Same tensor, split at load + v_proj="h.{layer}.attn.c_attn.weight", + o_proj="h.{layer}.attn.c_proj.weight", + q_bias="h.{layer}.attn.c_attn.bias", + k_bias="h.{layer}.attn.c_attn.bias", + v_bias="h.{layer}.attn.c_attn.bias", + o_bias="h.{layer}.attn.c_proj.bias", + q_norm=None, + k_norm=None, + # MLP (GELU) + mlp_norm="h.{layer}.ln_2.weight", + mlp_norm_bias="h.{layer}.ln_2.bias", + fc1="h.{layer}.mlp.c_fc.weight", + fc1_bias="h.{layer}.mlp.c_fc.bias", + fc2="h.{layer}.mlp.c_proj.weight", + fc2_bias="h.{layer}.mlp.c_proj.bias", + gate_proj=None, + up_proj=None, + down_proj=None, + # Architecture + norm_type="layernorm", + activation="gelu", + use_rope=False, + use_qk_norm=False, + use_position_embed=True, + qkv_combined=True, + weight_transpose=True, + default_norm_eps=1e-5, + default_rope_theta=10000.0, + hf_model_type="gpt2", +) + + +LLAMA_SPEC = ModelSpec( + name="llama", + # Embeddings + embed_tokens="model.embed_tokens.weight", + position_embed=None, + lm_head="lm_head.weight", + final_norm="model.norm.weight", + final_norm_bias=None, + # Attention + attn_norm="model.layers.{layer}.input_layernorm.weight", + attn_norm_bias=None, + q_proj="model.layers.{layer}.self_attn.q_proj.weight", + k_proj="model.layers.{layer}.self_attn.k_proj.weight", + v_proj="model.layers.{layer}.self_attn.v_proj.weight", + o_proj="model.layers.{layer}.self_attn.o_proj.weight", + q_bias=None, + k_bias=None, + v_bias=None, + o_bias=None, + q_norm=None, + k_norm=None, + # MLP (SwiGLU) + mlp_norm="model.layers.{layer}.post_attention_layernorm.weight", + mlp_norm_bias=None, + fc1=None, + fc1_bias=None, + fc2=None, + fc2_bias=None, + gate_proj="model.layers.{layer}.mlp.gate_proj.weight", + up_proj="model.layers.{layer}.mlp.up_proj.weight", + down_proj="model.layers.{layer}.mlp.down_proj.weight", + # Architecture + norm_type="rmsnorm", + activation="silu", + use_rope=True, + use_qk_norm=False, + use_position_embed=False, + qkv_combined=False, + weight_transpose=False, + default_norm_eps=1e-5, + default_rope_theta=10000.0, + hf_model_type="llama", +) + + +QWEN3_SPEC = ModelSpec( + name="qwen3", + # Embeddings + embed_tokens="model.embed_tokens.weight", + position_embed=None, + lm_head="lm_head.weight", + final_norm="model.norm.weight", + final_norm_bias=None, + # Attention + attn_norm="model.layers.{layer}.input_layernorm.weight", + attn_norm_bias=None, + q_proj="model.layers.{layer}.self_attn.q_proj.weight", + k_proj="model.layers.{layer}.self_attn.k_proj.weight", + v_proj="model.layers.{layer}.self_attn.v_proj.weight", + o_proj="model.layers.{layer}.self_attn.o_proj.weight", + q_bias=None, + k_bias=None, + v_bias=None, + o_bias=None, + q_norm="model.layers.{layer}.self_attn.q_norm.weight", + k_norm="model.layers.{layer}.self_attn.k_norm.weight", + # MLP (SwiGLU) + mlp_norm="model.layers.{layer}.post_attention_layernorm.weight", + mlp_norm_bias=None, + fc1=None, + fc1_bias=None, + fc2=None, + fc2_bias=None, + gate_proj="model.layers.{layer}.mlp.gate_proj.weight", + up_proj="model.layers.{layer}.mlp.up_proj.weight", + down_proj="model.layers.{layer}.mlp.down_proj.weight", + # Architecture + norm_type="rmsnorm", + activation="silu", + use_rope=True, + use_qk_norm=True, + use_position_embed=False, + qkv_combined=False, + weight_transpose=False, + default_norm_eps=1e-6, + default_rope_theta=1000000.0, + hf_model_type="qwen3", +) + + +# Registry for model detection +MODEL_SPECS: dict[str, ModelSpec] = { + "gpt2": GPT2_SPEC, + "llama": LLAMA_SPEC, + "qwen3": QWEN3_SPEC, + "qwen2": LLAMA_SPEC, # Qwen2 uses same structure as LLaMA +} + + +def detect_model_spec(tensor_names: list[str]) -> ModelSpec: + """Detect model type from tensor names. + + Args: + tensor_names: List of tensor names from safetensors file + + Returns: + ModelSpec for the detected model type + + Raises: + ValueError: If model type cannot be detected + """ + # Check for Qwen3-specific QK norm + if any("q_norm" in name for name in tensor_names): + return QWEN3_SPEC + # Check for LLaMA-style structure + if "model.embed_tokens.weight" in tensor_names: + return LLAMA_SPEC + # Check for GPT-2 structure + if "wte.weight" in tensor_names: + return GPT2_SPEC + + raise ValueError( + f"Cannot detect model type from tensor names. First 10 names: {tensor_names[:10]}" + ) + + # ============================================================================= # Common Sampling Functions # ============================================================================= @@ -1379,3 +1625,273 @@ def load_tensor(name: str) -> GPUArray: # ============================================================================= apply_rotary_pos_emb = apply_rotary_pos_emb_numpy + + +# ============================================================================= +# Generic Model Loader using ModelSpec +# ============================================================================= + + +def load_model_from_safetensors( + model_path: str, + dtype: str = "float32", + spec: ModelSpec | None = None, +) -> CausalTransformerModel: + """Load model from safetensors file using ModelSpec abstraction. + + Automatically detects model type (GPT-2, LLaMA, Qwen3) from tensor names + and loads using the appropriate ModelSpec configuration. + + Args: + model_path: Path to model.safetensors or model.safetensors.index.json + dtype: Weight dtype ("float32" or "float16") + spec: Optional ModelSpec to use (auto-detected if None) + + Returns: + CausalTransformerModel instance + + Example: + # Auto-detect model type + model = load_model_from_safetensors("/path/to/model.safetensors") + + # Explicit model type + model = load_model_from_safetensors("/path/to/model.safetensors", spec=LLAMA_SPEC) + """ + from pygpukit.llm import load_safetensors + + st = load_safetensors(model_path) + target_dtype = np.float16 if dtype == "float16" else np.float32 + + # Detect model type if not specified + if spec is None: + spec = detect_model_spec(st.tensor_names) + + # Helper to load tensor with dtype conversion + def load_tensor(name: str, do_transpose: bool = False) -> GPUArray: + data = st.tensor_bytes(name) + info = st.tensor_info(name) + if info.dtype == 2: # BFloat16 + arr = np.frombuffer(data, dtype=np.uint16).reshape(info.shape) + arr_f32 = np.empty(arr.shape, dtype=np.float32) + arr_f32.view(np.uint32)[:] = arr.astype(np.uint32) << 16 + arr = arr_f32 + else: + dtype_map = {0: np.float32, 1: np.float16, 3: np.float64} + np_dtype = dtype_map.get(info.dtype, np.float32) + arr = np.frombuffer(data, dtype=np_dtype).reshape(info.shape).copy() + if do_transpose and arr.ndim == 2: + arr = arr.T + return from_numpy(arr.astype(target_dtype)) + + def try_load(name: str | None, do_transpose: bool = False) -> GPUArray | None: + if name is None or name not in st.tensor_names: + return None + return load_tensor(name, do_transpose) + + def layer_name(pattern: str | None, layer: int) -> str | None: + if pattern is None: + return None + return pattern.format(layer=layer) + + def required_name(pattern: str, layer: int) -> str: + """Get layer name for a required pattern (never None).""" + return pattern.format(layer=layer) + + # Auto-detect config from tensor shapes + embed_info = st.tensor_info(spec.embed_tokens) + vocab_size = embed_info.shape[0] + hidden_size = embed_info.shape[1] + + # Count layers + num_layers = 0 + while required_name(spec.q_proj, num_layers) in st.tensor_names: + num_layers += 1 + + # Detect num_heads and num_kv_heads from projection shapes + q_info = st.tensor_info(required_name(spec.q_proj, 0)) + head_dim = 64 # Default + + # Try to get head_dim from q_norm if present (Qwen3) + if spec.use_qk_norm and spec.q_norm is not None: + q_norm_name = required_name(spec.q_norm, 0) + if q_norm_name in st.tensor_names: + q_norm_info = st.tensor_info(q_norm_name) + head_dim = q_norm_info.shape[0] + + num_heads = q_info.shape[0] // head_dim + + # For GQA models, detect num_kv_heads + num_kv_heads = num_heads + if not spec.qkv_combined: + k_info = st.tensor_info(required_name(spec.k_proj, 0)) + num_kv_heads = k_info.shape[0] // head_dim + + # Detect intermediate_size + intermediate_size = 4 * hidden_size + if spec.activation == "silu" and spec.gate_proj is not None: + gate_info = st.tensor_info(required_name(spec.gate_proj, 0)) + intermediate_size = gate_info.shape[0] + elif spec.activation == "gelu" and spec.fc1 is not None: + fc1_info = st.tensor_info(required_name(spec.fc1, 0)) + intermediate_size = fc1_info.shape[0] + + # Build TransformerConfig + transformer_config = TransformerConfig( + vocab_size=vocab_size, + hidden_size=hidden_size, + num_layers=num_layers, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + intermediate_size=intermediate_size, + norm_type=spec.norm_type, + activation=spec.activation, + use_rope=spec.use_rope, + causal=True, + norm_eps=spec.default_norm_eps, + rope_theta=spec.default_rope_theta, + ) + + # Load embeddings + embed_tokens = load_tensor(spec.embed_tokens) + position_embed = try_load(spec.position_embed) if spec.use_position_embed else None + + # Load blocks + blocks = [] + for layer_idx in range(num_layers): + # Attention norm (required) + attn_norm_weight = load_tensor(required_name(spec.attn_norm, layer_idx)) + attn_norm_bias = try_load(layer_name(spec.attn_norm_bias, layer_idx)) + attn_norm = Norm(attn_norm_weight, attn_norm_bias, spec.norm_type, spec.default_norm_eps) + + # QK Norm (Qwen3, optional) + q_norm_layer = None + k_norm_layer = None + if spec.use_qk_norm: + q_norm_weight = try_load(layer_name(spec.q_norm, layer_idx)) + k_norm_weight = try_load(layer_name(spec.k_norm, layer_idx)) + if q_norm_weight is not None: + q_norm_layer = Norm(q_norm_weight, None, spec.norm_type, spec.default_norm_eps) + if k_norm_weight is not None: + k_norm_layer = Norm(k_norm_weight, None, spec.norm_type, spec.default_norm_eps) + + # Attention projections + if spec.qkv_combined: + # GPT-2 style: combined QKV tensor needs to be split + c_attn_weight = load_tensor( + required_name(spec.q_proj, layer_idx), do_transpose=spec.weight_transpose + ) + c_attn_bias = try_load(layer_name(spec.q_bias, layer_idx)) + + # Split combined QKV + c_attn_np = c_attn_weight.to_numpy() + q_weight = from_numpy(c_attn_np[:hidden_size].copy().astype(target_dtype)) + k_weight = from_numpy( + c_attn_np[hidden_size : 2 * hidden_size].copy().astype(target_dtype) + ) + v_weight = from_numpy(c_attn_np[2 * hidden_size :].copy().astype(target_dtype)) + + q_bias, k_bias, v_bias = None, None, None + if c_attn_bias is not None: + c_attn_bias_np = c_attn_bias.to_numpy() + q_bias = from_numpy(c_attn_bias_np[:hidden_size].copy().astype(target_dtype)) + k_bias = from_numpy( + c_attn_bias_np[hidden_size : 2 * hidden_size].copy().astype(target_dtype) + ) + v_bias = from_numpy(c_attn_bias_np[2 * hidden_size :].copy().astype(target_dtype)) + + o_weight = load_tensor( + required_name(spec.o_proj, layer_idx), do_transpose=spec.weight_transpose + ) + o_bias = try_load(layer_name(spec.o_bias, layer_idx)) + + attn = Attention( + q_weight, + k_weight, + v_weight, + o_weight, + transformer_config, + q_bias, + k_bias, + v_bias, + o_bias, + q_norm_layer, + k_norm_layer, + ) + else: + # Separate Q, K, V projections (LLaMA/Qwen3 style) + q_weight = load_tensor(required_name(spec.q_proj, layer_idx)) + k_weight = load_tensor(required_name(spec.k_proj, layer_idx)) + v_weight = load_tensor(required_name(spec.v_proj, layer_idx)) + o_weight = load_tensor(required_name(spec.o_proj, layer_idx)) + + q_bias = try_load(layer_name(spec.q_bias, layer_idx)) + k_bias = try_load(layer_name(spec.k_bias, layer_idx)) + v_bias = try_load(layer_name(spec.v_bias, layer_idx)) + o_bias = try_load(layer_name(spec.o_bias, layer_idx)) + + attn = Attention( + q_weight, + k_weight, + v_weight, + o_weight, + transformer_config, + q_bias, + k_bias, + v_bias, + o_bias, + q_norm_layer, + k_norm_layer, + ) + + # MLP norm (required) + mlp_norm_weight = load_tensor(required_name(spec.mlp_norm, layer_idx)) + mlp_norm_bias = try_load(layer_name(spec.mlp_norm_bias, layer_idx)) + mlp_norm = Norm(mlp_norm_weight, mlp_norm_bias, spec.norm_type, spec.default_norm_eps) + + # MLP + if spec.activation == "gelu" and spec.fc1 is not None and spec.fc2 is not None: + fc1_weight = load_tensor( + required_name(spec.fc1, layer_idx), do_transpose=spec.weight_transpose + ) + fc1_bias = try_load(layer_name(spec.fc1_bias, layer_idx)) + fc2_weight = load_tensor( + required_name(spec.fc2, layer_idx), do_transpose=spec.weight_transpose + ) + fc2_bias = try_load(layer_name(spec.fc2_bias, layer_idx)) + mlp = MLP( + transformer_config, + fc1_weight=fc1_weight, + fc1_bias=fc1_bias, + fc2_weight=fc2_weight, + fc2_bias=fc2_bias, + ) + elif spec.gate_proj is not None and spec.up_proj is not None and spec.down_proj is not None: + # SwiGLU + gate_proj = load_tensor(required_name(spec.gate_proj, layer_idx)) + up_proj = load_tensor(required_name(spec.up_proj, layer_idx)) + down_proj = load_tensor(required_name(spec.down_proj, layer_idx)) + mlp = MLP( + transformer_config, + gate_proj=gate_proj, + up_proj=up_proj, + down_proj=down_proj, + ) + else: + raise ValueError(f"ModelSpec {spec.name} has invalid MLP configuration") + + block = TransformerBlock(attn_norm, attn, mlp_norm, mlp) + blocks.append(block) + + # Final norm + final_norm_weight = load_tensor(spec.final_norm) + final_norm_bias = try_load(spec.final_norm_bias) + final_norm = Norm(final_norm_weight, final_norm_bias, spec.norm_type, spec.default_norm_eps) + + # LM head + lm_head = None + if spec.lm_head is not None and spec.lm_head in st.tensor_names: + lm_head = load_tensor(spec.lm_head) + + return CausalTransformerModel( + transformer_config, embed_tokens, blocks, final_norm, lm_head, position_embed + ) From 3ddd416705af125339b46251dc7000d3002e995e Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 12:55:41 +0900 Subject: [PATCH 08/14] refactor(llm): convert model loaders to thin wrappers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert load_gpt2_from_safetensors, load_llama_from_safetensors, and load_qwen3_from_safetensors to thin wrappers that delegate to the generic load_model_from_safetensors function with appropriate ModelSpec. This completes the ModelSpec abstraction refactor: - All three loaders now use load_model_from_safetensors internally - Backward compatibility preserved via legacy model class wrappers - Config parameters are now ignored (auto-detected from tensor shapes) - ~150 lines of duplicated code removed 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/pygpukit/llm/model.py | 344 ++++---------------------------------- 1 file changed, 36 insertions(+), 308 deletions(-) diff --git a/src/pygpukit/llm/model.py b/src/pygpukit/llm/model.py index 2be32fc..e095764 100644 --- a/src/pygpukit/llm/model.py +++ b/src/pygpukit/llm/model.py @@ -1230,216 +1230,64 @@ def __init__( # ============================================================================= -# Safetensors Loaders +# Safetensors Loaders (thin wrappers over load_model_from_safetensors) # ============================================================================= def load_gpt2_from_safetensors( model_path: str, - config: GPT2Config | None = None, - load_attention: bool = True, + config: GPT2Config | None = None, # Ignored, auto-detected + dtype: str = "float32", ) -> GPT2Model: """Load GPT-2 model from safetensors file. Args: model_path: Path to model.safetensors - config: Model configuration (defaults to GPT-2 small) - load_attention: Whether to load attention weights + config: Ignored (auto-detected from tensor shapes) + dtype: Weight dtype ("float32" or "float16") Returns: GPT2Model instance (CausalTransformerModel) """ - from pygpukit.llm import SafeTensorsFile - - if config is None: - config = GPT2Config() - - transformer_config = config.to_transformer_config() - st = SafeTensorsFile(model_path) - - def load_tensor(name: str, do_transpose: bool = False) -> GPUArray: - data = st.tensor_bytes(name) - info = st.tensor_info(name) - dtype_map = {0: np.float32, 1: np.float16, 2: np.float32, 3: np.float64} - np_dtype = dtype_map.get(info.dtype, np.float32) - arr = np.frombuffer(data, dtype=np_dtype).reshape(info.shape) - if do_transpose and arr.ndim == 2: - arr = arr.T - return from_numpy(arr.copy().astype(np.float32)) - - def try_load(name: str, do_transpose: bool = False) -> GPUArray | None: - if name in st.tensor_names: - return load_tensor(name, do_transpose) - return None - - # Embeddings - wte = load_tensor("wte.weight") - wpe = load_tensor("wpe.weight") - - # Blocks - blocks = [] - for i in range(config.n_layer): - prefix = f"h.{i}." - - # Attention norm - ln_1_w = load_tensor(f"{prefix}ln_1.weight") - ln_1_b = load_tensor(f"{prefix}ln_1.bias") - attn_norm = Norm(ln_1_w, ln_1_b, "layernorm", config.layer_norm_eps) - - # Attention - attn = None - if load_attention: - c_attn_w = load_tensor(f"{prefix}attn.c_attn.weight", do_transpose=True) - c_attn_b = try_load(f"{prefix}attn.c_attn.bias") - c_proj_w = load_tensor(f"{prefix}attn.c_proj.weight", do_transpose=True) - c_proj_b = try_load(f"{prefix}attn.c_proj.bias") - attn = CausalSelfAttention( - c_attn_w, c_attn_b, c_proj_w, c_proj_b, config.n_head, config.n_embd - ) - - # MLP norm - ln_2_w = load_tensor(f"{prefix}ln_2.weight") - ln_2_b = load_tensor(f"{prefix}ln_2.bias") - mlp_norm = Norm(ln_2_w, ln_2_b, "layernorm", config.layer_norm_eps) - - # MLP - c_fc_w = load_tensor(f"{prefix}mlp.c_fc.weight", do_transpose=True) - c_fc_b = try_load(f"{prefix}mlp.c_fc.bias") - c_proj_w = load_tensor(f"{prefix}mlp.c_proj.weight", do_transpose=True) - c_proj_b = try_load(f"{prefix}mlp.c_proj.bias") - mlp = MLP( - transformer_config, - fc1_weight=c_fc_w, - fc1_bias=c_fc_b, - fc2_weight=c_proj_w, - fc2_bias=c_proj_b, - ) - - if attn is not None: - block = TransformerBlock(attn_norm, attn, mlp_norm, mlp) - blocks.append(block) - - # Final norm - ln_f_w = load_tensor("ln_f.weight") - ln_f_b = load_tensor("ln_f.bias") - final_norm = Norm(ln_f_w, ln_f_b, "layernorm", config.layer_norm_eps) - - return GPT2Model(transformer_config, wte, blocks, final_norm, None, wpe) + # Note: config parameter is ignored; config is auto-detected + model = load_model_from_safetensors(model_path, dtype=dtype, spec=GPT2_SPEC) + # Wrap as GPT2Model for backward compatibility + return GPT2Model( + model.config, + model.embed_tokens, + model.blocks, + model.final_norm, + model.lm_head, + model.position_embed, + ) def load_llama_from_safetensors( model_path: str, - config: LlamaConfig | None = None, + config: LlamaConfig | None = None, # Ignored, auto-detected + dtype: str = "float32", ) -> LlamaModel: """Load Llama model from safetensors file. Args: model_path: Path to model.safetensors - config: Model configuration (auto-detected if None) + config: Ignored (auto-detected from tensor shapes) + dtype: Weight dtype ("float32" or "float16") Returns: LlamaModel instance (CausalTransformerModel) """ - from pygpukit.llm import SafeTensorsFile - - st = SafeTensorsFile(model_path) - - # Auto-detect config - if config is None: - embed_info = st.tensor_info("model.embed_tokens.weight") - vocab_size = embed_info.shape[0] - hidden_size = embed_info.shape[1] - - num_layers = 0 - while f"model.layers.{num_layers}.self_attn.q_proj.weight" in st.tensor_names: - num_layers += 1 - - q_info = st.tensor_info("model.layers.0.self_attn.q_proj.weight") - k_info = st.tensor_info("model.layers.0.self_attn.k_proj.weight") - gate_info = st.tensor_info("model.layers.0.mlp.gate_proj.weight") - - head_dim = 64 - num_heads = q_info.shape[0] // head_dim - num_kv_heads = k_info.shape[0] // head_dim - - config = LlamaConfig( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=gate_info.shape[0], - num_hidden_layers=num_layers, - num_attention_heads=num_heads, - num_key_value_heads=num_kv_heads, - ) - - transformer_config = config.to_transformer_config() - - def load_tensor(name: str) -> GPUArray: - data = st.tensor_bytes(name) - info = st.tensor_info(name) - if info.dtype == 2: # BFloat16 - arr = np.frombuffer(data, dtype=np.uint16).reshape(info.shape) - arr_f32 = np.empty(arr.shape, dtype=np.float32) - arr_f32.view(np.uint32)[:] = arr.astype(np.uint32) << 16 - return from_numpy(arr_f32) - else: - dtype_map = {0: np.float32, 1: np.float16, 3: np.float64} - np_dtype = dtype_map.get(info.dtype, np.float32) - arr = np.frombuffer(data, dtype=np_dtype).reshape(info.shape) - return from_numpy(arr.copy().astype(np.float32)) - - # Embeddings - embed_tokens = load_tensor("model.embed_tokens.weight") - - # Blocks - blocks = [] - for i in range(config.num_hidden_layers): - prefix = f"model.layers.{i}." - - # Attention norm - attn_norm = Norm( - load_tensor(f"{prefix}input_layernorm.weight"), - None, - "rmsnorm", - config.rms_norm_eps, - ) - - # Attention - attn = Attention( - load_tensor(f"{prefix}self_attn.q_proj.weight"), - load_tensor(f"{prefix}self_attn.k_proj.weight"), - load_tensor(f"{prefix}self_attn.v_proj.weight"), - load_tensor(f"{prefix}self_attn.o_proj.weight"), - transformer_config, - ) - - # MLP norm - mlp_norm = Norm( - load_tensor(f"{prefix}post_attention_layernorm.weight"), - None, - "rmsnorm", - config.rms_norm_eps, - ) - - # MLP - mlp = MLP( - transformer_config, - gate_proj=load_tensor(f"{prefix}mlp.gate_proj.weight"), - up_proj=load_tensor(f"{prefix}mlp.up_proj.weight"), - down_proj=load_tensor(f"{prefix}mlp.down_proj.weight"), - ) - - block = TransformerBlock(attn_norm, attn, mlp_norm, mlp) - blocks.append(block) - - # Final norm - final_norm = Norm(load_tensor("model.norm.weight"), None, "rmsnorm", config.rms_norm_eps) - - # LM head - lm_head = None - if "lm_head.weight" in st.tensor_names: - lm_head = load_tensor("lm_head.weight") - - return LlamaModel(transformer_config, embed_tokens, blocks, final_norm, lm_head) + # Note: config parameter is ignored; config is auto-detected + model = load_model_from_safetensors(model_path, dtype=dtype, spec=LLAMA_SPEC) + # Wrap as LlamaModel for backward compatibility + return LlamaModel( + model.config, + model.embed_tokens, + model.blocks, + model.final_norm, + model.lm_head, + model.position_embed, + ) # ============================================================================= @@ -1483,141 +1331,21 @@ def to_transformer_config(self) -> TransformerConfig: def load_qwen3_from_safetensors( model_path: str, - config: Qwen3Config | None = None, + config: Qwen3Config | None = None, # Ignored, auto-detected dtype: str = "float32", ) -> CausalTransformerModel: - """Load Qwen3 model from safetensors file (single or sharded). + """Load Qwen3 model from safetensors file. Args: model_path: Path to model.safetensors or model.safetensors.index.json - config: Model configuration (auto-detected if None) + config: Ignored (auto-detected from tensor shapes) dtype: Weight dtype ("float32" or "float16") Returns: CausalTransformerModel instance """ - from pygpukit.llm import load_safetensors - - st = load_safetensors(model_path) - target_dtype = np.float16 if dtype == "float16" else np.float32 - - # Auto-detect config - if config is None: - embed_info = st.tensor_info("model.embed_tokens.weight") - vocab_size = embed_info.shape[0] - hidden_size = embed_info.shape[1] - - num_layers = 0 - while f"model.layers.{num_layers}.self_attn.q_proj.weight" in st.tensor_names: - num_layers += 1 - - q_info = st.tensor_info("model.layers.0.self_attn.q_proj.weight") - k_info = st.tensor_info("model.layers.0.self_attn.k_proj.weight") - gate_info = st.tensor_info("model.layers.0.mlp.gate_proj.weight") - - # Qwen3 uses explicit head_dim from q_norm shape - q_norm_info = st.tensor_info("model.layers.0.self_attn.q_norm.weight") - head_dim = q_norm_info.shape[0] - - num_heads = q_info.shape[0] // head_dim - num_kv_heads = k_info.shape[0] // head_dim - - config = Qwen3Config( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=gate_info.shape[0], - num_hidden_layers=num_layers, - num_attention_heads=num_heads, - num_key_value_heads=num_kv_heads, - head_dim=head_dim, - ) - - transformer_config = config.to_transformer_config() - - def load_tensor(name: str) -> GPUArray: - data = st.tensor_bytes(name) - info = st.tensor_info(name) - if info.dtype == 2: # BFloat16 -> target dtype - arr = np.frombuffer(data, dtype=np.uint16).reshape(info.shape) - # BF16 to FP32 first, then cast to target - arr_f32 = np.empty(arr.shape, dtype=np.float32) - arr_f32.view(np.uint32)[:] = arr.astype(np.uint32) << 16 - return from_numpy(arr_f32.astype(target_dtype)) - else: - dtype_map = {0: np.float32, 1: np.float16, 3: np.float64} - np_dtype = dtype_map.get(info.dtype, np.float32) - arr = np.frombuffer(data, dtype=np_dtype).reshape(info.shape) - return from_numpy(arr.copy().astype(target_dtype)) - - # Embeddings - embed_tokens = load_tensor("model.embed_tokens.weight") - - # Blocks - blocks = [] - for i in range(config.num_hidden_layers): - prefix = f"model.layers.{i}." - - # Attention norm - attn_norm = Norm( - load_tensor(f"{prefix}input_layernorm.weight"), - None, - "rmsnorm", - config.rms_norm_eps, - ) - - # QK Norm (Qwen3 specific) - q_norm = Norm( - load_tensor(f"{prefix}self_attn.q_norm.weight"), - None, - "rmsnorm", - config.rms_norm_eps, - ) - k_norm = Norm( - load_tensor(f"{prefix}self_attn.k_norm.weight"), - None, - "rmsnorm", - config.rms_norm_eps, - ) - - # Attention with QK Norm - attn = Attention( - load_tensor(f"{prefix}self_attn.q_proj.weight"), - load_tensor(f"{prefix}self_attn.k_proj.weight"), - load_tensor(f"{prefix}self_attn.v_proj.weight"), - load_tensor(f"{prefix}self_attn.o_proj.weight"), - transformer_config, - q_norm=q_norm, - k_norm=k_norm, - ) - - # MLP norm - mlp_norm = Norm( - load_tensor(f"{prefix}post_attention_layernorm.weight"), - None, - "rmsnorm", - config.rms_norm_eps, - ) - - # MLP - mlp = MLP( - transformer_config, - gate_proj=load_tensor(f"{prefix}mlp.gate_proj.weight"), - up_proj=load_tensor(f"{prefix}mlp.up_proj.weight"), - down_proj=load_tensor(f"{prefix}mlp.down_proj.weight"), - ) - - block = TransformerBlock(attn_norm, attn, mlp_norm, mlp) - blocks.append(block) - - # Final norm - final_norm = Norm(load_tensor("model.norm.weight"), None, "rmsnorm", config.rms_norm_eps) - - # LM head - lm_head = None - if "lm_head.weight" in st.tensor_names: - lm_head = load_tensor("lm_head.weight") - - return CausalTransformerModel(transformer_config, embed_tokens, blocks, final_norm, lm_head) + # Note: config parameter is ignored; config is auto-detected + return load_model_from_safetensors(model_path, dtype=dtype, spec=QWEN3_SPEC) # ============================================================================= From fc1beee0c4b536da1c4f439367f006cf3036e1b8 Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 13:04:02 +0900 Subject: [PATCH 09/14] refactor(llm): unify all models into CausalTransformerModel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Final ModelSpec cleanup for v0.2.9: - CausalTransformerModel is now the ONLY runtime model - Add `spec` attribute to store ModelSpec used for loading - GPT2Model, LlamaModel are now simple type aliases - RMSNorm, LayerNorm, etc. are simple aliases to Norm - Remove all legacy wrapper classes with constructors - Simplify loaders to direct load_model_from_safetensors calls - Remove redundant config parameters from loaders Code reduction: 188 deletions, 48 insertions (-140 net lines) All model-specific behavior is now controlled via: - model.spec.use_rope - model.spec.use_qk_norm - model.spec.activation - model.spec.norm_type 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/pygpukit/llm/__init__.py | 35 +++--- src/pygpukit/llm/model.py | 201 ++++++----------------------------- 2 files changed, 48 insertions(+), 188 deletions(-) diff --git a/src/pygpukit/llm/__init__.py b/src/pygpukit/llm/__init__.py index de4d8f2..457a5bb 100644 --- a/src/pygpukit/llm/__init__.py +++ b/src/pygpukit/llm/__init__.py @@ -476,32 +476,33 @@ def __repr__(self) -> str: MLP, MODEL_SPECS, QWEN3_SPEC, + # Components Attention, CausalSelfAttention, + # Core model CausalTransformerModel, - # Legacy GPT-2 aliases + # Legacy config classes (for reference) GPT2Config, + # Type aliases (GPT2Model = LlamaModel = CausalTransformerModel) GPT2Model, LayerNorm, Linear, LlamaAttention, LlamaBlock, - # Legacy Llama aliases LlamaConfig, LlamaMLP, LlamaModel, - # ModelSpec abstraction (v0.2.9) + # ModelSpec (v0.2.9) ModelSpec, Norm, - # Qwen3 Qwen3Config, RMSNorm, TransformerBlock, - # Unified Transformer classes TransformerConfig, detect_model_spec, load_gpt2_from_safetensors, load_llama_from_safetensors, + # Loaders load_model_from_safetensors, load_qwen3_from_safetensors, ) @@ -515,37 +516,37 @@ def __repr__(self) -> str: "load_safetensors", # Tokenizer "Tokenizer", - # Unified Transformer (v0.2.9) - "TransformerConfig", + # Core Transformer (v0.2.9) "CausalTransformerModel", + "TransformerConfig", "Attention", "MLP", "Norm", "TransformerBlock", "Linear", - # ModelSpec abstraction (v0.2.9) + # ModelSpec (v0.2.9) "ModelSpec", "GPT2_SPEC", "LLAMA_SPEC", "QWEN3_SPEC", "MODEL_SPECS", "detect_model_spec", + # Loaders "load_model_from_safetensors", - # Legacy GPT-2 aliases + "load_gpt2_from_safetensors", + "load_llama_from_safetensors", + "load_qwen3_from_safetensors", + # Legacy config classes "GPT2Config", + "LlamaConfig", + "Qwen3Config", + # Type aliases (all point to unified types) "GPT2Model", + "LlamaModel", "CausalSelfAttention", "LayerNorm", - "load_gpt2_from_safetensors", - # Legacy Llama aliases - "LlamaConfig", - "LlamaModel", "LlamaAttention", "LlamaBlock", "LlamaMLP", "RMSNorm", - "load_llama_from_safetensors", - # Qwen3 - "Qwen3Config", - "load_qwen3_from_safetensors", ] diff --git a/src/pygpukit/llm/model.py b/src/pygpukit/llm/model.py index e095764..668f584 100644 --- a/src/pygpukit/llm/model.py +++ b/src/pygpukit/llm/model.py @@ -941,7 +941,8 @@ def __call__( class CausalTransformerModel: """Unified causal transformer model. - Supports GPT-2 and LLaMA architectures through configuration. + The single runtime model for all architectures (GPT-2, LLaMA, Qwen3). + Model-specific behavior is controlled by the spec attribute. """ def __init__( @@ -952,13 +953,15 @@ def __init__( final_norm: Norm, lm_head: GPUArray | None = None, position_embed: GPUArray | None = None, # For GPT-2 style + spec: ModelSpec | None = None, ): self.config = config self.embed_tokens = embed_tokens self.blocks = blocks self.final_norm = final_norm - self.lm_head = lm_head + self._lm_head = lm_head self.position_embed = position_embed + self.spec = spec def __call__( self, @@ -1012,12 +1015,17 @@ def __call__( return hidden, present_key_values return hidden, None + @property + def lm_head(self) -> GPUArray | None: + """LM head weights (for backward compatibility).""" + return self._lm_head + def get_logits(self, hidden: GPUArray) -> GPUArray: """Compute logits from hidden states.""" hidden_np = hidden.to_numpy() - if self.lm_head is not None: - lm_head_np = self.lm_head.to_numpy() + if self._lm_head is not None: + lm_head_np = self._lm_head.to_numpy() else: # Tied embeddings lm_head_np = self.embed_tokens.to_numpy() @@ -1090,204 +1098,58 @@ def generate( # ============================================================================= -# Legacy Aliases (for backward compatibility) +# Type Aliases # ============================================================================= +# GPT2Model and LlamaModel are now simple aliases for CausalTransformerModel. +# All models use CausalTransformerModel as the single runtime type. +GPT2Model = CausalTransformerModel +LlamaModel = CausalTransformerModel -# RMSNorm alias -class RMSNorm(Norm): - """RMSNorm layer (legacy alias).""" - - def __init__(self, weight: GPUArray, eps: float = 1e-5): - super().__init__(weight, None, "rmsnorm", eps) - - -# LayerNorm alias -class LayerNorm(Norm): - """LayerNorm layer (legacy alias).""" - - def __init__(self, weight: GPUArray, bias: GPUArray, eps: float = 1e-5): - super().__init__(weight, bias, "layernorm", eps) - - -# Legacy LlamaAttention alias +# Legacy component aliases +RMSNorm = Norm # Use Norm with norm_type="rmsnorm" +LayerNorm = Norm # Use Norm with norm_type="layernorm" LlamaAttention = Attention - - -# Legacy LlamaMLP -class LlamaMLP(MLP): - """LLaMA MLP (legacy alias).""" - - def __init__( - self, - gate_proj: GPUArray, - up_proj: GPUArray, - down_proj: GPUArray, - ): - # Create minimal config for SwiGLU - config = TransformerConfig(activation="silu") - super().__init__(config, gate_proj=gate_proj, up_proj=up_proj, down_proj=down_proj) - - -# Legacy LlamaBlock alias +LlamaMLP = MLP LlamaBlock = TransformerBlock - - -# Legacy LlamaModel alias -class LlamaModel(CausalTransformerModel): - """LLaMA model (legacy alias).""" - - pass - - -# Legacy GPT2Model alias -class GPT2Model(CausalTransformerModel): - """GPT-2 model (legacy alias).""" - - def lm_head(self, hidden: GPUArray) -> GPUArray: - """Legacy lm_head method.""" - return self.get_logits(hidden) +CausalSelfAttention = Attention # ============================================================================= -# Legacy Attention Classes (for backward compatibility) -# ============================================================================= - - -class CausalSelfAttention(Attention): - """GPT-2 style causal self-attention (legacy alias).""" - - def __init__( - self, - c_attn_weight: GPUArray, - c_attn_bias: GPUArray | None, - c_proj_weight: GPUArray, - c_proj_bias: GPUArray | None, - n_head: int, - n_embd: int, - ): - # GPT-2 uses combined QKV projection - # Split weights for unified Attention class - # c_attn: [3*n_embd, n_embd] -> Q, K, V each [n_embd, n_embd] - c_attn_np = c_attn_weight.to_numpy() - q_weight = from_numpy(c_attn_np[:n_embd].copy()) - k_weight = from_numpy(c_attn_np[n_embd : 2 * n_embd].copy()) - v_weight = from_numpy(c_attn_np[2 * n_embd :].copy()) - - q_bias, k_bias, v_bias = None, None, None - if c_attn_bias is not None: - c_attn_bias_np = c_attn_bias.to_numpy() - q_bias = from_numpy(c_attn_bias_np[:n_embd].copy()) - k_bias = from_numpy(c_attn_bias_np[n_embd : 2 * n_embd].copy()) - v_bias = from_numpy(c_attn_bias_np[2 * n_embd :].copy()) - - config = TransformerConfig( - hidden_size=n_embd, - num_heads=n_head, - num_kv_heads=n_head, # MHA - norm_type="layernorm", - activation="gelu", - use_rope=False, - causal=True, - ) - - super().__init__( - q_weight, - k_weight, - v_weight, - c_proj_weight, - config, - q_bias, - k_bias, - v_bias, - c_proj_bias, - ) - - -# ============================================================================= -# Legacy MLP Class (for backward compatibility) -# ============================================================================= - - -class _LegacyMLP(MLP): - """GPT-2 style MLP (legacy).""" - - def __init__( - self, - c_fc_weight: GPUArray, - c_fc_bias: GPUArray | None, - c_proj_weight: GPUArray, - c_proj_bias: GPUArray | None, - ): - config = TransformerConfig(activation="gelu") - super().__init__( - config, - fc1_weight=c_fc_weight, - fc1_bias=c_fc_bias, - fc2_weight=c_proj_weight, - fc2_bias=c_proj_bias, - ) - - -# ============================================================================= -# Safetensors Loaders (thin wrappers over load_model_from_safetensors) +# Safetensors Loaders # ============================================================================= def load_gpt2_from_safetensors( model_path: str, - config: GPT2Config | None = None, # Ignored, auto-detected dtype: str = "float32", -) -> GPT2Model: +) -> CausalTransformerModel: """Load GPT-2 model from safetensors file. Args: model_path: Path to model.safetensors - config: Ignored (auto-detected from tensor shapes) dtype: Weight dtype ("float32" or "float16") Returns: - GPT2Model instance (CausalTransformerModel) + CausalTransformerModel instance """ - # Note: config parameter is ignored; config is auto-detected - model = load_model_from_safetensors(model_path, dtype=dtype, spec=GPT2_SPEC) - # Wrap as GPT2Model for backward compatibility - return GPT2Model( - model.config, - model.embed_tokens, - model.blocks, - model.final_norm, - model.lm_head, - model.position_embed, - ) + return load_model_from_safetensors(model_path, dtype=dtype, spec=GPT2_SPEC) def load_llama_from_safetensors( model_path: str, - config: LlamaConfig | None = None, # Ignored, auto-detected dtype: str = "float32", -) -> LlamaModel: +) -> CausalTransformerModel: """Load Llama model from safetensors file. Args: model_path: Path to model.safetensors - config: Ignored (auto-detected from tensor shapes) dtype: Weight dtype ("float32" or "float16") Returns: - LlamaModel instance (CausalTransformerModel) + CausalTransformerModel instance """ - # Note: config parameter is ignored; config is auto-detected - model = load_model_from_safetensors(model_path, dtype=dtype, spec=LLAMA_SPEC) - # Wrap as LlamaModel for backward compatibility - return LlamaModel( - model.config, - model.embed_tokens, - model.blocks, - model.final_norm, - model.lm_head, - model.position_embed, - ) + return load_model_from_safetensors(model_path, dtype=dtype, spec=LLAMA_SPEC) # ============================================================================= @@ -1331,20 +1193,17 @@ def to_transformer_config(self) -> TransformerConfig: def load_qwen3_from_safetensors( model_path: str, - config: Qwen3Config | None = None, # Ignored, auto-detected dtype: str = "float32", ) -> CausalTransformerModel: """Load Qwen3 model from safetensors file. Args: model_path: Path to model.safetensors or model.safetensors.index.json - config: Ignored (auto-detected from tensor shapes) dtype: Weight dtype ("float32" or "float16") Returns: CausalTransformerModel instance """ - # Note: config parameter is ignored; config is auto-detected return load_model_from_safetensors(model_path, dtype=dtype, spec=QWEN3_SPEC) @@ -1621,5 +1480,5 @@ def required_name(pattern: str, layer: int) -> str: lm_head = load_tensor(spec.lm_head) return CausalTransformerModel( - transformer_config, embed_tokens, blocks, final_norm, lm_head, position_embed + transformer_config, embed_tokens, blocks, final_norm, lm_head, position_embed, spec ) From 750dd029b6bf8e55c765270a74a436f053145059 Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 13:07:15 +0900 Subject: [PATCH 10/14] test(llm): add unified interface tests for v0.2.9 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add comprehensive tests for the ModelSpec refactor: - Type aliases (GPT2Model = LlamaModel = CausalTransformerModel) - Component aliases (RMSNorm = Norm, CausalSelfAttention = Attention) - ModelSpec instances and registry - Automatic model detection from tensor names - Simplified loader signatures (no config parameter) - CausalTransformerModel spec attribute - All expected exports from pygpukit.llm 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- tests/test_llm_unified.py | 232 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 tests/test_llm_unified.py diff --git a/tests/test_llm_unified.py b/tests/test_llm_unified.py new file mode 100644 index 0000000..bce0317 --- /dev/null +++ b/tests/test_llm_unified.py @@ -0,0 +1,232 @@ +"""Test unified LLM interface (v0.2.9 ModelSpec refactor).""" + +import pytest + + +def test_type_aliases(): + """Test that GPT2Model and LlamaModel are aliases for CausalTransformerModel.""" + from pygpukit.llm import ( + CausalTransformerModel, + GPT2Model, + LlamaModel, + ) + + # All should be the exact same class + assert GPT2Model is CausalTransformerModel + assert LlamaModel is CausalTransformerModel + + +def test_component_aliases(): + """Test that component aliases point to unified classes.""" + from pygpukit.llm import ( + Attention, + CausalSelfAttention, + LlamaAttention, + LlamaBlock, + LlamaMLP, + MLP, + Norm, + RMSNorm, + TransformerBlock, + ) + + # Attention aliases + assert CausalSelfAttention is Attention + assert LlamaAttention is Attention + + # MLP aliases + assert LlamaMLP is MLP + + # Block aliases + assert LlamaBlock is TransformerBlock + + # Norm aliases (RMSNorm = Norm now) + assert RMSNorm is Norm + + +def test_model_specs_exist(): + """Test that ModelSpec instances are defined correctly.""" + from pygpukit.llm import ( + GPT2_SPEC, + LLAMA_SPEC, + MODEL_SPECS, + QWEN3_SPEC, + ModelSpec, + ) + + # All specs should be ModelSpec instances + assert isinstance(GPT2_SPEC, ModelSpec) + assert isinstance(LLAMA_SPEC, ModelSpec) + assert isinstance(QWEN3_SPEC, ModelSpec) + + # Check names + assert GPT2_SPEC.name == "gpt2" + assert LLAMA_SPEC.name == "llama" + assert QWEN3_SPEC.name == "qwen3" + + # Check architecture flags + assert GPT2_SPEC.norm_type == "layernorm" + assert GPT2_SPEC.activation == "gelu" + assert GPT2_SPEC.use_rope is False + assert GPT2_SPEC.use_position_embed is True + + assert LLAMA_SPEC.norm_type == "rmsnorm" + assert LLAMA_SPEC.activation == "silu" + assert LLAMA_SPEC.use_rope is True + assert LLAMA_SPEC.use_qk_norm is False + + assert QWEN3_SPEC.norm_type == "rmsnorm" + assert QWEN3_SPEC.activation == "silu" + assert QWEN3_SPEC.use_rope is True + assert QWEN3_SPEC.use_qk_norm is True + assert QWEN3_SPEC.default_norm_eps == 1e-6 + assert QWEN3_SPEC.default_rope_theta == 1000000.0 + + # Check MODEL_SPECS registry + assert MODEL_SPECS["gpt2"] is GPT2_SPEC + assert MODEL_SPECS["llama"] is LLAMA_SPEC + assert MODEL_SPECS["qwen3"] is QWEN3_SPEC + assert MODEL_SPECS["qwen2"] is LLAMA_SPEC # Qwen2 uses LLaMA structure + + +def test_detect_model_spec(): + """Test automatic model detection from tensor names.""" + from pygpukit.llm import ( + GPT2_SPEC, + LLAMA_SPEC, + QWEN3_SPEC, + detect_model_spec, + ) + + # GPT-2 detection + gpt2_tensors = ["wte.weight", "wpe.weight", "h.0.ln_1.weight"] + assert detect_model_spec(gpt2_tensors) is GPT2_SPEC + + # LLaMA detection + llama_tensors = ["model.embed_tokens.weight", "model.layers.0.self_attn.q_proj.weight"] + assert detect_model_spec(llama_tensors) is LLAMA_SPEC + + # Qwen3 detection (has q_norm) + qwen3_tensors = [ + "model.embed_tokens.weight", + "model.layers.0.self_attn.q_norm.weight", + "model.layers.0.self_attn.q_proj.weight", + ] + assert detect_model_spec(qwen3_tensors) is QWEN3_SPEC + + +def test_detect_model_spec_unknown(): + """Test that unknown tensor names raise ValueError.""" + from pygpukit.llm import detect_model_spec + + with pytest.raises(ValueError, match="Cannot detect model type"): + detect_model_spec(["unknown.tensor.weight"]) + + +def test_transformer_config(): + """Test TransformerConfig creation and properties.""" + from pygpukit.llm import TransformerConfig + + # Test default values + config = TransformerConfig() + assert config.num_kv_heads == config.num_heads # MHA default + assert config.intermediate_size == 4 * config.hidden_size + + # Test GQA config + gqa_config = TransformerConfig( + hidden_size=4096, + num_heads=32, + num_kv_heads=8, + ) + assert gqa_config.head_dim == 128 # 4096 / 32 + assert gqa_config.num_kv_groups == 4 # 32 / 8 + + +def test_loader_signatures(): + """Test that loader functions have correct signatures.""" + import inspect + + from pygpukit.llm import ( + load_gpt2_from_safetensors, + load_llama_from_safetensors, + load_model_from_safetensors, + load_qwen3_from_safetensors, + ) + + # Check load_model_from_safetensors signature + sig = inspect.signature(load_model_from_safetensors) + params = list(sig.parameters.keys()) + assert "model_path" in params + assert "dtype" in params + assert "spec" in params + + # Check simplified loader signatures (no config parameter) + for loader in [ + load_gpt2_from_safetensors, + load_llama_from_safetensors, + load_qwen3_from_safetensors, + ]: + sig = inspect.signature(loader) + params = list(sig.parameters.keys()) + assert "model_path" in params + assert "dtype" in params + # Should NOT have config parameter anymore + assert "config" not in params + + +def test_causal_transformer_model_has_spec(): + """Test that CausalTransformerModel accepts spec parameter.""" + import inspect + + from pygpukit.llm import CausalTransformerModel + + sig = inspect.signature(CausalTransformerModel.__init__) + params = list(sig.parameters.keys()) + assert "spec" in params + + +def test_exports(): + """Test that all expected symbols are exported from pygpukit.llm.""" + from pygpukit import llm + + # Core classes + assert hasattr(llm, "CausalTransformerModel") + assert hasattr(llm, "TransformerConfig") + assert hasattr(llm, "Attention") + assert hasattr(llm, "MLP") + assert hasattr(llm, "Norm") + assert hasattr(llm, "TransformerBlock") + assert hasattr(llm, "Linear") + + # ModelSpec + assert hasattr(llm, "ModelSpec") + assert hasattr(llm, "GPT2_SPEC") + assert hasattr(llm, "LLAMA_SPEC") + assert hasattr(llm, "QWEN3_SPEC") + assert hasattr(llm, "MODEL_SPECS") + assert hasattr(llm, "detect_model_spec") + + # Loaders + assert hasattr(llm, "load_model_from_safetensors") + assert hasattr(llm, "load_gpt2_from_safetensors") + assert hasattr(llm, "load_llama_from_safetensors") + assert hasattr(llm, "load_qwen3_from_safetensors") + + # Type aliases + assert hasattr(llm, "GPT2Model") + assert hasattr(llm, "LlamaModel") + assert hasattr(llm, "CausalSelfAttention") + assert hasattr(llm, "LlamaAttention") + assert hasattr(llm, "LlamaBlock") + assert hasattr(llm, "LlamaMLP") + assert hasattr(llm, "RMSNorm") + assert hasattr(llm, "LayerNorm") + + # Legacy configs (still exported for reference) + assert hasattr(llm, "GPT2Config") + assert hasattr(llm, "LlamaConfig") + assert hasattr(llm, "Qwen3Config") + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) From 03e4e3ae632f233d5646e8e1462ef4d1e9d9da70 Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 13:16:41 +0900 Subject: [PATCH 11/14] demo(llm): update E2E demo for unified ModelSpec interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Use load_model_from_safetensors with detect_model_spec - Add --dtype argument for FP16/FP32 selection - Show ModelSpec info in detection output - Display model.spec attribute after loading 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- examples/demo_llm_e2e.py | 42 +++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/examples/demo_llm_e2e.py b/examples/demo_llm_e2e.py index 04d6c0b..8e5acee 100644 --- a/examples/demo_llm_e2e.py +++ b/examples/demo_llm_e2e.py @@ -98,6 +98,13 @@ def main(): action="store_true", help="Run benchmark without text generation", ) + parser.add_argument( + "--dtype", + type=str, + choices=["float32", "float16"], + default="float32", + help="Weight dtype (float32 or float16)", + ) args = parser.parse_args() print("=" * 70) @@ -120,10 +127,10 @@ def main(): try: import pygpukit as gpk from pygpukit.llm import ( - SafeTensorsFile, Tokenizer, - load_gpt2_from_safetensors, - load_llama_from_safetensors, + detect_model_spec, + load_model_from_safetensors, + load_safetensors, ) print("\nPyGPUkit loaded successfully") @@ -152,24 +159,20 @@ def main(): # ========================================================================= section("Detecting Model Type") - st = SafeTensorsFile(str(model_path)) + st = load_safetensors(str(model_path)) tensor_names = st.tensor_names - if args.model_type == "auto": - if "model.embed_tokens.weight" in tensor_names: - model_type = "llama" - elif "wte.weight" in tensor_names: - model_type = "gpt2" - else: - print(" Error: Could not auto-detect model type") - print(f" First 10 tensor names: {tensor_names[:10]}") - return 1 - else: - model_type = args.model_type + # Use ModelSpec-based detection + spec = detect_model_spec(tensor_names) + model_type = spec.name print(f" Model type: {model_type.upper()}") + print(f" ModelSpec: {spec.name}") print(f" Total tensors: {len(tensor_names)}") - print(f" File size: {st.file_size / 1024 / 1024:.1f} MB") + print(f" Norm type: {spec.norm_type}") + print(f" Activation: {spec.activation}") + print(f" Use RoPE: {spec.use_rope}") + print(f" Use QK Norm: {spec.use_qk_norm}") # ========================================================================= # Load Model @@ -177,10 +180,7 @@ def main(): section("Loading Model") start = time.perf_counter() - if model_type == "llama": - model = load_llama_from_safetensors(str(model_path)) - else: - model = load_gpt2_from_safetensors(str(model_path)) + model = load_model_from_safetensors(str(model_path), dtype=args.dtype, spec=spec) load_time = (time.perf_counter() - start) * 1000 config = model.config @@ -196,6 +196,8 @@ def main(): print(f" Activation: {config.activation}") print(f" Use RoPE: {config.use_rope}") print(f" Load time: {format_time(load_time)}") + print(f" model.spec: {model.spec.name if model.spec else 'None'}") + print(f" dtype: {args.dtype}") # Estimate model size params = ( From 376ccbc13f30ae9a63fc0c148365c4b50e25838d Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 13:23:02 +0900 Subject: [PATCH 12/14] docs: clarify tokenizer is experimental, update README for v0.2.9 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add tokenizer policy section: PyGPUkit delegates tokenization to HuggingFace - Mark pygpukit.llm.Tokenizer as EXPERIMENTAL with detailed docstring - Update README LLM section with unified interface examples - Add v0.2.9 to roadmap (unified LLM interface, ModelSpec abstraction) - Update API stability table with CausalTransformerModel 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- README.md | 29 ++++++++++++++++++++--------- examples/demo_llm_e2e.py | 7 +++++++ src/pygpukit/llm/__init__.py | 22 +++++++++++++++++++++- tests/test_llm_unified.py | 2 +- 4 files changed, 49 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 56fdc5a..ed36625 100644 --- a/README.md +++ b/README.md @@ -84,25 +84,34 @@ Runtime SM detection with architecture-optimized kernel variants: PyGPUkit includes built-in support for loading and running LLM models. See the [LLM Guide](docs/llm.md) for detailed documentation. +**Important:** PyGPUkit's core responsibility is **GPU execution**, not tokenization. +- The model API expects **token IDs as input**, not raw text +- For production tokenization, use [HuggingFace tokenizers](https://github.com/huggingface/tokenizers) +- The built-in `Tokenizer` class is **experimental** and intended for demos only + ```python -from pygpukit.llm import SafeTensorsFile, Tokenizer +from pygpukit.llm import SafeTensorsFile, load_model_from_safetensors, detect_model_spec # Load safetensors (memory-mapped, zero-copy) st = SafeTensorsFile("model.safetensors") print(f"Tensors: {st.num_tensors}, Size: {st.file_size / 1e9:.2f} GB") -# Tokenizer (HuggingFace format) -tok = Tokenizer("tokenizer.json") -ids = tok.encode("Hello, world!") -text = tok.decode(ids) +# Load model with automatic architecture detection +spec = detect_model_spec(st.tensor_names) +model = load_model_from_safetensors("model.safetensors", dtype="float16", spec=spec) + +# Generate with token IDs (use HuggingFace tokenizers for production) +input_ids = [1, 2, 3, 4] # Your tokenizer's output +output_ids = model.generate(input_ids, max_new_tokens=32) ``` | Component | Description | |-----------|-------------| | `SafeTensorsFile` | Memory-mapped .safetensors loading | -| `Tokenizer` | BPE tokenizer (HuggingFace format) | -| `GPT2Model` | GPT-2 model (MLP-only MVP) | -| `Linear`, `LayerNorm`, `MLP` | Model building blocks | +| `CausalTransformerModel` | Unified model for GPT-2, LLaMA, Qwen3 | +| `load_model_from_safetensors` | Load model with auto-detection | +| `detect_model_spec` | Auto-detect model architecture | +| `Tokenizer` | **Experimental** BPE tokenizer (demos only) | --- @@ -446,6 +455,7 @@ PyGPUkit/ | **v0.2.6** | **CUTLASS backend** (31 TFLOPS TF32, 63 TFLOPS FP16/BF16), Multi-LLM concurrent execution | | **v0.2.7** | **Epilogue fusion** (linear+bias+gelu), Multi-SM kernels, API review | | **v0.2.8** | CUTLASS v4.3.3 update, auto-update workflow | +| **v0.2.9** | **Unified LLM interface** (CausalTransformerModel), ModelSpec abstraction, GPT-2/LLaMA/Qwen3 support | ### Planned @@ -473,7 +483,8 @@ All functions exported via `pygpukit.*` are part of the stable public API: | **Reductions** | `sum`, `mean`, `max` | | **Neural** | `layernorm`, `bias_add_inplace`, `linear_bias_gelu` | | **Types** | `GPUArray`, `DataType`, `float32`, `float64`, `float16`, `bfloat16` | -| **LLM** | `llm.SafeTensorsFile`, `llm.Tokenizer`, `llm.GPT2Model`, `llm.Linear` | +| **LLM** | `llm.SafeTensorsFile`, `llm.CausalTransformerModel`, `llm.load_model_from_safetensors` | +| **LLM (Experimental)** | `llm.Tokenizer` (use HuggingFace tokenizers for production) | ### Deprecation Policy APIs to be removed will emit `DeprecationWarning` for at least one minor version before removal. diff --git a/examples/demo_llm_e2e.py b/examples/demo_llm_e2e.py index 8e5acee..99be1f7 100644 --- a/examples/demo_llm_e2e.py +++ b/examples/demo_llm_e2e.py @@ -15,6 +15,13 @@ Supported models: - LLaMA 2/3 (any size in safetensors format) - GPT-2 (safetensors format) + +Note on Tokenizer: + The built-in pygpukit.llm.Tokenizer is EXPERIMENTAL and intended for demos only. + It may not work with all tokenizer.json formats (e.g., Qwen3). + For production use, we recommend HuggingFace tokenizers: + pip install tokenizers + from tokenizers import Tokenizer """ from __future__ import annotations diff --git a/src/pygpukit/llm/__init__.py b/src/pygpukit/llm/__init__.py index 457a5bb..33b5c10 100644 --- a/src/pygpukit/llm/__init__.py +++ b/src/pygpukit/llm/__init__.py @@ -365,12 +365,32 @@ def load_safetensors(path: str) -> SafeTensorsFile | ShardedSafeTensorsFile: class Tokenizer: """BPE Tokenizer for GPT-2 style models. - Loads tokenizer.json format and provides basic encode/decode functionality. + **⚠️ EXPERIMENTAL: This tokenizer is intended for demos and testing only.** + + For production use, we recommend HuggingFace tokenizers: + - https://github.com/huggingface/tokenizers + - pip install tokenizers + + PyGPUkit's core responsibility is GPU execution, not tokenization. + The model API expects token IDs as input - use your preferred tokenizer + to convert text to token IDs before passing to PyGPUkit models. + + Limitations: + - Only supports a subset of HuggingFace tokenizer.json formats + - May not work with all models (e.g., Qwen3 uses unsupported format) + - No chat template support + - No special token handling beyond BOS/EOS/PAD Example: + >>> # For demos/testing only >>> tok = Tokenizer("tokenizer.json") >>> ids = tok.encode("Hello, world!") >>> text = tok.decode(ids) + + >>> # For production, use HuggingFace tokenizers: + >>> from tokenizers import Tokenizer as HFTokenizer + >>> hf_tok = HFTokenizer.from_file("tokenizer.json") + >>> ids = hf_tok.encode("Hello, world!").ids """ def __init__(self, path: str): diff --git a/tests/test_llm_unified.py b/tests/test_llm_unified.py index bce0317..27bd075 100644 --- a/tests/test_llm_unified.py +++ b/tests/test_llm_unified.py @@ -19,12 +19,12 @@ def test_type_aliases(): def test_component_aliases(): """Test that component aliases point to unified classes.""" from pygpukit.llm import ( + MLP, Attention, CausalSelfAttention, LlamaAttention, LlamaBlock, LlamaMLP, - MLP, Norm, RMSNorm, TransformerBlock, From 0676eb4d65fa18b4cebcf02b35cf68c4f26f6639 Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 13:33:56 +0900 Subject: [PATCH 13/14] docs(readme): add What's New in v0.2.9 section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Unified LLM Interface (CausalTransformerModel + ModelSpec) - Multi-architecture support: GPT-2, LLaMA 2/3, Qwen3 - Hybrid attention execution (CPU decode / GPU prefill) - New LLM operations: sdpa_causal, rope_inplace, silu, rmsnorm - Sharded model support for large models - Updated documentation table 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- README.md | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed36625..9367b21 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ |-------|-------------| | [Getting Started](docs/getting-started.md) | Installation, quick start, basic usage | | [API Reference](docs/api.md) | Complete API documentation with examples | -| [LLM Guide](docs/llm.md) | SafeTensors, Tokenizer, GPT-2 model loading | +| [LLM Guide](docs/llm.md) | SafeTensors, GPT-2/LLaMA/Qwen3 inference | | [Performance Tuning](docs/performance.md) | TF32, FP16, CUTLASS optimization | | [Scheduler Guide](docs/scheduler.md) | Multi-LLM concurrent execution | @@ -33,6 +33,65 @@ PyGPUkit aims to be the "micro-runtime for GPU computing": small, fast, and idea --- +## What's New in v0.2.9 + +### Unified LLM Interface +A single `CausalTransformerModel` now supports multiple architectures through the `ModelSpec` abstraction. + +| Architecture | Features | Status | +|--------------|----------|--------| +| **GPT-2** | LayerNorm, GELU, Position Embedding | ✅ Tested | +| **LLaMA 2/3** | RMSNorm, SiLU, RoPE, GQA | ✅ Tested | +| **Qwen3** | RMSNorm, SiLU, RoPE, GQA, QK-Norm | ✅ Tested | + +```python +from pygpukit.llm import load_model_from_safetensors, detect_model_spec, load_safetensors + +# Auto-detect and load any supported model +st = load_safetensors("model.safetensors") +spec = detect_model_spec(st.tensor_names) # Returns GPT2_SPEC, LLAMA_SPEC, or QWEN3_SPEC +model = load_model_from_safetensors("model.safetensors", dtype="float16", spec=spec) + +# Generate with KV-cache +output_ids = model.generate( + input_ids, + max_new_tokens=64, + temperature=0.7, + top_k=50, + top_p=0.9, + use_cache=True, # KV-cache for efficient generation +) +``` + +### Hybrid Attention Execution +Automatic CPU/GPU switching for optimal performance: + +| Phase | Backend | Reason | +|-------|---------|--------| +| **Prefill** (seq_len > 1) | GPU SDPA | Parallelizable | +| **Decode** (seq_len = 1) | CPU | Avoids kernel launch overhead | + +### New LLM Operations +| Operation | Description | +|-----------|-------------| +| `gpk.sdpa_causal(q, k, v)` | Scaled Dot-Product Attention with causal mask | +| `gpk.rope_inplace(x, freqs)` | Rotary Position Embedding (in-place) | +| `gpk.silu(x)` | SiLU/Swish activation | +| `gpk.rmsnorm(x, weight, eps)` | RMS Layer Normalization | + +### Sharded Model Support +Load large models split across multiple safetensors files: + +```python +from pygpukit.llm import load_safetensors + +# Automatically handles sharded models +st = load_safetensors("model.safetensors.index.json") # Returns ShardedSafeTensorsFile +print(f"Shards: {len(st._shard_files)}, Tensors: {st.num_tensors}") +``` + +--- + ## What's New in v0.2.7 ### CUTLASS Epilogue Fusion From 13bd676324092178d9d01e93934203cc614c432d Mon Sep 17 00:00:00 2001 From: m96-chan Date: Tue, 16 Dec 2025 13:39:08 +0900 Subject: [PATCH 14/14] fix(llm): resolve mypy type errors in model.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add assertions for num_kv_heads (set in __post_init__) - Add type annotations for _cos/_sin as Optional[ndarray] - Use hidden_np for numpy array, hidden for GPUArray - Fix return type annotations for __call__ method - Add assertions for _cos/_sin before indexing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/pygpukit/llm/model.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/pygpukit/llm/model.py b/src/pygpukit/llm/model.py index 668f584..cc33fc9 100644 --- a/src/pygpukit/llm/model.py +++ b/src/pygpukit/llm/model.py @@ -396,6 +396,7 @@ def head_dim(self) -> int: @property def num_kv_groups(self) -> int: """Number of query heads per KV head (for GQA).""" + assert self.num_kv_heads is not None # Set in __post_init__ return self.num_heads // self.num_kv_heads @@ -614,10 +615,13 @@ def __init__( self.config = config self.head_dim = config.head_dim self.num_heads = config.num_heads - self.num_kv_heads = config.num_kv_heads + assert config.num_kv_heads is not None # Set in __post_init__ + self.num_kv_heads: int = config.num_kv_heads self.num_kv_groups = config.num_kv_groups # Precompute RoPE if enabled + self._cos: np.ndarray | None + self._sin: np.ndarray | None if config.use_rope: self._cos, self._sin = precompute_freqs_cis( self.head_dim, config.max_position_embeddings, config.rope_theta @@ -689,6 +693,7 @@ def _forward_gpu( # Apply RoPE on GPU (requires FP32) if self.config.use_rope: + assert self._cos is not None and self._sin is not None cos = from_numpy(self._cos[position_ids].astype(np.float32)) sin = from_numpy(self._sin[position_ids].astype(np.float32)) # RoPE only supports FP32, convert if needed @@ -773,6 +778,7 @@ def _forward_cpu( # Apply RoPE (CPU) if self.config.use_rope: + assert self._cos is not None and self._sin is not None cos = self._cos[position_ids] sin = self._sin[position_ids] q, k = apply_rotary_pos_emb_numpy(q, k, cos, sin) @@ -967,9 +973,9 @@ def __call__( self, input_ids: list[int], position_ids: list[int] | None = None, - past_key_values: list[tuple] | None = None, + past_key_values: list[tuple | None] | None = None, use_cache: bool = False, - ) -> tuple[GPUArray, list[tuple] | None]: + ) -> tuple[GPUArray, list[tuple | None] | None]: """Forward pass. Args: @@ -992,14 +998,14 @@ def __call__( # Token embeddings (preserve dtype) embed_np = self.embed_tokens.to_numpy() - hidden = embed_np[input_ids] + hidden_np = embed_np[input_ids] # Add position embeddings (GPT-2 style) if self.position_embed is not None: pos_embed_np = self.position_embed.to_numpy() - hidden = hidden + pos_embed_np[position_ids] + hidden_np = hidden_np + pos_embed_np[position_ids] - hidden = from_numpy(hidden.astype(embed_np.dtype)) + hidden: GPUArray = from_numpy(hidden_np.astype(embed_np.dtype)) # Transformer blocks present_key_values = []