From 86c528ff9693a63727164389b634bd8b8c5a5780 Mon Sep 17 00:00:00 2001 From: ben1009 Date: Mon, 22 Sep 2025 16:43:11 +0800 Subject: [PATCH 1/9] chore: housekeeping --- Cargo.lock | 29 +------ Cargo.toml | 2 +- src/adam.rs | 2 +- src/embeddings.rs | 57 ++++++++----- src/feed_forward.rs | 25 +++--- src/layer_norm.rs | 35 ++++---- src/lib.rs | 16 ++-- src/llm.rs | 140 +++++++++++++++++++------------- src/main.rs | 83 ++++++++++--------- src/output_projection.rs | 20 +++-- src/self_attention.rs | 66 +++++++-------- src/transformer.rs | 26 +++--- src/vocab.rs | 8 +- tests/adam_test.rs | 30 +++---- tests/embeddings_test.rs | 35 ++++---- tests/feed_forward_test.rs | 24 +++--- tests/llm_test.rs | 54 ++++++------ tests/output_projection_test.rs | 50 ++++++------ tests/self_attention_test.rs | 18 ++-- tests/transformer_test.rs | 12 +-- tests/vocab_test.rs | 8 +- 21 files changed, 387 insertions(+), 353 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d62639f..ed56ac6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -133,7 +133,7 @@ version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" dependencies = [ - "zerocopy 0.7.35", + "zerocopy", ] [[package]] @@ -156,13 +156,12 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.0" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ "rand_chacha", "rand_core", - "zerocopy 0.8.23", ] [[package]] @@ -306,16 +305,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ "byteorder", - "zerocopy-derive 0.7.35", -] - -[[package]] -name = "zerocopy" -version = "0.8.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd97444d05a4328b90e75e503a34bad781f14e28a823ad3557f0750df1ebcbc6" -dependencies = [ - "zerocopy-derive 0.8.23", + "zerocopy-derive", ] [[package]] @@ -328,14 +318,3 @@ dependencies = [ "quote", "syn", ] - -[[package]] -name = "zerocopy-derive" -version = "0.8.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6352c01d0edd5db859a63e2605f4ea3183ddbd15e2c4a9e7d32184df75e4f154" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] diff --git a/Cargo.toml b/Cargo.toml index 6467962..08efc31 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ edition = "2024" [dependencies] ndarray = "0.16.1" -rand = "0.9.0" +rand = "0.9.2" rand_distr = "0.5.0" [dev-dependencies] diff --git a/src/adam.rs b/src/adam.rs index 744f2dc..c7dd922 100644 --- a/src/adam.rs +++ b/src/adam.rs @@ -33,4 +33,4 @@ impl Adam { *params -= &(update * lr); } -} \ No newline at end of file +} diff --git a/src/embeddings.rs b/src/embeddings.rs index fd15aa4..65f5f7e 100644 --- a/src/embeddings.rs +++ b/src/embeddings.rs @@ -1,6 +1,6 @@ -use ndarray::{s, Array2}; -use rand_distr::{Normal, Distribution}; -use crate::{vocab::Vocab, llm::Layer, EMBEDDING_DIM, MAX_SEQ_LEN, adam::Adam}; +use crate::{EMBEDDING_DIM, MAX_SEQ_LEN, adam::Adam, llm::Layer, vocab::Vocab}; +use ndarray::{Array2, s}; +use rand_distr::{Distribution, Normal}; pub struct Embeddings { pub token_embeddings: Array2, @@ -10,20 +10,19 @@ pub struct Embeddings { pub positional_optimizer: Adam, } -impl Default for Embeddings { +impl Default for Embeddings { fn default() -> Self { - Self { + Self { token_embeddings: Self::init_embeddings(Vocab::default_words().len(), EMBEDDING_DIM), positional_embeddings: Self::init_positional_embeddings(MAX_SEQ_LEN, EMBEDDING_DIM), cached_input: None, token_optimizer: Adam::new((Vocab::default_words().len(), EMBEDDING_DIM)), - positional_optimizer: Adam::new((MAX_SEQ_LEN, EMBEDDING_DIM)) + positional_optimizer: Adam::new((MAX_SEQ_LEN, EMBEDDING_DIM)), } } } impl Embeddings { - pub fn new(vocab: Vocab) -> Self { Self { token_embeddings: Self::init_embeddings(vocab.words.len(), EMBEDDING_DIM), @@ -50,26 +49,35 @@ impl Embeddings { let mut token_embeds = Array2::::zeros((token_ids.len(), embeddings.ncols())); for (i, &token_id) in token_ids.iter().enumerate() { if token_id >= embeddings.nrows() { - panic!("Token ID {} out of bounds for vocab size {}", token_id, embeddings.nrows()); + panic!( + "Token ID {} out of bounds for vocab size {}", + token_id, + embeddings.nrows() + ); } token_embeds.row_mut(i).assign(&embeddings.row(token_id)); } token_embeds } - fn get_positional_embeddings(positional_encodings: &Array2, seq_len: usize) -> Array2 { + fn get_positional_embeddings( + positional_encodings: &Array2, + seq_len: usize, + ) -> Array2 { if seq_len > positional_encodings.nrows() { - panic!("Sequence length {} exceeds maximum {}", seq_len, positional_encodings.nrows()); + panic!( + "Sequence length {} exceeds maximum {}", + seq_len, + positional_encodings.nrows() + ); } positional_encodings.slice(s![0..seq_len, ..]).to_owned() } - pub fn embed_tokens( - &self, - token_ids: &[usize] - ) -> Array2 { + pub fn embed_tokens(&self, token_ids: &[usize]) -> Array2 { let token_embeds = Self::get_token_embeddings(&self.token_embeddings, token_ids); - let position_embeds = Self::get_positional_embeddings(&self.positional_embeddings, token_ids.len()); + let position_embeds = + Self::get_positional_embeddings(&self.positional_embeddings, token_ids.len()); token_embeds + position_embeds // Element-wise sum } } @@ -79,7 +87,8 @@ impl Layer for Embeddings { "Embeddings" } - fn forward(&mut self, input: &Array2) -> Array2 { // input shape is [1, sequence_length] + fn forward(&mut self, input: &Array2) -> Array2 { + // input shape is [1, sequence_length] self.cached_input = Some(input.clone()); let token_ids: Vec = input.iter().map(|&x| x as usize).collect(); self.embed_tokens(&token_ids) // shape is [sequence_length, embedding_dim] @@ -96,16 +105,20 @@ impl Layer for Embeddings { for (i, &token_id) in token_ids.iter().enumerate() { if token_id >= self.token_embeddings.nrows() { - panic!("Token ID {} out of bounds for vocab size {}", token_id, self.token_embeddings.nrows()); + panic!( + "Token ID {} out of bounds for vocab size {}", + token_id, + self.token_embeddings.nrows() + ); } let grad_row = grads.row(i); - + // Accumulate token embedding gradients efficiently (no temp variable) { let mut token_row = token_grads.row_mut(token_id); token_row += &grad_row; } - + // Accumulate positional embedding gradients efficiently (no temp variable) { let mut pos_row = positional_grads.row_mut(i); @@ -113,8 +126,10 @@ impl Layer for Embeddings { } } - self.token_optimizer.step(&mut self.token_embeddings, &token_grads, lr); - self.positional_optimizer.step(&mut self.positional_embeddings, &positional_grads, lr); + self.token_optimizer + .step(&mut self.token_embeddings, &token_grads, lr); + self.positional_optimizer + .step(&mut self.positional_embeddings, &positional_grads, lr); // Return gradient to propagate further back grads.to_owned() diff --git a/src/feed_forward.rs b/src/feed_forward.rs index c141763..dd74c94 100644 --- a/src/feed_forward.rs +++ b/src/feed_forward.rs @@ -1,7 +1,7 @@ +use crate::{adam::Adam, llm::Layer}; use ndarray::Array2; use ndarray::Axis; -use rand_distr::{Normal, Distribution}; -use crate::{adam::Adam, llm::Layer}; +use rand_distr::{Distribution, Normal}; pub struct FeedForward { w1: Array2, @@ -24,15 +24,15 @@ impl FeedForward { /// Initialize a feedforward layer with random weights pub fn new(embedding_dim: usize, hidden_dim: usize) -> Self { let mut rng = rand::rng(); - + // Xavier/He initialization for w1: std = sqrt(2 / fan_in) let std_w1 = (2.0 / embedding_dim as f32).sqrt(); let normal_w1 = Normal::new(0.0, std_w1).unwrap(); - - // Xavier/He initialization for w2: std = sqrt(2 / fan_in) + + // Xavier/He initialization for w2: std = sqrt(2 / fan_in) let std_w2 = (2.0 / hidden_dim as f32).sqrt(); let normal_w2 = Normal::new(0.0, std_w2).unwrap(); - + FeedForward { w1: Array2::from_shape_fn((embedding_dim, hidden_dim), |_| normal_w1.sample(&mut rng)), b1: Array2::zeros((1, hidden_dim)), // Bias initialized to 0 @@ -73,12 +73,14 @@ impl Layer for FeedForward { // Gradient w.r.t. W1 and b1 let grad_w1 = input.t().dot(&grad_hidden_pre_activation); - let grad_b1 = grad_hidden_pre_activation.sum_axis(Axis(0)).insert_axis(Axis(0)); // Shape: [1, hidden_dim] + let grad_b1 = grad_hidden_pre_activation + .sum_axis(Axis(0)) + .insert_axis(Axis(0)); // Shape: [1, hidden_dim] // Gradient w.r.t. input (through feed-forward computation) let grad_input_feedforward = grad_hidden_pre_activation.dot(&self.w1.t()); - - // Add gradient from residual connection + + // Add gradient from residual connection // Forward: output = W2(ReLU(W1*input + b1)) + b2 + input // Backward: grad_input = grad_feedforward + grad_residual let grad_input = grad_input_feedforward + grads; @@ -93,10 +95,9 @@ impl Layer for FeedForward { } fn forward(&mut self, input: &Array2) -> Array2 { - let hidden_pre_activation = input.dot(&self.w1) + &self.b1; let hidden_post_activation = hidden_pre_activation.mapv(|x| x.max(0.0)); // ReLU - + let output = hidden_post_activation.dot(&self.w2) + &self.b2; // Cache values @@ -106,4 +107,4 @@ impl Layer for FeedForward { output + input // residual connection (no LayerNorm here) } -} \ No newline at end of file +} diff --git a/src/layer_norm.rs b/src/layer_norm.rs index 7277f7f..f326641 100644 --- a/src/layer_norm.rs +++ b/src/layer_norm.rs @@ -1,10 +1,10 @@ use crate::adam::Adam; +use crate::llm::Layer; use ndarray::Array2; use ndarray::Axis; -use crate::llm::Layer; pub struct LayerNorm { - epsilon: f32, // Small constant for stability + epsilon: f32, // Small constant for stability gamma: Array2, // Learnable scaling parameter beta: Array2, // Learnable bias parameter @@ -58,32 +58,39 @@ impl Layer for LayerNorm { let input = self.cached_input.as_ref().unwrap(); let mean = self.cached_mean.as_ref().unwrap(); let std = self.cached_std.as_ref().unwrap(); - + let normalized = (input - mean) / (std + self.epsilon); let n_features = input.shape()[1] as f32; // Number of features per token - + // Gradients w.r.t. gamma and beta let grad_gamma = (&normalized * grads).sum_axis(Axis(0)).insert_axis(Axis(0)); let grad_beta = grads.sum_axis(Axis(0)).insert_axis(Axis(0)); - + // Gradient w.r.t. normalized values let grad_normalized = &self.gamma * grads; - + // LayerNorm backward pass with full chain rule let grad_input = { let variance = std * std + self.epsilon; - let grad_var = (&grad_normalized * &normalized).sum_axis(Axis(1)).insert_axis(Axis(1)) * (-0.5) / variance.mapv(|x| x * x.sqrt()); - let grad_mean = grad_normalized.sum_axis(Axis(1)).insert_axis(Axis(1)) * (-1.0) / (std + self.epsilon) + &grad_var * (input - mean).sum_axis(Axis(1)).insert_axis(Axis(1)) * (-2.0) / n_features; - - &grad_normalized / (std + self.epsilon) + - &grad_var * 2.0 * (input - mean) / n_features + - &grad_mean / n_features + let grad_var = (&grad_normalized * &normalized) + .sum_axis(Axis(1)) + .insert_axis(Axis(1)) + * (-0.5) + / variance.mapv(|x| x * x.sqrt()); + let grad_mean = grad_normalized.sum_axis(Axis(1)).insert_axis(Axis(1)) * (-1.0) + / (std + self.epsilon) + + &grad_var * (input - mean).sum_axis(Axis(1)).insert_axis(Axis(1)) * (-2.0) + / n_features; + + &grad_normalized / (std + self.epsilon) + + &grad_var * 2.0 * (input - mean) / n_features + + &grad_mean / n_features }; - + // Update learnable parameters self.optimizer_gamma.step(&mut self.gamma, &grad_gamma, lr); self.optimizer_beta.step(&mut self.beta, &grad_beta, lr); - + grad_input } } diff --git a/src/lib.rs b/src/lib.rs index b9a7782..4f09a52 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,19 +1,19 @@ -pub mod llm; +pub mod adam; pub mod embeddings; -pub mod vocab; -pub mod transformer; pub mod feed_forward; -pub mod self_attention; -pub mod output_projection; -pub mod adam; pub mod layer_norm; +pub mod llm; +pub mod output_projection; +pub mod self_attention; +pub mod transformer; +pub mod vocab; // Re-export key structs for easier access -pub use vocab::Vocab; pub use embeddings::Embeddings; pub use llm::LLM; pub use llm::Layer; +pub use vocab::Vocab; // Constants pub const MAX_SEQ_LEN: usize = 40; pub const EMBEDDING_DIM: usize = 32; -pub const HIDDEN_DIM: usize = 32; \ No newline at end of file +pub const HIDDEN_DIM: usize = 32; diff --git a/src/llm.rs b/src/llm.rs index 89be934..7ddfaae 100644 --- a/src/llm.rs +++ b/src/llm.rs @@ -1,12 +1,12 @@ -use ndarray::Array1; -use ndarray::{Array2, Axis}; -use crate::transformer::TransformerBlock; -use crate::Embeddings; -use crate::Vocab; -use crate::output_projection::OutputProjection; use crate::EMBEDDING_DIM; +use crate::Embeddings; use crate::HIDDEN_DIM; use crate::MAX_SEQ_LEN; +use crate::Vocab; +use crate::output_projection::OutputProjection; +use crate::transformer::TransformerBlock; +use ndarray::Array1; +use ndarray::{Array2, Axis}; use std::cmp::Ordering; pub trait Layer { fn layer_type(&self) -> &str; @@ -16,6 +16,7 @@ pub trait Layer { fn backward(&mut self, grads: &Array2, lr: f32) -> Array2; } +#[allow(clippy::upper_case_acronyms)] pub struct LLM { pub vocab: Vocab, pub network: Vec>, @@ -38,16 +39,17 @@ impl Default for LLM { impl LLM { pub fn new(vocab: Vocab, network: Vec>) -> Self { - Self { - vocab, - network - } + Self { vocab, network } } } impl LLM { pub fn network_description(&self) -> String { - self.network.iter().map(|layer| layer.layer_type()).collect::>().join(", ") + self.network + .iter() + .map(|layer| layer.layer_type()) + .collect::>() + .join(", ") } pub fn predict(&mut self, text: &str) -> String { @@ -59,7 +61,10 @@ impl LLM { } // Convert token_ids to strings - let token_strs = output_tokens.iter().map(|t| self.vocab.decode[t].clone()).collect::>(); + let token_strs = output_tokens + .iter() + .map(|t| self.vocab.decode[t].clone()) + .collect::>(); token_strs.join(" ") } @@ -75,12 +80,12 @@ impl LLM { } let input_len = tokenized.len(); - + // Prevent overflow if input_len >= MAX_SEQ_LEN if input_len >= MAX_SEQ_LEN { return output_tokens; } - + for _ in 0..(MAX_SEQ_LEN - input_len) { // let tokenized_clone = tokenized.clone(); @@ -92,21 +97,25 @@ impl LLM { let token_input = Array2::from_shape_vec( (1, tokenized.len()), tokenized.iter().map(|&x| x as f32).collect(), - ).unwrap(); + ) + .unwrap(); let mut input = token_input; - + for layer in &mut self.network { input = layer.forward(&input); } let logits = input; - + // Safety check: ensure we have at least one token if logits.shape()[0] == 0 { break; } - - let last_logit = logits.row(logits.shape()[0] - 1).to_owned().insert_axis(Axis(0)); + + let last_logit = logits + .row(logits.shape()[0] - 1) + .to_owned() + .insert_axis(Axis(0)); // Softmax - convert activiations of each token to a probability distribution over the vocabulary let probs = Self::softmax(&last_logit); // 1 x vocab_size @@ -114,12 +123,14 @@ impl LLM { // Greedy Decode - Choose the highest probability token for each position let tokens = Self::greedy_decode(&probs); - let next_token = tokens[tokens.len() - 1]; + let next_token = tokens[tokens.len() - 1]; output_tokens.push(next_token); tokenized.push(next_token); - if next_token == self.vocab.encode("").unwrap() { break; } + if next_token == self.vocab.encode("").unwrap() { + break; + } } output_tokens @@ -128,13 +139,15 @@ impl LLM { pub fn train(&mut self, data: Vec<&str>, epochs: usize, lr: f32) { let tokenized_data = data .iter() - .map(|input| (self.tokenize(input))) + .map(|input| self.tokenize(input)) .collect::>>(); for epoch in 0..epochs { let mut total_loss = 0.0; for training_row in &tokenized_data { - if training_row.len() < 2 { continue; } + if training_row.len() < 2 { + continue; + } // 1. Slice input and targets let input_ids = &training_row[..training_row.len() - 1]; // Exclude the last token @@ -142,9 +155,11 @@ impl LLM { // Forward pass let mut input: Array2 = Array2::zeros((1, input_ids.len())); - input.row_mut(0).assign(&input_ids.iter().map(|&x| x as f32).collect::>()); + input + .row_mut(0) + .assign(&input_ids.iter().map(|&x| x as f32).collect::>()); - for layer in &mut self.network { + for layer in &mut self.network { input = layer.forward(&input); } @@ -155,10 +170,10 @@ impl LLM { // Backward pass let mut grads_output = Self::compute_gradients_step(&probs, target_ids); // this is d_L/d_output_projection - + // Apply gradient clipping BEFORE backpropagation Self::clip_gradients(&mut grads_output, 5.0); - + for layer in self.network.iter_mut().rev() { grads_output = layer.backward(&grads_output, lr); } @@ -166,17 +181,23 @@ impl LLM { let tokens = Self::greedy_decode(&probs); let next_token = tokens[tokens.len() - 1]; - if next_token == self.vocab.encode("").unwrap() { continue; } + if next_token == self.vocab.encode("").unwrap() { + continue; + } } - - println!("Epoch {}: Loss = {:.4}", epoch, total_loss / tokenized_data.len() as f32); + + println!( + "Epoch {}: Loss = {:.4}", + epoch, + total_loss / tokenized_data.len() as f32 + ); } } pub fn tokenize(&self, text: &str) -> Vec { // Split by whitespace first let mut tokens = Vec::new(); - + for word in text.split_whitespace() { // Special case for end token if word == "" { @@ -185,9 +206,9 @@ impl LLM { } continue; } - + let mut current_word = String::new(); - + for c in word.chars() { if c.is_ascii_punctuation() { // If we have a word before the punctuation, add it @@ -197,7 +218,7 @@ impl LLM { } current_word.clear(); } - + // Add the punctuation as its own token if let Some(token_id) = self.vocab.encode(&c.to_string()) { tokens.push(token_id); @@ -206,46 +227,49 @@ impl LLM { current_word.push(c); } } - + // Add any remaining word - if !current_word.is_empty() { - if let Some(token_id) = self.vocab.encode(¤t_word) { - tokens.push(token_id); - } + if !current_word.is_empty() + && let Some(token_id) = self.vocab.encode(¤t_word) + { + tokens.push(token_id); } } - + tokens } - fn softmax(logits: &Array2) -> Array2 { // logits is seq_len x vocab_size + fn softmax(logits: &Array2) -> Array2 { + // logits is seq_len x vocab_size let mut result = logits.clone(); - + // Apply softmax row-wise for mut row in result.rows_mut() { // Calculate exp for each element let max_val = row.iter().copied().fold(f32::NEG_INFINITY, f32::max); let exp_values: Vec = row.iter().map(|&x| (x - max_val).exp()).collect(); let sum_exp: f32 = exp_values.iter().sum(); - + // Normalize by sum for (i, &exp_val) in exp_values.iter().enumerate() { row[i] = exp_val / sum_exp; } } - + result } fn greedy_decode(probs: &Array2) -> Vec { - probs.map_axis(Axis(1), |row| { - row.iter() - .enumerate() - .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(Ordering::Equal)) - .map(|(index, _)| index) - .unwrap() - }).to_vec() - } + probs + .map_axis(Axis(1), |row| { + row.iter() + .enumerate() + .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(Ordering::Equal)) + .map(|(index, _)| index) + .unwrap() + }) + .to_vec() + } fn cross_entropy_loss_step(probs: &Array2, target: &[usize]) -> f32 { let mut loss = 0.0; @@ -263,28 +287,28 @@ impl LLM { if probs.shape()[0] != target.len() { panic!("Probs and target must have the same number of rows"); } - + let batch_size = target.len() as f32; - + // Compute correct softmax + cross-entropy gradient: softmax - one_hot(target) for row_idx in 0..grads.shape()[0] { grads[[row_idx, target[row_idx]]] -= 1.0; // Convert to: p - y (where y is one-hot) } - + // Normalize by batch size for stable training grads.mapv_inplace(|x| x / batch_size); - + grads } fn clip_gradients(grads: &mut Array2, max_norm: f32) { // Calculate L2 norm of gradients let norm = grads.iter().map(|&x| x * x).sum::().sqrt(); - + // If norm exceeds max_norm, scale gradients down if norm > max_norm { let scale = max_norm / norm; grads.mapv_inplace(|x| x * scale); } } -} \ No newline at end of file +} diff --git a/src/main.rs b/src/main.rs index b08e920..15cdeba 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,20 +1,20 @@ use std::io::Write; use embeddings::Embeddings; +use llm::LLM; use output_projection::OutputProjection; use transformer::TransformerBlock; -use llm::LLM; use vocab::Vocab; -mod llm; +mod adam; mod embeddings; -mod vocab; -mod transformer; mod feed_forward; -mod self_attention; -mod output_projection; -mod adam; mod layer_norm; +mod llm; +mod output_projection; +mod self_attention; +mod transformer; +mod vocab; // Use the constants from lib.rs const MAX_SEQ_LEN: usize = 80; @@ -27,10 +27,10 @@ fn main() { // Extract all unique words from training data to create vocabulary let mut vocab_set = std::collections::HashSet::new(); - + // Add end of sequence token vocab_set.insert("".to_string()); - + // Pre-training data - simple text completion patterns let pretraining_data = vec![ "The sun rises in the east and sets in the west ", @@ -72,7 +72,6 @@ fn main() { ("User: How many neurons are in the human brain? Assistant: The human brain contains about 86 billion neurons that transmit information throughout the body "), ("User: What is electricity? Assistant: Electricity is the flow of electrons through a conductor, often used to power devices "), ("User: What is climate change? Assistant: Climate change refers to long-term shifts in temperatures and weather patterns on Earth "), - ("User: How long do oak trees live? Assistant: Oak trees can live for hundreds of years and produce acorns as their fruit "), ("User: What happened to Pluto? Assistant: Pluto was reclassified from a planet to a dwarf planet in 2006 by astronomers "), ("User: How is glass made? Assistant: Glass is made by heating sand, soda ash, and limestone to very high temperatures until they melt "), @@ -83,7 +82,6 @@ fn main() { ("User: What is Jupiter like? Assistant: Jupiter is the largest planet in our solar system and has dozens of moons "), ("User: How fast does light travel? Assistant: Light travels at approximately 299,792 kilometers per second in a vacuum "), ("User: What is gold used for? Assistant: Gold is a dense, soft metal often used in jewelry and electronics due to its conductivity "), - ("User: What covers most of Earth? Assistant: Most of the earth's surface is covered by water, primarily in oceans "), ("User: How do bicycles work? Assistant: Bicycles are an efficient mode of transport that convert human energy into motion through gears "), ("User: How is chocolate made? Assistant: Chocolate is made from roasted and ground cacao seeds, often sweetened and flavored "), @@ -94,7 +92,6 @@ fn main() { ("User: What did the printing press do? Assistant: The printing press revolutionized the spread of information in the 15th century "), ("User: What is sound? Assistant: Sound is a vibration that travels through air, water, or solid materials to reach our ears "), ("User: Why is carbon important? Assistant: Carbon is an essential element in organic chemistry, forming the basis of all life on Earth "), - ("User: What was the Library of Alexandria? Assistant: The library of alexandria was one of the most significant libraries of the ancient world "), ("User: What do honeybees do? Assistant: Honeybees play a vital role in pollination, which supports ecosystems and agriculture "), ("User: Are electric cars better? Assistant: Electric vehicles produce less air pollution than traditional gasoline-powered cars "), @@ -105,7 +102,6 @@ fn main() { ("User: How are language models trained? Assistant: Language models are trained using vast amounts of text to learn patterns in language "), ("User: How do compasses work? Assistant: Compasses work by aligning a magnetic needle with the earth's magnetic field "), ("User: What do vaccines do? Assistant: Vaccines help the immune system recognize and fight off specific pathogens "), - // Greeting and conversational patterns ("User: Hello! Assistant: Hello! How can I help you today? "), ("User: Hi! Assistant: Hi there! What can I do for you? "), @@ -121,7 +117,7 @@ fn main() { ("User: I don't understand. Assistant: No problem! Let me try to explain it differently "), ("User: What do you think? Assistant: Based on the information available, I think it's important to consider multiple perspectives "), ]; - + // Process all training examples for vocabulary // First process pre-training data for text in &pretraining_data { @@ -144,7 +140,7 @@ fn main() { } } } - + // Then process chat training data for row in &chat_training_data { // Add words from outputs @@ -167,7 +163,7 @@ fn main() { } } } - + let mut vocab_words: Vec = vocab_set.into_iter().collect(); vocab_words.sort(); // Sort for deterministic ordering let vocab_words_refs: Vec<&str> = vocab_words.iter().map(|s| s.as_str()).collect(); @@ -178,31 +174,42 @@ fn main() { let transformer_block_3 = TransformerBlock::new(EMBEDDING_DIM, HIDDEN_DIM); let output_projection = OutputProjection::new(EMBEDDING_DIM, vocab.words.len()); let embeddings = Embeddings::new(vocab.clone()); - let mut llm = LLM::new(vocab, vec![ - Box::new(embeddings), - Box::new(transformer_block_1), - Box::new(transformer_block_2), - Box::new(transformer_block_3), - Box::new(output_projection), - ]); + let mut llm = LLM::new( + vocab, + vec![ + Box::new(embeddings), + Box::new(transformer_block_1), + Box::new(transformer_block_2), + Box::new(transformer_block_3), + Box::new(output_projection), + ], + ); println!("\n=== MODEL INFORMATION ==="); println!("Network architecture: {}", llm.network_description()); - + println!("\n=== BEFORE TRAINING ==="); println!("Input: {}", string); println!("Output: {}", llm.predict(&string)); - + println!("\n=== PRE-TRAINING MODEL ==="); - println!("Pre-training on {} examples for {} epochs with learning rate {}", - pretraining_data.len(), 100, 0.0005); + println!( + "Pre-training on {} examples for {} epochs with learning rate {}", + pretraining_data.len(), + 100, + 0.0005 + ); llm.train(pretraining_data, 100, 0.0005); - + println!("\n=== INSTRUCTION TUNING ==="); - println!("Instruction tuning on {} examples for {} epochs with learning rate {}", - chat_training_data.len(), 100, 0.0001); + println!( + "Instruction tuning on {} examples for {} epochs with learning rate {}", + chat_training_data.len(), + 100, + 0.0001 + ); llm.train(chat_training_data, 100, 0.0001); // Much lower learning rate for stability - + println!("\n=== AFTER TRAINING ==="); println!("Input: {}", string); let result = llm.predict(&string); @@ -213,26 +220,28 @@ fn main() { println!("\n--- Interactive Mode ---"); println!("Type a prompt and press Enter to generate text."); println!("Type 'exit' to quit."); - + let mut input = String::new(); loop { // Clear the input string input.clear(); - + // Prompt for user input print!("\nEnter prompt: "); std::io::stdout().flush().unwrap(); - + // Read user input - std::io::stdin().read_line(&mut input).expect("Failed to read input"); - + std::io::stdin() + .read_line(&mut input) + .expect("Failed to read input"); + // Trim whitespace and check for exit command let trimmed_input = input.trim(); if trimmed_input.eq_ignore_ascii_case("exit") { println!("Exiting interactive mode."); break; } - + // Generate prediction based on user input with "User:" prefix let formatted_input = format!("User: {}", trimmed_input); let prediction = llm.predict(&formatted_input); diff --git a/src/output_projection.rs b/src/output_projection.rs index 4054bcb..9c2be18 100644 --- a/src/output_projection.rs +++ b/src/output_projection.rs @@ -1,13 +1,13 @@ use ndarray::{Array2, Axis}; -use rand_distr::{Normal, Distribution}; +use rand_distr::{Distribution, Normal}; use crate::{adam::Adam, llm::Layer}; pub struct OutputProjection { - pub w_out: Array2, // Weight matrix - pub b_out: Array2, // Bias vector - pub optimizer: Adam, - pub cached_input: Option>, + pub w_out: Array2, // Weight matrix + pub b_out: Array2, // Bias vector + pub optimizer: Adam, + pub cached_input: Option>, } impl OutputProjection { @@ -17,7 +17,7 @@ impl OutputProjection { // Xavier/He initialization: std = sqrt(2 / fan_in) let std = (2.0 / embedding_dim as f32).sqrt(); let normal = Normal::new(0.0, std).unwrap(); - + OutputProjection { w_out: Array2::from_shape_fn((embedding_dim, vocab_size), |_| normal.sample(&mut rng)), b_out: Array2::zeros((1, vocab_size)), @@ -33,12 +33,14 @@ impl Layer for OutputProjection { } /// Forward pass: project embeddings to vocab logits - fn forward(&mut self, input: &Array2) -> Array2 { // input shape is [sequence_length, embedding_dim] + fn forward(&mut self, input: &Array2) -> Array2 { + // input shape is [sequence_length, embedding_dim] self.cached_input = Some(input.clone()); input.dot(&self.w_out) + &self.b_out // shape is [sequence_length, vocab_size] } - fn backward(&mut self, grads: &Array2, lr: f32) -> Array2 { // grads shape is [sequence_length, vocab_size] + fn backward(&mut self, grads: &Array2, lr: f32) -> Array2 { + // grads shape is [sequence_length, vocab_size] let input = self.cached_input.as_ref().unwrap(); let grad_w_out = input.t().dot(grads); let grad_b_out = grads.mean_axis(Axis(0)).unwrap(); @@ -50,4 +52,4 @@ impl Layer for OutputProjection { grad_input } -} \ No newline at end of file +} diff --git a/src/self_attention.rs b/src/self_attention.rs index a485176..b96512f 100644 --- a/src/self_attention.rs +++ b/src/self_attention.rs @@ -1,8 +1,8 @@ -use crate::adam::Adam; use crate::EMBEDDING_DIM; -use ndarray::Array2; -use rand_distr::{Normal, Distribution}; +use crate::adam::Adam; use crate::llm::Layer; +use ndarray::Array2; +use rand_distr::{Distribution, Normal}; use std::f32; pub struct SelfAttention { @@ -23,7 +23,6 @@ impl Default for SelfAttention { SelfAttention::new(EMBEDDING_DIM) } } - impl SelfAttention { /// Initializes a Transformer with random Q, K, V weights @@ -32,7 +31,7 @@ impl SelfAttention { // Xavier/He initialization: std = sqrt(2 / fan_in) let std = (2.0 / embedding_dim as f32).sqrt(); let normal = Normal::new(0.0, std).unwrap(); - + SelfAttention { embedding_dim, w_q: Array2::from_shape_fn((embedding_dim, embedding_dim), |_| normal.sample(&mut rng)), @@ -72,34 +71,33 @@ impl SelfAttention { fn softmax(&self, scores: &Array2) -> Array2 { let mut result = scores.clone(); - + // Apply softmax row-wise for mut row in result.rows_mut() { let max_val = row.iter().max_by(|a, b| a.partial_cmp(b).unwrap()).unwrap(); // Calculate exp for each element let exp_values: Vec = row.iter().map(|&x| (x - max_val).exp()).collect(); let sum_exp: f32 = exp_values.iter().sum(); - + // Normalize by sum for (i, &exp_val) in exp_values.iter().enumerate() { row[i] = exp_val / sum_exp; } } - + result } fn softmax_backward( - softmax_output: &Array2, // shape: [seq_len, vocab_size] - grad_output: &Array2, // shape: [seq_len, vocab_size] + softmax_output: &Array2, // shape: [seq_len, vocab_size] + grad_output: &Array2, // shape: [seq_len, vocab_size] ) -> Array2 { let mut grad_input = softmax_output.clone(); // to hold the result - - for ((mut grad_row, softmax_row), grad_out_row) in - grad_input - .outer_iter_mut() - .zip(softmax_output.outer_iter()) - .zip(grad_output.outer_iter()) + + for ((mut grad_row, softmax_row), grad_out_row) in grad_input + .outer_iter_mut() + .zip(softmax_output.outer_iter()) + .zip(grad_output.outer_iter()) { // dot product: y ⊙ dL/dy let dot = softmax_row @@ -107,7 +105,7 @@ impl SelfAttention { .zip(grad_out_row.iter()) .map(|(&y_i, &dy_i)| y_i * dy_i) .sum::(); - + for ((g, &y_i), &dy_i) in grad_row .iter_mut() .zip(softmax_row.iter()) @@ -116,7 +114,7 @@ impl SelfAttention { *g = y_i * (dy_i - dot); } } - + grad_input } } @@ -140,9 +138,9 @@ impl Layer for SelfAttention { let v = input.dot(&self.w_v); let dk = self.w_q.shape()[1] as f32; let scale = dk.sqrt(); - + let mut scores = q.dot(&k.t()) / scale; - + // Apply causal masking - prevent attention to future tokens let seq_len = scores.shape()[0]; for i in 0..seq_len { @@ -150,40 +148,38 @@ impl Layer for SelfAttention { scores[[i, j]] = f32::NEG_INFINITY; } } - + let attn_weights = self.softmax(&scores); // also cached - + // Step 1: grads = ∂L/∂attn_output let grad_attn_weights = grads.dot(&v.t()); let grad_v = attn_weights.t().dot(grads); - + // Step 2: softmax backward let grad_scores = SelfAttention::softmax_backward(&attn_weights, &grad_attn_weights); // [seq_len, seq_len] - + // Step 3: ∂L/∂Q and ∂L/∂K let grad_q = grad_scores.dot(&k); let grad_k = grad_scores.t().dot(&q); - + // Step 4: ∂L/∂W_q/W_k/W_v let grad_w_q = input.t().dot(&grad_q); let grad_w_k = input.t().dot(&grad_k); let grad_w_v = input.t().dot(&grad_v); - + // Step 5: ∂L/∂input (gradient through attention computation) let grad_input_attention = - grad_q.dot(&self.w_q.t()) + - grad_k.dot(&self.w_k.t()) + - grad_v.dot(&self.w_v.t()); - - // Step 6: Add gradient from residual connection + grad_q.dot(&self.w_q.t()) + grad_k.dot(&self.w_k.t()) + grad_v.dot(&self.w_v.t()); + + // Step 6: Add gradient from residual connection // Forward: residual = attention + input, so gradient flows directly through let grad_input = grad_input_attention + grads; - + // Step 7: update weights self.optimizer_w_q.step(&mut self.w_q, &grad_w_q, lr); self.optimizer_w_k.step(&mut self.w_k, &grad_w_k, lr); self.optimizer_w_v.step(&mut self.w_v, &grad_w_v, lr); - - grad_input + + grad_input } -} \ No newline at end of file +} diff --git a/src/transformer.rs b/src/transformer.rs index aa1c613..0beef93 100644 --- a/src/transformer.rs +++ b/src/transformer.rs @@ -1,7 +1,7 @@ -use crate::self_attention::SelfAttention; use crate::feed_forward::FeedForward; use crate::layer_norm::LayerNorm; use crate::llm::Layer; +use crate::self_attention::SelfAttention; use ndarray::Array2; pub struct TransformerBlock { attention: SelfAttention, @@ -27,29 +27,27 @@ impl Layer for TransformerBlock { } fn forward(&mut self, input: &Array2) -> Array2 { - // Standard Transformer architecture: attention + norm -> feedforward + norm + // Standard Transformer architecture: attention + norm -> feedforward + norm let attention_out = self.attention.forward(input); // includes residual let norm1_out = self.norm1.normalize(&attention_out); - + let feed_forward_out = self.feed_forward.forward(&norm1_out); // includes residual - let norm2_out = self.norm2.normalize(&feed_forward_out); - - norm2_out + + self.norm2.normalize(&feed_forward_out) } - + fn backward(&mut self, grads: &Array2, lr: f32) -> Array2 { // Backward through second LayerNorm let grad_norm2 = self.norm2.backward(grads, lr); - + // Backward through feed-forward (includes residual connection) let grad_ffn = self.feed_forward.backward(&grad_norm2, lr); - - // Backward through first LayerNorm + + // Backward through first LayerNorm let grad_norm1 = self.norm1.backward(&grad_ffn, lr); - + // Backward through attention (includes residual connection) - let grad_attn = self.attention.backward(&grad_norm1, lr); - grad_attn - } + self.attention.backward(&grad_norm1, lr) + } } diff --git a/src/vocab.rs b/src/vocab.rs index 7cb7693..50b1aeb 100644 --- a/src/vocab.rs +++ b/src/vocab.rs @@ -22,7 +22,11 @@ impl Vocab { decode.insert(i, word.to_string()); } - Vocab { encode, decode, words: words.iter().map(|w| w.to_string()).collect() } + Vocab { + encode, + decode, + words: words.iter().map(|w| w.to_string()).collect(), + } } /// Convert a word to its token index @@ -39,4 +43,4 @@ impl Vocab { pub fn default_words() -> Vec<&'static str> { vec!["hello", "world", "this", "is", "rust", ""] } -} \ No newline at end of file +} diff --git a/tests/adam_test.rs b/tests/adam_test.rs index 1045625..6aa509c 100644 --- a/tests/adam_test.rs +++ b/tests/adam_test.rs @@ -1,11 +1,11 @@ -use ndarray::Array2; use llm::adam::Adam; +use ndarray::Array2; #[test] fn test_adam_initialization() { let shape = [2, 3]; let adam = Adam::new((2, 3)); - + // Check if momentum and velocity matrices are initialized to zeros assert_eq!(adam.m.shape(), shape); assert_eq!(adam.v.shape(), shape); @@ -20,16 +20,16 @@ fn test_adam_step() { let mut adam = Adam::new(shape); let mut params = Array2::ones(shape); let grads = Array2::ones(shape); - + // Store initial parameters let initial_params = params.clone(); - + // Perform optimization step adam.step(&mut params, &grads, lr); - + // Parameters should have changed assert_ne!(params, initial_params); - + // Parameters should have decreased (since gradients are positive) assert!(params.iter().all(|&x| x < 1.0)); } @@ -41,15 +41,15 @@ fn test_adam_multiple_steps() { let mut adam = Adam::new(shape); let mut params = Array2::ones(shape); let grads = Array2::ones(shape); - + // Store initial parameters let initial_params = params.clone(); - + // Perform multiple optimization steps for _ in 0..10 { adam.step(&mut params, &grads, lr); } - + // Parameters should have changed more significantly assert!(params.iter().all(|&x| x < initial_params[[0, 0]])); } @@ -61,13 +61,13 @@ fn test_adam_with_zero_gradients() { let mut adam = Adam::new(shape); let mut params = Array2::ones(shape); let grads = Array2::zeros(shape); - + // Store initial parameters let initial_params = params.clone(); - + // Perform optimization step with zero gradients adam.step(&mut params, &grads, lr); - + // Parameters should not change with zero gradients assert_eq!(params, initial_params); } @@ -79,10 +79,10 @@ fn test_adam_with_negative_gradients() { let mut adam = Adam::new(shape); let mut params = Array2::ones(shape); let grads = Array2::from_shape_fn(shape, |_| -1.0); - + // Perform optimization step adam.step(&mut params, &grads, lr); - + // Parameters should have increased (since gradients are negative) assert!(params.iter().all(|&x| x > 1.0)); -} \ No newline at end of file +} diff --git a/tests/embeddings_test.rs b/tests/embeddings_test.rs index 931d797..18d7462 100644 --- a/tests/embeddings_test.rs +++ b/tests/embeddings_test.rs @@ -1,7 +1,7 @@ -use llm::{Embeddings, Vocab, Layer, EMBEDDING_DIM, MAX_SEQ_LEN}; +use llm::{EMBEDDING_DIM, Embeddings, Layer, MAX_SEQ_LEN, Vocab}; #[test] -fn test_embeddings_creation() { +fn test_embeddings_creation() { // Create with custom vocab let words = vec!["hello", "world", "test", ""]; let _vocab = Vocab::new(words); // Fix unused variable warning @@ -13,18 +13,18 @@ fn test_embed_tokens() { let words = vec!["hello", "world", "test", ""]; let vocab = Vocab::new(words); let embeddings = Embeddings::new(vocab.clone()); - + // Test embedding a single token let token_ids = vec![0]; // "hello" let embedded = embeddings.embed_tokens(&token_ids); - + // Check dimensions assert_eq!(embedded.shape(), [1, EMBEDDING_DIM]); - + // Test embedding multiple tokens let token_ids = vec![0, 1, 2]; // "hello world test" let embedded = embeddings.embed_tokens(&token_ids); - + // Check dimensions assert_eq!(embedded.shape(), [3, EMBEDDING_DIM]); } @@ -35,21 +35,21 @@ fn test_positional_embeddings() { let words = vec!["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]; let vocab = Vocab::new(words); let embeddings = Embeddings::new(vocab); - + // Test with different sequence lengths for seq_len in 1..5 { let token_ids = vec![0; seq_len]; // Repeat token 0 seq_len times let embedded = embeddings.embed_tokens(&token_ids); - + // Check dimensions assert_eq!(embedded.shape(), [seq_len, EMBEDDING_DIM]); - + // Verify that embeddings for the same token at different positions are different // (due to positional embeddings being added) if seq_len > 1 { let first_pos = embedded.row(0).to_owned(); let second_pos = embedded.row(1).to_owned(); - + // They should be different due to positional encoding assert_ne!(first_pos, second_pos); } @@ -61,14 +61,14 @@ fn test_max_sequence_length() { // Create vocab and embeddings let vocab = Vocab::default(); let embeddings = Embeddings::new(vocab); - + // Create a sequence at the maximum length let token_ids = vec![0; MAX_SEQ_LEN]; let embedded = embeddings.embed_tokens(&token_ids); - + // Check dimensions assert_eq!(embedded.shape(), [MAX_SEQ_LEN, EMBEDDING_DIM]); -} +} #[test] fn test_embedding_backwards() { @@ -83,7 +83,7 @@ fn test_embedding_backwards() { use ndarray::Array2; let input = Array2::from_shape_vec((1, 3), vec![0.0, 1.0, 2.0]).unwrap(); let _output = embeddings.forward(&input); - + // Create some dummy gradients and run backward pass let grads = Array2::from_shape_vec((3, EMBEDDING_DIM), vec![0.1; 3 * EMBEDDING_DIM]).unwrap(); let _grad_input = embeddings.backward(&grads, 0.01); @@ -92,5 +92,8 @@ fn test_embedding_backwards() { let post_train_position_embeddings = embeddings.positional_embeddings.clone(); assert_ne!(pre_train_token_embeddings, post_train_token_embeddings); - assert_ne!(pre_train_position_embeddings, post_train_position_embeddings); -} \ No newline at end of file + assert_ne!( + pre_train_position_embeddings, + post_train_position_embeddings + ); +} diff --git a/tests/feed_forward_test.rs b/tests/feed_forward_test.rs index 6530642..db5f675 100644 --- a/tests/feed_forward_test.rs +++ b/tests/feed_forward_test.rs @@ -1,18 +1,18 @@ -use llm::{Layer, EMBEDDING_DIM, HIDDEN_DIM}; -use ndarray::Array2; use llm::feed_forward::FeedForward; +use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer}; +use ndarray::Array2; #[test] fn test_feed_forward_forward() { // Create feed-forward module let mut feed_forward = FeedForward::new(EMBEDDING_DIM, HIDDEN_DIM); - + // Create input tensor (batch_size=1, seq_len=3, embedding_dim=EMBEDDING_DIM) let input = Array2::ones((3, EMBEDDING_DIM)); - + // Test forward pass let output = feed_forward.forward(&input); - + // Check output shape - should be same as input assert_eq!(output.shape(), input.shape()); } @@ -21,28 +21,28 @@ fn test_feed_forward_forward() { fn test_feed_forward_with_different_sequence_lengths() { // Create feed-forward module let mut feed_forward = FeedForward::new(EMBEDDING_DIM, HIDDEN_DIM); - + // Test with different sequence lengths for seq_len in 1..5 { // Create input tensor let input = Array2::ones((seq_len, EMBEDDING_DIM)); - + // Test forward pass let output = feed_forward.forward(&input); - + // Check output shape assert_eq!(output.shape(), [seq_len, EMBEDDING_DIM]); } -} +} #[test] fn test_feed_forward_and_backward() { // Create feed-forward module let mut feed_forward = FeedForward::new(EMBEDDING_DIM, HIDDEN_DIM); - + // Create input tensor (batch_size=1, seq_len=3, embedding_dim=EMBEDDING_DIM) let input = Array2::ones((3, EMBEDDING_DIM)); - + // Test forward pass let output = feed_forward.forward(&input); @@ -53,4 +53,4 @@ fn test_feed_forward_and_backward() { // Make sure backward pass modifies the input assert_ne!(output, grad_input); -} \ No newline at end of file +} diff --git a/tests/llm_test.rs b/tests/llm_test.rs index 3530099..4abb749 100644 --- a/tests/llm_test.rs +++ b/tests/llm_test.rs @@ -1,7 +1,7 @@ -use llm::{LLM, Vocab, Layer}; +use llm::EMBEDDING_DIM; use llm::Embeddings; use llm::output_projection::OutputProjection; -use llm::EMBEDDING_DIM; +use llm::{LLM, Layer, Vocab}; use ndarray::Array2; struct TestOutputProjectionLayer { @@ -43,7 +43,7 @@ impl Layer for TestOutputProjectionLayer { let grad_input = input.dot(grads); self.cached_grads = Some(grad_input.clone()); - return grad_input + grad_input } } @@ -64,14 +64,15 @@ impl TestOutputProjectionLayer { fn test_llm_tokenize() { let vocab = Vocab::default(); let vocab_size = vocab.encode.len(); - let llm = LLM::new(vocab, vec![ - Box::new(TestOutputProjectionLayer::new(5, 5, vocab_size)) - ]); - + let llm = LLM::new( + vocab, + vec![Box::new(TestOutputProjectionLayer::new(5, 5, vocab_size))], + ); + // Test tokenization let tokens = llm.tokenize("hello world"); assert!(!tokens.is_empty()); - + // Test that tokens can be decoded back for token in tokens { assert!(llm.vocab.decode(token).is_some()); @@ -82,10 +83,11 @@ fn test_llm_tokenize() { fn test_llm_predict() { let vocab = Vocab::default(); let vocab_size = vocab.encode.len(); - let mut llm = LLM::new(vocab.clone(), vec![ - Box::new(TestOutputProjectionLayer::new(5, 5, vocab_size)) - ]); - + let mut llm = LLM::new( + vocab.clone(), + vec![Box::new(TestOutputProjectionLayer::new(5, 5, vocab_size))], + ); + // Test prediction let input_text = "hello world this is rust"; let input_tokens = llm.tokenize(input_text); @@ -93,25 +95,24 @@ fn test_llm_predict() { assert!(!result.is_empty()); // Build expected output - let mut expected_tokens = vec![0; input_tokens.len()].iter().map(|x| vocab.decode[x].clone()).collect::>(); + let mut expected_tokens = vec![0; input_tokens.len()] + .iter() + .map(|x| vocab.decode[x].clone()) + .collect::>(); expected_tokens.push("".to_string()); let expected_output = expected_tokens.join(" "); - assert_eq!(result, expected_output); -} + assert_eq!(result, expected_output); +} #[test] fn test_llm_train() { let vocab = Vocab::default(); let vocab_size = vocab.encode.len(); let layer = Box::new(TestOutputProjectionLayer::new(5, 1, vocab_size)); - let mut llm = LLM::new(vocab.clone(), vec![ - layer - ]); + let mut llm = LLM::new(vocab.clone(), vec![layer]); - let training_data = vec![ - "hello world this is rust.", - ]; + let training_data = vec!["hello world this is rust."]; llm.train(training_data, 10, 0.01); } @@ -124,13 +125,8 @@ fn test_llm_integration() { let embeddings = Box::new(Embeddings::new(vocab.clone())); let output_projection = Box::new(OutputProjection::new(EMBEDDING_DIM, vocab_size)); - let mut llm = LLM::new(vocab.clone(), vec![ - embeddings, - output_projection - ]); + let mut llm = LLM::new(vocab.clone(), vec![embeddings, output_projection]); let input_text = "hello world this is rust"; - llm.train(vec![ - input_text - ], 10, 0.01); -} \ No newline at end of file + llm.train(vec![input_text], 10, 0.01); +} diff --git a/tests/output_projection_test.rs b/tests/output_projection_test.rs index 63997b6..a0f14c1 100644 --- a/tests/output_projection_test.rs +++ b/tests/output_projection_test.rs @@ -1,18 +1,18 @@ -use llm::{Layer, EMBEDDING_DIM}; -use ndarray::Array2; use llm::output_projection::OutputProjection; +use llm::{EMBEDDING_DIM, Layer}; +use ndarray::Array2; #[test] fn test_output_projection_creation() { let vocab_size = 10; let output_proj = OutputProjection::new(EMBEDDING_DIM, vocab_size); - + // Check weight matrix dimensions assert_eq!(output_proj.w_out.shape(), [EMBEDDING_DIM, vocab_size]); - + // Check bias vector dimensions assert_eq!(output_proj.b_out.shape(), [1, vocab_size]); - + // Check optimizer dimensions assert_eq!(output_proj.optimizer.m.shape(), [EMBEDDING_DIM, vocab_size]); assert_eq!(output_proj.optimizer.v.shape(), [EMBEDDING_DIM, vocab_size]); @@ -22,13 +22,13 @@ fn test_output_projection_creation() { fn test_output_projection_forward() { let vocab_size = 10; let mut output_proj = OutputProjection::new(EMBEDDING_DIM, vocab_size); - + // Create input tensor (batch_size=1, seq_len=3, embedding_dim=EMBEDDING_DIM) let input = Array2::ones((3, EMBEDDING_DIM)); - + // Test forward pass let output = output_proj.forward(&input); - + // Check output shape - should be [seq_len, vocab_size] assert_eq!(output.shape(), [3, vocab_size]); } @@ -37,15 +37,15 @@ fn test_output_projection_forward() { fn test_output_projection_with_different_sequence_lengths() { let vocab_size = 10; let mut output_proj = OutputProjection::new(EMBEDDING_DIM, vocab_size); - + // Test with different sequence lengths for seq_len in 1..5 { // Create input tensor let input = Array2::ones((seq_len, EMBEDDING_DIM)); - + // Test forward pass let output = output_proj.forward(&input); - + // Check output shape assert_eq!(output.shape(), [seq_len, vocab_size]); } @@ -55,30 +55,30 @@ fn test_output_projection_with_different_sequence_lengths() { fn test_output_projection_backward() { let vocab_size = 10; let mut output_proj = OutputProjection::new(EMBEDDING_DIM, vocab_size); - + // Create input tensor let input = Array2::ones((3, EMBEDDING_DIM)); - + // Forward pass first (required to cache input) let _output = output_proj.forward(&input); - + // Create gradient tensor let grads = Array2::ones((3, vocab_size)); - + // Test backward pass let grad_input = output_proj.backward(&grads, 0.01); - + // Check gradient input shape assert_eq!(grad_input.shape(), [3, EMBEDDING_DIM]); - + // Verify that parameters were updated let w_out_before = output_proj.w_out.clone(); let b_out_before = output_proj.b_out.clone(); - + // Run another forward and backward pass let _output = output_proj.forward(&input); let _grad_input = output_proj.backward(&grads, 0.01); - + // Check that parameters changed assert_ne!(output_proj.w_out, w_out_before); assert_ne!(output_proj.b_out, b_out_before); @@ -88,24 +88,24 @@ fn test_output_projection_backward() { fn test_output_projection_training() { let vocab_size = 10; let mut output_proj = OutputProjection::new(EMBEDDING_DIM, vocab_size); - + // Create input tensor let input = Array2::ones((3, EMBEDDING_DIM)); - + // Run multiple training steps for _ in 0..5 { // Forward pass let _output = output_proj.forward(&input); - + // Create gradient tensor (simulating cross-entropy loss gradients) let mut grads = Array2::zeros((3, vocab_size)); grads[[0, 0]] = 1.0; // Set gradient for first token - + // Backward pass let _grad_input = output_proj.backward(&grads, 0.01); } - + // Verify that parameters were updated assert_ne!(output_proj.w_out.sum(), 0.0); assert_ne!(output_proj.b_out.sum(), 0.0); -} \ No newline at end of file +} diff --git a/tests/self_attention_test.rs b/tests/self_attention_test.rs index cd08341..009c7e4 100644 --- a/tests/self_attention_test.rs +++ b/tests/self_attention_test.rs @@ -1,18 +1,18 @@ -use llm::{Layer, EMBEDDING_DIM}; -use ndarray::Array2; use llm::self_attention::SelfAttention; +use llm::{EMBEDDING_DIM, Layer}; +use ndarray::Array2; #[test] fn test_self_attention_forward() { // Create self-attention module let mut self_attention = SelfAttention::new(EMBEDDING_DIM); - + // Create input tensor (batch_size=1, seq_len=3, embedding_dim=EMBEDDING_DIM) let input = Array2::ones((3, EMBEDDING_DIM)); - + // Test forward pass let output = self_attention.forward(&input); - + // Check output shape - should be same as input assert_eq!(output.shape(), input.shape()); } @@ -21,16 +21,16 @@ fn test_self_attention_forward() { fn test_self_attention_with_different_sequence_lengths() { // Create self-attention module let mut self_attention = SelfAttention::new(EMBEDDING_DIM); - + // Test with different sequence lengths for seq_len in 1..5 { // Create input tensor let input = Array2::ones((seq_len, EMBEDDING_DIM)); - + // Test forward pass let output = self_attention.forward(&input); - + // Check output shape assert_eq!(output.shape(), [seq_len, EMBEDDING_DIM]); } -} \ No newline at end of file +} diff --git a/tests/transformer_test.rs b/tests/transformer_test.rs index 366ca59..c198915 100644 --- a/tests/transformer_test.rs +++ b/tests/transformer_test.rs @@ -1,17 +1,17 @@ -use llm::{Layer, EMBEDDING_DIM, HIDDEN_DIM}; -use ndarray::Array2; use llm::transformer::TransformerBlock; +use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer}; +use ndarray::Array2; #[test] fn test_transformer_block() { let mut transformer = TransformerBlock::new(EMBEDDING_DIM, HIDDEN_DIM); - + // Create a simple input tensor let input = Array2::ones((1, EMBEDDING_DIM)); - + // Test forward pass let output = transformer.forward(&input); - + // Check output shape assert_eq!(output.shape(), [1, EMBEDDING_DIM]); -} \ No newline at end of file +} diff --git a/tests/vocab_test.rs b/tests/vocab_test.rs index b8f1adf..131888f 100644 --- a/tests/vocab_test.rs +++ b/tests/vocab_test.rs @@ -4,12 +4,12 @@ use llm::Vocab; fn test_vocab_encode_decode() { let words = vec!["hello", "world", "this", "is", "rust", ""]; let vocab = Vocab::new(words); - + // Test encoding assert_eq!(vocab.encode("hello"), Some(0)); assert_eq!(vocab.encode("world"), Some(1)); assert_eq!(vocab.encode("unknown"), None); - + // Test decoding assert_eq!(vocab.decode(0).map(|s| s.as_str()), Some("hello")); assert_eq!(vocab.decode(1).map(|s| s.as_str()), Some("world")); @@ -19,9 +19,9 @@ fn test_vocab_encode_decode() { #[test] fn test_vocab_default() { let vocab = Vocab::default(); - + // Test that default vocab contains expected words assert!(vocab.encode("hello").is_some()); assert!(vocab.encode("world").is_some()); assert!(vocab.encode("").is_some()); -} \ No newline at end of file +} From 11cf58020ec02dd239c9475650e5dc514f5b160a Mon Sep 17 00:00:00 2001 From: ben1009 Date: Mon, 22 Sep 2025 16:48:41 +0800 Subject: [PATCH 2/9] chore: housekeeping --- src/transformer.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/transformer.rs b/src/transformer.rs index 04a9d4b..0beef93 100644 --- a/src/transformer.rs +++ b/src/transformer.rs @@ -32,9 +32,8 @@ impl Layer for TransformerBlock { let norm1_out = self.norm1.normalize(&attention_out); let feed_forward_out = self.feed_forward.forward(&norm1_out); // includes residual - let norm2_out = self.norm2.normalize(&feed_forward_out); - norm2_out + self.norm2.normalize(&feed_forward_out) } fn backward(&mut self, grads: &Array2, lr: f32) -> Array2 { @@ -49,6 +48,6 @@ impl Layer for TransformerBlock { // Backward through attention (includes residual connection) - grad_attn + self.attention.backward(&grad_norm1, lr) } } From 43b70a5d699bce3979729cb6f2b910c8581d5ec5 Mon Sep 17 00:00:00 2001 From: ben1009 Date: Tue, 23 Sep 2025 10:17:03 +0800 Subject: [PATCH 3/9] add more gha --- .github/codecov.yml | 21 +++++++++++ .github/workflows/check.yml | 73 +++++++++++++++++++++++++++++++++++++ .github/workflows/rust.yml | 23 ------------ .github/workflows/test.yml | 67 ++++++++++++++++++++++++++++++++++ 4 files changed, 161 insertions(+), 23 deletions(-) create mode 100644 .github/codecov.yml create mode 100644 .github/workflows/check.yml delete mode 100644 .github/workflows/rust.yml create mode 100644 .github/workflows/test.yml diff --git a/.github/codecov.yml b/.github/codecov.yml new file mode 100644 index 0000000..f2038b4 --- /dev/null +++ b/.github/codecov.yml @@ -0,0 +1,21 @@ +# ref: https://docs.codecov.com/docs/codecovyml-reference +coverage: + # Hold ourselves to a high bar + range: 77..100 + round: down + precision: 1 + status: + # ref: https://docs.codecov.com/docs/commit-status + project: + default: + # Avoid false negatives + threshold: 1% + +# Test files aren't important for coverage +ignore: + - "tests" + +# Make comments less noisy +comment: + layout: "files" + require_changes: yes \ No newline at end of file diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml new file mode 100644 index 0000000..b138260 --- /dev/null +++ b/.github/workflows/check.yml @@ -0,0 +1,73 @@ +permissions: + contents: read +on: + push: + branches: [main, master] + pull_request: + merge_group: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +env: + RUST_TOOLCHAIN: stable + +name: Check +jobs: + fmt: + runs-on: ubuntu-latest + strategy: + fail-fast: false + name: fmt + permissions: + # Give the default GITHUB_TOKEN write permission to commit and push the + # added or changed files to the repository. + contents: write + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install rust + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ env.RUST_TOOLCHAIN }} + components: rustfmt + - run: cargo fmt --check + + clippy: + runs-on: ubuntu-latest + name: clippy + permissions: + contents: read + checks: write + strategy: + fail-fast: false + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install ${{ env.RUST_TOOLCHAIN }} + uses: dtolnay/rust-toolchain@master # master + with: + toolchain: ${{ env.RUST_TOOLCHAIN }} + components: clippy + - name: Rust Cache + uses: Swatinem/rust-cache@v2 + - run: cargo clippy --workspace --all-features --all-targets -- -D warnings + + typos: + runs-on: ubuntu-latest + name: typos + permissions: + contents: read + strategy: + fail-fast: false + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Check spelling + uses: crate-ci/typos@master + + \ No newline at end of file diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml deleted file mode 100644 index ead13e2..0000000 --- a/.github/workflows/rust.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Rust - -on: - push: - branches: ["main"] - pull_request: - branches: ["main"] - -env: - CARGO_TERM_COLOR: always - -jobs: - build: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - name: Format Check - run: cargo fmt -- --check - - name: Build - run: cargo build --verbose - - name: Run tests - run: cargo test --verbose diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..fc706de --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,67 @@ +permissions: + contents: read +on: + push: + branches: [main, master] + pull_request: + merge_group: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +env: + RUST_TOOLCHAIN: stable + +name: Test +jobs: + required: + runs-on: ubuntu-latest + name: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install ${{ env.RUST_TOOLCHAIN }} + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ env.RUST_TOOLCHAIN }} + - name: cargo generate-lockfile + if: hashFiles('Cargo.lock') == '' + run: cargo generate-lockfile + # https://twitter.com/jonhoo/status/1571290371124260865 + - name: Rust Cache + uses: Swatinem/rust-cache@v2 + - name: Install nextest + uses: taiki-e/install-action@nextest + - name: cargo nextest --locked + run: cargo nextest run --locked --workspace --all-features --all-targets + + coverage: + runs-on: ubuntu-latest + name: coverage + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install rust + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ env.RUST_TOOLCHAIN }} + components: llvm-tools-preview + - name: cargo install cargo-llvm-cov + uses: taiki-e/install-action@cargo-llvm-cov + - name: cargo generate-lockfile + if: hashFiles('Cargo.lock') == '' + run: cargo generate-lockfile + - name: Rust Cache + uses: Swatinem/rust-cache@v2 + - name: Install nextest + uses: taiki-e/install-action@nextest + - name: cargo llvm-cov + run: cargo llvm-cov nextest --locked --workspace --all-features --all-targets --lcov --output-path lcov.info + - name: Upload to codecov.io + uses: codecov/codecov-action@v5 + with: + fail_ci_if_error: true + token: ${{ secrets.CODECOV_TOKEN }} # required \ No newline at end of file From 9f86c82f114136e1c1c1d87e1958c85d163c25f3 Mon Sep 17 00:00:00 2001 From: ben1009 Date: Tue, 23 Sep 2025 10:33:50 +0800 Subject: [PATCH 4/9] adjust cov to 55% --- .github/codecov.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/codecov.yml b/.github/codecov.yml index f2038b4..958c206 100644 --- a/.github/codecov.yml +++ b/.github/codecov.yml @@ -1,7 +1,7 @@ # ref: https://docs.codecov.com/docs/codecovyml-reference coverage: # Hold ourselves to a high bar - range: 77..100 + range: 55..100 round: down precision: 1 status: From 32f300b63a8f034731af984026e19d6e97324b60 Mon Sep 17 00:00:00 2001 From: ben1009 Date: Tue, 23 Sep 2025 10:46:25 +0800 Subject: [PATCH 5/9] add nightly fmt --- .github/workflows/check.yml | 2 +- rustfmt.toml | 14 ++++++++++++++ src/embeddings.rs | 3 ++- src/feed_forward.rs | 6 +++--- src/layer_norm.rs | 7 +++---- src/lib.rs | 3 +-- src/llm.rs | 19 +++++++++---------- src/main.rs | 10 +++++----- src/self_attention.rs | 8 ++++---- src/transformer.rs | 8 ++++---- tests/feed_forward_test.rs | 3 +-- tests/llm_test.rs | 5 +---- tests/output_projection_test.rs | 3 +-- tests/self_attention_test.rs | 3 +-- tests/transformer_test.rs | 3 +-- 15 files changed, 51 insertions(+), 46 deletions(-) create mode 100644 rustfmt.toml diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index b138260..aba9556 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -31,7 +31,7 @@ jobs: - name: Install rust uses: dtolnay/rust-toolchain@master with: - toolchain: ${{ env.RUST_TOOLCHAIN }} + toolchain: nightly #${{ env.RUST_TOOLCHAIN }} components: rustfmt - run: cargo fmt --check diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..d85c165 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,14 @@ +edition = "2024" +style_edition = "2024" +comment_width = 120 +format_code_in_doc_comments = true +format_macro_bodies = true +format_macro_matchers = true +normalize_comments = true +normalize_doc_attributes = true +imports_granularity = "Crate" +group_imports = "StdExternalCrate" +reorder_impl_items = true +reorder_imports = true +tab_spaces = 4 +wrap_comments = true diff --git a/src/embeddings.rs b/src/embeddings.rs index 65f5f7e..f0aff98 100644 --- a/src/embeddings.rs +++ b/src/embeddings.rs @@ -1,7 +1,8 @@ -use crate::{EMBEDDING_DIM, MAX_SEQ_LEN, adam::Adam, llm::Layer, vocab::Vocab}; use ndarray::{Array2, s}; use rand_distr::{Distribution, Normal}; +use crate::{EMBEDDING_DIM, MAX_SEQ_LEN, adam::Adam, llm::Layer, vocab::Vocab}; + pub struct Embeddings { pub token_embeddings: Array2, pub positional_embeddings: Array2, diff --git a/src/feed_forward.rs b/src/feed_forward.rs index dd74c94..ae20d10 100644 --- a/src/feed_forward.rs +++ b/src/feed_forward.rs @@ -1,8 +1,8 @@ -use crate::{adam::Adam, llm::Layer}; -use ndarray::Array2; -use ndarray::Axis; +use ndarray::{Array2, Axis}; use rand_distr::{Distribution, Normal}; +use crate::{adam::Adam, llm::Layer}; + pub struct FeedForward { w1: Array2, b1: Array2, diff --git a/src/layer_norm.rs b/src/layer_norm.rs index f326641..b73a7d7 100644 --- a/src/layer_norm.rs +++ b/src/layer_norm.rs @@ -1,7 +1,6 @@ -use crate::adam::Adam; -use crate::llm::Layer; -use ndarray::Array2; -use ndarray::Axis; +use ndarray::{Array2, Axis}; + +use crate::{adam::Adam, llm::Layer}; pub struct LayerNorm { epsilon: f32, // Small constant for stability diff --git a/src/lib.rs b/src/lib.rs index a62f008..80769b5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,8 +9,7 @@ pub mod transformer; pub mod vocab; // Re-export key structs for easier access pub use embeddings::Embeddings; -pub use llm::LLM; -pub use llm::Layer; +pub use llm::{LLM, Layer}; pub use vocab::Vocab; // Constants diff --git a/src/llm.rs b/src/llm.rs index 7ddfaae..d49fb25 100644 --- a/src/llm.rs +++ b/src/llm.rs @@ -1,13 +1,11 @@ -use crate::EMBEDDING_DIM; -use crate::Embeddings; -use crate::HIDDEN_DIM; -use crate::MAX_SEQ_LEN; -use crate::Vocab; -use crate::output_projection::OutputProjection; -use crate::transformer::TransformerBlock; -use ndarray::Array1; -use ndarray::{Array2, Axis}; use std::cmp::Ordering; + +use ndarray::{Array1, Array2, Axis}; + +use crate::{ + EMBEDDING_DIM, Embeddings, HIDDEN_DIM, MAX_SEQ_LEN, Vocab, output_projection::OutputProjection, + transformer::TransformerBlock, +}; pub trait Layer { fn layer_type(&self) -> &str; @@ -117,7 +115,8 @@ impl LLM { .to_owned() .insert_axis(Axis(0)); - // Softmax - convert activiations of each token to a probability distribution over the vocabulary + // Softmax - convert activiations of each token to a probability distribution over the + // vocabulary let probs = Self::softmax(&last_logit); // 1 x vocab_size // Greedy Decode - Choose the highest probability token for each position diff --git a/src/main.rs b/src/main.rs index 6e64144..cf5297a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,12 +1,12 @@ use std::io::Write; -use crate::embeddings::Embeddings; -use crate::llm::LLM; -use crate::output_projection::OutputProjection; -use crate::transformer::TransformerBlock; -use crate::vocab::Vocab; use ::llm::{EMBEDDING_DIM, HIDDEN_DIM, MAX_SEQ_LEN}; +use crate::{ + embeddings::Embeddings, llm::LLM, output_projection::OutputProjection, + transformer::TransformerBlock, vocab::Vocab, +}; + mod adam; mod embeddings; mod feed_forward; diff --git a/src/self_attention.rs b/src/self_attention.rs index b96512f..4310cf7 100644 --- a/src/self_attention.rs +++ b/src/self_attention.rs @@ -1,9 +1,9 @@ -use crate::EMBEDDING_DIM; -use crate::adam::Adam; -use crate::llm::Layer; +use std::f32; + use ndarray::Array2; use rand_distr::{Distribution, Normal}; -use std::f32; + +use crate::{EMBEDDING_DIM, adam::Adam, llm::Layer}; pub struct SelfAttention { pub embedding_dim: usize, diff --git a/src/transformer.rs b/src/transformer.rs index 0beef93..e91b59f 100644 --- a/src/transformer.rs +++ b/src/transformer.rs @@ -1,8 +1,8 @@ -use crate::feed_forward::FeedForward; -use crate::layer_norm::LayerNorm; -use crate::llm::Layer; -use crate::self_attention::SelfAttention; use ndarray::Array2; + +use crate::{ + feed_forward::FeedForward, layer_norm::LayerNorm, llm::Layer, self_attention::SelfAttention, +}; pub struct TransformerBlock { attention: SelfAttention, feed_forward: FeedForward, diff --git a/tests/feed_forward_test.rs b/tests/feed_forward_test.rs index 922239c..c651fb6 100644 --- a/tests/feed_forward_test.rs +++ b/tests/feed_forward_test.rs @@ -1,5 +1,4 @@ -use llm::feed_forward::FeedForward; -use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer}; +use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer, feed_forward::FeedForward}; use ndarray::Array2; #[test] diff --git a/tests/llm_test.rs b/tests/llm_test.rs index 4abb749..5c88b78 100644 --- a/tests/llm_test.rs +++ b/tests/llm_test.rs @@ -1,7 +1,4 @@ -use llm::EMBEDDING_DIM; -use llm::Embeddings; -use llm::output_projection::OutputProjection; -use llm::{LLM, Layer, Vocab}; +use llm::{EMBEDDING_DIM, Embeddings, LLM, Layer, Vocab, output_projection::OutputProjection}; use ndarray::Array2; struct TestOutputProjectionLayer { diff --git a/tests/output_projection_test.rs b/tests/output_projection_test.rs index a0f14c1..5b467ad 100644 --- a/tests/output_projection_test.rs +++ b/tests/output_projection_test.rs @@ -1,5 +1,4 @@ -use llm::output_projection::OutputProjection; -use llm::{EMBEDDING_DIM, Layer}; +use llm::{EMBEDDING_DIM, Layer, output_projection::OutputProjection}; use ndarray::Array2; #[test] diff --git a/tests/self_attention_test.rs b/tests/self_attention_test.rs index 009c7e4..4e1e5ff 100644 --- a/tests/self_attention_test.rs +++ b/tests/self_attention_test.rs @@ -1,5 +1,4 @@ -use llm::self_attention::SelfAttention; -use llm::{EMBEDDING_DIM, Layer}; +use llm::{EMBEDDING_DIM, Layer, self_attention::SelfAttention}; use ndarray::Array2; #[test] diff --git a/tests/transformer_test.rs b/tests/transformer_test.rs index c198915..0fa49d1 100644 --- a/tests/transformer_test.rs +++ b/tests/transformer_test.rs @@ -1,5 +1,4 @@ -use llm::transformer::TransformerBlock; -use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer}; +use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer, transformer::TransformerBlock}; use ndarray::Array2; #[test] From fcc6a4a15586202aaca04ee7e303aaeb2b157600 Mon Sep 17 00:00:00 2001 From: ben1009 Date: Fri, 26 Sep 2025 10:20:56 +0800 Subject: [PATCH 6/9] chore: fix lints --- Cargo.lock | 20 ++++++++++++++++++-- src/dataset_loader.rs | 5 +++-- src/layer_norm.rs | 4 ++-- src/vocab.rs | 9 +++++---- 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 146c406..4219d18 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -343,10 +343,15 @@ version = "1.0.1+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" dependencies = [ - "byteorder", - "zerocopy-derive", + "wit-bindgen", ] +[[package]] +name = "wit-bindgen" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" + [[package]] name = "zerocopy" version = "0.8.27" @@ -355,3 +360,14 @@ checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" dependencies = [ "zerocopy-derive", ] + +[[package]] +name = "zerocopy-derive" +version = "0.8.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/src/dataset_loader.rs b/src/dataset_loader.rs index 0c63eb2..bb0192a 100644 --- a/src/dataset_loader.rs +++ b/src/dataset_loader.rs @@ -1,13 +1,14 @@ -use csv::ReaderBuilder; -use serde_json; use std::fs; +use csv::ReaderBuilder; + pub struct Dataset { pub pretraining_data: Vec, pub chat_training_data: Vec, } #[allow(dead_code)] +#[allow(clippy::upper_case_acronyms)] pub enum DatasetType { JSON, CSV, diff --git a/src/layer_norm.rs b/src/layer_norm.rs index c2e269a..b73a7d7 100644 --- a/src/layer_norm.rs +++ b/src/layer_norm.rs @@ -1,7 +1,7 @@ -use crate::adam::Adam; -use crate::llm::Layer; use ndarray::{Array2, Axis}; +use crate::{adam::Adam, llm::Layer}; + pub struct LayerNorm { epsilon: f32, // Small constant for stability gamma: Array2, // Learnable scaling parameter diff --git a/src/vocab.rs b/src/vocab.rs index 81b7b0e..b28e86a 100644 --- a/src/vocab.rs +++ b/src/vocab.rs @@ -1,6 +1,7 @@ -use bincode::Encode; use std::collections::HashMap; +use bincode::Encode; + #[derive(Clone, Encode)] pub struct Vocab { pub encode: HashMap, @@ -48,10 +49,10 @@ impl Vocab { } } -impl Into for Vocab { - fn into(self) -> String { +impl From for String { + fn from(val: Vocab) -> Self { String::from_iter( - self.words + val.words .iter() .enumerate() .map(|(i, str)| format!("({i},{str}),")), From 2ec33c705d312b2768320a6d9c7349bd6064fe95 Mon Sep 17 00:00:00 2001 From: ben1009 Date: Tue, 30 Sep 2025 09:22:37 +0800 Subject: [PATCH 7/9] chore: fix lints --- src/llm.rs | 2 +- tests/llm_test.rs | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/llm.rs b/src/llm.rs index dbaa1bc..4613f93 100644 --- a/src/llm.rs +++ b/src/llm.rs @@ -56,7 +56,7 @@ impl LLM { // Sum the parameters across all layers in the network self.network .iter() - .map(|layer: &Box| layer.parameters()) + .map(|layer| layer.parameters()) .sum::() } diff --git a/tests/llm_test.rs b/tests/llm_test.rs index c32e518..1e2fec4 100644 --- a/tests/llm_test.rs +++ b/tests/llm_test.rs @@ -1,5 +1,7 @@ -use llm::{EMBEDDING_DIM, Embeddings, LLM, Layer, Vocab, output_projection::OutputProjection}; -use llm::{LLM, Layer, Vocab}; +use llm::{ + EMBEDDING_DIM, Embeddings, HIDDEN_DIM, LLM, Layer, MAX_SEQ_LEN, Vocab, + output_projection::OutputProjection, transformer::TransformerBlock, +}; use ndarray::Array2; struct TestOutputProjectionLayer { @@ -153,7 +155,8 @@ fn test_llm_total_parameters() { let param_count = llm.total_parameters(); assert!(param_count > 0); - // Let's validate that this is equal to the expected total number of parameters. (based on our source) + // Let's validate that this is equal to the expected total number of parameters. (based on our + // source) let expected_embeddings_parameters = vocab_size * EMBEDDING_DIM + MAX_SEQ_LEN * EMBEDDING_DIM; let expected_transformer_block_parameters = (2 * EMBEDDING_DIM) + // LayerNorm (3 * EMBEDDING_DIM * EMBEDDING_DIM) + // SelfAttention From 4f407f4062b590c961ba1aba310a6eaca0db9f3c Mon Sep 17 00:00:00 2001 From: ben1009 Date: Sat, 4 Oct 2025 12:21:51 +0800 Subject: [PATCH 8/9] comment out the cov gha --- .github/codecov.yml | 39 +++++++++++++------------- .github/workflows/test.yml | 57 +++++++++++++++++++------------------- src/llm.rs | 2 +- 3 files changed, 50 insertions(+), 48 deletions(-) diff --git a/.github/codecov.yml b/.github/codecov.yml index 958c206..f8e75a2 100644 --- a/.github/codecov.yml +++ b/.github/codecov.yml @@ -1,21 +1,22 @@ -# ref: https://docs.codecov.com/docs/codecovyml-reference -coverage: - # Hold ourselves to a high bar - range: 55..100 - round: down - precision: 1 - status: - # ref: https://docs.codecov.com/docs/commit-status - project: - default: - # Avoid false negatives - threshold: 1% +# # ref: https://docs.codecov.com/docs/codecovyml-reference +# comment out coverage job for now, https://github.com/tekaratzas/RustGPT/pull/11#issuecomment-3361854174 +# coverage: +# # Hold ourselves to a high bar +# range: 55..100 +# round: down +# precision: 1 +# status: +# # ref: https://docs.codecov.com/docs/commit-status +# project: +# default: +# # Avoid false negatives +# threshold: 1% -# Test files aren't important for coverage -ignore: - - "tests" +# # Test files aren't important for coverage +# ignore: +# - "tests" -# Make comments less noisy -comment: - layout: "files" - require_changes: yes \ No newline at end of file +# # Make comments less noisy +# comment: +# layout: "files" +# require_changes: yes \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fc706de..da07418 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,31 +37,32 @@ jobs: - name: cargo nextest --locked run: cargo nextest run --locked --workspace --all-features --all-targets - coverage: - runs-on: ubuntu-latest - name: coverage - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: Install rust - uses: dtolnay/rust-toolchain@master - with: - toolchain: ${{ env.RUST_TOOLCHAIN }} - components: llvm-tools-preview - - name: cargo install cargo-llvm-cov - uses: taiki-e/install-action@cargo-llvm-cov - - name: cargo generate-lockfile - if: hashFiles('Cargo.lock') == '' - run: cargo generate-lockfile - - name: Rust Cache - uses: Swatinem/rust-cache@v2 - - name: Install nextest - uses: taiki-e/install-action@nextest - - name: cargo llvm-cov - run: cargo llvm-cov nextest --locked --workspace --all-features --all-targets --lcov --output-path lcov.info - - name: Upload to codecov.io - uses: codecov/codecov-action@v5 - with: - fail_ci_if_error: true - token: ${{ secrets.CODECOV_TOKEN }} # required \ No newline at end of file + # comment out coverage job for now, https://github.com/tekaratzas/RustGPT/pull/11#issuecomment-3361854174 + # coverage: + # runs-on: ubuntu-latest + # name: coverage + # steps: + # - uses: actions/checkout@v4 + # with: + # submodules: true + # - name: Install rust + # uses: dtolnay/rust-toolchain@master + # with: + # toolchain: ${{ env.RUST_TOOLCHAIN }} + # components: llvm-tools-preview + # - name: cargo install cargo-llvm-cov + # uses: taiki-e/install-action@cargo-llvm-cov + # - name: cargo generate-lockfile + # if: hashFiles('Cargo.lock') == '' + # run: cargo generate-lockfile + # - name: Rust Cache + # uses: Swatinem/rust-cache@v2 + # - name: Install nextest + # uses: taiki-e/install-action@nextest + # - name: cargo llvm-cov + # run: cargo llvm-cov nextest --locked --workspace --all-features --all-targets --lcov --output-path lcov.info + # - name: Upload to codecov.io + # uses: codecov/codecov-action@v5 + # with: + # fail_ci_if_error: true + # token: ${{ secrets.CODECOV_TOKEN }} # required \ No newline at end of file diff --git a/src/llm.rs b/src/llm.rs index 4613f93..d0d6688 100644 --- a/src/llm.rs +++ b/src/llm.rs @@ -125,7 +125,7 @@ impl LLM { .to_owned() .insert_axis(Axis(0)); - // Softmax - convert activiations of each token to a probability distribution over the + // Softmax - convert activations of each token to a probability distribution over the // vocabulary let probs = Self::softmax(&last_logit); // 1 x vocab_size From 08613771c25467837b4daaef290288fb8cf12b9c Mon Sep 17 00:00:00 2001 From: ben1009 Date: Sun, 5 Oct 2025 18:42:05 +0800 Subject: [PATCH 9/9] update --- src/vocab.rs | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/vocab.rs b/src/vocab.rs index b28e86a..ced340b 100644 --- a/src/vocab.rs +++ b/src/vocab.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use bincode::Encode; @@ -47,6 +47,34 @@ impl Vocab { pub fn default_words() -> Vec<&'static str> { vec!["hello", "world", "this", "is", "rust", ""] } + + /// Process text data to extract vocabulary words and add them to the vocabulary set + pub fn process_text_for_vocab(texts: &[String], vocab_set: &mut HashSet) { + // Add end of sequence token + vocab_set.insert("".to_string()); + + // Process all training examples for vocabulary + for text in texts { + for word in text.split_whitespace() { + // Handle punctuation by splitting it from words + let mut current = String::new(); + for c in word.chars() { + if c.is_ascii_punctuation() { + if !current.is_empty() { + vocab_set.insert(current.clone()); + current.clear(); + } + vocab_set.insert(c.to_string()); + } else { + current.push(c); + } + } + if !current.is_empty() { + vocab_set.insert(current); + } + } + } + } } impl From for String {