From 86c528ff9693a63727164389b634bd8b8c5a5780 Mon Sep 17 00:00:00 2001
From: ben1009 <liuhe1009@gmail.com>
Date: Mon, 22 Sep 2025 16:43:11 +0800
Subject: [PATCH 1/9] chore: housekeeping

---
 Cargo.lock                      |  29 +------
 Cargo.toml                      |   2 +-
 src/adam.rs                     |   2 +-
 src/embeddings.rs               |  57 ++++++++-----
 src/feed_forward.rs             |  25 +++---
 src/layer_norm.rs               |  35 ++++----
 src/lib.rs                      |  16 ++--
 src/llm.rs                      | 140 +++++++++++++++++++-------------
 src/main.rs                     |  83 ++++++++++---------
 src/output_projection.rs        |  20 +++--
 src/self_attention.rs           |  66 +++++++--------
 src/transformer.rs              |  26 +++---
 src/vocab.rs                    |   8 +-
 tests/adam_test.rs              |  30 +++----
 tests/embeddings_test.rs        |  35 ++++----
 tests/feed_forward_test.rs      |  24 +++---
 tests/llm_test.rs               |  54 ++++++------
 tests/output_projection_test.rs |  50 ++++++------
 tests/self_attention_test.rs    |  18 ++--
 tests/transformer_test.rs       |  12 +--
 tests/vocab_test.rs             |   8 +-
 21 files changed, 387 insertions(+), 353 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index d62639f..ed56ac6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -133,7 +133,7 @@ version = "0.2.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
 dependencies = [
- "zerocopy 0.7.35",
+ "zerocopy",
 ]
 
 [[package]]
@@ -156,13 +156,12 @@ dependencies = [
 
 [[package]]
 name = "rand"
-version = "0.9.0"
+version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94"
+checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
 dependencies = [
  "rand_chacha",
  "rand_core",
- "zerocopy 0.8.23",
 ]
 
 [[package]]
@@ -306,16 +305,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
 dependencies = [
  "byteorder",
- "zerocopy-derive 0.7.35",
-]
-
-[[package]]
-name = "zerocopy"
-version = "0.8.23"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd97444d05a4328b90e75e503a34bad781f14e28a823ad3557f0750df1ebcbc6"
-dependencies = [
- "zerocopy-derive 0.8.23",
+ "zerocopy-derive",
 ]
 
 [[package]]
@@ -328,14 +318,3 @@ dependencies = [
  "quote",
  "syn",
 ]
-
-[[package]]
-name = "zerocopy-derive"
-version = "0.8.23"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6352c01d0edd5db859a63e2605f4ea3183ddbd15e2c4a9e7d32184df75e4f154"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
diff --git a/Cargo.toml b/Cargo.toml
index 6467962..08efc31 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,7 +5,7 @@ edition = "2024"
 
 [dependencies]
 ndarray = "0.16.1"
-rand = "0.9.0"
+rand = "0.9.2"
 rand_distr = "0.5.0"
 
 [dev-dependencies]
diff --git a/src/adam.rs b/src/adam.rs
index 744f2dc..c7dd922 100644
--- a/src/adam.rs
+++ b/src/adam.rs
@@ -33,4 +33,4 @@ impl Adam {
 
         *params -= &(update * lr);
     }
-}
\ No newline at end of file
+}
diff --git a/src/embeddings.rs b/src/embeddings.rs
index fd15aa4..65f5f7e 100644
--- a/src/embeddings.rs
+++ b/src/embeddings.rs
@@ -1,6 +1,6 @@
-use ndarray::{s, Array2};
-use rand_distr::{Normal, Distribution};
-use crate::{vocab::Vocab, llm::Layer, EMBEDDING_DIM, MAX_SEQ_LEN, adam::Adam};
+use crate::{EMBEDDING_DIM, MAX_SEQ_LEN, adam::Adam, llm::Layer, vocab::Vocab};
+use ndarray::{Array2, s};
+use rand_distr::{Distribution, Normal};
 
 pub struct Embeddings {
     pub token_embeddings: Array2<f32>,
@@ -10,20 +10,19 @@ pub struct Embeddings {
     pub positional_optimizer: Adam,
 }
 
-impl Default for Embeddings { 
+impl Default for Embeddings {
     fn default() -> Self {
-        Self { 
+        Self {
             token_embeddings: Self::init_embeddings(Vocab::default_words().len(), EMBEDDING_DIM),
             positional_embeddings: Self::init_positional_embeddings(MAX_SEQ_LEN, EMBEDDING_DIM),
             cached_input: None,
             token_optimizer: Adam::new((Vocab::default_words().len(), EMBEDDING_DIM)),
-            positional_optimizer: Adam::new((MAX_SEQ_LEN, EMBEDDING_DIM))
+            positional_optimizer: Adam::new((MAX_SEQ_LEN, EMBEDDING_DIM)),
         }
     }
 }
 
 impl Embeddings {
-
     pub fn new(vocab: Vocab) -> Self {
         Self {
             token_embeddings: Self::init_embeddings(vocab.words.len(), EMBEDDING_DIM),
@@ -50,26 +49,35 @@ impl Embeddings {
         let mut token_embeds = Array2::<f32>::zeros((token_ids.len(), embeddings.ncols()));
         for (i, &token_id) in token_ids.iter().enumerate() {
             if token_id >= embeddings.nrows() {
-                panic!("Token ID {} out of bounds for vocab size {}", token_id, embeddings.nrows());
+                panic!(
+                    "Token ID {} out of bounds for vocab size {}",
+                    token_id,
+                    embeddings.nrows()
+                );
             }
             token_embeds.row_mut(i).assign(&embeddings.row(token_id));
         }
         token_embeds
     }
 
-    fn get_positional_embeddings(positional_encodings: &Array2<f32>, seq_len: usize) -> Array2<f32> {
+    fn get_positional_embeddings(
+        positional_encodings: &Array2<f32>,
+        seq_len: usize,
+    ) -> Array2<f32> {
         if seq_len > positional_encodings.nrows() {
-            panic!("Sequence length {} exceeds maximum {}", seq_len, positional_encodings.nrows());
+            panic!(
+                "Sequence length {} exceeds maximum {}",
+                seq_len,
+                positional_encodings.nrows()
+            );
         }
         positional_encodings.slice(s![0..seq_len, ..]).to_owned()
     }
 
-    pub fn embed_tokens(
-        &self,
-        token_ids: &[usize]
-    ) -> Array2<f32> {
+    pub fn embed_tokens(&self, token_ids: &[usize]) -> Array2<f32> {
         let token_embeds = Self::get_token_embeddings(&self.token_embeddings, token_ids);
-        let position_embeds = Self::get_positional_embeddings(&self.positional_embeddings, token_ids.len());
+        let position_embeds =
+            Self::get_positional_embeddings(&self.positional_embeddings, token_ids.len());
         token_embeds + position_embeds // Element-wise sum
     }
 }
@@ -79,7 +87,8 @@ impl Layer for Embeddings {
         "Embeddings"
     }
 
-    fn forward(&mut self, input: &Array2<f32>) -> Array2<f32> { // input shape is [1, sequence_length]
+    fn forward(&mut self, input: &Array2<f32>) -> Array2<f32> {
+        // input shape is [1, sequence_length]
         self.cached_input = Some(input.clone());
         let token_ids: Vec<usize> = input.iter().map(|&x| x as usize).collect();
         self.embed_tokens(&token_ids) // shape is [sequence_length, embedding_dim]
@@ -96,16 +105,20 @@ impl Layer for Embeddings {
 
         for (i, &token_id) in token_ids.iter().enumerate() {
             if token_id >= self.token_embeddings.nrows() {
-                panic!("Token ID {} out of bounds for vocab size {}", token_id, self.token_embeddings.nrows());
+                panic!(
+                    "Token ID {} out of bounds for vocab size {}",
+                    token_id,
+                    self.token_embeddings.nrows()
+                );
             }
             let grad_row = grads.row(i);
-            
+
             // Accumulate token embedding gradients efficiently (no temp variable)
             {
                 let mut token_row = token_grads.row_mut(token_id);
                 token_row += &grad_row;
             }
-            
+
             // Accumulate positional embedding gradients efficiently (no temp variable)
             {
                 let mut pos_row = positional_grads.row_mut(i);
@@ -113,8 +126,10 @@ impl Layer for Embeddings {
             }
         }
 
-        self.token_optimizer.step(&mut self.token_embeddings, &token_grads, lr);
-        self.positional_optimizer.step(&mut self.positional_embeddings, &positional_grads, lr);
+        self.token_optimizer
+            .step(&mut self.token_embeddings, &token_grads, lr);
+        self.positional_optimizer
+            .step(&mut self.positional_embeddings, &positional_grads, lr);
 
         // Return gradient to propagate further back
         grads.to_owned()
diff --git a/src/feed_forward.rs b/src/feed_forward.rs
index c141763..dd74c94 100644
--- a/src/feed_forward.rs
+++ b/src/feed_forward.rs
@@ -1,7 +1,7 @@
+use crate::{adam::Adam, llm::Layer};
 use ndarray::Array2;
 use ndarray::Axis;
-use rand_distr::{Normal, Distribution};
-use crate::{adam::Adam, llm::Layer};
+use rand_distr::{Distribution, Normal};
 
 pub struct FeedForward {
     w1: Array2<f32>,
@@ -24,15 +24,15 @@ impl FeedForward {
     /// Initialize a feedforward layer with random weights
     pub fn new(embedding_dim: usize, hidden_dim: usize) -> Self {
         let mut rng = rand::rng();
-        
+
         // Xavier/He initialization for w1: std = sqrt(2 / fan_in)
         let std_w1 = (2.0 / embedding_dim as f32).sqrt();
         let normal_w1 = Normal::new(0.0, std_w1).unwrap();
-        
-        // Xavier/He initialization for w2: std = sqrt(2 / fan_in)  
+
+        // Xavier/He initialization for w2: std = sqrt(2 / fan_in)
         let std_w2 = (2.0 / hidden_dim as f32).sqrt();
         let normal_w2 = Normal::new(0.0, std_w2).unwrap();
-        
+
         FeedForward {
             w1: Array2::from_shape_fn((embedding_dim, hidden_dim), |_| normal_w1.sample(&mut rng)),
             b1: Array2::zeros((1, hidden_dim)), // Bias initialized to 0
@@ -73,12 +73,14 @@ impl Layer for FeedForward {
 
         // Gradient w.r.t. W1 and b1
         let grad_w1 = input.t().dot(&grad_hidden_pre_activation);
-        let grad_b1 = grad_hidden_pre_activation.sum_axis(Axis(0)).insert_axis(Axis(0)); // Shape: [1, hidden_dim]
+        let grad_b1 = grad_hidden_pre_activation
+            .sum_axis(Axis(0))
+            .insert_axis(Axis(0)); // Shape: [1, hidden_dim]
 
         // Gradient w.r.t. input (through feed-forward computation)
         let grad_input_feedforward = grad_hidden_pre_activation.dot(&self.w1.t());
-        
-        // Add gradient from residual connection 
+
+        // Add gradient from residual connection
         // Forward: output = W2(ReLU(W1*input + b1)) + b2 + input
         // Backward: grad_input = grad_feedforward + grad_residual
         let grad_input = grad_input_feedforward + grads;
@@ -93,10 +95,9 @@ impl Layer for FeedForward {
     }
 
     fn forward(&mut self, input: &Array2<f32>) -> Array2<f32> {
-
         let hidden_pre_activation = input.dot(&self.w1) + &self.b1;
         let hidden_post_activation = hidden_pre_activation.mapv(|x| x.max(0.0)); // ReLU
-        
+
         let output = hidden_post_activation.dot(&self.w2) + &self.b2;
 
         // Cache values
@@ -106,4 +107,4 @@ impl Layer for FeedForward {
 
         output + input // residual connection (no LayerNorm here)
     }
-}
\ No newline at end of file
+}
diff --git a/src/layer_norm.rs b/src/layer_norm.rs
index 7277f7f..f326641 100644
--- a/src/layer_norm.rs
+++ b/src/layer_norm.rs
@@ -1,10 +1,10 @@
 use crate::adam::Adam;
+use crate::llm::Layer;
 use ndarray::Array2;
 use ndarray::Axis;
-use crate::llm::Layer;
 
 pub struct LayerNorm {
-    epsilon: f32,   // Small constant for stability
+    epsilon: f32,       // Small constant for stability
     gamma: Array2<f32>, // Learnable scaling parameter
     beta: Array2<f32>,  // Learnable bias parameter
 
@@ -58,32 +58,39 @@ impl Layer for LayerNorm {
         let input = self.cached_input.as_ref().unwrap();
         let mean = self.cached_mean.as_ref().unwrap();
         let std = self.cached_std.as_ref().unwrap();
-        
+
         let normalized = (input - mean) / (std + self.epsilon);
         let n_features = input.shape()[1] as f32; // Number of features per token
-        
+
         // Gradients w.r.t. gamma and beta
         let grad_gamma = (&normalized * grads).sum_axis(Axis(0)).insert_axis(Axis(0));
         let grad_beta = grads.sum_axis(Axis(0)).insert_axis(Axis(0));
-        
+
         // Gradient w.r.t. normalized values
         let grad_normalized = &self.gamma * grads;
-        
+
         // LayerNorm backward pass with full chain rule
         let grad_input = {
             let variance = std * std + self.epsilon;
-            let grad_var = (&grad_normalized * &normalized).sum_axis(Axis(1)).insert_axis(Axis(1)) * (-0.5) / variance.mapv(|x| x * x.sqrt());
-            let grad_mean = grad_normalized.sum_axis(Axis(1)).insert_axis(Axis(1)) * (-1.0) / (std + self.epsilon) + &grad_var * (input - mean).sum_axis(Axis(1)).insert_axis(Axis(1)) * (-2.0) / n_features;
-            
-            &grad_normalized / (std + self.epsilon) + 
-            &grad_var * 2.0 * (input - mean) / n_features + 
-            &grad_mean / n_features
+            let grad_var = (&grad_normalized * &normalized)
+                .sum_axis(Axis(1))
+                .insert_axis(Axis(1))
+                * (-0.5)
+                / variance.mapv(|x| x * x.sqrt());
+            let grad_mean = grad_normalized.sum_axis(Axis(1)).insert_axis(Axis(1)) * (-1.0)
+                / (std + self.epsilon)
+                + &grad_var * (input - mean).sum_axis(Axis(1)).insert_axis(Axis(1)) * (-2.0)
+                    / n_features;
+
+            &grad_normalized / (std + self.epsilon)
+                + &grad_var * 2.0 * (input - mean) / n_features
+                + &grad_mean / n_features
         };
-        
+
         // Update learnable parameters
         self.optimizer_gamma.step(&mut self.gamma, &grad_gamma, lr);
         self.optimizer_beta.step(&mut self.beta, &grad_beta, lr);
-        
+
         grad_input
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index b9a7782..4f09a52 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,19 +1,19 @@
-pub mod llm;
+pub mod adam;
 pub mod embeddings;
-pub mod vocab;
-pub mod transformer;
 pub mod feed_forward;
-pub mod self_attention;
-pub mod output_projection;
-pub mod adam;
 pub mod layer_norm;
+pub mod llm;
+pub mod output_projection;
+pub mod self_attention;
+pub mod transformer;
+pub mod vocab;
 // Re-export key structs for easier access
-pub use vocab::Vocab;
 pub use embeddings::Embeddings;
 pub use llm::LLM;
 pub use llm::Layer;
+pub use vocab::Vocab;
 
 // Constants
 pub const MAX_SEQ_LEN: usize = 40;
 pub const EMBEDDING_DIM: usize = 32;
-pub const HIDDEN_DIM: usize = 32; 
\ No newline at end of file
+pub const HIDDEN_DIM: usize = 32;
diff --git a/src/llm.rs b/src/llm.rs
index 89be934..7ddfaae 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -1,12 +1,12 @@
-use ndarray::Array1;
-use ndarray::{Array2, Axis};
-use crate::transformer::TransformerBlock;
-use crate::Embeddings;
-use crate::Vocab;
-use crate::output_projection::OutputProjection;
 use crate::EMBEDDING_DIM;
+use crate::Embeddings;
 use crate::HIDDEN_DIM;
 use crate::MAX_SEQ_LEN;
+use crate::Vocab;
+use crate::output_projection::OutputProjection;
+use crate::transformer::TransformerBlock;
+use ndarray::Array1;
+use ndarray::{Array2, Axis};
 use std::cmp::Ordering;
 pub trait Layer {
     fn layer_type(&self) -> &str;
@@ -16,6 +16,7 @@ pub trait Layer {
     fn backward(&mut self, grads: &Array2<f32>, lr: f32) -> Array2<f32>;
 }
 
+#[allow(clippy::upper_case_acronyms)]
 pub struct LLM {
     pub vocab: Vocab,
     pub network: Vec<Box<dyn Layer>>,
@@ -38,16 +39,17 @@ impl Default for LLM {
 
 impl LLM {
     pub fn new(vocab: Vocab, network: Vec<Box<dyn Layer>>) -> Self {
-        Self {
-            vocab,
-            network
-        }
+        Self { vocab, network }
     }
 }
 
 impl LLM {
     pub fn network_description(&self) -> String {
-        self.network.iter().map(|layer| layer.layer_type()).collect::<Vec<&str>>().join(", ")
+        self.network
+            .iter()
+            .map(|layer| layer.layer_type())
+            .collect::<Vec<&str>>()
+            .join(", ")
     }
 
     pub fn predict(&mut self, text: &str) -> String {
@@ -59,7 +61,10 @@ impl LLM {
         }
 
         // Convert token_ids to strings
-        let token_strs = output_tokens.iter().map(|t| self.vocab.decode[t].clone()).collect::<Vec<String>>();
+        let token_strs = output_tokens
+            .iter()
+            .map(|t| self.vocab.decode[t].clone())
+            .collect::<Vec<String>>();
 
         token_strs.join(" ")
     }
@@ -75,12 +80,12 @@ impl LLM {
         }
 
         let input_len = tokenized.len();
-        
+
         // Prevent overflow if input_len >= MAX_SEQ_LEN
         if input_len >= MAX_SEQ_LEN {
             return output_tokens;
         }
-                
+
         for _ in 0..(MAX_SEQ_LEN - input_len) {
             // let tokenized_clone = tokenized.clone();
 
@@ -92,21 +97,25 @@ impl LLM {
             let token_input = Array2::from_shape_vec(
                 (1, tokenized.len()),
                 tokenized.iter().map(|&x| x as f32).collect(),
-            ).unwrap();
+            )
+            .unwrap();
             let mut input = token_input;
-            
+
             for layer in &mut self.network {
                 input = layer.forward(&input);
             }
 
             let logits = input;
-            
+
             // Safety check: ensure we have at least one token
             if logits.shape()[0] == 0 {
                 break;
             }
-            
-            let last_logit = logits.row(logits.shape()[0] - 1).to_owned().insert_axis(Axis(0));
+
+            let last_logit = logits
+                .row(logits.shape()[0] - 1)
+                .to_owned()
+                .insert_axis(Axis(0));
 
             // Softmax - convert activiations of each token to a probability distribution over the vocabulary
             let probs = Self::softmax(&last_logit); // 1 x vocab_size
@@ -114,12 +123,14 @@ impl LLM {
             // Greedy Decode - Choose the highest probability token for each position
             let tokens = Self::greedy_decode(&probs);
 
-            let next_token = tokens[tokens.len() - 1];              
+            let next_token = tokens[tokens.len() - 1];
 
             output_tokens.push(next_token);
             tokenized.push(next_token);
 
-            if next_token == self.vocab.encode("</s>").unwrap() { break; }
+            if next_token == self.vocab.encode("</s>").unwrap() {
+                break;
+            }
         }
 
         output_tokens
@@ -128,13 +139,15 @@ impl LLM {
     pub fn train(&mut self, data: Vec<&str>, epochs: usize, lr: f32) {
         let tokenized_data = data
             .iter()
-            .map(|input| (self.tokenize(input)))
+            .map(|input| self.tokenize(input))
             .collect::<Vec<Vec<usize>>>();
 
         for epoch in 0..epochs {
             let mut total_loss = 0.0;
             for training_row in &tokenized_data {
-                if training_row.len() < 2 { continue; }
+                if training_row.len() < 2 {
+                    continue;
+                }
 
                 // 1. Slice input and targets
                 let input_ids = &training_row[..training_row.len() - 1]; // Exclude the last token
@@ -142,9 +155,11 @@ impl LLM {
 
                 // Forward pass
                 let mut input: Array2<f32> = Array2::zeros((1, input_ids.len()));
-                input.row_mut(0).assign(&input_ids.iter().map(|&x| x as f32).collect::<Array1<f32>>());
+                input
+                    .row_mut(0)
+                    .assign(&input_ids.iter().map(|&x| x as f32).collect::<Array1<f32>>());
 
-                for layer in &mut self.network {    
+                for layer in &mut self.network {
                     input = layer.forward(&input);
                 }
 
@@ -155,10 +170,10 @@ impl LLM {
 
                 // Backward pass
                 let mut grads_output = Self::compute_gradients_step(&probs, target_ids); // this is d_L/d_output_projection
-                
+
                 // Apply gradient clipping BEFORE backpropagation
                 Self::clip_gradients(&mut grads_output, 5.0);
-                
+
                 for layer in self.network.iter_mut().rev() {
                     grads_output = layer.backward(&grads_output, lr);
                 }
@@ -166,17 +181,23 @@ impl LLM {
                 let tokens = Self::greedy_decode(&probs);
                 let next_token = tokens[tokens.len() - 1];
 
-                if next_token == self.vocab.encode("</s>").unwrap() { continue; }
+                if next_token == self.vocab.encode("</s>").unwrap() {
+                    continue;
+                }
             }
-            
-            println!("Epoch {}: Loss = {:.4}", epoch, total_loss / tokenized_data.len() as f32);
+
+            println!(
+                "Epoch {}: Loss = {:.4}",
+                epoch,
+                total_loss / tokenized_data.len() as f32
+            );
         }
     }
 
     pub fn tokenize(&self, text: &str) -> Vec<usize> {
         // Split by whitespace first
         let mut tokens = Vec::new();
-        
+
         for word in text.split_whitespace() {
             // Special case for end token
             if word == "</s>" {
@@ -185,9 +206,9 @@ impl LLM {
                 }
                 continue;
             }
-            
+
             let mut current_word = String::new();
-            
+
             for c in word.chars() {
                 if c.is_ascii_punctuation() {
                     // If we have a word before the punctuation, add it
@@ -197,7 +218,7 @@ impl LLM {
                         }
                         current_word.clear();
                     }
-                    
+
                     // Add the punctuation as its own token
                     if let Some(token_id) = self.vocab.encode(&c.to_string()) {
                         tokens.push(token_id);
@@ -206,46 +227,49 @@ impl LLM {
                     current_word.push(c);
                 }
             }
-            
+
             // Add any remaining word
-            if !current_word.is_empty() {
-                if let Some(token_id) = self.vocab.encode(&current_word) {
-                    tokens.push(token_id);
-                }
+            if !current_word.is_empty()
+                && let Some(token_id) = self.vocab.encode(&current_word)
+            {
+                tokens.push(token_id);
             }
         }
-        
+
         tokens
     }
 
-    fn softmax(logits: &Array2<f32>) -> Array2<f32> { // logits is seq_len x vocab_size
+    fn softmax(logits: &Array2<f32>) -> Array2<f32> {
+        // logits is seq_len x vocab_size
         let mut result = logits.clone();
-        
+
         // Apply softmax row-wise
         for mut row in result.rows_mut() {
             // Calculate exp for each element
             let max_val = row.iter().copied().fold(f32::NEG_INFINITY, f32::max);
             let exp_values: Vec<f32> = row.iter().map(|&x| (x - max_val).exp()).collect();
             let sum_exp: f32 = exp_values.iter().sum();
-            
+
             // Normalize by sum
             for (i, &exp_val) in exp_values.iter().enumerate() {
                 row[i] = exp_val / sum_exp;
             }
         }
-        
+
         result
     }
 
     fn greedy_decode(probs: &Array2<f32>) -> Vec<usize> {
-        probs.map_axis(Axis(1), |row| {
-            row.iter()
-                .enumerate()
-                .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(Ordering::Equal))
-                .map(|(index, _)| index)
-                .unwrap()
-        }).to_vec()
-    } 
+        probs
+            .map_axis(Axis(1), |row| {
+                row.iter()
+                    .enumerate()
+                    .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(Ordering::Equal))
+                    .map(|(index, _)| index)
+                    .unwrap()
+            })
+            .to_vec()
+    }
 
     fn cross_entropy_loss_step(probs: &Array2<f32>, target: &[usize]) -> f32 {
         let mut loss = 0.0;
@@ -263,28 +287,28 @@ impl LLM {
         if probs.shape()[0] != target.len() {
             panic!("Probs and target must have the same number of rows");
         }
-        
+
         let batch_size = target.len() as f32;
-        
+
         // Compute correct softmax + cross-entropy gradient: softmax - one_hot(target)
         for row_idx in 0..grads.shape()[0] {
             grads[[row_idx, target[row_idx]]] -= 1.0; // Convert to: p - y (where y is one-hot)
         }
-        
+
         // Normalize by batch size for stable training
         grads.mapv_inplace(|x| x / batch_size);
-        
+
         grads
     }
 
     fn clip_gradients(grads: &mut Array2<f32>, max_norm: f32) {
         // Calculate L2 norm of gradients
         let norm = grads.iter().map(|&x| x * x).sum::<f32>().sqrt();
-        
+
         // If norm exceeds max_norm, scale gradients down
         if norm > max_norm {
             let scale = max_norm / norm;
             grads.mapv_inplace(|x| x * scale);
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/main.rs b/src/main.rs
index b08e920..15cdeba 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,20 +1,20 @@
 use std::io::Write;
 
 use embeddings::Embeddings;
+use llm::LLM;
 use output_projection::OutputProjection;
 use transformer::TransformerBlock;
-use llm::LLM;
 use vocab::Vocab;
 
-mod llm;
+mod adam;
 mod embeddings;
-mod vocab;
-mod transformer;
 mod feed_forward;
-mod self_attention;
-mod output_projection;
-mod adam;
 mod layer_norm;
+mod llm;
+mod output_projection;
+mod self_attention;
+mod transformer;
+mod vocab;
 
 // Use the constants from lib.rs
 const MAX_SEQ_LEN: usize = 80;
@@ -27,10 +27,10 @@ fn main() {
 
     // Extract all unique words from training data to create vocabulary
     let mut vocab_set = std::collections::HashSet::new();
-    
+
     // Add end of sequence token
     vocab_set.insert("</s>".to_string());
-    
+
     // Pre-training data - simple text completion patterns
     let pretraining_data = vec![
         "The sun rises in the east and sets in the west </s>",
@@ -72,7 +72,6 @@ fn main() {
         ("User: How many neurons are in the human brain? Assistant: The human brain contains about 86 billion neurons that transmit information throughout the body </s>"),
         ("User: What is electricity? Assistant: Electricity is the flow of electrons through a conductor, often used to power devices </s>"),
         ("User: What is climate change? Assistant: Climate change refers to long-term shifts in temperatures and weather patterns on Earth </s>"),
-
         ("User: How long do oak trees live? Assistant: Oak trees can live for hundreds of years and produce acorns as their fruit </s>"),
         ("User: What happened to Pluto? Assistant: Pluto was reclassified from a planet to a dwarf planet in 2006 by astronomers </s>"),
         ("User: How is glass made? Assistant: Glass is made by heating sand, soda ash, and limestone to very high temperatures until they melt </s>"),
@@ -83,7 +82,6 @@ fn main() {
         ("User: What is Jupiter like? Assistant: Jupiter is the largest planet in our solar system and has dozens of moons </s>"),
         ("User: How fast does light travel? Assistant: Light travels at approximately 299,792 kilometers per second in a vacuum </s>"),
         ("User: What is gold used for? Assistant: Gold is a dense, soft metal often used in jewelry and electronics due to its conductivity </s>"),
-
         ("User: What covers most of Earth? Assistant: Most of the earth's surface is covered by water, primarily in oceans </s>"),
         ("User: How do bicycles work? Assistant: Bicycles are an efficient mode of transport that convert human energy into motion through gears </s>"),
         ("User: How is chocolate made? Assistant: Chocolate is made from roasted and ground cacao seeds, often sweetened and flavored </s>"),
@@ -94,7 +92,6 @@ fn main() {
         ("User: What did the printing press do? Assistant: The printing press revolutionized the spread of information in the 15th century </s>"),
         ("User: What is sound? Assistant: Sound is a vibration that travels through air, water, or solid materials to reach our ears </s>"),
         ("User: Why is carbon important? Assistant: Carbon is an essential element in organic chemistry, forming the basis of all life on Earth </s>"),
-
         ("User: What was the Library of Alexandria? Assistant: The library of alexandria was one of the most significant libraries of the ancient world </s>"),
         ("User: What do honeybees do? Assistant: Honeybees play a vital role in pollination, which supports ecosystems and agriculture </s>"),
         ("User: Are electric cars better? Assistant: Electric vehicles produce less air pollution than traditional gasoline-powered cars </s>"),
@@ -105,7 +102,6 @@ fn main() {
         ("User: How are language models trained? Assistant: Language models are trained using vast amounts of text to learn patterns in language </s>"),
         ("User: How do compasses work? Assistant: Compasses work by aligning a magnetic needle with the earth's magnetic field </s>"),
         ("User: What do vaccines do? Assistant: Vaccines help the immune system recognize and fight off specific pathogens </s>"),
-
         // Greeting and conversational patterns
         ("User: Hello! Assistant: Hello! How can I help you today? </s>"),
         ("User: Hi! Assistant: Hi there! What can I do for you? </s>"),
@@ -121,7 +117,7 @@ fn main() {
         ("User: I don't understand. Assistant: No problem! Let me try to explain it differently </s>"),
         ("User: What do you think? Assistant: Based on the information available, I think it's important to consider multiple perspectives </s>"),
     ];
-    
+
     // Process all training examples for vocabulary
     // First process pre-training data
     for text in &pretraining_data {
@@ -144,7 +140,7 @@ fn main() {
             }
         }
     }
-    
+
     // Then process chat training data
     for row in &chat_training_data {
         // Add words from outputs
@@ -167,7 +163,7 @@ fn main() {
             }
         }
     }
-    
+
     let mut vocab_words: Vec<String> = vocab_set.into_iter().collect();
     vocab_words.sort(); // Sort for deterministic ordering
     let vocab_words_refs: Vec<&str> = vocab_words.iter().map(|s| s.as_str()).collect();
@@ -178,31 +174,42 @@ fn main() {
     let transformer_block_3 = TransformerBlock::new(EMBEDDING_DIM, HIDDEN_DIM);
     let output_projection = OutputProjection::new(EMBEDDING_DIM, vocab.words.len());
     let embeddings = Embeddings::new(vocab.clone());
-    let mut llm = LLM::new(vocab, vec![
-        Box::new(embeddings),
-        Box::new(transformer_block_1),
-        Box::new(transformer_block_2),
-        Box::new(transformer_block_3),
-        Box::new(output_projection),
-    ]);
+    let mut llm = LLM::new(
+        vocab,
+        vec![
+            Box::new(embeddings),
+            Box::new(transformer_block_1),
+            Box::new(transformer_block_2),
+            Box::new(transformer_block_3),
+            Box::new(output_projection),
+        ],
+    );
 
     println!("\n=== MODEL INFORMATION ===");
     println!("Network architecture: {}", llm.network_description());
-    
+
     println!("\n=== BEFORE TRAINING ===");
     println!("Input: {}", string);
     println!("Output: {}", llm.predict(&string));
-    
+
     println!("\n=== PRE-TRAINING MODEL ===");
-    println!("Pre-training on {} examples for {} epochs with learning rate {}", 
-             pretraining_data.len(), 100, 0.0005);
+    println!(
+        "Pre-training on {} examples for {} epochs with learning rate {}",
+        pretraining_data.len(),
+        100,
+        0.0005
+    );
     llm.train(pretraining_data, 100, 0.0005);
-    
+
     println!("\n=== INSTRUCTION TUNING ===");
-    println!("Instruction tuning on {} examples for {} epochs with learning rate {}", 
-             chat_training_data.len(), 100, 0.0001);
+    println!(
+        "Instruction tuning on {} examples for {} epochs with learning rate {}",
+        chat_training_data.len(),
+        100,
+        0.0001
+    );
     llm.train(chat_training_data, 100, 0.0001); // Much lower learning rate for stability
-    
+
     println!("\n=== AFTER TRAINING ===");
     println!("Input: {}", string);
     let result = llm.predict(&string);
@@ -213,26 +220,28 @@ fn main() {
     println!("\n--- Interactive Mode ---");
     println!("Type a prompt and press Enter to generate text.");
     println!("Type 'exit' to quit.");
-    
+
     let mut input = String::new();
     loop {
         // Clear the input string
         input.clear();
-        
+
         // Prompt for user input
         print!("\nEnter prompt: ");
         std::io::stdout().flush().unwrap();
-        
+
         // Read user input
-        std::io::stdin().read_line(&mut input).expect("Failed to read input");
-        
+        std::io::stdin()
+            .read_line(&mut input)
+            .expect("Failed to read input");
+
         // Trim whitespace and check for exit command
         let trimmed_input = input.trim();
         if trimmed_input.eq_ignore_ascii_case("exit") {
             println!("Exiting interactive mode.");
             break;
         }
-        
+
         // Generate prediction based on user input with "User:" prefix
         let formatted_input = format!("User: {}", trimmed_input);
         let prediction = llm.predict(&formatted_input);
diff --git a/src/output_projection.rs b/src/output_projection.rs
index 4054bcb..9c2be18 100644
--- a/src/output_projection.rs
+++ b/src/output_projection.rs
@@ -1,13 +1,13 @@
 use ndarray::{Array2, Axis};
-use rand_distr::{Normal, Distribution};
+use rand_distr::{Distribution, Normal};
 
 use crate::{adam::Adam, llm::Layer};
 
 pub struct OutputProjection {
-   pub w_out: Array2<f32>, // Weight matrix
-   pub b_out: Array2<f32>, // Bias vector
-   pub optimizer: Adam,
-   pub cached_input: Option<Array2<f32>>,
+    pub w_out: Array2<f32>, // Weight matrix
+    pub b_out: Array2<f32>, // Bias vector
+    pub optimizer: Adam,
+    pub cached_input: Option<Array2<f32>>,
 }
 
 impl OutputProjection {
@@ -17,7 +17,7 @@ impl OutputProjection {
         // Xavier/He initialization: std = sqrt(2 / fan_in)
         let std = (2.0 / embedding_dim as f32).sqrt();
         let normal = Normal::new(0.0, std).unwrap();
-        
+
         OutputProjection {
             w_out: Array2::from_shape_fn((embedding_dim, vocab_size), |_| normal.sample(&mut rng)),
             b_out: Array2::zeros((1, vocab_size)),
@@ -33,12 +33,14 @@ impl Layer for OutputProjection {
     }
 
     /// Forward pass: project embeddings to vocab logits
-    fn forward(&mut self, input: &Array2<f32>) -> Array2<f32> { // input shape is [sequence_length, embedding_dim]
+    fn forward(&mut self, input: &Array2<f32>) -> Array2<f32> {
+        // input shape is [sequence_length, embedding_dim]
         self.cached_input = Some(input.clone());
         input.dot(&self.w_out) + &self.b_out // shape is [sequence_length, vocab_size]
     }
 
-    fn backward(&mut self, grads: &Array2<f32>, lr: f32) -> Array2<f32> { // grads shape is [sequence_length, vocab_size]
+    fn backward(&mut self, grads: &Array2<f32>, lr: f32) -> Array2<f32> {
+        // grads shape is [sequence_length, vocab_size]
         let input = self.cached_input.as_ref().unwrap();
         let grad_w_out = input.t().dot(grads);
         let grad_b_out = grads.mean_axis(Axis(0)).unwrap();
@@ -50,4 +52,4 @@ impl Layer for OutputProjection {
 
         grad_input
     }
-}
\ No newline at end of file
+}
diff --git a/src/self_attention.rs b/src/self_attention.rs
index a485176..b96512f 100644
--- a/src/self_attention.rs
+++ b/src/self_attention.rs
@@ -1,8 +1,8 @@
-use crate::adam::Adam;
 use crate::EMBEDDING_DIM;
-use ndarray::Array2;
-use rand_distr::{Normal, Distribution};
+use crate::adam::Adam;
 use crate::llm::Layer;
+use ndarray::Array2;
+use rand_distr::{Distribution, Normal};
 use std::f32;
 
 pub struct SelfAttention {
@@ -23,7 +23,6 @@ impl Default for SelfAttention {
         SelfAttention::new(EMBEDDING_DIM)
     }
 }
-    
 
 impl SelfAttention {
     /// Initializes a Transformer with random Q, K, V weights
@@ -32,7 +31,7 @@ impl SelfAttention {
         // Xavier/He initialization: std = sqrt(2 / fan_in)
         let std = (2.0 / embedding_dim as f32).sqrt();
         let normal = Normal::new(0.0, std).unwrap();
-        
+
         SelfAttention {
             embedding_dim,
             w_q: Array2::from_shape_fn((embedding_dim, embedding_dim), |_| normal.sample(&mut rng)),
@@ -72,34 +71,33 @@ impl SelfAttention {
 
     fn softmax(&self, scores: &Array2<f32>) -> Array2<f32> {
         let mut result = scores.clone();
-        
+
         // Apply softmax row-wise
         for mut row in result.rows_mut() {
             let max_val = row.iter().max_by(|a, b| a.partial_cmp(b).unwrap()).unwrap();
             // Calculate exp for each element
             let exp_values: Vec<f32> = row.iter().map(|&x| (x - max_val).exp()).collect();
             let sum_exp: f32 = exp_values.iter().sum();
-            
+
             // Normalize by sum
             for (i, &exp_val) in exp_values.iter().enumerate() {
                 row[i] = exp_val / sum_exp;
             }
         }
-        
+
         result
     }
 
     fn softmax_backward(
-        softmax_output: &Array2<f32>,  // shape: [seq_len, vocab_size]
-        grad_output: &Array2<f32>,     // shape: [seq_len, vocab_size]
+        softmax_output: &Array2<f32>, // shape: [seq_len, vocab_size]
+        grad_output: &Array2<f32>,    // shape: [seq_len, vocab_size]
     ) -> Array2<f32> {
         let mut grad_input = softmax_output.clone(); // to hold the result
-    
-        for ((mut grad_row, softmax_row), grad_out_row) in
-            grad_input
-                .outer_iter_mut()
-                .zip(softmax_output.outer_iter())
-                .zip(grad_output.outer_iter())
+
+        for ((mut grad_row, softmax_row), grad_out_row) in grad_input
+            .outer_iter_mut()
+            .zip(softmax_output.outer_iter())
+            .zip(grad_output.outer_iter())
         {
             // dot product: y ⊙ dL/dy
             let dot = softmax_row
@@ -107,7 +105,7 @@ impl SelfAttention {
                 .zip(grad_out_row.iter())
                 .map(|(&y_i, &dy_i)| y_i * dy_i)
                 .sum::<f32>();
-    
+
             for ((g, &y_i), &dy_i) in grad_row
                 .iter_mut()
                 .zip(softmax_row.iter())
@@ -116,7 +114,7 @@ impl SelfAttention {
                 *g = y_i * (dy_i - dot);
             }
         }
-    
+
         grad_input
     }
 }
@@ -140,9 +138,9 @@ impl Layer for SelfAttention {
         let v = input.dot(&self.w_v);
         let dk = self.w_q.shape()[1] as f32;
         let scale = dk.sqrt();
-    
+
         let mut scores = q.dot(&k.t()) / scale;
-        
+
         // Apply causal masking - prevent attention to future tokens
         let seq_len = scores.shape()[0];
         for i in 0..seq_len {
@@ -150,40 +148,38 @@ impl Layer for SelfAttention {
                 scores[[i, j]] = f32::NEG_INFINITY;
             }
         }
-        
+
         let attn_weights = self.softmax(&scores); // also cached
-    
+
         // Step 1: grads = ∂L/∂attn_output
         let grad_attn_weights = grads.dot(&v.t());
         let grad_v = attn_weights.t().dot(grads);
-    
+
         // Step 2: softmax backward
         let grad_scores = SelfAttention::softmax_backward(&attn_weights, &grad_attn_weights); // [seq_len, seq_len]
-    
+
         // Step 3: ∂L/∂Q and ∂L/∂K
         let grad_q = grad_scores.dot(&k);
         let grad_k = grad_scores.t().dot(&q);
-    
+
         // Step 4: ∂L/∂W_q/W_k/W_v
         let grad_w_q = input.t().dot(&grad_q);
         let grad_w_k = input.t().dot(&grad_k);
         let grad_w_v = input.t().dot(&grad_v);
-    
+
         // Step 5: ∂L/∂input (gradient through attention computation)
         let grad_input_attention =
-            grad_q.dot(&self.w_q.t()) +
-            grad_k.dot(&self.w_k.t()) +
-            grad_v.dot(&self.w_v.t());
-    
-        // Step 6: Add gradient from residual connection 
+            grad_q.dot(&self.w_q.t()) + grad_k.dot(&self.w_k.t()) + grad_v.dot(&self.w_v.t());
+
+        // Step 6: Add gradient from residual connection
         // Forward: residual = attention + input, so gradient flows directly through
         let grad_input = grad_input_attention + grads;
-    
+
         // Step 7: update weights
         self.optimizer_w_q.step(&mut self.w_q, &grad_w_q, lr);
         self.optimizer_w_k.step(&mut self.w_k, &grad_w_k, lr);
         self.optimizer_w_v.step(&mut self.w_v, &grad_w_v, lr);
-    
-        grad_input        
+
+        grad_input
     }
-}
\ No newline at end of file
+}
diff --git a/src/transformer.rs b/src/transformer.rs
index aa1c613..0beef93 100644
--- a/src/transformer.rs
+++ b/src/transformer.rs
@@ -1,7 +1,7 @@
-use crate::self_attention::SelfAttention;
 use crate::feed_forward::FeedForward;
 use crate::layer_norm::LayerNorm;
 use crate::llm::Layer;
+use crate::self_attention::SelfAttention;
 use ndarray::Array2;
 pub struct TransformerBlock {
     attention: SelfAttention,
@@ -27,29 +27,27 @@ impl Layer for TransformerBlock {
     }
 
     fn forward(&mut self, input: &Array2<f32>) -> Array2<f32> {
-        // Standard Transformer architecture: attention + norm -> feedforward + norm  
+        // Standard Transformer architecture: attention + norm -> feedforward + norm
         let attention_out = self.attention.forward(input); // includes residual
         let norm1_out = self.norm1.normalize(&attention_out);
-        
+
         let feed_forward_out = self.feed_forward.forward(&norm1_out); // includes residual
-        let norm2_out = self.norm2.normalize(&feed_forward_out);
-        
-        norm2_out
+
+        self.norm2.normalize(&feed_forward_out)
     }
-    
+
     fn backward(&mut self, grads: &Array2<f32>, lr: f32) -> Array2<f32> {
         // Backward through second LayerNorm
         let grad_norm2 = self.norm2.backward(grads, lr);
-        
+
         // Backward through feed-forward (includes residual connection)
         let grad_ffn = self.feed_forward.backward(&grad_norm2, lr);
-        
-        // Backward through first LayerNorm  
+
+        // Backward through first LayerNorm
         let grad_norm1 = self.norm1.backward(&grad_ffn, lr);
-        
+
         // Backward through attention (includes residual connection)
-        let grad_attn = self.attention.backward(&grad_norm1, lr);
 
-        grad_attn
-    }   
+        self.attention.backward(&grad_norm1, lr)
+    }
 }
diff --git a/src/vocab.rs b/src/vocab.rs
index 7cb7693..50b1aeb 100644
--- a/src/vocab.rs
+++ b/src/vocab.rs
@@ -22,7 +22,11 @@ impl Vocab {
             decode.insert(i, word.to_string());
         }
 
-        Vocab { encode, decode, words: words.iter().map(|w| w.to_string()).collect() }
+        Vocab {
+            encode,
+            decode,
+            words: words.iter().map(|w| w.to_string()).collect(),
+        }
     }
 
     /// Convert a word to its token index
@@ -39,4 +43,4 @@ impl Vocab {
     pub fn default_words() -> Vec<&'static str> {
         vec!["hello", "world", "this", "is", "rust", "</s>"]
     }
-}
\ No newline at end of file
+}
diff --git a/tests/adam_test.rs b/tests/adam_test.rs
index 1045625..6aa509c 100644
--- a/tests/adam_test.rs
+++ b/tests/adam_test.rs
@@ -1,11 +1,11 @@
-use ndarray::Array2;
 use llm::adam::Adam;
+use ndarray::Array2;
 
 #[test]
 fn test_adam_initialization() {
     let shape = [2, 3];
     let adam = Adam::new((2, 3));
-    
+
     // Check if momentum and velocity matrices are initialized to zeros
     assert_eq!(adam.m.shape(), shape);
     assert_eq!(adam.v.shape(), shape);
@@ -20,16 +20,16 @@ fn test_adam_step() {
     let mut adam = Adam::new(shape);
     let mut params = Array2::ones(shape);
     let grads = Array2::ones(shape);
-    
+
     // Store initial parameters
     let initial_params = params.clone();
-    
+
     // Perform optimization step
     adam.step(&mut params, &grads, lr);
-    
+
     // Parameters should have changed
     assert_ne!(params, initial_params);
-    
+
     // Parameters should have decreased (since gradients are positive)
     assert!(params.iter().all(|&x| x < 1.0));
 }
@@ -41,15 +41,15 @@ fn test_adam_multiple_steps() {
     let mut adam = Adam::new(shape);
     let mut params = Array2::ones(shape);
     let grads = Array2::ones(shape);
-    
+
     // Store initial parameters
     let initial_params = params.clone();
-    
+
     // Perform multiple optimization steps
     for _ in 0..10 {
         adam.step(&mut params, &grads, lr);
     }
-    
+
     // Parameters should have changed more significantly
     assert!(params.iter().all(|&x| x < initial_params[[0, 0]]));
 }
@@ -61,13 +61,13 @@ fn test_adam_with_zero_gradients() {
     let mut adam = Adam::new(shape);
     let mut params = Array2::ones(shape);
     let grads = Array2::zeros(shape);
-    
+
     // Store initial parameters
     let initial_params = params.clone();
-    
+
     // Perform optimization step with zero gradients
     adam.step(&mut params, &grads, lr);
-    
+
     // Parameters should not change with zero gradients
     assert_eq!(params, initial_params);
 }
@@ -79,10 +79,10 @@ fn test_adam_with_negative_gradients() {
     let mut adam = Adam::new(shape);
     let mut params = Array2::ones(shape);
     let grads = Array2::from_shape_fn(shape, |_| -1.0);
-    
+
     // Perform optimization step
     adam.step(&mut params, &grads, lr);
-    
+
     // Parameters should have increased (since gradients are negative)
     assert!(params.iter().all(|&x| x > 1.0));
-} 
\ No newline at end of file
+}
diff --git a/tests/embeddings_test.rs b/tests/embeddings_test.rs
index 931d797..18d7462 100644
--- a/tests/embeddings_test.rs
+++ b/tests/embeddings_test.rs
@@ -1,7 +1,7 @@
-use llm::{Embeddings, Vocab, Layer, EMBEDDING_DIM, MAX_SEQ_LEN};
+use llm::{EMBEDDING_DIM, Embeddings, Layer, MAX_SEQ_LEN, Vocab};
 
 #[test]
-fn test_embeddings_creation() {    
+fn test_embeddings_creation() {
     // Create with custom vocab
     let words = vec!["hello", "world", "test", "</s>"];
     let _vocab = Vocab::new(words); // Fix unused variable warning
@@ -13,18 +13,18 @@ fn test_embed_tokens() {
     let words = vec!["hello", "world", "test", "</s>"];
     let vocab = Vocab::new(words);
     let embeddings = Embeddings::new(vocab.clone());
-    
+
     // Test embedding a single token
     let token_ids = vec![0]; // "hello"
     let embedded = embeddings.embed_tokens(&token_ids);
-    
+
     // Check dimensions
     assert_eq!(embedded.shape(), [1, EMBEDDING_DIM]);
-    
+
     // Test embedding multiple tokens
     let token_ids = vec![0, 1, 2]; // "hello world test"
     let embedded = embeddings.embed_tokens(&token_ids);
-    
+
     // Check dimensions
     assert_eq!(embedded.shape(), [3, EMBEDDING_DIM]);
 }
@@ -35,21 +35,21 @@ fn test_positional_embeddings() {
     let words = vec!["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"];
     let vocab = Vocab::new(words);
     let embeddings = Embeddings::new(vocab);
-    
+
     // Test with different sequence lengths
     for seq_len in 1..5 {
         let token_ids = vec![0; seq_len]; // Repeat token 0 seq_len times
         let embedded = embeddings.embed_tokens(&token_ids);
-        
+
         // Check dimensions
         assert_eq!(embedded.shape(), [seq_len, EMBEDDING_DIM]);
-        
+
         // Verify that embeddings for the same token at different positions are different
         // (due to positional embeddings being added)
         if seq_len > 1 {
             let first_pos = embedded.row(0).to_owned();
             let second_pos = embedded.row(1).to_owned();
-            
+
             // They should be different due to positional encoding
             assert_ne!(first_pos, second_pos);
         }
@@ -61,14 +61,14 @@ fn test_max_sequence_length() {
     // Create vocab and embeddings
     let vocab = Vocab::default();
     let embeddings = Embeddings::new(vocab);
-    
+
     // Create a sequence at the maximum length
     let token_ids = vec![0; MAX_SEQ_LEN];
     let embedded = embeddings.embed_tokens(&token_ids);
-    
+
     // Check dimensions
     assert_eq!(embedded.shape(), [MAX_SEQ_LEN, EMBEDDING_DIM]);
-} 
+}
 
 #[test]
 fn test_embedding_backwards() {
@@ -83,7 +83,7 @@ fn test_embedding_backwards() {
     use ndarray::Array2;
     let input = Array2::from_shape_vec((1, 3), vec![0.0, 1.0, 2.0]).unwrap();
     let _output = embeddings.forward(&input);
-    
+
     // Create some dummy gradients and run backward pass
     let grads = Array2::from_shape_vec((3, EMBEDDING_DIM), vec![0.1; 3 * EMBEDDING_DIM]).unwrap();
     let _grad_input = embeddings.backward(&grads, 0.01);
@@ -92,5 +92,8 @@ fn test_embedding_backwards() {
     let post_train_position_embeddings = embeddings.positional_embeddings.clone();
 
     assert_ne!(pre_train_token_embeddings, post_train_token_embeddings);
-    assert_ne!(pre_train_position_embeddings, post_train_position_embeddings);
-}
\ No newline at end of file
+    assert_ne!(
+        pre_train_position_embeddings,
+        post_train_position_embeddings
+    );
+}
diff --git a/tests/feed_forward_test.rs b/tests/feed_forward_test.rs
index 6530642..db5f675 100644
--- a/tests/feed_forward_test.rs
+++ b/tests/feed_forward_test.rs
@@ -1,18 +1,18 @@
-use llm::{Layer, EMBEDDING_DIM, HIDDEN_DIM};
-use ndarray::Array2;
 use llm::feed_forward::FeedForward;
+use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer};
+use ndarray::Array2;
 
 #[test]
 fn test_feed_forward_forward() {
     // Create feed-forward module
     let mut feed_forward = FeedForward::new(EMBEDDING_DIM, HIDDEN_DIM);
-    
+
     // Create input tensor (batch_size=1, seq_len=3, embedding_dim=EMBEDDING_DIM)
     let input = Array2::ones((3, EMBEDDING_DIM));
-    
+
     // Test forward pass
     let output = feed_forward.forward(&input);
-    
+
     // Check output shape - should be same as input
     assert_eq!(output.shape(), input.shape());
 }
@@ -21,28 +21,28 @@ fn test_feed_forward_forward() {
 fn test_feed_forward_with_different_sequence_lengths() {
     // Create feed-forward module
     let mut feed_forward = FeedForward::new(EMBEDDING_DIM, HIDDEN_DIM);
-    
+
     // Test with different sequence lengths
     for seq_len in 1..5 {
         // Create input tensor
         let input = Array2::ones((seq_len, EMBEDDING_DIM));
-        
+
         // Test forward pass
         let output = feed_forward.forward(&input);
-        
+
         // Check output shape
         assert_eq!(output.shape(), [seq_len, EMBEDDING_DIM]);
     }
-} 
+}
 
 #[test]
 fn test_feed_forward_and_backward() {
     // Create feed-forward module
     let mut feed_forward = FeedForward::new(EMBEDDING_DIM, HIDDEN_DIM);
-    
+
     // Create input tensor (batch_size=1, seq_len=3, embedding_dim=EMBEDDING_DIM)
     let input = Array2::ones((3, EMBEDDING_DIM));
-    
+
     // Test forward pass
     let output = feed_forward.forward(&input);
 
@@ -53,4 +53,4 @@ fn test_feed_forward_and_backward() {
 
     // Make sure backward pass modifies the input
     assert_ne!(output, grad_input);
-}
\ No newline at end of file
+}
diff --git a/tests/llm_test.rs b/tests/llm_test.rs
index 3530099..4abb749 100644
--- a/tests/llm_test.rs
+++ b/tests/llm_test.rs
@@ -1,7 +1,7 @@
-use llm::{LLM, Vocab, Layer};
+use llm::EMBEDDING_DIM;
 use llm::Embeddings;
 use llm::output_projection::OutputProjection;
-use llm::EMBEDDING_DIM;
+use llm::{LLM, Layer, Vocab};
 use ndarray::Array2;
 
 struct TestOutputProjectionLayer {
@@ -43,7 +43,7 @@ impl Layer for TestOutputProjectionLayer {
         let grad_input = input.dot(grads);
         self.cached_grads = Some(grad_input.clone());
 
-        return grad_input
+        grad_input
     }
 }
 
@@ -64,14 +64,15 @@ impl TestOutputProjectionLayer {
 fn test_llm_tokenize() {
     let vocab = Vocab::default();
     let vocab_size = vocab.encode.len();
-    let llm = LLM::new(vocab, vec![
-        Box::new(TestOutputProjectionLayer::new(5, 5, vocab_size))
-    ]);
-    
+    let llm = LLM::new(
+        vocab,
+        vec![Box::new(TestOutputProjectionLayer::new(5, 5, vocab_size))],
+    );
+
     // Test tokenization
     let tokens = llm.tokenize("hello world");
     assert!(!tokens.is_empty());
-    
+
     // Test that tokens can be decoded back
     for token in tokens {
         assert!(llm.vocab.decode(token).is_some());
@@ -82,10 +83,11 @@ fn test_llm_tokenize() {
 fn test_llm_predict() {
     let vocab = Vocab::default();
     let vocab_size = vocab.encode.len();
-    let mut llm = LLM::new(vocab.clone(), vec![
-        Box::new(TestOutputProjectionLayer::new(5, 5, vocab_size))
-    ]);
-    
+    let mut llm = LLM::new(
+        vocab.clone(),
+        vec![Box::new(TestOutputProjectionLayer::new(5, 5, vocab_size))],
+    );
+
     // Test prediction
     let input_text = "hello world this is rust";
     let input_tokens = llm.tokenize(input_text);
@@ -93,25 +95,24 @@ fn test_llm_predict() {
     assert!(!result.is_empty());
 
     // Build expected output
-    let mut expected_tokens = vec![0; input_tokens.len()].iter().map(|x| vocab.decode[x].clone()).collect::<Vec<String>>();
+    let mut expected_tokens = vec![0; input_tokens.len()]
+        .iter()
+        .map(|x| vocab.decode[x].clone())
+        .collect::<Vec<String>>();
     expected_tokens.push("</s>".to_string());
     let expected_output = expected_tokens.join(" ");
 
-    assert_eq!(result, expected_output);    
-} 
+    assert_eq!(result, expected_output);
+}
 
 #[test]
 fn test_llm_train() {
     let vocab = Vocab::default();
     let vocab_size = vocab.encode.len();
     let layer = Box::new(TestOutputProjectionLayer::new(5, 1, vocab_size));
-    let mut llm = LLM::new(vocab.clone(), vec![
-        layer
-    ]);
+    let mut llm = LLM::new(vocab.clone(), vec![layer]);
 
-    let training_data = vec![
-        "hello world this is rust.",
-    ];
+    let training_data = vec!["hello world this is rust."];
 
     llm.train(training_data, 10, 0.01);
 }
@@ -124,13 +125,8 @@ fn test_llm_integration() {
     let embeddings = Box::new(Embeddings::new(vocab.clone()));
     let output_projection = Box::new(OutputProjection::new(EMBEDDING_DIM, vocab_size));
 
-    let mut llm = LLM::new(vocab.clone(), vec![
-        embeddings,
-        output_projection
-    ]);
+    let mut llm = LLM::new(vocab.clone(), vec![embeddings, output_projection]);
 
     let input_text = "hello world this is rust";
-    llm.train(vec![
-        input_text
-    ], 10, 0.01);
-}
\ No newline at end of file
+    llm.train(vec![input_text], 10, 0.01);
+}
diff --git a/tests/output_projection_test.rs b/tests/output_projection_test.rs
index 63997b6..a0f14c1 100644
--- a/tests/output_projection_test.rs
+++ b/tests/output_projection_test.rs
@@ -1,18 +1,18 @@
-use llm::{Layer, EMBEDDING_DIM};
-use ndarray::Array2;
 use llm::output_projection::OutputProjection;
+use llm::{EMBEDDING_DIM, Layer};
+use ndarray::Array2;
 
 #[test]
 fn test_output_projection_creation() {
     let vocab_size = 10;
     let output_proj = OutputProjection::new(EMBEDDING_DIM, vocab_size);
-    
+
     // Check weight matrix dimensions
     assert_eq!(output_proj.w_out.shape(), [EMBEDDING_DIM, vocab_size]);
-    
+
     // Check bias vector dimensions
     assert_eq!(output_proj.b_out.shape(), [1, vocab_size]);
-    
+
     // Check optimizer dimensions
     assert_eq!(output_proj.optimizer.m.shape(), [EMBEDDING_DIM, vocab_size]);
     assert_eq!(output_proj.optimizer.v.shape(), [EMBEDDING_DIM, vocab_size]);
@@ -22,13 +22,13 @@ fn test_output_projection_creation() {
 fn test_output_projection_forward() {
     let vocab_size = 10;
     let mut output_proj = OutputProjection::new(EMBEDDING_DIM, vocab_size);
-    
+
     // Create input tensor (batch_size=1, seq_len=3, embedding_dim=EMBEDDING_DIM)
     let input = Array2::ones((3, EMBEDDING_DIM));
-    
+
     // Test forward pass
     let output = output_proj.forward(&input);
-    
+
     // Check output shape - should be [seq_len, vocab_size]
     assert_eq!(output.shape(), [3, vocab_size]);
 }
@@ -37,15 +37,15 @@ fn test_output_projection_forward() {
 fn test_output_projection_with_different_sequence_lengths() {
     let vocab_size = 10;
     let mut output_proj = OutputProjection::new(EMBEDDING_DIM, vocab_size);
-    
+
     // Test with different sequence lengths
     for seq_len in 1..5 {
         // Create input tensor
         let input = Array2::ones((seq_len, EMBEDDING_DIM));
-        
+
         // Test forward pass
         let output = output_proj.forward(&input);
-        
+
         // Check output shape
         assert_eq!(output.shape(), [seq_len, vocab_size]);
     }
@@ -55,30 +55,30 @@ fn test_output_projection_with_different_sequence_lengths() {
 fn test_output_projection_backward() {
     let vocab_size = 10;
     let mut output_proj = OutputProjection::new(EMBEDDING_DIM, vocab_size);
-    
+
     // Create input tensor
     let input = Array2::ones((3, EMBEDDING_DIM));
-    
+
     // Forward pass first (required to cache input)
     let _output = output_proj.forward(&input);
-    
+
     // Create gradient tensor
     let grads = Array2::ones((3, vocab_size));
-    
+
     // Test backward pass
     let grad_input = output_proj.backward(&grads, 0.01);
-    
+
     // Check gradient input shape
     assert_eq!(grad_input.shape(), [3, EMBEDDING_DIM]);
-    
+
     // Verify that parameters were updated
     let w_out_before = output_proj.w_out.clone();
     let b_out_before = output_proj.b_out.clone();
-    
+
     // Run another forward and backward pass
     let _output = output_proj.forward(&input);
     let _grad_input = output_proj.backward(&grads, 0.01);
-    
+
     // Check that parameters changed
     assert_ne!(output_proj.w_out, w_out_before);
     assert_ne!(output_proj.b_out, b_out_before);
@@ -88,24 +88,24 @@ fn test_output_projection_backward() {
 fn test_output_projection_training() {
     let vocab_size = 10;
     let mut output_proj = OutputProjection::new(EMBEDDING_DIM, vocab_size);
-    
+
     // Create input tensor
     let input = Array2::ones((3, EMBEDDING_DIM));
-    
+
     // Run multiple training steps
     for _ in 0..5 {
         // Forward pass
         let _output = output_proj.forward(&input);
-        
+
         // Create gradient tensor (simulating cross-entropy loss gradients)
         let mut grads = Array2::zeros((3, vocab_size));
         grads[[0, 0]] = 1.0; // Set gradient for first token
-        
+
         // Backward pass
         let _grad_input = output_proj.backward(&grads, 0.01);
     }
-    
+
     // Verify that parameters were updated
     assert_ne!(output_proj.w_out.sum(), 0.0);
     assert_ne!(output_proj.b_out.sum(), 0.0);
-} 
\ No newline at end of file
+}
diff --git a/tests/self_attention_test.rs b/tests/self_attention_test.rs
index cd08341..009c7e4 100644
--- a/tests/self_attention_test.rs
+++ b/tests/self_attention_test.rs
@@ -1,18 +1,18 @@
-use llm::{Layer, EMBEDDING_DIM};
-use ndarray::Array2;
 use llm::self_attention::SelfAttention;
+use llm::{EMBEDDING_DIM, Layer};
+use ndarray::Array2;
 
 #[test]
 fn test_self_attention_forward() {
     // Create self-attention module
     let mut self_attention = SelfAttention::new(EMBEDDING_DIM);
-    
+
     // Create input tensor (batch_size=1, seq_len=3, embedding_dim=EMBEDDING_DIM)
     let input = Array2::ones((3, EMBEDDING_DIM));
-    
+
     // Test forward pass
     let output = self_attention.forward(&input);
-    
+
     // Check output shape - should be same as input
     assert_eq!(output.shape(), input.shape());
 }
@@ -21,16 +21,16 @@ fn test_self_attention_forward() {
 fn test_self_attention_with_different_sequence_lengths() {
     // Create self-attention module
     let mut self_attention = SelfAttention::new(EMBEDDING_DIM);
-    
+
     // Test with different sequence lengths
     for seq_len in 1..5 {
         // Create input tensor
         let input = Array2::ones((seq_len, EMBEDDING_DIM));
-        
+
         // Test forward pass
         let output = self_attention.forward(&input);
-        
+
         // Check output shape
         assert_eq!(output.shape(), [seq_len, EMBEDDING_DIM]);
     }
-} 
\ No newline at end of file
+}
diff --git a/tests/transformer_test.rs b/tests/transformer_test.rs
index 366ca59..c198915 100644
--- a/tests/transformer_test.rs
+++ b/tests/transformer_test.rs
@@ -1,17 +1,17 @@
-use llm::{Layer, EMBEDDING_DIM, HIDDEN_DIM};
-use ndarray::Array2;
 use llm::transformer::TransformerBlock;
+use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer};
+use ndarray::Array2;
 
 #[test]
 fn test_transformer_block() {
     let mut transformer = TransformerBlock::new(EMBEDDING_DIM, HIDDEN_DIM);
-    
+
     // Create a simple input tensor
     let input = Array2::ones((1, EMBEDDING_DIM));
-    
+
     // Test forward pass
     let output = transformer.forward(&input);
-    
+
     // Check output shape
     assert_eq!(output.shape(), [1, EMBEDDING_DIM]);
-} 
\ No newline at end of file
+}
diff --git a/tests/vocab_test.rs b/tests/vocab_test.rs
index b8f1adf..131888f 100644
--- a/tests/vocab_test.rs
+++ b/tests/vocab_test.rs
@@ -4,12 +4,12 @@ use llm::Vocab;
 fn test_vocab_encode_decode() {
     let words = vec!["hello", "world", "this", "is", "rust", "</s>"];
     let vocab = Vocab::new(words);
-    
+
     // Test encoding
     assert_eq!(vocab.encode("hello"), Some(0));
     assert_eq!(vocab.encode("world"), Some(1));
     assert_eq!(vocab.encode("unknown"), None);
-    
+
     // Test decoding
     assert_eq!(vocab.decode(0).map(|s| s.as_str()), Some("hello"));
     assert_eq!(vocab.decode(1).map(|s| s.as_str()), Some("world"));
@@ -19,9 +19,9 @@ fn test_vocab_encode_decode() {
 #[test]
 fn test_vocab_default() {
     let vocab = Vocab::default();
-    
+
     // Test that default vocab contains expected words
     assert!(vocab.encode("hello").is_some());
     assert!(vocab.encode("world").is_some());
     assert!(vocab.encode("</s>").is_some());
-} 
\ No newline at end of file
+}

From 11cf58020ec02dd239c9475650e5dc514f5b160a Mon Sep 17 00:00:00 2001
From: ben1009 <liuhe1009@gmail.com>
Date: Mon, 22 Sep 2025 16:48:41 +0800
Subject: [PATCH 2/9] chore: housekeeping

---
 src/transformer.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformer.rs b/src/transformer.rs
index 04a9d4b..0beef93 100644
--- a/src/transformer.rs
+++ b/src/transformer.rs
@@ -32,9 +32,8 @@ impl Layer for TransformerBlock {
         let norm1_out = self.norm1.normalize(&attention_out);
 
         let feed_forward_out = self.feed_forward.forward(&norm1_out); // includes residual
-        let norm2_out = self.norm2.normalize(&feed_forward_out);
 
-        norm2_out
+        self.norm2.normalize(&feed_forward_out)
     }
 
     fn backward(&mut self, grads: &Array2<f32>, lr: f32) -> Array2<f32> {
@@ -49,6 +48,6 @@ impl Layer for TransformerBlock {
 
         // Backward through attention (includes residual connection)
 
-        grad_attn
+        self.attention.backward(&grad_norm1, lr)
     }
 }

From 43b70a5d699bce3979729cb6f2b910c8581d5ec5 Mon Sep 17 00:00:00 2001
From: ben1009 <liuhe1009@gmail.com>
Date: Tue, 23 Sep 2025 10:17:03 +0800
Subject: [PATCH 3/9] add more gha

---
 .github/codecov.yml         | 21 +++++++++++
 .github/workflows/check.yml | 73 +++++++++++++++++++++++++++++++++++++
 .github/workflows/rust.yml  | 23 ------------
 .github/workflows/test.yml  | 67 ++++++++++++++++++++++++++++++++++
 4 files changed, 161 insertions(+), 23 deletions(-)
 create mode 100644 .github/codecov.yml
 create mode 100644 .github/workflows/check.yml
 delete mode 100644 .github/workflows/rust.yml
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/codecov.yml b/.github/codecov.yml
new file mode 100644
index 0000000..f2038b4
--- /dev/null
+++ b/.github/codecov.yml
@@ -0,0 +1,21 @@
+# ref: https://docs.codecov.com/docs/codecovyml-reference
+coverage:
+  # Hold ourselves to a high bar
+  range: 77..100
+  round: down
+  precision: 1
+  status:
+    # ref: https://docs.codecov.com/docs/commit-status
+    project:
+      default:
+        # Avoid false negatives
+        threshold: 1%
+
+# Test files aren't important for coverage
+ignore:
+  - "tests"
+
+# Make comments less noisy
+comment:
+  layout: "files"
+  require_changes: yes
\ No newline at end of file
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
new file mode 100644
index 0000000..b138260
--- /dev/null
+++ b/.github/workflows/check.yml
@@ -0,0 +1,73 @@
+permissions:
+  contents: read
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+  merge_group:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:    
+  RUST_TOOLCHAIN: stable
+
+name: Check
+jobs:
+  fmt:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    name: fmt
+    permissions:
+      # Give the default GITHUB_TOKEN write permission to commit and push the
+      # added or changed files to the repository.
+      contents: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: ${{ env.RUST_TOOLCHAIN }}  
+          components: rustfmt
+      - run: cargo fmt --check
+
+  clippy:
+    runs-on: ubuntu-latest
+    name: clippy
+    permissions:
+      contents: read
+      checks: write
+    strategy:
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Install ${{ env.RUST_TOOLCHAIN }}
+        uses: dtolnay/rust-toolchain@master  # master
+        with:
+          toolchain: ${{ env.RUST_TOOLCHAIN }}
+          components: clippy
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@v2
+      - run: cargo clippy --workspace --all-features --all-targets -- -D warnings
+
+  typos:
+      runs-on: ubuntu-latest
+      name: typos
+      permissions:
+        contents: read
+      strategy:
+        fail-fast: false
+      steps:
+        - uses: actions/checkout@v4
+          with:
+            submodules: true
+        - name: Check spelling
+          uses: crate-ci/typos@master
+
+ 
\ No newline at end of file
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
deleted file mode 100644
index ead13e2..0000000
--- a/.github/workflows/rust.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: Rust
-
-on:
-  push:
-    branches: ["main"]
-  pull_request:
-    branches: ["main"]
-
-env:
-  CARGO_TERM_COLOR: always
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Format Check
-        run: cargo fmt -- --check
-      - name: Build
-        run: cargo build --verbose
-      - name: Run tests
-        run: cargo test --verbose
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..fc706de
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,67 @@
+permissions:
+  contents: read
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+  merge_group:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:    
+  RUST_TOOLCHAIN: stable
+
+name: Test
+jobs:
+  required:
+    runs-on: ubuntu-latest
+    name: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Install ${{ env.RUST_TOOLCHAIN }}
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: ${{ env.RUST_TOOLCHAIN }}
+      - name: cargo generate-lockfile
+        if: hashFiles('Cargo.lock') == ''
+        run: cargo generate-lockfile
+      # https://twitter.com/jonhoo/status/1571290371124260865
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@v2
+      - name: Install nextest
+        uses: taiki-e/install-action@nextest
+      - name: cargo nextest --locked
+        run: cargo nextest run --locked --workspace --all-features --all-targets
+
+  coverage:
+    runs-on: ubuntu-latest
+    name: coverage
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: ${{ env.RUST_TOOLCHAIN }}
+          components: llvm-tools-preview
+      - name: cargo install cargo-llvm-cov
+        uses: taiki-e/install-action@cargo-llvm-cov
+      - name: cargo generate-lockfile
+        if: hashFiles('Cargo.lock') == ''
+        run: cargo generate-lockfile
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@v2        
+      - name: Install nextest
+        uses: taiki-e/install-action@nextest
+      - name: cargo llvm-cov
+        run: cargo llvm-cov nextest --locked --workspace --all-features --all-targets --lcov --output-path lcov.info
+      - name: Upload to codecov.io
+        uses: codecov/codecov-action@v5
+        with:
+          fail_ci_if_error: true
+          token: ${{ secrets.CODECOV_TOKEN }} # required
\ No newline at end of file

From 9f86c82f114136e1c1c1d87e1958c85d163c25f3 Mon Sep 17 00:00:00 2001
From: ben1009 <liuhe1009@gmail.com>
Date: Tue, 23 Sep 2025 10:33:50 +0800
Subject: [PATCH 4/9] adjust cov to 55%

---
 .github/codecov.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/codecov.yml b/.github/codecov.yml
index f2038b4..958c206 100644
--- a/.github/codecov.yml
+++ b/.github/codecov.yml
@@ -1,7 +1,7 @@
 # ref: https://docs.codecov.com/docs/codecovyml-reference
 coverage:
   # Hold ourselves to a high bar
-  range: 77..100
+  range: 55..100
   round: down
   precision: 1
   status:

From 32f300b63a8f034731af984026e19d6e97324b60 Mon Sep 17 00:00:00 2001
From: ben1009 <liuhe1009@gmail.com>
Date: Tue, 23 Sep 2025 10:46:25 +0800
Subject: [PATCH 5/9] add nightly fmt

---
 .github/workflows/check.yml     |  2 +-
 rustfmt.toml                    | 14 ++++++++++++++
 src/embeddings.rs               |  3 ++-
 src/feed_forward.rs             |  6 +++---
 src/layer_norm.rs               |  7 +++----
 src/lib.rs                      |  3 +--
 src/llm.rs                      | 19 +++++++++----------
 src/main.rs                     | 10 +++++-----
 src/self_attention.rs           |  8 ++++----
 src/transformer.rs              |  8 ++++----
 tests/feed_forward_test.rs      |  3 +--
 tests/llm_test.rs               |  5 +----
 tests/output_projection_test.rs |  3 +--
 tests/self_attention_test.rs    |  3 +--
 tests/transformer_test.rs       |  3 +--
 15 files changed, 51 insertions(+), 46 deletions(-)
 create mode 100644 rustfmt.toml

diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
index b138260..aba9556 100644
--- a/.github/workflows/check.yml
+++ b/.github/workflows/check.yml
@@ -31,7 +31,7 @@ jobs:
       - name: Install rust
         uses: dtolnay/rust-toolchain@master
         with:
-          toolchain: ${{ env.RUST_TOOLCHAIN }}  
+          toolchain: nightly #${{ env.RUST_TOOLCHAIN }}  
           components: rustfmt
       - run: cargo fmt --check
 
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000..d85c165
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1,14 @@
+edition = "2024"
+style_edition = "2024"
+comment_width = 120
+format_code_in_doc_comments = true
+format_macro_bodies = true
+format_macro_matchers = true
+normalize_comments = true
+normalize_doc_attributes = true
+imports_granularity = "Crate"
+group_imports = "StdExternalCrate"
+reorder_impl_items = true
+reorder_imports = true
+tab_spaces = 4
+wrap_comments = true
diff --git a/src/embeddings.rs b/src/embeddings.rs
index 65f5f7e..f0aff98 100644
--- a/src/embeddings.rs
+++ b/src/embeddings.rs
@@ -1,7 +1,8 @@
-use crate::{EMBEDDING_DIM, MAX_SEQ_LEN, adam::Adam, llm::Layer, vocab::Vocab};
 use ndarray::{Array2, s};
 use rand_distr::{Distribution, Normal};
 
+use crate::{EMBEDDING_DIM, MAX_SEQ_LEN, adam::Adam, llm::Layer, vocab::Vocab};
+
 pub struct Embeddings {
     pub token_embeddings: Array2<f32>,
     pub positional_embeddings: Array2<f32>,
diff --git a/src/feed_forward.rs b/src/feed_forward.rs
index dd74c94..ae20d10 100644
--- a/src/feed_forward.rs
+++ b/src/feed_forward.rs
@@ -1,8 +1,8 @@
-use crate::{adam::Adam, llm::Layer};
-use ndarray::Array2;
-use ndarray::Axis;
+use ndarray::{Array2, Axis};
 use rand_distr::{Distribution, Normal};
 
+use crate::{adam::Adam, llm::Layer};
+
 pub struct FeedForward {
     w1: Array2<f32>,
     b1: Array2<f32>,
diff --git a/src/layer_norm.rs b/src/layer_norm.rs
index f326641..b73a7d7 100644
--- a/src/layer_norm.rs
+++ b/src/layer_norm.rs
@@ -1,7 +1,6 @@
-use crate::adam::Adam;
-use crate::llm::Layer;
-use ndarray::Array2;
-use ndarray::Axis;
+use ndarray::{Array2, Axis};
+
+use crate::{adam::Adam, llm::Layer};
 
 pub struct LayerNorm {
     epsilon: f32,       // Small constant for stability
diff --git a/src/lib.rs b/src/lib.rs
index a62f008..80769b5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,8 +9,7 @@ pub mod transformer;
 pub mod vocab;
 // Re-export key structs for easier access
 pub use embeddings::Embeddings;
-pub use llm::LLM;
-pub use llm::Layer;
+pub use llm::{LLM, Layer};
 pub use vocab::Vocab;
 
 // Constants
diff --git a/src/llm.rs b/src/llm.rs
index 7ddfaae..d49fb25 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -1,13 +1,11 @@
-use crate::EMBEDDING_DIM;
-use crate::Embeddings;
-use crate::HIDDEN_DIM;
-use crate::MAX_SEQ_LEN;
-use crate::Vocab;
-use crate::output_projection::OutputProjection;
-use crate::transformer::TransformerBlock;
-use ndarray::Array1;
-use ndarray::{Array2, Axis};
 use std::cmp::Ordering;
+
+use ndarray::{Array1, Array2, Axis};
+
+use crate::{
+    EMBEDDING_DIM, Embeddings, HIDDEN_DIM, MAX_SEQ_LEN, Vocab, output_projection::OutputProjection,
+    transformer::TransformerBlock,
+};
 pub trait Layer {
     fn layer_type(&self) -> &str;
 
@@ -117,7 +115,8 @@ impl LLM {
                 .to_owned()
                 .insert_axis(Axis(0));
 
-            // Softmax - convert activiations of each token to a probability distribution over the vocabulary
+            // Softmax - convert activiations of each token to a probability distribution over the
+            // vocabulary
             let probs = Self::softmax(&last_logit); // 1 x vocab_size
 
             // Greedy Decode - Choose the highest probability token for each position
diff --git a/src/main.rs b/src/main.rs
index 6e64144..cf5297a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,12 +1,12 @@
 use std::io::Write;
 
-use crate::embeddings::Embeddings;
-use crate::llm::LLM;
-use crate::output_projection::OutputProjection;
-use crate::transformer::TransformerBlock;
-use crate::vocab::Vocab;
 use ::llm::{EMBEDDING_DIM, HIDDEN_DIM, MAX_SEQ_LEN};
 
+use crate::{
+    embeddings::Embeddings, llm::LLM, output_projection::OutputProjection,
+    transformer::TransformerBlock, vocab::Vocab,
+};
+
 mod adam;
 mod embeddings;
 mod feed_forward;
diff --git a/src/self_attention.rs b/src/self_attention.rs
index b96512f..4310cf7 100644
--- a/src/self_attention.rs
+++ b/src/self_attention.rs
@@ -1,9 +1,9 @@
-use crate::EMBEDDING_DIM;
-use crate::adam::Adam;
-use crate::llm::Layer;
+use std::f32;
+
 use ndarray::Array2;
 use rand_distr::{Distribution, Normal};
-use std::f32;
+
+use crate::{EMBEDDING_DIM, adam::Adam, llm::Layer};
 
 pub struct SelfAttention {
     pub embedding_dim: usize,
diff --git a/src/transformer.rs b/src/transformer.rs
index 0beef93..e91b59f 100644
--- a/src/transformer.rs
+++ b/src/transformer.rs
@@ -1,8 +1,8 @@
-use crate::feed_forward::FeedForward;
-use crate::layer_norm::LayerNorm;
-use crate::llm::Layer;
-use crate::self_attention::SelfAttention;
 use ndarray::Array2;
+
+use crate::{
+    feed_forward::FeedForward, layer_norm::LayerNorm, llm::Layer, self_attention::SelfAttention,
+};
 pub struct TransformerBlock {
     attention: SelfAttention,
     feed_forward: FeedForward,
diff --git a/tests/feed_forward_test.rs b/tests/feed_forward_test.rs
index 922239c..c651fb6 100644
--- a/tests/feed_forward_test.rs
+++ b/tests/feed_forward_test.rs
@@ -1,5 +1,4 @@
-use llm::feed_forward::FeedForward;
-use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer};
+use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer, feed_forward::FeedForward};
 use ndarray::Array2;
 
 #[test]
diff --git a/tests/llm_test.rs b/tests/llm_test.rs
index 4abb749..5c88b78 100644
--- a/tests/llm_test.rs
+++ b/tests/llm_test.rs
@@ -1,7 +1,4 @@
-use llm::EMBEDDING_DIM;
-use llm::Embeddings;
-use llm::output_projection::OutputProjection;
-use llm::{LLM, Layer, Vocab};
+use llm::{EMBEDDING_DIM, Embeddings, LLM, Layer, Vocab, output_projection::OutputProjection};
 use ndarray::Array2;
 
 struct TestOutputProjectionLayer {
diff --git a/tests/output_projection_test.rs b/tests/output_projection_test.rs
index a0f14c1..5b467ad 100644
--- a/tests/output_projection_test.rs
+++ b/tests/output_projection_test.rs
@@ -1,5 +1,4 @@
-use llm::output_projection::OutputProjection;
-use llm::{EMBEDDING_DIM, Layer};
+use llm::{EMBEDDING_DIM, Layer, output_projection::OutputProjection};
 use ndarray::Array2;
 
 #[test]
diff --git a/tests/self_attention_test.rs b/tests/self_attention_test.rs
index 009c7e4..4e1e5ff 100644
--- a/tests/self_attention_test.rs
+++ b/tests/self_attention_test.rs
@@ -1,5 +1,4 @@
-use llm::self_attention::SelfAttention;
-use llm::{EMBEDDING_DIM, Layer};
+use llm::{EMBEDDING_DIM, Layer, self_attention::SelfAttention};
 use ndarray::Array2;
 
 #[test]
diff --git a/tests/transformer_test.rs b/tests/transformer_test.rs
index c198915..0fa49d1 100644
--- a/tests/transformer_test.rs
+++ b/tests/transformer_test.rs
@@ -1,5 +1,4 @@
-use llm::transformer::TransformerBlock;
-use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer};
+use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer, transformer::TransformerBlock};
 use ndarray::Array2;
 
 #[test]

From fcc6a4a15586202aaca04ee7e303aaeb2b157600 Mon Sep 17 00:00:00 2001
From: ben1009 <liuhe1009@gmail.com>
Date: Fri, 26 Sep 2025 10:20:56 +0800
Subject: [PATCH 6/9] chore: fix lints

---
 Cargo.lock            | 20 ++++++++++++++++++--
 src/dataset_loader.rs |  5 +++--
 src/layer_norm.rs     |  4 ++--
 src/vocab.rs          |  9 +++++----
 4 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 146c406..4219d18 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -343,10 +343,15 @@ version = "1.0.1+wasi-0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
 dependencies = [
- "byteorder",
- "zerocopy-derive",
+ "wit-bindgen",
 ]
 
+[[package]]
+name = "wit-bindgen"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
+
 [[package]]
 name = "zerocopy"
 version = "0.8.27"
@@ -355,3 +360,14 @@ checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c"
 dependencies = [
  "zerocopy-derive",
 ]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
diff --git a/src/dataset_loader.rs b/src/dataset_loader.rs
index 0c63eb2..bb0192a 100644
--- a/src/dataset_loader.rs
+++ b/src/dataset_loader.rs
@@ -1,13 +1,14 @@
-use csv::ReaderBuilder;
-use serde_json;
 use std::fs;
 
+use csv::ReaderBuilder;
+
 pub struct Dataset {
     pub pretraining_data: Vec<String>,
     pub chat_training_data: Vec<String>,
 }
 
 #[allow(dead_code)]
+#[allow(clippy::upper_case_acronyms)]
 pub enum DatasetType {
     JSON,
     CSV,
diff --git a/src/layer_norm.rs b/src/layer_norm.rs
index c2e269a..b73a7d7 100644
--- a/src/layer_norm.rs
+++ b/src/layer_norm.rs
@@ -1,7 +1,7 @@
-use crate::adam::Adam;
-use crate::llm::Layer;
 use ndarray::{Array2, Axis};
 
+use crate::{adam::Adam, llm::Layer};
+
 pub struct LayerNorm {
     epsilon: f32,       // Small constant for stability
     gamma: Array2<f32>, // Learnable scaling parameter
diff --git a/src/vocab.rs b/src/vocab.rs
index 81b7b0e..b28e86a 100644
--- a/src/vocab.rs
+++ b/src/vocab.rs
@@ -1,6 +1,7 @@
-use bincode::Encode;
 use std::collections::HashMap;
 
+use bincode::Encode;
+
 #[derive(Clone, Encode)]
 pub struct Vocab {
     pub encode: HashMap<String, usize>,
@@ -48,10 +49,10 @@ impl Vocab {
     }
 }
 
-impl Into<String> for Vocab {
-    fn into(self) -> String {
+impl From<Vocab> for String {
+    fn from(val: Vocab) -> Self {
         String::from_iter(
-            self.words
+            val.words
                 .iter()
                 .enumerate()
                 .map(|(i, str)| format!("({i},{str}),")),

From 2ec33c705d312b2768320a6d9c7349bd6064fe95 Mon Sep 17 00:00:00 2001
From: ben1009 <liuhe1009@gmail.com>
Date: Tue, 30 Sep 2025 09:22:37 +0800
Subject: [PATCH 7/9] chore: fix lints

---
 src/llm.rs        | 2 +-
 tests/llm_test.rs | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/llm.rs b/src/llm.rs
index dbaa1bc..4613f93 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -56,7 +56,7 @@ impl LLM {
         // Sum the parameters across all layers in the network
         self.network
             .iter()
-            .map(|layer: &Box<dyn Layer>| layer.parameters())
+            .map(|layer| layer.parameters())
             .sum::<usize>()
     }
 
diff --git a/tests/llm_test.rs b/tests/llm_test.rs
index c32e518..1e2fec4 100644
--- a/tests/llm_test.rs
+++ b/tests/llm_test.rs
@@ -1,5 +1,7 @@
-use llm::{EMBEDDING_DIM, Embeddings, LLM, Layer, Vocab, output_projection::OutputProjection};
-use llm::{LLM, Layer, Vocab};
+use llm::{
+    EMBEDDING_DIM, Embeddings, HIDDEN_DIM, LLM, Layer, MAX_SEQ_LEN, Vocab,
+    output_projection::OutputProjection, transformer::TransformerBlock,
+};
 use ndarray::Array2;
 
 struct TestOutputProjectionLayer {
@@ -153,7 +155,8 @@ fn test_llm_total_parameters() {
     let param_count = llm.total_parameters();
     assert!(param_count > 0);
 
-    // Let's validate that this is equal to the expected total number of parameters. (based on our source)
+    // Let's validate that this is equal to the expected total number of parameters. (based on our
+    // source)
     let expected_embeddings_parameters = vocab_size * EMBEDDING_DIM + MAX_SEQ_LEN * EMBEDDING_DIM;
     let expected_transformer_block_parameters = (2 * EMBEDDING_DIM) + // LayerNorm
     (3 * EMBEDDING_DIM * EMBEDDING_DIM) + // SelfAttention

From 4f407f4062b590c961ba1aba310a6eaca0db9f3c Mon Sep 17 00:00:00 2001
From: ben1009 <liuhe1009@gmail.com>
Date: Sat, 4 Oct 2025 12:21:51 +0800
Subject: [PATCH 8/9] comment out the cov gha

---
 .github/codecov.yml        | 39 +++++++++++++-------------
 .github/workflows/test.yml | 57 +++++++++++++++++++-------------------
 src/llm.rs                 |  2 +-
 3 files changed, 50 insertions(+), 48 deletions(-)

diff --git a/.github/codecov.yml b/.github/codecov.yml
index 958c206..f8e75a2 100644
--- a/.github/codecov.yml
+++ b/.github/codecov.yml
@@ -1,21 +1,22 @@
-# ref: https://docs.codecov.com/docs/codecovyml-reference
-coverage:
-  # Hold ourselves to a high bar
-  range: 55..100
-  round: down
-  precision: 1
-  status:
-    # ref: https://docs.codecov.com/docs/commit-status
-    project:
-      default:
-        # Avoid false negatives
-        threshold: 1%
+# # ref: https://docs.codecov.com/docs/codecovyml-reference
+# comment out coverage job for now, https://github.com/tekaratzas/RustGPT/pull/11#issuecomment-3361854174
+# coverage:
+#   # Hold ourselves to a high bar
+#   range: 55..100
+#   round: down
+#   precision: 1
+#   status:
+#     # ref: https://docs.codecov.com/docs/commit-status
+#     project:
+#       default:
+#         # Avoid false negatives
+#         threshold: 1%
 
-# Test files aren't important for coverage
-ignore:
-  - "tests"
+# # Test files aren't important for coverage
+# ignore:
+#   - "tests"
 
-# Make comments less noisy
-comment:
-  layout: "files"
-  require_changes: yes
\ No newline at end of file
+# # Make comments less noisy
+# comment:
+#   layout: "files"
+#   require_changes: yes
\ No newline at end of file
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index fc706de..da07418 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -37,31 +37,32 @@ jobs:
       - name: cargo nextest --locked
         run: cargo nextest run --locked --workspace --all-features --all-targets
 
-  coverage:
-    runs-on: ubuntu-latest
-    name: coverage
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: true
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@master
-        with:
-          toolchain: ${{ env.RUST_TOOLCHAIN }}
-          components: llvm-tools-preview
-      - name: cargo install cargo-llvm-cov
-        uses: taiki-e/install-action@cargo-llvm-cov
-      - name: cargo generate-lockfile
-        if: hashFiles('Cargo.lock') == ''
-        run: cargo generate-lockfile
-      - name: Rust Cache
-        uses: Swatinem/rust-cache@v2        
-      - name: Install nextest
-        uses: taiki-e/install-action@nextest
-      - name: cargo llvm-cov
-        run: cargo llvm-cov nextest --locked --workspace --all-features --all-targets --lcov --output-path lcov.info
-      - name: Upload to codecov.io
-        uses: codecov/codecov-action@v5
-        with:
-          fail_ci_if_error: true
-          token: ${{ secrets.CODECOV_TOKEN }} # required
\ No newline at end of file
+  # comment out coverage job for now, https://github.com/tekaratzas/RustGPT/pull/11#issuecomment-3361854174
+  # coverage:
+  #   runs-on: ubuntu-latest
+  #   name: coverage
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #       with:
+  #         submodules: true
+  #     - name: Install rust
+  #       uses: dtolnay/rust-toolchain@master
+  #       with:
+  #         toolchain: ${{ env.RUST_TOOLCHAIN }}
+  #         components: llvm-tools-preview
+  #     - name: cargo install cargo-llvm-cov
+  #       uses: taiki-e/install-action@cargo-llvm-cov
+  #     - name: cargo generate-lockfile
+  #       if: hashFiles('Cargo.lock') == ''
+  #       run: cargo generate-lockfile
+  #     - name: Rust Cache
+  #       uses: Swatinem/rust-cache@v2        
+  #     - name: Install nextest
+  #       uses: taiki-e/install-action@nextest
+  #     - name: cargo llvm-cov
+  #       run: cargo llvm-cov nextest --locked --workspace --all-features --all-targets --lcov --output-path lcov.info
+  #     - name: Upload to codecov.io
+  #       uses: codecov/codecov-action@v5
+  #       with:
+  #         fail_ci_if_error: true
+  #         token: ${{ secrets.CODECOV_TOKEN }} # required
\ No newline at end of file
diff --git a/src/llm.rs b/src/llm.rs
index 4613f93..d0d6688 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -125,7 +125,7 @@ impl LLM {
                 .to_owned()
                 .insert_axis(Axis(0));
 
-            // Softmax - convert activiations of each token to a probability distribution over the
+            // Softmax - convert activations of each token to a probability distribution over the
             // vocabulary
             let probs = Self::softmax(&last_logit); // 1 x vocab_size
 

From 08613771c25467837b4daaef290288fb8cf12b9c Mon Sep 17 00:00:00 2001
From: ben1009 <liuhe1009@gmail.com>
Date: Sun, 5 Oct 2025 18:42:05 +0800
Subject: [PATCH 9/9] update

---
 src/vocab.rs | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/vocab.rs b/src/vocab.rs
index b28e86a..ced340b 100644
--- a/src/vocab.rs
+++ b/src/vocab.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 
 use bincode::Encode;
 
@@ -47,6 +47,34 @@ impl Vocab {
     pub fn default_words() -> Vec<&'static str> {
         vec!["hello", "world", "this", "is", "rust", "</s>"]
     }
+
+    /// Process text data to extract vocabulary words and add them to the vocabulary set
+    pub fn process_text_for_vocab(texts: &[String], vocab_set: &mut HashSet<String>) {
+        // Add end of sequence token
+        vocab_set.insert("</s>".to_string());
+
+        // Process all training examples for vocabulary
+        for text in texts {
+            for word in text.split_whitespace() {
+                // Handle punctuation by splitting it from words
+                let mut current = String::new();
+                for c in word.chars() {
+                    if c.is_ascii_punctuation() {
+                        if !current.is_empty() {
+                            vocab_set.insert(current.clone());
+                            current.clear();
+                        }
+                        vocab_set.insert(c.to_string());
+                    } else {
+                        current.push(c);
+                    }
+                }
+                if !current.is_empty() {
+                    vocab_set.insert(current);
+                }
+            }
+        }
+    }
 }
 
 impl From<Vocab> for String {