diff --git a/.github/codecov.yml b/.github/codecov.yml
new file mode 100644
index 0000000..f8e75a2
--- /dev/null
+++ b/.github/codecov.yml
@@ -0,0 +1,22 @@
+# # ref: https://docs.codecov.com/docs/codecovyml-reference
+# comment out coverage job for now, https://github.com/tekaratzas/RustGPT/pull/11#issuecomment-3361854174
+# coverage:
+#   # Hold ourselves to a high bar
+#   range: 55..100
+#   round: down
+#   precision: 1
+#   status:
+#     # ref: https://docs.codecov.com/docs/commit-status
+#     project:
+#       default:
+#         # Avoid false negatives
+#         threshold: 1%
+
+# # Test files aren't important for coverage
+# ignore:
+#   - "tests"
+
+# # Make comments less noisy
+# comment:
+#   layout: "files"
+#   require_changes: yes
\ No newline at end of file
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
new file mode 100644
index 0000000..aba9556
--- /dev/null
+++ b/.github/workflows/check.yml
@@ -0,0 +1,73 @@
+permissions:
+  contents: read
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+  merge_group:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:    
+  RUST_TOOLCHAIN: stable
+
+name: Check
+jobs:
+  fmt:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    name: fmt
+    permissions:
+      # Give the default GITHUB_TOKEN write permission to commit and push the
+      # added or changed files to the repository.
+      contents: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: nightly #${{ env.RUST_TOOLCHAIN }}  
+          components: rustfmt
+      - run: cargo fmt --check
+
+  clippy:
+    runs-on: ubuntu-latest
+    name: clippy
+    permissions:
+      contents: read
+      checks: write
+    strategy:
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Install ${{ env.RUST_TOOLCHAIN }}
+        uses: dtolnay/rust-toolchain@master  # master
+        with:
+          toolchain: ${{ env.RUST_TOOLCHAIN }}
+          components: clippy
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@v2
+      - run: cargo clippy --workspace --all-features --all-targets -- -D warnings
+
+  typos:
+      runs-on: ubuntu-latest
+      name: typos
+      permissions:
+        contents: read
+      strategy:
+        fail-fast: false
+      steps:
+        - uses: actions/checkout@v4
+          with:
+            submodules: true
+        - name: Check spelling
+          uses: crate-ci/typos@master
+
+ 
\ No newline at end of file
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
deleted file mode 100644
index ead13e2..0000000
--- a/.github/workflows/rust.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: Rust
-
-on:
-  push:
-    branches: ["main"]
-  pull_request:
-    branches: ["main"]
-
-env:
-  CARGO_TERM_COLOR: always
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Format Check
-        run: cargo fmt -- --check
-      - name: Build
-        run: cargo build --verbose
-      - name: Run tests
-        run: cargo test --verbose
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..da07418
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,68 @@
+permissions:
+  contents: read
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+  merge_group:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:    
+  RUST_TOOLCHAIN: stable
+
+name: Test
+jobs:
+  required:
+    runs-on: ubuntu-latest
+    name: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Install ${{ env.RUST_TOOLCHAIN }}
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: ${{ env.RUST_TOOLCHAIN }}
+      - name: cargo generate-lockfile
+        if: hashFiles('Cargo.lock') == ''
+        run: cargo generate-lockfile
+      # https://twitter.com/jonhoo/status/1571290371124260865
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@v2
+      - name: Install nextest
+        uses: taiki-e/install-action@nextest
+      - name: cargo nextest --locked
+        run: cargo nextest run --locked --workspace --all-features --all-targets
+
+  # comment out coverage job for now, https://github.com/tekaratzas/RustGPT/pull/11#issuecomment-3361854174
+  # coverage:
+  #   runs-on: ubuntu-latest
+  #   name: coverage
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #       with:
+  #         submodules: true
+  #     - name: Install rust
+  #       uses: dtolnay/rust-toolchain@master
+  #       with:
+  #         toolchain: ${{ env.RUST_TOOLCHAIN }}
+  #         components: llvm-tools-preview
+  #     - name: cargo install cargo-llvm-cov
+  #       uses: taiki-e/install-action@cargo-llvm-cov
+  #     - name: cargo generate-lockfile
+  #       if: hashFiles('Cargo.lock') == ''
+  #       run: cargo generate-lockfile
+  #     - name: Rust Cache
+  #       uses: Swatinem/rust-cache@v2        
+  #     - name: Install nextest
+  #       uses: taiki-e/install-action@nextest
+  #     - name: cargo llvm-cov
+  #       run: cargo llvm-cov nextest --locked --workspace --all-features --all-targets --lcov --output-path lcov.info
+  #     - name: Upload to codecov.io
+  #       uses: codecov/codecov-action@v5
+  #       with:
+  #         fail_ci_if_error: true
+  #         token: ${{ secrets.CODECOV_TOKEN }} # required
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index 3ad229c..f88cc2c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2024"
 [dependencies]
 bincode = "2.0.1"
 ndarray = "0.16.1"
-rand = "0.9.0"
+rand = "0.9.2"
 rand_distr = "0.5.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000..d85c165
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1,14 @@
+edition = "2024"
+style_edition = "2024"
+comment_width = 120
+format_code_in_doc_comments = true
+format_macro_bodies = true
+format_macro_matchers = true
+normalize_comments = true
+normalize_doc_attributes = true
+imports_granularity = "Crate"
+group_imports = "StdExternalCrate"
+reorder_impl_items = true
+reorder_imports = true
+tab_spaces = 4
+wrap_comments = true
diff --git a/src/dataset_loader.rs b/src/dataset_loader.rs
index 0c63eb2..bb0192a 100644
--- a/src/dataset_loader.rs
+++ b/src/dataset_loader.rs
@@ -1,13 +1,14 @@
-use csv::ReaderBuilder;
-use serde_json;
 use std::fs;
 
+use csv::ReaderBuilder;
+
 pub struct Dataset {
     pub pretraining_data: Vec<String>,
     pub chat_training_data: Vec<String>,
 }
 
 #[allow(dead_code)]
+#[allow(clippy::upper_case_acronyms)]
 pub enum DatasetType {
     JSON,
     CSV,
diff --git a/src/embeddings.rs b/src/embeddings.rs
index 1d36685..72d8a6d 100644
--- a/src/embeddings.rs
+++ b/src/embeddings.rs
@@ -1,7 +1,8 @@
-use crate::{EMBEDDING_DIM, MAX_SEQ_LEN, adam::Adam, llm::Layer, vocab::Vocab};
 use ndarray::{Array2, s};
 use rand_distr::{Distribution, Normal};
 
+use crate::{EMBEDDING_DIM, MAX_SEQ_LEN, adam::Adam, llm::Layer, vocab::Vocab};
+
 pub struct Embeddings {
     pub token_embeddings: Array2<f32>,
     pub positional_embeddings: Array2<f32>,
diff --git a/src/feed_forward.rs b/src/feed_forward.rs
index db08438..2048f39 100644
--- a/src/feed_forward.rs
+++ b/src/feed_forward.rs
@@ -1,8 +1,8 @@
-use crate::{adam::Adam, llm::Layer};
-use ndarray::Array2;
-use ndarray::Axis;
+use ndarray::{Array2, Axis};
 use rand_distr::{Distribution, Normal};
 
+use crate::{adam::Adam, llm::Layer};
+
 pub struct FeedForward {
     w1: Array2<f32>,
     b1: Array2<f32>,
diff --git a/src/layer_norm.rs b/src/layer_norm.rs
index 895a6de..ff488d4 100644
--- a/src/layer_norm.rs
+++ b/src/layer_norm.rs
@@ -1,7 +1,7 @@
-use crate::adam::Adam;
-use crate::llm::Layer;
 use ndarray::{Array2, Axis};
 
+use crate::{adam::Adam, llm::Layer};
+
 pub struct LayerNorm {
     epsilon: f32,       // Small constant for stability
     gamma: Array2<f32>, // Learnable scaling parameter
diff --git a/src/llm.rs b/src/llm.rs
index c2a3ee9..d0d6688 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -1,13 +1,11 @@
-use crate::EMBEDDING_DIM;
-use crate::Embeddings;
-use crate::HIDDEN_DIM;
-use crate::MAX_SEQ_LEN;
-use crate::Vocab;
-use crate::output_projection::OutputProjection;
-use crate::transformer::TransformerBlock;
-use ndarray::{Array1, Array2, Axis};
 use std::cmp::Ordering;
 
+use ndarray::{Array1, Array2, Axis};
+
+use crate::{
+    EMBEDDING_DIM, Embeddings, HIDDEN_DIM, MAX_SEQ_LEN, Vocab, output_projection::OutputProjection,
+    transformer::TransformerBlock,
+};
 pub trait Layer {
     fn layer_type(&self) -> &str;
 
@@ -18,6 +16,7 @@ pub trait Layer {
     fn parameters(&self) -> usize;
 }
 
+#[allow(clippy::upper_case_acronyms)]
 pub struct LLM {
     pub vocab: Vocab,
     pub network: Vec<Box<dyn Layer>>,
@@ -57,7 +56,7 @@ impl LLM {
         // Sum the parameters across all layers in the network
         self.network
             .iter()
-            .map(|layer: &Box<dyn Layer>| layer.parameters())
+            .map(|layer| layer.parameters())
             .sum::<usize>()
     }
 
@@ -126,7 +125,8 @@ impl LLM {
                 .to_owned()
                 .insert_axis(Axis(0));
 
-            // Softmax - convert activiations of each token to a probability distribution over the vocabulary
+            // Softmax - convert activations of each token to a probability distribution over the
+            // vocabulary
             let probs = Self::softmax(&last_logit); // 1 x vocab_size
 
             // Greedy Decode - Choose the highest probability token for each position
@@ -238,10 +238,10 @@ impl LLM {
             }
 
             // Add any remaining word
-            if !current_word.is_empty() {
-                if let Some(token_id) = self.vocab.encode(&current_word) {
-                    tokens.push(token_id);
-                }
+            if !current_word.is_empty()
+                && let Some(token_id) = self.vocab.encode(&current_word)
+            {
+                tokens.push(token_id);
             }
         }
 
diff --git a/src/main.rs b/src/main.rs
index 0c9712b..5babf3c 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,13 +1,13 @@
 use std::io::Write;
 
-use crate::embeddings::Embeddings;
-use crate::llm::LLM;
-use crate::output_projection::OutputProjection;
-use crate::transformer::TransformerBlock;
-use crate::vocab::Vocab;
 use ::llm::{EMBEDDING_DIM, HIDDEN_DIM, MAX_SEQ_LEN};
 use dataset_loader::{Dataset, DatasetType};
 
+use crate::{
+    embeddings::Embeddings, llm::LLM, output_projection::OutputProjection,
+    transformer::TransformerBlock, vocab::Vocab,
+};
+
 mod adam;
 mod dataset_loader;
 mod embeddings;
diff --git a/src/self_attention.rs b/src/self_attention.rs
index 6252522..2e31324 100644
--- a/src/self_attention.rs
+++ b/src/self_attention.rs
@@ -1,9 +1,9 @@
-use crate::EMBEDDING_DIM;
-use crate::adam::Adam;
-use crate::llm::Layer;
+use std::f32;
+
 use ndarray::Array2;
 use rand_distr::{Distribution, Normal};
-use std::f32;
+
+use crate::{EMBEDDING_DIM, adam::Adam, llm::Layer};
 
 pub struct SelfAttention {
     pub embedding_dim: usize,
diff --git a/src/transformer.rs b/src/transformer.rs
index 4795f25..e700c8c 100644
--- a/src/transformer.rs
+++ b/src/transformer.rs
@@ -1,8 +1,8 @@
-use crate::feed_forward::FeedForward;
-use crate::layer_norm::LayerNorm;
-use crate::llm::Layer;
-use crate::self_attention::SelfAttention;
 use ndarray::Array2;
+
+use crate::{
+    feed_forward::FeedForward, layer_norm::LayerNorm, llm::Layer, self_attention::SelfAttention,
+};
 pub struct TransformerBlock {
     attention: SelfAttention,
     feed_forward: FeedForward,
@@ -32,9 +32,8 @@ impl Layer for TransformerBlock {
         let norm1_out = self.norm1.normalize(&attention_out);
 
         let feed_forward_out = self.feed_forward.forward(&norm1_out); // includes residual
-        let norm2_out = self.norm2.normalize(&feed_forward_out);
 
-        norm2_out
+        self.norm2.normalize(&feed_forward_out)
     }
 
     fn backward(&mut self, grads: &Array2<f32>, lr: f32) -> Array2<f32> {
@@ -48,9 +47,8 @@ impl Layer for TransformerBlock {
         let grad_norm1 = self.norm1.backward(&grad_ffn, lr);
 
         // Backward through attention (includes residual connection)
-        let grad_attn = self.attention.backward(&grad_norm1, lr);
 
-        grad_attn
+        self.attention.backward(&grad_norm1, lr)
     }
 
     fn parameters(&self) -> usize {
diff --git a/src/vocab.rs b/src/vocab.rs
index 448d761..ced340b 100644
--- a/src/vocab.rs
+++ b/src/vocab.rs
@@ -1,6 +1,6 @@
+use std::collections::{HashMap, HashSet};
+
 use bincode::Encode;
-use std::collections::HashMap;
-use std::collections::HashSet;
 
 #[derive(Clone, Encode)]
 pub struct Vocab {
@@ -77,10 +77,10 @@ impl Vocab {
     }
 }
 
-impl Into<String> for Vocab {
-    fn into(self) -> String {
+impl From<Vocab> for String {
+    fn from(val: Vocab) -> Self {
         String::from_iter(
-            self.words
+            val.words
                 .iter()
                 .enumerate()
                 .map(|(i, str)| format!("({i},{str}),")),
diff --git a/tests/feed_forward_test.rs b/tests/feed_forward_test.rs
index 922239c..c651fb6 100644
--- a/tests/feed_forward_test.rs
+++ b/tests/feed_forward_test.rs
@@ -1,5 +1,4 @@
-use llm::feed_forward::FeedForward;
-use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer};
+use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer, feed_forward::FeedForward};
 use ndarray::Array2;
 
 #[test]
diff --git a/tests/llm_test.rs b/tests/llm_test.rs
index 937b12b..1e2fec4 100644
--- a/tests/llm_test.rs
+++ b/tests/llm_test.rs
@@ -1,10 +1,7 @@
-use llm::EMBEDDING_DIM;
-use llm::Embeddings;
-use llm::HIDDEN_DIM;
-use llm::MAX_SEQ_LEN;
-use llm::output_projection::OutputProjection;
-use llm::transformer::TransformerBlock;
-use llm::{LLM, Layer, Vocab};
+use llm::{
+    EMBEDDING_DIM, Embeddings, HIDDEN_DIM, LLM, Layer, MAX_SEQ_LEN, Vocab,
+    output_projection::OutputProjection, transformer::TransformerBlock,
+};
 use ndarray::Array2;
 
 struct TestOutputProjectionLayer {
@@ -46,7 +43,7 @@ impl Layer for TestOutputProjectionLayer {
         let grad_input = input.dot(grads);
         self.cached_grads = Some(grad_input.clone());
 
-        return grad_input;
+        grad_input
     }
 
     fn parameters(&self) -> usize {
@@ -158,7 +155,8 @@ fn test_llm_total_parameters() {
     let param_count = llm.total_parameters();
     assert!(param_count > 0);
 
-    // Let's validate that this is equal to the expected total number of parameters. (based on our source)
+    // Let's validate that this is equal to the expected total number of parameters. (based on our
+    // source)
     let expected_embeddings_parameters = vocab_size * EMBEDDING_DIM + MAX_SEQ_LEN * EMBEDDING_DIM;
     let expected_transformer_block_parameters = (2 * EMBEDDING_DIM) + // LayerNorm
     (3 * EMBEDDING_DIM * EMBEDDING_DIM) + // SelfAttention
diff --git a/tests/output_projection_test.rs b/tests/output_projection_test.rs
index a0f14c1..5b467ad 100644
--- a/tests/output_projection_test.rs
+++ b/tests/output_projection_test.rs
@@ -1,5 +1,4 @@
-use llm::output_projection::OutputProjection;
-use llm::{EMBEDDING_DIM, Layer};
+use llm::{EMBEDDING_DIM, Layer, output_projection::OutputProjection};
 use ndarray::Array2;
 
 #[test]
diff --git a/tests/self_attention_test.rs b/tests/self_attention_test.rs
index 009c7e4..4e1e5ff 100644
--- a/tests/self_attention_test.rs
+++ b/tests/self_attention_test.rs
@@ -1,5 +1,4 @@
-use llm::self_attention::SelfAttention;
-use llm::{EMBEDDING_DIM, Layer};
+use llm::{EMBEDDING_DIM, Layer, self_attention::SelfAttention};
 use ndarray::Array2;
 
 #[test]
diff --git a/tests/transformer_test.rs b/tests/transformer_test.rs
index c198915..0fa49d1 100644
--- a/tests/transformer_test.rs
+++ b/tests/transformer_test.rs
@@ -1,5 +1,4 @@
-use llm::transformer::TransformerBlock;
-use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer};
+use llm::{EMBEDDING_DIM, HIDDEN_DIM, Layer, transformer::TransformerBlock};
 use ndarray::Array2;
 
 #[test]