scientist-labs · cpetersen · Mar 24, 2026 · Mar 23, 2026 · Mar 24, 2026 · Mar 15, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/clusterkit.gemspec b/clusterkit.gemspec
@@ -33,6 +33,7 @@ Gem::Specification.new do |spec|
   spec.add_dependency "rb_sys", "~> 0.9"
 
   # Development dependencies
+  spec.add_development_dependency "benchmark"
   spec.add_development_dependency "csv"
   spec.add_development_dependency "rake", "~> 13.0"
   spec.add_development_dependency "rake-compiler", "~> 1.2"

diff --git a/ext/clusterkit/Cargo.toml b/ext/clusterkit/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"
 crate-type = ["cdylib"]
 
 [dependencies]
-magnus = { version = "0.6", features = ["embed"] }
+magnus = { version = "0.8", features = ["embed"] }
 annembed = { git = "https://github.com/scientist-labs/annembed", tag = "clusterkit-0.1.1" }
 hnsw_rs = { git = "https://github.com/scientist-labs/hnswlib-rs", tag = "clusterkit-0.1.0" }
 hdbscan = "0.11"

diff --git a/ext/clusterkit/src/clustering.rs b/ext/clusterkit/src/clustering.rs
@@ -1,50 +1,52 @@
-use magnus::{function, prelude::*, Error, Value, RArray, Integer};
+use magnus::{function, prelude::*, Error, Value, RArray, Ruby};
 use ndarray::{Array1, Array2, ArrayView1, Axis};
 use rand::prelude::*;
 use rand::rngs::StdRng;
 use rand::SeedableRng;
-use crate::utils::{ruby_array_to_ndarray2};
+use crate::utils::ruby_array_to_ndarray2;
 
 mod hdbscan_wrapper;
 
 pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
     let clustering_module = parent.define_module("Clustering")?;
-    
+
     clustering_module.define_singleton_method(
         "kmeans_rust",
         function!(kmeans, 4),
     )?;
-    
+
     clustering_module.define_singleton_method(
         "kmeans_predict_rust",
         function!(kmeans_predict, 2),
     )?;
-    
+
     // Initialize HDBSCAN functions
     hdbscan_wrapper::init(&clustering_module)?;
-    
+
     Ok(())
 }
 
 /// Perform K-means clustering
 /// Returns (labels, centroids, inertia)
 fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> Result<(RArray, RArray, f64), Error> {
+    let ruby = Ruby::get().unwrap();
+
     // Convert Ruby array to ndarray using shared helper
     let data_array = ruby_array_to_ndarray2(data)?;
     let (n_samples, n_features) = data_array.dim();
-    
+
     if k > n_samples {
         return Err(Error::new(
-            magnus::exception::arg_error(),
+            ruby.exception_arg_error(),
             format!("k ({}) cannot be larger than number of samples ({})", k, n_samples),
         ));
     }
-    
+
     // Initialize centroids using K-means++
     let mut centroids = kmeans_plusplus(&data_array, k, random_seed)?;
     let mut labels = vec![0usize; n_samples];
     let mut prev_labels = vec![0usize; n_samples];
-    
+
     // K-means iterations
     for iteration in 0..max_iter {
         // Assign points to nearest centroid
@@ -53,131 +55,128 @@ fn kmeans(data: Value, k: usize, max_iter: usize, random_seed: Option<i64>) -> R
             let point = data_array.row(i);
             let mut min_dist = f64::INFINITY;
             let mut best_cluster = 0;
-            
+
             for (j, centroid) in centroids.axis_iter(Axis(0)).enumerate() {
                 let dist = euclidean_distance(&point, &centroid);
                 if dist < min_dist {
                     min_dist = dist;
                     best_cluster = j;
                 }
             }
-            
+
             if labels[i] != best_cluster {
                 changed = true;
             }
             labels[i] = best_cluster;
         }
-        
+
         // Check for convergence
         if !changed && iteration > 0 {
             break;
         }
-        
+
         // Update centroids
         for j in 0..k {
             let mut sum = Array1::<f64>::zeros(n_features);
             let mut count = 0;
-            
+
             for i in 0..n_samples {
                 if labels[i] == j {
                     sum += &data_array.row(i);
                     count += 1;
                 }
             }
-            
+
             if count > 0 {
                 centroids.row_mut(j).assign(&(sum / count as f64));
             }
         }
-        
+
         prev_labels.clone_from(&labels);
     }
-    
+
     // Calculate inertia (sum of squared distances to nearest centroid)
     let mut inertia = 0.0;
     for i in 0..n_samples {
         let point = data_array.row(i);
         let centroid = centroids.row(labels[i]);
         inertia += euclidean_distance(&point, &centroid).powi(2);
     }
-    
+
     // Convert results to Ruby arrays
-    let ruby = magnus::Ruby::get().unwrap();
-    let labels_array = RArray::new();
+    let labels_array = ruby.ary_new();
     for label in labels {
-        labels_array.push(Integer::from_value(ruby.eval(&format!("{}", label)).unwrap()).unwrap())?;
+        labels_array.push(ruby.integer_from_i64(label as i64))?;
     }
-    
-    let centroids_array = RArray::new();
+
+    let centroids_array = ruby.ary_new();
     for i in 0..k {
-        let row_array = RArray::new();
+        let row_array = ruby.ary_new();
         for j in 0..n_features {
             row_array.push(centroids[[i, j]])?;
         }
         centroids_array.push(row_array)?;
     }
-    
+
     Ok((labels_array, centroids_array, inertia))
 }
 
 /// Predict cluster labels for new data given centroids
 fn kmeans_predict(data: Value, centroids: Value) -> Result<RArray, Error> {
+    let ruby = Ruby::get().unwrap();
+
     // Convert inputs using shared helpers
     let data_matrix = ruby_array_to_ndarray2(data)?;
     let centroids_matrix = ruby_array_to_ndarray2(centroids)?;
-    
+
     let (n_samples, _) = data_matrix.dim();
-    let (_k, _) = centroids_matrix.dim();
-
+
     // Predict labels
-    let ruby = magnus::Ruby::get().unwrap();
-    let labels_array = RArray::new();
-
+    let labels_array = ruby.ary_new();
+
     for i in 0..n_samples {
         let point = data_matrix.row(i);
         let mut min_dist = f64::INFINITY;
         let mut best_cluster = 0;
-        
+
         for (j, centroid) in centroids_matrix.axis_iter(Axis(0)).enumerate() {
             let dist = euclidean_distance(&point, &centroid);
             if dist < min_dist {
                 min_dist = dist;
                 best_cluster = j;
             }
         }
-        
-        labels_array.push(Integer::from_value(ruby.eval(&format!("{}", best_cluster)).unwrap()).unwrap())?;
+
+        labels_array.push(ruby.integer_from_i64(best_cluster as i64))?;
     }
-    
+
     Ok(labels_array)
 }
 
 /// K-means++ initialization
 fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Result<Array2<f64>, Error> {
     let n_samples = data.nrows();
     let n_features = data.ncols();
-    
+
     // Use seeded RNG if seed is provided, otherwise use thread_rng
     let mut rng: Box<dyn RngCore> = match random_seed {
         Some(seed) => {
-            // Convert i64 to u64 for seeding (negative numbers wrap around)
             let seed_u64 = seed as u64;
             Box::new(StdRng::seed_from_u64(seed_u64))
         },
         None => Box::new(thread_rng()),
     };
-    
+
     let mut centroids = Array2::<f64>::zeros((k, n_features));
-    
+
     // Choose first centroid randomly
     let first_idx = rng.gen_range(0..n_samples);
     centroids.row_mut(0).assign(&data.row(first_idx));
-    
+
     // Choose remaining centroids
     for i in 1..k {
         let mut distances = vec![f64::INFINITY; n_samples];
-
-        // Calculate distance to nearest centroid for each point
+
         for j in 0..n_samples {
             for c in 0..i {
                 let dist = euclidean_distance(&data.row(j), &centroids.row(c));
@@ -186,25 +185,20 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Re
                 }
             }
         }
-
-        // Convert distances to probabilities
+
         let total: f64 = distances.iter().map(|d| d * d).sum();
         if total == 0.0 {
-            // All points are identical or we've selected duplicates
-            // Just use sequential points as centroids
             if i < n_samples {
                 centroids.row_mut(i).assign(&data.row(i));
             } else {
-                // Reuse first point if we run out
                 centroids.row_mut(i).assign(&data.row(0));
             }
             continue;
         }
-
-        // Choose next centroid with probability proportional to squared distance
+
         let mut cumsum = 0.0;
         let rand_val: f64 = rng.gen::<f64>() * total;
-        
+
         for j in 0..n_samples {
             cumsum += distances[j] * distances[j];
             if cumsum >= rand_val {
@@ -213,7 +207,7 @@ fn kmeans_plusplus(data: &Array2<f64>, k: usize, random_seed: Option<i64>) -> Re
             }
         }
     }
-    
+
     Ok(centroids)
 }
 
@@ -224,4 +218,4 @@ fn euclidean_distance(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> f64 {
         .map(|(x, y)| (x - y).powi(2))
         .sum::<f64>()
         .sqrt()
-}
+}