diff --git a/crates/RustQuant_ml/src/lasso.rs b/crates/RustQuant_ml/src/lasso.rs new file mode 100644 index 00000000..8c09869d --- /dev/null +++ b/crates/RustQuant_ml/src/lasso.rs @@ -0,0 +1,290 @@ +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// RustQuant: A Rust library for quantitative finance tools. +// Copyright (C) 2023 https://github.com/avhz +// Dual licensed under Apache 2.0 and MIT. +// See: +// - LICENSE-APACHE.md +// - LICENSE-MIT.md +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +//! Module for Lasso algorithms. + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// IMPORTS +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +use nalgebra::{DMatrix, DVector}; +use RustQuant_error::RustQuantError; + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// STRUCTS, ENUMS, AND TRAITS +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +/// Struct to hold the input data for a Lasso regression. +#[allow(clippy::module_name_repetitions)] +#[derive(Clone, Debug)] +pub struct LassoInput { + /// The features matrix. + pub x: DMatrix, + /// The output data vector, also known as the response vector. + pub y: DVector, + /// The regularization parameter. + pub lambda: T, + /// Include the intercept. + pub fit_intercept: bool, + /// The maximum number of iterations for training. + pub max_iter: usize, + /// The tolerance for the convergence. + pub tolerance: T, +} + +/// Struct to hold the output data for lasso. +#[allow(clippy::module_name_repetitions)] +#[derive(Clone, Debug)] +pub struct LassoOutput { + /// The intercept of the lasso regression, + pub intercept: T, + /// The coefficients of the lasso regression, + pub coefficients: DVector, +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// IMPLEMENTATIONS +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +impl LassoInput { + /// Create a new `LassoInput` struct. + #[must_use] + pub fn new( + x: DMatrix, + y: DVector, + lambda: f64, + fit_intercept: bool, + max_iter: usize, + tolerance: f64, + ) -> Self { + Self { x, y, lambda, fit_intercept, max_iter, tolerance } + } + + /// Fits a Lasso regression to the input data. + /// Returns the intercept and coefficients. + /// The intercept is the first value of the coefficients. + pub fn fit(&self) -> Result, RustQuantError> { + let n_cols = self.x.ncols(); + let n_rows = self.x.nrows() as f64; + let mut features_matrix = self.x.clone(); + let mut residuals = self.y.clone(); + let feature_means = DVector::from_iterator( + self.x.ncols(), + (0..self.x.ncols()).map(|j| self.x.column(j).mean()) + ); + + if self.fit_intercept { + + features_matrix = self.x.clone(); + for j in 0..self.x.ncols() { + let mean = feature_means[j]; + for i in 0..self.x.nrows() { + features_matrix[(i, j)] -= mean; + } + } + residuals -= DVector::from_element(self.x.nrows(), self.y.mean()); + } + + let mut coefficients = DVector::::zeros(n_cols); + + for _ in 0..self.max_iter { + let mut max_delta: f64 = 0.0; + for j in 0..n_cols { + + let feature_vals_col_j = features_matrix.column(j); + let col_norm: f64 = feature_vals_col_j.dot(&feature_vals_col_j); + let rho: f64 = (residuals.dot(&feature_vals_col_j) + coefficients[j] * col_norm) / n_rows; + + let new_coefficient_j: f64 = if rho < -self.lambda { + (rho + self.lambda) / (col_norm / n_rows) + } else if rho > self.lambda { + (rho - self.lambda) / (col_norm / n_rows) + } else { + 0.0 + }; + + let delta: f64 = new_coefficient_j - coefficients[j]; + if delta.abs() > 0.0 { + residuals -= &feature_vals_col_j * delta; + } + coefficients[j] = new_coefficient_j; + max_delta = max_delta.max(delta.abs()); + } + + if max_delta < self.tolerance { + break; + } + } + + let intercept: f64 = if self.fit_intercept { + self.y.mean() - feature_means.dot(&coefficients) + } else { + 0.0 + }; + coefficients = coefficients.insert_row(0, intercept); + + Ok(LassoOutput { + intercept, + coefficients, + }) + } +} + +impl LassoOutput { + /// Predicts the output for the given input data. + pub fn predict(&self, input: DMatrix) -> Result, RustQuantError> { + let intercept = DVector::from_element( + input.nrows(), + self.intercept + ); + let coefficients = self.coefficients.clone().remove_row(0); + let predictions = input * coefficients + intercept; + Ok(predictions) + } +} + + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// UNIT TESTS +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +#[cfg(test)] +mod tests_lasso_regression { + use super::*; + use RustQuant_utils::assert_approx_equal; + + struct DataForTests { + training_set: DMatrix, + testing_set: DMatrix, + response: DVector, + } + + fn setup_test() -> DataForTests { + DataForTests { + training_set: DMatrix::from_row_slice( + 4, + 3, + &[ + -0.083_784_355, -0.633_485_70, -0.399_266_60, + -0.982_943_745, 1.090_797_46, -0.468_123_05, + -1.875_067_321, -0.913_727_27, 0.326_962_08, + -0.186_144_661, 1.001_639_71, -0.412_746_90], + ), + + testing_set: DMatrix::from_row_slice( + 4, + 3, + &[ + 0.562_036_47, 0.595_846_45, -0.411_653_01, + 0.663_358_26, 0.452_091_83, -0.294_327_15, + -0.602_897_28, 0.896_743_96, 1.218_573_96, + 0.698_377_69, 0.572_216_51, 0.244_111_43], + ), + + response: DVector::from_row_slice( + &[ + -0.445_151_96, + -1.847_803_64, + -0.628_825_31, + -0.861_080_69 + ] + ), + } + } + + #[test] + fn test_lasso_without_intercept() -> Result<(), RustQuantError> { + + let data: DataForTests = setup_test(); + + let input: LassoInput = LassoInput { + x: data.training_set, + y: data.response, + lambda: 0.01, + fit_intercept: false, + max_iter: 1000, + tolerance: 1e-4, + }; + + let output: LassoOutput = input.fit()?; + let predictions = output.predict(data.testing_set)?; + + for (i, coefficient) in output.coefficients.iter().enumerate() { + assert_approx_equal!( + coefficient, + &[ + 0.0, + 0.743_965_706_491_596_7, + -0.304_713_846_510_641_43, + 1.355_162_653_724_116_22, + ][i], + f64::EPSILON + ); + } + + for (i, pred) in predictions.iter().enumerate() { + assert_approx_equal!( + pred, + &[ + -0.321_283_589_676_737_6, + -0.04310400559445471, + 0.9295807191488583, + 0.6760174510230131 + ][i], + f64::EPSILON + ); + } + Ok(()) + } + + #[test] + fn test_lasso_with_intercept() -> Result<(), RustQuantError> { + + let data: DataForTests = setup_test(); + + let input: LassoInput = LassoInput { + x: data.training_set, + y: data.response, + lambda: 0.01, + fit_intercept: true, + max_iter: 1000, + tolerance: 1e-4, + }; + + let output: LassoOutput = input.fit()?; + let predictions = output.predict(data.testing_set)?; + + for (i, coefficient) in output.coefficients.iter().enumerate() { + assert_approx_equal!( + coefficient, + &[ + 0.009_633_706_736_496_328, + 0.750_479_303_541_854_1, + -0.301_997_087_876_784_5, + 1.373_605_833_196_545_3, + ][i], + f64::EPSILON + ); + } + + for (i, pred) in predictions.iter().enumerate() { + assert_approx_equal!( + pred, + &[ + -0.313_962_423_203_417_3, + -0.033_349_554_520_968_38, + 0.960_198_011_081_136_2, + 0.696_256_873_679_798_4, + ][i], + f64::EPSILON + ); + } + Ok(()) + } +} diff --git a/crates/RustQuant_ml/src/lib.rs b/crates/RustQuant_ml/src/lib.rs index 1280bd0d..919d6af5 100644 --- a/crates/RustQuant_ml/src/lib.rs +++ b/crates/RustQuant_ml/src/lib.rs @@ -33,3 +33,7 @@ pub use linear_regression::*; /// Logistic regression. pub mod logistic_regression; pub use logistic_regression::*; + +/// lasso regression. +pub mod lasso; +pub use lasso::*;