diff --git a/financial_loss_functions/config/hparams.json b/financial_loss_functions/config/hparams.json
index d7d4ef90..1878b434 100644
--- a/financial_loss_functions/config/hparams.json
+++ b/financial_loss_functions/config/hparams.json
@@ -1,45 +1,205 @@
-{
+{   
+    "seed": 50,
     "rolling_windows": {
-        "in_size": 200,
-        "out_size": 50,
+        "in_size": 120,
+        "out_size": 60,
         "stride": 1
     },
     "nn_models": {
         "BaseLSTM": {
             "model": {
-                "hidden_size": 256,
-                "num_layers": 2,
+                "hidden_size": 16,
+                "num_layers": 4,
                 "dropout": 0.2,
-                "equal_prior": true
+                "equal_prior": false
             },
             "optimizer": {
                 "lr": 1e-4,
-                "weight_decay": 1e-5
+                "weight_decay": 1e-2
             },
             "train" : {
-                "train_batch_size": 256,
-                "val_batch_size": 2,
+                "train_batch_size": 64,
+                "val_batch_size": 64,
                 "clip_grad_norm": 0.5,
-                "epochs": 100
+                "epochs": 200,
+                "early_stopping": true,
+                "early_stop_patience": 20,
+                "early_stop_min_delta": 1e-5
+            },
+            "scheduler": {
+                "factor": 0.5,
+                "patience": 10,
+                "min_lr": 1e-6
             }
         },
         "AttentionLSTM": {
             "model": {
-                "hidden_size": 128,
-                "num_layers": 3,
+                "hidden_size": 16,
+                "num_layers": 4,
+                "attention_heads": 2,
+                "dropout": 0.2,
+                "equal_prior": false
+            },
+            "optimizer": {
+                "lr": 1e-4,
+                "weight_decay": 1e-2
+            },
+            "train": {
+                "train_batch_size": 64,
+                "val_batch_size": 64,
+                "clip_grad_norm": 0.5,
+                "epochs": 200,
+                "early_stopping": true,
+                "early_stop_patience": 20,
+                "early_stop_min_delta": 1e-5
+            },
+            "scheduler": {
+                "factor": 0.5,
+                "patience": 10,
+                "min_lr": 1e-6
+            }
+        },
+        "TemporalTransformer": {
+            "model": {
+                "hidden_size": 16,
+                "num_layers": 2,
                 "attention_heads": 2,
                 "dropout": 0.2,
-                "equal_prior": true
+                "expansion_factor": 4,
+                "max_seq_len": 120
             },
             "optimizer": {
                 "lr": 1e-4,
-                "weight_decay": 1e-4
+                "weight_decay": 1e-2
             },
             "train": {
-                "train_batch_size": 256,
-                "val_batch_size": 2,
+                "train_batch_size": 64,
+                "val_batch_size": 64,
+                "clip_grad_norm": 0.1,
+                "epochs": 200,
+                "early_stopping": true,
+                "early_stop_patience": 20,
+                "early_stop_min_delta": 1e-5
+            },
+            "scheduler": {
+                "factor": 0.5,
+                "patience": 10,
+                "min_lr": 1e-6
+            }
+        }, 
+        "TFT": {
+            "model": {
+                "hidden_size": 16,
+                "num_layers": 1,
+                "attention_heads": 2,
+                "dropout": 0.4,
+                "expansion_factor": 2,
+                "max_seq_len": 120
+            },
+            "optimizer": {
+                "lr": 1e-4,
+                "weight_decay": 1e-2
+            },
+            "train": {
+                "train_batch_size": 64,
+                "val_batch_size": 64,
                 "clip_grad_norm": 0.5,
-                "epochs": 100
+                "epochs": 200,
+                "early_stopping": true,
+                "early_stop_patience": 20,
+                "early_stop_min_delta": 1e-5
+            },
+            "scheduler": {
+                "factor": 0.5,
+                "patience": 10,
+                "min_lr": 1e-6
+            }
+        },
+        "LSTMTransformer": {
+            "model": {
+                "hidden_size": 32,
+                "num_layers": 2,
+                "attention_heads": 2,
+                "dropout": 0.5,
+                "expansion_factor": 4, 
+                "max_seq_len": 120
+            },
+            "optimizer": {
+                "lr": 1e-5,
+                "weight_decay": 1e-3
+            },
+            "train": {
+                "train_batch_size": 64,
+                "val_batch_size": 64,
+                "clip_grad_norm": 0.5,
+                "epochs": 200,
+                "early_stopping": true,
+                "early_stop_patience": 20,
+                "early_stop_min_delta": 1e-5
+            },
+            "scheduler": {
+                "factor": 0.5,
+                "patience": 10,
+                "min_lr": 1e-8
+            }
+        },
+        "InvertedAttentionLSTM": {
+            "model": {
+                "hidden_size": 16,
+                "num_layers": 4,
+                "attention_heads": 8,
+                "dropout": 0.5,
+                "max_seq_len": 120
+            },
+            "optimizer": {
+                "lr": 1e-4,
+                "weight_decay": 1e-3
+            },
+            "train": {
+                "train_batch_size": 64,
+                "val_batch_size": 64,
+                "clip_grad_norm": 0.5,
+                "epochs": 200,
+                "early_stopping": true,
+                "early_stop_patience": 20,
+                "early_stop_min_delta": 1e-5
+            },
+            "scheduler": {
+                "factor": 0.5,
+                "patience": 10,
+                "min_lr": 1e-6
+            }
+        },
+        "DeformTime": {
+            "model": {
+                "seq_len": 120,
+                "e_layers": 2,
+                "d_layers": 2,
+                "d_model": 16,
+                "attention_heads": 4,
+                "kernel_size": 4,
+                "dropout": 0.2,
+                "n_reshape": 12,
+                "patch_len": 6,
+                "stride": 6
+            },
+            "optimizer": {
+                "lr": 1e-4,
+                "weight_decay": 1e-3
+            },
+            "train": {
+                "train_batch_size": 64,
+                "val_batch_size": 64,
+                "clip_grad_norm": 0.5,
+                "epochs": 200,
+                "early_stopping": true,
+                "early_stop_patience": 20,
+                "early_stop_min_delta": 1e-5
+            },
+            "scheduler": {
+                "factor": 0.5,
+                "patience": 10,
+                "min_lr": 1e-6
             }
         }
     },
@@ -68,13 +228,17 @@
             "lambda2": 0.1
         },
         "custom_loss_8" : {
-            "lambda1": 0.01,
-            "lambda2": 0.01,
-            "lambda3": 0.1
+            "log_ret_lambda": 0.01,
+            "cvar_lambda": 0.1,
+            "risk_p_lambda": 0.1
         },
         "custom_loss_9" : {
             "lambda1": 0.01,
             "lambda2": 0.1
+        },
+        "custom_loss_10" : {
+            "cvar_lambda": 0.1,
+            "risk_p_lambda": 0.1
         }
     },
     "trad_models": {
diff --git a/financial_loss_functions/config/paths.json b/financial_loss_functions/config/paths.json
index 38c76bac..e210c150 100644
--- a/financial_loss_functions/config/paths.json
+++ b/financial_loss_functions/config/paths.json
@@ -7,9 +7,9 @@
         "crsp_dir": ""
     },
     "raw_files": {
-        "train": "combined_predictors_train.csv",
-        "val": "combined_predictors_validation.csv",
-        "test": "combined_predictors_test.csv"
+        "train": "crsp_train_2019.csv",
+        "val": "crsp_val_2021.csv",
+        "test": "crsp_test_2023.csv"
     },
     "processed_paths": {
         "returns_train": "data/processed/ret_train.csv",
diff --git a/financial_loss_functions/data/raw/sample/combined_predictors_test.csv b/financial_loss_functions/data/raw/sample/crsp_test_2023.csv
similarity index 100%
rename from financial_loss_functions/data/raw/sample/combined_predictors_test.csv
rename to financial_loss_functions/data/raw/sample/crsp_test_2023.csv
diff --git a/financial_loss_functions/data/raw/sample/combined_predictors_train.csv b/financial_loss_functions/data/raw/sample/crsp_train_2019.csv
similarity index 100%
rename from financial_loss_functions/data/raw/sample/combined_predictors_train.csv
rename to financial_loss_functions/data/raw/sample/crsp_train_2019.csv
diff --git a/financial_loss_functions/data/raw/sample/combined_predictors_validation.csv b/financial_loss_functions/data/raw/sample/crsp_val_2021.csv
similarity index 100%
rename from financial_loss_functions/data/raw/sample/combined_predictors_validation.csv
rename to financial_loss_functions/data/raw/sample/crsp_val_2021.csv
diff --git a/financial_loss_functions/requirements.txt b/financial_loss_functions/requirements.txt
index bba07a16..43dc8934 100644
--- a/financial_loss_functions/requirements.txt
+++ b/financial_loss_functions/requirements.txt
@@ -12,4 +12,7 @@ seaborn==0.13.0
 statsmodels==0.14.5
 torch==2.9.1
 torchvision==0.24.1
-optuna==4.6.0
\ No newline at end of file
+optuna==4.6.0
+tqdm=4.67.1
+einops==0.8.2
+timm==1.0.25
\ No newline at end of file
diff --git a/financial_loss_functions/scripts/run_training_one.py b/financial_loss_functions/scripts/run_training_one.py
index 924b890f..0abac271 100644
--- a/financial_loss_functions/scripts/run_training_one.py
+++ b/financial_loss_functions/scripts/run_training_one.py
@@ -3,6 +3,10 @@
 import signal
 import argparse
 from src.utils.io import load_path_config, load_config
+
+# @author: Atharva Vaidya - This fallback helps in allowing unsupported MPS ops to run through CPU when DeformTime triggers them.
+os.environ.setdefault('PYTORCH_ENABLE_MPS_FALLBACK', '1')
+
 from src.training.pipeline import run_training_one_model
 
 _interrupted = False
@@ -123,4 +127,4 @@ def cleanup_on_interrupt():
         print(f"\nPipeline failed with error: {e}")
         import traceback
         traceback.print_exc()
-        sys.exit(1)
\ No newline at end of file
+        sys.exit(1)
diff --git a/financial_loss_functions/src/models/DeformTime/DeformTime.py b/financial_loss_functions/src/models/DeformTime/DeformTime.py
new file mode 100644
index 00000000..0c2bf0df
--- /dev/null
+++ b/financial_loss_functions/src/models/DeformTime/DeformTime.py
@@ -0,0 +1,189 @@
+import torch
+from torch import nn
+from src.models.registry import NNModelLibrary
+
+from src.models.DeformTime.layers.TemporalDeformAttention import Encoder, CrossDeformAttn
+from src.models.DeformTime.layers.Embed import Deform_Temporal_Embedding, Local_Temporal_Embedding
+from math import ceil
+
+class Layernorm(nn.Module):
+    def __init__(self, dim):
+        super(Layernorm, self).__init__()
+        self.layernorm = nn.LayerNorm(dim)
+
+    def forward(self, x):
+        x_hat = self.layernorm(x)
+        bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1)
+        return x_hat - bias
+
+
+@NNModelLibrary.register(category='transformer')
+class DeformTime(nn.Module):
+    def __init__(
+            self,
+            input_size: int,
+            num_stocks: int,
+            seq_len: int,
+            e_layers: int,
+            d_layers: int,
+            d_model: int,
+            attention_heads: int,
+            kernel_size: int,
+            dropout: float,
+            n_reshape: int,
+            patch_len: int,
+            stride: int
+    ) -> None:
+        super().__init__()
+
+        self.input_size = input_size
+        self.num_stocks = num_stocks
+        
+        self.d_layers = d_layers
+        self.d_model = d_model
+
+        # Embedding
+        if self.input_size == 1:
+            self.enc_value_embedding = Deform_Temporal_Embedding(self.input_size, self.d_model, freq='d')
+        else:
+            self.s_group = 4
+            assert self.d_model % self.s_group == 0
+            # Embedding local patches
+            self.pad_in_len = ceil(1.0 * self.input_size / self.s_group) * self.s_group
+            self.enc_value_embedding = Local_Temporal_Embedding(self.pad_in_len//self.s_group, self.d_model, self.pad_in_len-self.input_size, self.s_group)
+
+        self.pre_norm = nn.LayerNorm(self.d_model)
+        # Encoder
+        n_days = [1,n_reshape,n_reshape]
+        assert len(n_days) > e_layers-1
+        drop_path_rate=dropout
+        dpr = [x.item() for x in torch.linspace(drop_path_rate, drop_path_rate, e_layers)]
+        self.encoder = Encoder(
+            [
+                CrossDeformAttn(seq_len=seq_len, 
+                                d_model=self.d_model, 
+                                n_heads=attention_heads, 
+                                dropout=dropout, 
+                                droprate=dpr[l], 
+                                n_days=n_days[l], 
+                                window_size=kernel_size, 
+                                patch_len=patch_len, 
+                                stride=stride) for l in range(e_layers)
+            ],
+            norm_layer=Layernorm(self.d_model)
+        )
+
+        # GRU layers
+        self.gru = torch.nn.GRU(
+            self.d_model, self.d_model, self.d_layers, batch_first=True, dropout=dropout
+        )
+
+        # @author: Atharva Vaidya - This head helps in converting the encoded DeformTime context into portfolio logits expected by the loss pipeline.
+        self.fc = nn.Sequential(
+            nn.Linear(self.d_model, self.d_model),
+            nn.LeakyReLU(),
+            nn.Linear(self.d_model, self.num_stocks)
+        )
+
+    def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
+        """
+        Variables
+            • x_enc
+                type: tensor of numbers
+                usage: used to store the input feature window that is passed from the
+                       trainer for DeformTime encoding
+            • x_mark_enc
+                type: tensor of numbers or empty value
+                usage: reserved encoder marker input that is accepted for interface
+                       compatibility but is not used in this runtime path
+            • x_dec
+                type: tensor of numbers or empty value
+                usage: reserved decoder input kept for compatibility with the
+                       surrounding model interface
+            • x_mark_dec
+                type: tensor of numbers or empty value
+                usage: reserved decoder marker input kept for compatibility with the
+                       surrounding model interface
+            • mean_enc
+                type: tensor of numbers
+                usage: used to store the per-window mean so the input can be
+                       stationarized before attention is applied
+            • std_enc
+                type: tensor of numbers
+                usage: used to store the per-window standard deviation so the input
+                       scaling remains numerically stable
+            • enc_out
+                type: tensor of numbers
+                usage: used to store the encoded deformable-attention representation
+                       produced by the encoder
+            • h0
+                type: tensor of numbers
+                usage: used to store the initial hidden state passed into the GRU
+            • out
+                type: tensor of numbers
+                usage: used to store the GRU sequence output before collapsing it to
+                       a single allocation context
+            • context
+                type: tensor of numbers
+                usage: used to store the last GRU state which becomes the compact
+                       portfolio-allocation context for the final head
+
+        forecast now extracts a portfolio-allocation context from the encoded sequence
+        instead of reconstructing the original feature space. @author: Atharva Vaidya
+        """
+        assert x_enc.shape[-1] == self.input_size
+
+        # Series Stationarization adopted from NSformer, optional
+        mean_enc = x_enc.mean(1, keepdim=True).detach() # B x 1 x E
+        x_enc = x_enc - mean_enc
+        std_enc = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5).detach()
+        x_enc = x_enc / std_enc
+
+        x_enc = self.enc_value_embedding(x_enc)
+        x_enc = self.pre_norm(x_enc)
+
+        # Deformed attention
+        enc_out, _ = self.encoder(x_enc) 
+
+        # Decoder
+        h0 = torch.zeros(self.d_layers, x_enc.size(0), self.d_model).requires_grad_().to(x_enc.device)
+        out, _ = self.gru(enc_out, h0.detach())
+        # Extract the final GRU state so one context vector represents the allocation decision.
+        context = out[:, -1, :]
+        # Apply the portfolio head to convert the context vector into allocation logits.
+        return self.fc(context)
+    
+    def forward(self, x_enc, x_mark_enc=None, x_dec=None, x_mark_dec=None, mask=None):
+        """
+        Variables
+            • x_enc
+                type: tensor of numbers
+                usage: used to store the encoded input window passed from the trainer
+                       for a portfolio-weight prediction
+            • x_mark_enc
+                type: tensor of numbers or empty value
+                usage: reserved encoder marker input that is kept for interface
+                       compatibility
+            • x_dec
+                type: tensor of numbers or empty value
+                usage: reserved decoder input that is kept for interface compatibility
+            • x_mark_dec
+                type: tensor of numbers or empty value
+                usage: reserved decoder marker input that is kept for interface
+                       compatibility
+            • mask
+                type: tensor of numbers or empty value
+                usage: reserved mask input accepted for compatibility with the model
+                       interface
+            • logits
+                type: tensor of numbers
+                usage: used to store the raw portfolio scores returned from forecast
+                       before normalization
+
+        forward normalizes the DeformTime logits into portfolio weights expected by
+        the training and loss pipeline. @author: Atharva Vaidya
+        """
+        logits = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
+        # Normalize the logits so the downstream losses receive portfolio weights.
+        return torch.softmax(logits, dim=-1)
+
diff --git a/financial_loss_functions/src/models/DeformTime/__init__.py b/financial_loss_functions/src/models/DeformTime/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/financial_loss_functions/src/models/DeformTime/layers/Embed.py b/financial_loss_functions/src/models/DeformTime/layers/Embed.py
new file mode 100644
index 00000000..fd754c02
--- /dev/null
+++ b/financial_loss_functions/src/models/DeformTime/layers/Embed.py
@@ -0,0 +1,231 @@
+import torch
+import torch.nn as nn
+import math
+from einops import rearrange
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEmbedding, self).__init__()
+        # Compute the positional encodings once in log space.
+        pe = torch.zeros(max_len, d_model).float()
+        pe.require_grad = False
+
+        position = torch.arange(0, max_len).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float()
+                    * -(math.log(10000.0) / d_model)).exp()
+
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        return self.pe[:, :x.size(1)]
+
+
+class TokenEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(TokenEmbedding, self).__init__()
+        padding = 1 if torch.__version__ >= '1.5.0' else 2
+        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
+                                   kernel_size=3, padding=padding, padding_mode='circular', bias=False)
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_in', nonlinearity='leaky_relu')
+
+    def forward(self, x):
+        x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
+        return x
+
+
+class FixedEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(FixedEmbedding, self).__init__()
+
+        w = torch.zeros(c_in, d_model).float()
+        w.require_grad = False
+
+        position = torch.arange(0, c_in).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float()
+                    * -(math.log(10000.0) / d_model)).exp()
+
+        w[:, 0::2] = torch.sin(position * div_term)
+        w[:, 1::2] = torch.cos(position * div_term)
+
+        self.emb = nn.Embedding(c_in, d_model)
+        self.emb.weight = nn.Parameter(w, requires_grad=False)
+
+    def forward(self, x):
+        return self.emb(x).detach()
+
+
+class TemporalEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='fixed', freq='h'):
+        super(TemporalEmbedding, self).__init__()
+
+        minute_size = 4
+        hour_size = 24
+        weekday_size = 7
+        day_size = 32
+        month_size = 13
+
+        Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding
+        if freq == 't':
+            self.minute_embed = Embed(minute_size, d_model)
+        self.hour_embed = Embed(hour_size, d_model)
+        self.weekday_embed = Embed(weekday_size, d_model)
+        self.day_embed = Embed(day_size, d_model)
+        self.month_embed = Embed(month_size, d_model)
+
+    def forward(self, x):
+        x = x.long()
+        minute_x = self.minute_embed(x[:, :, 4]) if hasattr(
+            self, 'minute_embed') else 0.
+        hour_x = self.hour_embed(x[:, :, 3])
+        weekday_x = self.weekday_embed(x[:, :, 2])
+        day_x = self.day_embed(x[:, :, 1])
+        month_x = self.month_embed(x[:, :, 0])
+
+        return hour_x + weekday_x + day_x + month_x + minute_x
+
+
+class TimeFeatureEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='timeF', freq='h'):
+        super(TimeFeatureEmbedding, self).__init__()
+
+        freq_map = {'h': 4, 't': 5, 's': 6,
+                    'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3}
+        d_inp = freq_map[freq]
+        self.embed = nn.Linear(d_inp, d_model, bias=False)
+
+    def forward(self, x):
+        return self.embed(x)
+
+
+class DataEmbedding(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding, self).__init__()
+
+        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        if x_mark is None:
+            x = self.value_embedding(x) + self.position_embedding(x)
+        else:
+            x = self.value_embedding(
+                x) + self.temporal_embedding(x_mark) + self.position_embedding(x)
+        return self.dropout(x)
+
+
+class DataEmbedding_inverted(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding_inverted, self).__init__()
+        self.value_embedding = nn.Linear(c_in, d_model)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        x = x.permute(0, 2, 1)
+        # x: [Batch Variate Time]
+        if x_mark is None:
+            x = self.value_embedding(x)
+        else:
+            x = self.value_embedding(torch.cat([x, x_mark.permute(0, 2, 1)], 1))
+        # x: [Batch Variate d_model]
+        return self.dropout(x)
+
+
+class DataEmbedding_wo_pos(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding_wo_pos, self).__init__()
+
+        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        if x_mark is None:
+            x = self.value_embedding(x)
+        else:
+            x = self.value_embedding(x) + self.temporal_embedding(x_mark)
+        return self.dropout(x)
+
+
+class PatchEmbedding(nn.Module):
+    def __init__(self, d_model, patch_len, stride, padding, dropout):
+        super(PatchEmbedding, self).__init__()
+        # Patching
+        self.patch_len = patch_len
+        self.stride = stride
+        self.padding_patch_layer = nn.ReplicationPad1d((0, padding))
+
+        # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space
+        self.value_embedding = nn.Linear(patch_len, d_model, bias=False)
+
+        # Positional embedding
+        self.position_embedding = PositionalEmbedding(d_model)
+
+        # Residual dropout
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        # do patching
+        n_vars = x.shape[1]
+        x = self.padding_patch_layer(x)
+        x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride)
+        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))
+        # Input encoding
+        x = self.value_embedding(x) + self.position_embedding(x)
+        return self.dropout(x), n_vars
+    
+
+class Deform_Temporal_Embedding(nn.Module):
+    def __init__(self, d_inp, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(Deform_Temporal_Embedding, self).__init__()
+
+        self.value_embedding = nn.Linear(d_inp, d_model, bias=False)
+        self.position_embedding = PositionalEmbedding(d_model)
+
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x):
+        x = self.value_embedding(x) + self.position_embedding(x)
+        return self.dropout(x)
+
+class Local_Temporal_Embedding(nn.Module):
+    def __init__(self, d_inp, d_model, padding, sub_groups=8, dropout=0.1):
+        super(Local_Temporal_Embedding, self).__init__()
+
+        d_out = d_model // sub_groups if d_model % sub_groups == 0 else d_model // sub_groups + 1
+        self.sub_seqlen = d_inp
+        self.padding_patch_layer = nn.ReplicationPad1d((0, padding))
+        self.value_embedding = nn.Linear(d_inp, d_out, bias=False)
+                
+        self.position_embedding = PositionalEmbedding(d_model)
+        self.dropout = nn.Dropout(p=dropout)
+        self.d_model = d_model
+
+    def forward(self, x):
+        # The in_channel is still fully conv with the out channel
+        # the number of output channels (out_channels) determines 
+        # the number of filters applied to the input, and each filter 
+        # processes the input across all input channels
+        B, L, C = x.shape
+        x = self.padding_patch_layer(x)
+        x = x.unfold(dimension=-1, size=self.sub_seqlen, step=self.sub_seqlen)
+        x = rearrange(x, 'b l g c -> (b g) l c')
+        # x = x.permute(0, 2, 1)
+        x = self.value_embedding(x)
+        # .permute(0, 2, 1)
+        x = rearrange(x, '(b g) l c -> b l (g c)', b = B)[:,:,:self.d_model]
+        x = x + self.position_embedding(x)
+        return self.dropout(x)
\ No newline at end of file
diff --git a/financial_loss_functions/src/models/DeformTime/layers/MLP.py b/financial_loss_functions/src/models/DeformTime/layers/MLP.py
new file mode 100644
index 00000000..f971bc89
--- /dev/null
+++ b/financial_loss_functions/src/models/DeformTime/layers/MLP.py
@@ -0,0 +1,132 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+activation_functions = {
+    'tanh': nn.Tanh(),
+    'relu': nn.ReLU(),
+    'elu': nn.ELU(),
+    'sigmoid': nn.Sigmoid(),
+    'gelu':nn.GELU()
+    }
+    
+class LipSwish(torch.nn.Module):
+    def forward(self, x):
+        return 0.909 * F.silu(x)
+
+class MLPLipSwish(torch.nn.Module):
+    def __init__(self, in_size, out_size, mlp_size, num_layers, tanh):
+        super().__init__()
+
+        model = [torch.nn.Linear(in_size, mlp_size),
+                 LipSwish()]
+        for _ in range(num_layers - 1):
+            model.append(torch.nn.Linear(mlp_size, mlp_size))
+            ###################
+            # LipSwish activations are useful to constrain the Lipschitz constant of the discriminator.
+            # (For simplicity we additionally use them in the generator, but that's less important.)
+            ###################
+            model.append(LipSwish())
+        model.append(torch.nn.Linear(mlp_size, out_size))
+        if tanh:
+            model.append(torch.nn.Tanh())
+        self._model = torch.nn.Sequential(*model)
+
+    def forward(self, x):
+        return self._model(x)
+    
+class MLP(nn.Module):
+    # layer_sizes[0] is the dimension of the input
+    # layer_sizes[-1] is the dimension of the output
+    def __init__(self, layer_sizes, final_relu=False, drop_out=0.7):
+        super().__init__()
+        layer_list = []
+        layer_sizes = [int(x) for x in layer_sizes]
+        num_layers = len(layer_sizes) - 1
+        final_relu_layer = num_layers if final_relu else num_layers - 1
+        for i in range(len(layer_sizes) - 1):
+            input_size = layer_sizes[i]
+            curr_size = layer_sizes[i + 1]
+            if i < final_relu_layer:
+                layer_list.append(nn.ReLU(inplace=False))
+            if drop_out != 0:
+                layer_list.append(nn.Dropout(drop_out))
+            layer_list.append(nn.Linear(input_size, curr_size))
+        self.net = nn.Sequential(*layer_list)
+        self.last_linear = self.net[-1]
+
+    def forward(self, x):
+        return self.net(x)
+    
+class FreqMLP(nn.Module):
+    def __init__(self, layer_sizes, final_relu=False, drop_out=0.7) -> None:
+        super().__init__()
+        layer_list = []
+        layer_sizes = [int(x) for x in layer_sizes]
+        num_layers = len(layer_sizes) - 1
+        final_relu_layer = num_layers if final_relu else num_layers - 1
+        for i in range(len(layer_sizes) - 1):
+            input_size = layer_sizes[i]
+            curr_size = layer_sizes[i + 1]
+            if i < final_relu_layer:
+                layer_list.append(nn.ReLU(inplace=False))
+            if drop_out != 0:
+                layer_list.append(nn.Dropout(drop_out))
+            layer_list.append(nn.Linear(input_size, curr_size))
+        self.net = nn.Sequential(*layer_list)
+        self.last_linear = self.net[-1]
+
+        self.r1 = nn.Parameter(self.scale * torch.randn(self.embed_size, self.embed_size))
+        self.i1 = nn.Parameter(self.scale * torch.randn(self.embed_size, self.embed_size))
+        self.rb1 = nn.Parameter(self.scale * torch.randn(self.embed_size))
+        self.ib1 = nn.Parameter(self.scale * torch.randn(self.embed_size))
+        self.r2 = nn.Parameter(self.scale * torch.randn(self.embed_size, self.embed_size))
+        self.i2 = nn.Parameter(self.scale * torch.randn(self.embed_size, self.embed_size))
+        self.rb2 = nn.Parameter(self.scale * torch.randn(self.embed_size))
+        self.ib2 = nn.Parameter(self.scale * torch.randn(self.embed_size))
+    
+    # frequency temporal learner
+    def MLP_temporal(self, x, B, N, L):
+        # [B, N, T, D]
+        x = torch.fft.rfft(x, dim=2, norm='ortho') # FFT on L dimension
+        y = self.FreMLP(B, N, L, x, self.r2, self.i2, self.rb2, self.ib2)
+        x = torch.fft.irfft(y, n=self.seq_len, dim=2, norm="ortho")
+        return x
+    
+    # frequency channel learner
+    def MLP_channel(self, x, B, N, L):
+        # [B, N, T, D]
+        x = x.permute(0, 2, 1, 3)
+        # [B, T, N, D]
+        x = torch.fft.rfft(x, dim=2, norm='ortho') # FFT on N dimension
+        y = self.FreMLP(B, L, N, x, self.r1, self.i1, self.rb1, self.ib1)
+        x = torch.fft.irfft(y, n=self.feature_size, dim=2, norm="ortho")
+        x = x.permute(0, 2, 1, 3)
+        # [B, N, T, D]
+        return x
+    
+    # frequency-domain MLPs
+    # dimension: FFT along the dimension, r: the real part of weights, i: the imaginary part of weights
+    # rb: the real part of bias, ib: the imaginary part of bias
+    def FreMLP(self, B, nd, dimension, x, r, i, rb, ib):
+        o1_real = torch.zeros([B, nd, dimension // 2 + 1, self.embed_size],
+                              device=x.device)
+        o1_imag = torch.zeros([B, nd, dimension // 2 + 1, self.embed_size],
+                              device=x.device)
+
+        o1_real = F.relu(
+            torch.einsum('bijd,dd->bijd', x.real, r) - \
+            torch.einsum('bijd,dd->bijd', x.imag, i) + \
+            rb
+        )
+
+        o1_imag = F.relu(
+            torch.einsum('bijd,dd->bijd', x.imag, r) + \
+            torch.einsum('bijd,dd->bijd', x.real, i) + \
+            ib
+        )
+
+        y = torch.stack([o1_real, o1_imag], dim=-1)
+        y = F.softshrink(y, lambd=self.sparsity_threshold)
+        y = torch.view_as_complex(y)
+        return y
\ No newline at end of file
diff --git a/financial_loss_functions/src/models/DeformTime/layers/TemporalDeformAttention.py b/financial_loss_functions/src/models/DeformTime/layers/TemporalDeformAttention.py
new file mode 100644
index 00000000..61b27b3c
--- /dev/null
+++ b/financial_loss_functions/src/models/DeformTime/layers/TemporalDeformAttention.py
@@ -0,0 +1,457 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from timm.layers import trunc_normal_
+from src.models.DeformTime.layers.MLP import MLP
+from src.models.DeformTime.utils.functions import num_patches
+
+
+def normal_init(module, mean=0, std=1, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+def constant_init(module, val, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.constant_(module.weight, val)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+class series_decomp(nn.Module):
+    """
+    Series decomposition block
+    """
+
+    def __init__(self, kernel_size):
+        super(series_decomp, self).__init__()
+        self.moving_avg = moving_avg(kernel_size, stride=1)
+
+    def forward(self, x):
+        moving_mean = self.moving_avg(x)
+        res = x - moving_mean
+        return res, moving_mean
+
+class moving_avg(nn.Module):
+    """
+    Moving average block to highlight the trend of time series
+    """
+
+    def __init__(self, kernel_size, stride):
+        super(moving_avg, self).__init__()
+        self.kernel_size = kernel_size
+        self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)
+
+    def forward(self, x):
+        # padding on the both ends of time series
+        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
+        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
+        x = torch.cat([front, x, end], dim=1)
+        x = self.avg(x.permute(0, 2, 1))
+        x = x.permute(0, 2, 1)
+        return x
+    
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """
+    From: https://github.com/huggingface/pytorch-image-models
+
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    DropPath is dropping an entire sample from the batch while Dropout is dropping random values
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+class LayerScale(nn.Module):
+    def __init__(self,
+                 dim: int,
+                 inplace: bool = False,
+                 init_values: float = 1e-5):
+        super().__init__()
+        self.inplace = inplace
+        self.weight = nn.Parameter(torch.ones(dim) * init_values)
+
+    def forward(self, x):
+        if self.inplace:
+            return x.mul_(self.weight.view(-1, 1, 1))
+        else:
+            return x * self.weight.view(-1, 1, 1)
+
+class LayerNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x):
+        x = self.norm(x)
+        return x
+
+class LayerNormProxy(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x):
+        x = rearrange(x, 'b c l -> b l c')
+        x = self.norm(x)
+        return rearrange(x, 'b l c -> b c l')
+
+
+class LayerNormProxy2D(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x):
+        x = rearrange(x, 'b c h w -> b h w c')
+        x = self.norm(x)
+        return rearrange(x, 'b h w c -> b c h w')
+
+
+class Encoder(nn.Module):
+    def __init__(self, attn_layers, norm_layer=None):
+        super(Encoder, self).__init__()
+        self.attn_layers = nn.ModuleList(attn_layers)
+        self.norm = norm_layer
+
+    def forward(self, x, attn_mask=None, tau=None, delta=None):
+        # x [B, L, D]
+        attns = []
+        for attn_layer in self.attn_layers:
+            x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta)
+            attns.append(attn)
+
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, attns
+
+
+class DeformAtten1D(nn.Module):
+    '''
+        max_offset (int): The maximum magnitude of the offset residue. Default: 14.
+    '''
+    def __init__(self, seq_len, d_model, n_heads, dropout, kernel=5, n_groups=4, no_off=False, rpb=True) -> None:
+        super().__init__()
+        self.offset_range_factor = kernel
+        self.no_off = no_off
+        self.seq_len = seq_len
+        self.d_model = d_model 
+        self.n_groups = n_groups
+        self.n_group_channels = self.d_model // self.n_groups
+        self.n_heads = n_heads
+        self.n_head_channels = self.d_model // self.n_heads
+        self.n_group_heads = self.n_heads // self.n_groups
+        self.scale = self.n_head_channels ** -0.5
+        self.rpb = rpb
+
+        self.proj_q = nn.Conv1d(self.d_model, self.d_model, kernel_size=1, stride=1, padding=0)
+        self.proj_k = nn.Conv1d(self.d_model, self.d_model, kernel_size=1, stride=1, padding=0)
+        self.proj_v = nn.Conv1d(self.d_model, self.d_model, kernel_size=1, stride=1, padding=0)
+        self.proj_out = nn.Linear(self.d_model, self.d_model)
+        kernel_size = kernel
+        self.stride = 1
+        pad_size = kernel_size // 2 if kernel_size != self.stride else 0
+        self.proj_offset = nn.Sequential(
+            nn.Conv1d(self.n_group_channels, self.n_group_channels, kernel_size=kernel_size, stride=self.stride, padding=pad_size),
+            nn.Conv1d(self.n_group_channels, 1, kernel_size=1, stride=self.stride, padding=pad_size),
+        )
+
+        self.scale_factor = self.d_model ** -0.5  # 1/np.sqrt(dim)
+
+        if self.rpb:
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(1, self.d_model, self.seq_len))
+            trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self, x, mask=None):
+        B, L, C = x.shape
+        dtype, device = x.dtype, x.device
+        x = x.permute(0,2,1) # B, C, L
+
+        q = self.proj_q(x) # B, C, L
+
+        group = lambda t: rearrange(t, 'b (g d) n -> (b g) d n', g = self.n_groups)
+
+        grouped_queries = group(q)
+
+        offset = self.proj_offset(grouped_queries) # B * g 1 Lg
+        offset = rearrange(offset, 'b 1 n -> b n')
+
+        def grid_sample_1d(feats, grid, *args, **kwargs):
+            # does 1d grid sample by reshaping it to 2d
+            grid = rearrange(grid, '... -> ... 1 1')
+            grid = F.pad(grid, (1, 0), value = 0.)
+            feats = rearrange(feats, '... -> ... 1')
+            # the backward of F.grid_sample is non-deterministic
+            # See for details: https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html
+            out = F.grid_sample(feats, grid, **kwargs) 
+            return rearrange(out, '... 1 -> ...')
+        
+        def normalize_grid(arange, dim = 1, out_dim = -1):
+            # normalizes 1d sequence to range of -1 to 1
+            n = arange.shape[-1]
+            return 2.0 * arange / max(n - 1, 1) - 1.0
+
+        if self.offset_range_factor >= 0 and not self.no_off:
+            offset = offset.tanh().mul(self.offset_range_factor)
+
+        if self.no_off:
+            x_sampled = F.avg_pool1d(x, kernel_size=self.stride, stride=self.stride)
+        else:
+            grid = torch.arange(offset.shape[-1], device = device)
+            vgrid = grid + offset
+            vgrid_scaled = normalize_grid(vgrid)
+
+            x_sampled = grid_sample_1d(
+                group(x),
+                vgrid_scaled,
+            mode = 'bilinear', padding_mode = 'zeros', align_corners = False)[:,:,:L]
+            
+        if not self.no_off:
+            x_sampled = rearrange(x_sampled,'(b g) d n -> b (g d) n', g = self.n_groups)
+        q = q.reshape(B * self.n_heads, self.n_head_channels, L)
+        k = self.proj_k(x_sampled).reshape(B * self.n_heads, self.n_head_channels, L)
+        if self.rpb:
+            v = self.proj_v(x_sampled)
+            v = (v + self.relative_position_bias_table).reshape(B * self.n_heads, self.n_head_channels, L)
+        else:
+            v = self.proj_v(x_sampled).reshape(B * self.n_heads, self.n_head_channels, L)
+
+        scaled_dot_prod = torch.einsum('b i d , b j d -> b i j', q, k) * self.scale_factor
+
+        if mask is not None:
+            assert mask.shape == scaled_dot_prod.shape[1:]
+            scaled_dot_prod = scaled_dot_prod.masked_fill(mask, -np.inf)
+
+        attention = torch.softmax(scaled_dot_prod, dim=-1) # softmax: attention[0,0,:].sum() = 1
+
+        out = torch.einsum('b i j , b j d -> b i d', attention, v) 
+        
+        return self.proj_out(rearrange(out, '(b g) l c -> b c (g l)', b=B))
+
+
+class DeformAtten2D(nn.Module):
+    '''
+        max_offset (int): The maximum magnitude of the offset residue. Default: 14.
+    '''
+    def __init__(self, seq_len, d_model, n_heads, dropout, kernel=5, n_groups=4, no_off=False, rpb=True) -> None:
+        super().__init__()
+        self.offset_range_factor = kernel
+        self.no_off = no_off
+        self.f_sample = False
+        self.seq_len = seq_len
+        self.d_model = d_model # (512)
+        self.n_groups = n_groups
+        self.n_group_channels = self.d_model // self.n_groups
+        self.n_heads = n_heads
+        self.n_head_channels = self.d_model // self.n_heads
+        self.n_group_heads = self.n_heads // self.n_groups
+        self.scale = self.n_head_channels ** -0.5
+        self.rpb = rpb
+
+        self.proj_q = nn.Conv2d(self.d_model, self.d_model, kernel_size=1, stride=1, padding=0)
+        self.proj_k = nn.Conv2d(self.d_model, self.d_model, kernel_size=1, stride=1, padding=0)
+        self.proj_v = nn.Conv2d(self.d_model, self.d_model, kernel_size=1, stride=1, padding=0)
+        self.proj_out = nn.Linear(self.d_model, self.d_model)
+        kernel_size = kernel
+        self.stride = 1
+        pad_size = kernel_size // 2 if kernel_size != self.stride else 0
+        self.proj_offset = nn.Sequential( 
+            nn.Conv2d(self.n_group_channels, self.n_group_channels, kernel_size=kernel_size, stride=self.stride, padding=pad_size),
+            nn.Conv2d(self.n_group_channels, 2, kernel_size=1, stride=1, padding=0, bias=False)
+        )
+
+        self.scale_factor = self.d_model ** -0.5  # 1/np.sqrt(dim)
+
+        if self.rpb:
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(1, self.d_model, self.seq_len, 1))
+            trunc_normal_(self.relative_position_bias_table, std=.02)
+
+
+    def forward(self, x, mask=None):
+        B, H, W, C = x.shape
+        x = x.permute(0, 3, 1, 2) # B, C, H, W
+
+        q = self.proj_q(x) # B, 1, H, W
+
+        offset = self.proj_offset(q) # B, 2, H, W
+
+        if self.offset_range_factor >= 0 and not self.no_off:
+            offset = offset.tanh().mul(self.offset_range_factor)
+
+        def create_grid_like(t, dim = 0):
+            h, w, device = *t.shape[-2:], t.device
+
+            grid = torch.stack(torch.meshgrid(
+                torch.arange(w, device = device),
+                torch.arange(h, device = device),
+            indexing = 'xy'), dim = dim)
+
+            grid.requires_grad = False
+            grid = grid.type_as(t)
+            return grid
+        
+        def normalize_grid(grid, dim = 1, out_dim = -1):
+            # normalizes a grid to range from -1 to 1
+            h, w = grid.shape[-2:]
+            grid_h, grid_w = grid.unbind(dim = dim)
+
+            grid_h = 2.0 * grid_h / max(h - 1, 1) - 1.0
+            grid_w = 2.0 * grid_w / max(w - 1, 1) - 1.0
+
+            return torch.stack((grid_h, grid_w), dim = out_dim)
+
+        if self.no_off:
+            x_sampled = F.avg_pool2d(x, kernel_size=self.stride, stride=self.stride)
+        else:
+            grid =create_grid_like(offset)
+            vgrid = grid + offset
+            vgrid_scaled = normalize_grid(vgrid)
+            # the backward of F.grid_sample is non-deterministic
+            x_sampled = F.grid_sample(
+                x,
+                vgrid_scaled,
+            mode = 'bilinear', padding_mode = 'zeros', align_corners = False)[:,:,:H,:W]
+              
+        if not self.no_off:
+            x_sampled = rearrange(x_sampled, '(b g) c h w -> b (g c) h w', g=self.n_groups)
+        q = q.reshape(B * self.n_heads, H, W)
+        k = self.proj_k(x_sampled).reshape(B * self.n_heads, H, W)
+        if self.rpb:
+            v = self.proj_v(x_sampled)
+            v = (v + self.relative_position_bias_table).reshape(B * self.n_heads, H, W)
+        else:
+            v = self.proj_v(x_sampled).reshape(B * self.n_heads, H, W)
+
+        scaled_dot_prod = torch.einsum('b i d , b j d -> b i j', q, k) * self.scale_factor
+
+        if mask is not None:
+            assert mask.shape == scaled_dot_prod.shape[1:]
+            scaled_dot_prod = scaled_dot_prod.masked_fill(mask, -np.inf)
+
+        attention = torch.softmax(scaled_dot_prod, dim=-1)
+
+        out = torch.einsum('b i j , b j d -> b i d', attention, v)
+        
+        return self.proj_out(out.reshape(B, H, W, C))
+
+
+class CrossDeformAttn(nn.Module):
+    def __init__(self, seq_len, d_model, n_heads, dropout, droprate, 
+                 n_days=1, window_size=4, patch_len=7, stride=3, no_off=False) -> None:
+        super().__init__()
+        self.n_days = n_days
+        self.seq_len = seq_len
+        # 1d size: B*n_days, subseq_len, C
+        # 2d size: B*num_patches, 1, patch_len, C
+        self.subseq_len = seq_len // n_days + (1 if seq_len % n_days != 0 else 0)
+        self.patch_len = patch_len
+        self.stride = stride
+        self.num_patches = num_patches(self.seq_len, self.patch_len, self.stride)
+
+        self.layer_norm = LayerNorm(d_model)
+
+        # 1D
+        self.ff1 = nn.Linear(d_model, d_model, bias=True)
+        self.ff2 = nn.Linear(self.subseq_len, self.subseq_len, bias=True)
+        # Deform attention
+        self.deform_attn = DeformAtten1D(self.subseq_len, d_model, n_heads, dropout, kernel=window_size, no_off=no_off) 
+        self.attn_layers1d = nn.ModuleList([self.deform_attn])
+
+        self.mlps1d = nn.ModuleList(
+            [ 
+                MLP([d_model, d_model], final_relu=True, drop_out=0.0) for _ in range(len(self.attn_layers1d))
+            ]
+        )
+        self.drop_path1d = nn.ModuleList(
+            [
+                DropPath(droprate) if droprate > 0.0 else nn.Identity() for _ in range(len(self.attn_layers1d))
+            ]
+        )
+        #######################################
+        # 2D
+        d_route = 1
+        self.conv_in = nn.Conv2d(1, d_route, kernel_size=1, bias=True)
+        self.conv_out = nn.Conv2d(d_route, 1, kernel_size=1, bias=True)
+        self.deform_attn2d = DeformAtten2D(self.patch_len, d_route, n_heads=1, dropout=dropout, kernel=window_size, n_groups=1, no_off=no_off)
+        self.write_out = nn.Linear(self.num_patches*self.patch_len, self.seq_len)
+
+        self.attn_layers2d = nn.ModuleList([self.deform_attn2d])
+
+        self.mlps2d = nn.ModuleList(
+            [ 
+                MLP([d_model, d_model], final_relu=True, drop_out=0.0) for _ in range(len(self.attn_layers2d))
+            ]
+        )
+        self.drop_path2d = nn.ModuleList(
+            [
+                DropPath(droprate) if droprate > 0.0 else nn.Identity() for _ in range(len(self.attn_layers2d))
+            ]
+        )
+
+        self.fc = nn.Linear(2*d_model, d_model)
+        
+    def forward(self, x, attn_mask=None, tau=None, delta=None):
+        n_day = self.n_days 
+        B, L, C = x.shape
+
+        x = self.layer_norm(x)
+
+        padding_len = (n_day - (L % n_day)) % n_day
+        x_padded = torch.cat((x, x[:, [0], :].expand(-1, padding_len, -1)), dim=1)
+        x_1d = rearrange(x_padded, 'b (seg_num ts_d) d_model -> (b ts_d) seg_num d_model', ts_d=n_day) 
+        # attn on 1D
+        for d, attn_layer in enumerate(self.attn_layers1d):
+            x0 = x_1d
+            x_1d = attn_layer(x_1d)
+            x_1d = self.drop_path1d[d](x_1d) + x0
+            x0 = x_1d
+            x_1d = self.mlps1d[d](self.layer_norm(x_1d))
+            x_1d = self.drop_path1d[d](x_1d) + x0
+        x_1d = rearrange(x_1d, '(b ts_d) seg_num d_model -> b (seg_num ts_d) d_model', ts_d=n_day)[:,:L,:]
+
+        # Patch attn on 2D
+        x_unfold = x.unfold(dimension=-2, size=self.patch_len, step=self.stride)
+        x_2d = rearrange(x_unfold, 'b n c l -> (b n) l c').unsqueeze(-3)
+        x_2d = rearrange(x_2d, 'b c h w -> b h w c')
+        for d, attn_layer in enumerate(self.attn_layers2d):
+            x0 = x_2d
+            x_2d = attn_layer(x_2d)
+            x_2d = self.drop_path2d[d](x_2d) + x0
+            x0 = x_2d
+            x_2d = self.mlps2d[d](self.layer_norm(x_2d.permute(0,1,3,2))).permute(0,1,3,2)
+            x_2d = self.drop_path2d[d](x_2d) + x0
+        x_2d = rearrange(x_2d, 'b h w c -> b c h w')
+        x_2d = rearrange(x_2d, '(b n) 1 l c -> b (n l) c', b=B)
+        x_2d = self.write_out(x_2d.permute(0,2,1)).permute(0,2,1)
+
+        x = torch.concat([x_1d, x_2d], dim=-1)
+        x = self.fc(x)
+
+        return x, None
+    
diff --git a/financial_loss_functions/src/models/DeformTime/utils/functions.py b/financial_loss_functions/src/models/DeformTime/utils/functions.py
new file mode 100644
index 00000000..6cdc805c
--- /dev/null
+++ b/financial_loss_functions/src/models/DeformTime/utils/functions.py
@@ -0,0 +1,31 @@
+import torch
+
+
+def grid_sample1D(tensor, grid):
+    """Given an input and a flow-field grid, computes the output using input
+    values and pixel locations from grid.
+
+    Args:
+        tensor: (N, C, L_in) tensor
+        grid: (N, L_out, 2) tensor in the range of [-1, 1]
+
+    Returns:
+        (N, C, L_out) tensor
+
+    """
+    b, c, l_in = tensor.shape
+    b_, l_out, w_ = grid.shape
+    assert b == b_ 
+    out = []
+    for (t, g) in zip(tensor, grid):
+        x_ = 0.5 * (l_in - 1) * (g[:, 0] + 1)
+        ix = torch.floor(x_).to(torch.int32).clamp(0, l_in - 2)
+        dx = x_ - ix
+        out.append((1 - dx) * t[..., ix] + dx * t[..., ix + 1])
+    return torch.stack(out, dim=0)
+
+def num_patches(seq_len, patch_len, stride):
+    return (seq_len - patch_len) // stride + 1
+
+print(num_patches(96, 7, 4))
+
diff --git a/financial_loss_functions/src/models/__init__.py b/financial_loss_functions/src/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/financial_loss_functions/src/models/layers/TFT_vsn.py b/financial_loss_functions/src/models/layers/TFT_vsn.py
new file mode 100644
index 00000000..9473f849
--- /dev/null
+++ b/financial_loss_functions/src/models/layers/TFT_vsn.py
@@ -0,0 +1,66 @@
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn.functional import elu
+
+class GatedResidualNetwork(nn.Module):
+    def __init__(
+            self,
+            input_size: int,
+            hidden_size: int,
+            output_size: int,
+            dropout: float
+        ):
+        super().__init__()
+        self.lin1 = nn.Linear(input_size, hidden_size)
+        self.lin2 = nn.Linear(hidden_size, output_size)
+        self.gate = nn.Linear(input_size, output_size)
+        self.ln = nn.LayerNorm(output_size)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # GLU-style gating: (Residual + (Transformation * Gating))
+        # This helps the model "ignore" noisy features
+        residual = self.gate(x)
+        x = elu(self.lin1(x)) # ELU is standard for TFT
+        x = self.dropout(self.lin2(x))
+        gate = torch.sigmoid(residual)
+        return self.ln(residual + (x * gate))
+
+class VariableSelectionNetwork(nn.Module):
+    def __init__(self, input_size: int, hidden_size: int, dropout: float) -> Tensor:
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        
+        # Correct shape for broadcasting: (1, 1, F, H)
+        # This allows it to skip B and T and multiply directly against F
+        self.feature_weights = nn.Parameter(torch.randn(1, 1, input_size, hidden_size))
+        self.feature_bias = nn.Parameter(torch.zeros(1, 1, input_size, hidden_size))
+        
+        self.selector_grn = GatedResidualNetwork(
+            input_size * hidden_size, 
+            hidden_size, 
+            input_size, 
+            dropout
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        # x: (B, T, F) -> (B, 120, 251)
+        b, t, f = x.shape
+        
+        # 1. Project each feature to hidden_size
+        # x.unsqueeze(-1) is (B, T, F, 1)
+        # Multiplication now aligns correctly: (B, T, 251, 1) * (1, 1, 251, H)
+        var_outputs = x.unsqueeze(-1) * self.feature_weights + self.feature_bias # (B, T, F, H)
+        
+        # 2. Variable Selection Weights
+        flattened = var_outputs.view(b, t, -1) # (B, T, F*H)
+        
+        # selector_grn returns (B, T, F)
+        sparse_weights = torch.softmax(self.selector_grn(flattened), dim=-1) 
+        sparse_weights = sparse_weights.unsqueeze(-1) # (B, T, F, 1)
+        
+        # 3. Weighted Sum across the Feature dimension
+        # (B, T, F, 1) * (B, T, F, H) -> sum over F -> (B, T, H)
+        return torch.sum(sparse_weights * var_outputs, dim=-2)
\ No newline at end of file
diff --git a/financial_loss_functions/src/models/lstm.py b/financial_loss_functions/src/models/lstm.py
index f31f03ab..a8926dce 100644
--- a/financial_loss_functions/src/models/lstm.py
+++ b/financial_loss_functions/src/models/lstm.py
@@ -18,7 +18,7 @@ def __init__(
             hidden_size: int, 
             num_layers: int,
             num_stocks: int,
-            dropout: float = 0.2,
+            dropout: float,
             equal_prior: bool = False
         ):
         """
@@ -41,7 +41,7 @@ def __init__(
             hidden_size=hidden_size,
             num_layers=num_layers,
             batch_first=True,
-            dropout=dropout,
+            dropout=dropout if num_layers > 1 else 0
         )
 
         self.equal_prior = equal_prior
@@ -79,15 +79,6 @@ def forward(self, x: Tensor) -> Tensor:
         last = self.dropout(last)
         
         logits = self.fc(last)     # (B, N)
-
-        # if self.equal_prior:
-        #     # Strong equal-weight prior that never goes away
-        #     equal_prior = torch.full_like(
-        #         logits,
-        #         fill_value=np.log(1.0 / logits.shape[-1]),
-        #         device=logits.device
-        #     )
-        #     logits = logits + equal_prior
         
         pf_weights = torch.softmax(logits, dim=-1)
         return pf_weights
@@ -126,7 +117,7 @@ def __init__(
             hidden_size=hidden_size,
             num_layers=num_layers,
             batch_first=True,
-            dropout=dropout,
+            dropout=dropout if num_layers > 1 else 0
         )
 
         self.equal_prior = equal_prior
@@ -182,14 +173,162 @@ def forward(self, x: Tensor) -> Tensor:
         context = self.dropout(context)
         
         logits = self.fc(context)  # (B, N)
-        # if self.equal_prior:
-        #     # Strong equal-weight prior that never goes away
-        #     equal_prior = torch.full_like(
-        #         logits,
-        #         fill_value=np.log(1.0 / logits.shape[-1]),
-        #         device=logits.device
-        #     )
-        #     logits = logits + equal_prior
         
         pf_weights = torch.softmax(logits, dim=-1)
-        return pf_weights
\ No newline at end of file
+        return pf_weights
+
+@NNModelLibrary.register(category='lstm')
+class InvertedAttentionLSTM(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        num_layers: int,
+        num_stocks: int,
+        attention_heads: int,
+        dropout: float,
+        max_seq_len: int, # Needed for the inverted Attention/Norm layers
+    ):
+        super().__init__()
+        # 1. Temporal Extraction (Standard)
+        self.lstm = nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=dropout if num_layers > 1 else 0
+        )
+        self.ln_lstm = nn.LayerNorm(hidden_size)
+
+        # 2. THE INVERSION: Attention now operates on the Time dimension (max_seq_len)
+        # We treat each hidden node as a token, and its sequence over time as the 'embedding'
+        self.attn = nn.MultiheadAttention(
+            embed_dim=max_seq_len, 
+            num_heads=attention_heads,
+            batch_first=True
+        )
+        self.ln_attn = nn.LayerNorm(max_seq_len)
+    
+        self.dropout = nn.Dropout(dropout)
+        self.fc = nn.Linear(hidden_size, num_stocks)
+
+        # 3. Decision Space (Expansion FFN)
+        # After inversion and pooling, we go from hidden_size -> stocks
+        # self.ffn = nn.Sequential(
+        #     nn.Linear(hidden_size, hidden_size * expansion_factor),
+        #     nn.GELU(),
+        #     nn.Dropout(dropout),
+        #     nn.Linear(hidden_size * expansion_factor, num_stocks)
+        # )
+
+    def forward(self, x: Tensor) -> Tensor:
+        # Step 1: Standard LSTM processing
+        # x shape: (Batch, Time, Features) -> (B, 120, 251)
+        out, _ = self.lstm(x)  # (B, 120, hidden_size)
+        out = self.ln_lstm(out)
+        out = torch.relu(out)
+        out = self.dropout(out)
+        
+        # Step 2: INVERT (Transpose)
+        # Swap Time (120) and Hidden (32)
+        # New shape: (Batch, hidden_size, Time) -> (B, 16, 120)
+        out_inverted = out.transpose(1, 2)
+        
+        # Step 3: Feature-wise Attention
+        # The model asks: "How do these hidden features correlate across the whole window?"
+        attn_out, _ = self.attn(out_inverted, out_inverted, out_inverted)
+        
+        # Residual Connection on the inverted shape
+        out_inverted = out_inverted + attn_out 
+        out_inverted = self.ln_attn(out_inverted)
+        
+        # Step 4: Pooling across the temporal "embeddings"
+        # We mean-pool the time dimension (dim=2) to get one vector per hidden feature
+        context = out_inverted.mean(dim=-1) # (B, hidden_size)
+        context = self.dropout(context)
+        
+        # Step 5: Final Portfolio Weights
+        logits = self.fc(context) 
+        return torch.softmax(logits, dim=-1)
+
+@NNModelLibrary.register(category='lstm')
+class LSTMTransformer(nn.Module):
+    """
+    Hybrid Model: LSTM for local temporal features + Transformer for global attention.
+    """
+    def __init__(
+        self,
+        input_size: int,       # 251 features
+        hidden_size: int,      # Embedding dimension
+        num_layers: int,       # LSTM layers
+        num_stocks: int,       # 50 stocks
+        attention_heads: int,
+        dropout: float,
+        expansion_factor: int,
+        max_seq_len: int
+    ):
+        super().__init__()
+        
+        # 1. Feature Projection (Initial step to clean up features)
+        self.feature_proj = nn.Linear(input_size, hidden_size)
+        
+        # 2. LSTM Layer (Local Temporal Smoothing)
+        self.lstm = nn.LSTM(
+            input_size=hidden_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=dropout if num_layers > 1 else 0
+        )
+        
+        # 3. Position Encoding (Crucial for the Transformer part)
+        self.pos_embedding = nn.Parameter(torch.zeros(1, max_seq_len, hidden_size))
+        nn.init.trunc_normal_(self.pos_embedding, std=0.02)
+
+        # 4. Transformer Block (Global Context)
+        # Replacing simple Attention with a full Encoder Layer (includes FFN + Norms)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=hidden_size,
+            nhead=attention_heads,
+            dim_feedforward=hidden_size * expansion_factor,
+            dropout=dropout,
+            batch_first=True,
+            activation='gelu'
+        )
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=1)
+
+        # 5. Output Head
+        self.ln_final = nn.LayerNorm(hidden_size)
+        self.dropout = nn.Dropout(dropout)
+        self.fc = nn.Linear(hidden_size, num_stocks)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # x: (B, T, 251)
+        
+        # Initial Projection
+        x = self.feature_proj(x)
+        
+        # Step 1: LSTM local processing
+        # This helps the Transformer 'see' the sequence as a flow
+        x, _ = self.lstm(x) # (B, T, H)
+        # x = torch.relu(x)
+        x = nn.functional.gelu(x)
+        x = self.dropout(x)
+        
+        # Step 2: Add Positional Information
+        x = x + self.pos_embedding[:, :x.size(1), :]
+        
+        # Step 3: Transformer Global Attention
+        # Every day now looks at every other day through the lens of the LSTM output
+        x = self.transformer(x) # (B, T, H)
+        
+        # Step 4: Pooling
+        # Mean pooling the context of the whole 120-day window
+        context = x.mean(dim=1)
+        context = self.ln_final(context)
+        context = self.dropout(context)
+        
+        # Step 5: Portfolio Allocation
+        logits = self.fc(context)  # (B, N)
+        return torch.softmax(logits, dim=-1)
+    
diff --git a/financial_loss_functions/src/models/transformer.py b/financial_loss_functions/src/models/transformer.py
new file mode 100644
index 00000000..db7c0719
--- /dev/null
+++ b/financial_loss_functions/src/models/transformer.py
@@ -0,0 +1,179 @@
+import torch
+# import numpy as np
+import torch.nn as nn
+from torch import Tensor
+from src.models.registry import NNModelLibrary
+from src.models.layers.TFT_vsn import (
+    VariableSelectionNetwork,
+    GatedResidualNetwork
+)
+
+
+@NNModelLibrary.register(category='transformer')
+class TemporalTransformer(nn.Module):
+    """
+    SOTA-style Transformer for Portfolio Optimization.
+    Replaces LSTM with Learned Positional Encodings and Transformer Blocks.
+    """
+    def __init__(
+        self,
+        input_size: int,    # Number of features (251)
+        hidden_size: int,   # d_model
+        num_layers: int,    # Number of Transformer blocks
+        num_stocks: int,    # Output size
+        attention_heads: int,
+        dropout: float,
+        expansion_factor: int,
+        max_seq_len: int # Length of your lookback window
+    ):
+        super().__init__()
+        
+        # 1. Feature Projection: Projects 251 features to hidden_size
+        self.feature_projection = nn.Linear(input_size, hidden_size)
+        
+        # 2. Learned Positional Encoding
+        # Financial data is sequential; the model needs to know 'when' a bar happened
+        self.pos_embedding = nn.Parameter(torch.zeros(1, max_seq_len, hidden_size))
+        
+        # 3. Transformer Encoder Blocks
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=hidden_size,
+            nhead=attention_heads,
+            dim_feedforward=hidden_size * expansion_factor,
+            dropout=dropout,
+            batch_first=True,
+            activation='gelu' # GELU is standard for SOTA Transformers
+        )
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        
+        # 4. Global LayerNorm
+        self.ln_final = nn.LayerNorm(hidden_size)
+        
+        # 5. Output Head
+        self.fc = nn.Linear(hidden_size, num_stocks)
+        
+    def forward(self, x: Tensor) -> Tensor:
+        # x shape: (Batch, Time, Features)
+        
+        # Project features to embedding space
+        x = self.feature_projection(x) # (B, T, H)
+        
+        # Add Positional Encoding
+        x = x + self.pos_embedding[:, :x.size(1), :]
+        
+        # Pass through Transformer Blocks
+        # Self-attention allows every day in the window to look at every other day
+        out = self.transformer_encoder(x) # (B, T, H)
+        
+        # Mean pooling to average context of the whole window
+        # context = out[:, -1, :] # # Pooling: The last time step's representation makes model sensitive to last day
+        context = out.mean(dim=1)
+        context = self.ln_final(context)
+        
+        # Generate Portfolio Logits
+        logits = self.fc(context) # (B, N)
+        
+        # Softmax to ensure weights sum to 1
+        return torch.softmax(logits, dim=-1)
+
+
+@NNModelLibrary.register(category='transformer')
+class TFT(nn.Module):
+    def __init__(
+            self,
+            input_size: int,
+            hidden_size: int,
+            num_layers: int,
+            num_stocks: int,
+            attention_heads: int,
+            dropout: float,
+            expansion_factor: int,
+            max_seq_len: int
+        ):
+        super().__init__()
+        
+        # 1. Feature Selection Layer (VSN)
+        self.vsn = VariableSelectionNetwork(input_size, hidden_size, dropout)
+        
+        # 2. Position Encoding
+        self.pos_embedding = nn.Parameter(torch.zeros(1, max_seq_len, hidden_size))
+        nn.init.trunc_normal_(self.pos_embedding, std=0.02)
+
+        # 3. Temporal Self-Attention
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=hidden_size,
+            nhead=attention_heads,
+            dim_feedforward=hidden_size * expansion_factor,
+            dropout=dropout,
+            batch_first=True,
+            activation='gelu'
+        )
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        
+        # 4. Final Gating & Output
+        self.post_attention_grn = GatedResidualNetwork(hidden_size, hidden_size, hidden_size, dropout)
+        self.fc = nn.Linear(hidden_size, num_stocks)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # x: (B, T, 251)
+        
+        # Filter 251 features down to hidden_size
+        x = self.vsn(x) 
+        
+        # Add position context
+        x = x + self.pos_embedding[:, :x.size(1), :]
+        
+        # Temporal context (Attention)
+        x = self.transformer(x)
+        
+        # Regulate attention output with gating
+        x = self.post_attention_grn(x)
+        
+        # Mean Pooling for stability
+        context = x.mean(dim=1)
+        
+        logits = self.fc(context)
+        return torch.softmax(logits, dim=-1)
+
+
+# @NNModelLibrary.register(category='transformer')
+class PatchTST(nn.Module):
+    def __init__(
+            self, 
+            input_size: int,
+            hidden_size: int,
+            num_stocks: int,
+            patch_size: int,
+            stride: int, 
+            seq_len: int
+        ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.num_patches = seq_len // stride
+        
+        # 1. Patching Linear Layer
+        # Takes a patch of 5 days across 251 features and projects it
+        self.patch_projection = nn.Linear(input_size * patch_size, hidden_size)
+        
+        # 2. Standard Transformer Encoder
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=hidden_size, nhead=4, batch_first=True, dropout=0.2
+        )
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=3)
+        
+        self.fc = nn.Linear(hidden_size, num_stocks)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # x: (Batch, 60, 251)
+        B, T, C = x.shape
+        
+        # Create patches: (Batch, Num_Patches, Patch_Size * Features)
+        x = x.unfold(1, self.patch_size, self.patch_size) # Extract patches
+        x = x.reshape(B, self.num_patches, -1) 
+        
+        x = self.patch_projection(x) # (B, 12, hidden_size)
+        x = self.transformer(x)
+        
+        # Pooling: Use the context of the most recent patches
+        context = x.mean(dim=1) 
+        return torch.softmax(self.fc(context), dim=-1)
\ No newline at end of file
diff --git a/financial_loss_functions/src/training/loss_functions.py b/financial_loss_functions/src/training/loss_functions.py
index b500e29d..aaac86da 100644
--- a/financial_loss_functions/src/training/loss_functions.py
+++ b/financial_loss_functions/src/training/loss_functions.py
@@ -1244,7 +1244,7 @@ def custom_loss_7(
 
 @LossLibrary.register(category='custom')
 def custom_loss_8(
-    weights: Tensor, returns: Tensor, lambda1: float, lambda2: float, lambda3: float
+    weights: Tensor, returns: Tensor, log_ret_lambda: float, cvar_lambda: float, risk_p_lambda: float
 ) -> Tensor:
     """
     loss = differentiable sharpe + lambda1 * log returns + lambda2 * smooth CVar + lambda3 * risk_parity
@@ -1258,7 +1258,11 @@ def custom_loss_8(
     # print('Sharpe:', sharpe)
     # print('CVaR:', cvar)
     # print('RP:', risk_parity)
-    return sharpe + (lambda1 * log_returns) + (lambda2 * cvar) + (lambda3 * risk_parity)
+    loss = sharpe + \
+        (log_ret_lambda * log_returns) + \
+            (cvar_lambda * cvar) + \
+                (risk_p_lambda * risk_parity)
+    return loss
 
 @LossLibrary.register(category='custom')
 def custom_loss_9(
@@ -1274,4 +1278,24 @@ def custom_loss_9(
     # print('Sharpe:', sharpe)
     # print('CVaR:', cvar)
     # print('RP:', risk_parity)
-    return log_sortino + (lambda1 * cvar) + (lambda2 * risk_parity)
\ No newline at end of file
+    return log_sortino + (lambda1 * cvar) + (lambda2 * risk_parity)
+
+@LossLibrary.register(category='custom')
+def custom_loss_10(
+    weights: Tensor, returns: Tensor, cvar_lambda: float, risk_p_lambda: float
+) -> Tensor:
+    """
+    loss = differentiable sharpe + lambda1 * log returns + lambda2 * smooth CVar + lambda3 * risk_parity
+    """
+    ### 2nd Best
+    sharpe = smooth_neglog_sharpe_loss(weights, returns)
+    cvar = smooth_rockafellar_cvar_regularizer(weights, returns)
+    risk_parity = risk_parity_regularizer(weights, returns)
+
+    # print('Sharpe:', sharpe)
+    # print('CVaR:', cvar)
+    # print('RP:', risk_parity)
+    loss = sharpe + \
+        (cvar_lambda * cvar) + \
+            (risk_p_lambda * risk_parity)
+    return loss
\ No newline at end of file
diff --git a/financial_loss_functions/src/training/pipeline.py b/financial_loss_functions/src/training/pipeline.py
index c7231e8c..e4eb2c1b 100644
--- a/financial_loss_functions/src/training/pipeline.py
+++ b/financial_loss_functions/src/training/pipeline.py
@@ -1,11 +1,13 @@
 import time
+import torch
+import numpy as np
 import pandas as pd
 from torch import optim
 from pathlib import Path
 from src.utils.io import create_directory
-from src.utils.device import get_best_device
 from src.evaluation.evaluator import Evaluator
 from src.data_processing.loading import load_csv_files
+from src.utils.device import get_best_device, set_seed
 from src.data_processing.dataset import (
     Reshaper,
     calc_in_out_idx,
@@ -29,13 +31,12 @@
 from src.models.registry import NNModelLibrary, TradModelLibrary
 
 # TODO:
-# Implement early stopping
-# Implement Learning Rate Scheduler
 # Add other NN models
 # Add Best model ranker
 # Unit test NCO
 
-def _common_setup(paths_config):
+def _common_setup(paths_config, seed_value: int):
+    set_seed(seed_value) # Global seed for reproducibility
     # Create plots directory if it doesnt exist
     plots_dir = (Path(paths_config['artifacts']['plots']))
     create_directory(plots_dir)
@@ -53,6 +54,31 @@ def _common_setup(paths_config):
     
     return plots_dir, results_dir, best_device
 
+def _deformtime_device(best_device: torch.device | str) -> torch.device | str:
+    """
+    Variables
+        • best_device
+            type: device name or device object
+            usage: used to store the preferred runtime device chosen by the project
+                   before DeformTime-specific compatibility checks are applied
+
+    This function helps in downgrading DeformTime to CPU when the selected device is
+    MPS and its unsupported backward operators would otherwise break training.
+    @author: Atharva Vaidya
+    """
+    # Check whether the selected device is an MPS device object that DeformTime should avoid.
+    if isinstance(best_device, torch.device) and best_device.type == 'mps':
+        # Return CPU so DeformTime avoids unsupported MPS backward operations.
+        print('DeformTime uses CPU because its backward pass requires ops unsupported on MPS.')
+        return torch.device('cpu')
+    # Check whether the selected device is the string form of MPS from the surrounding runtime.
+    if isinstance(best_device, str) and best_device == 'mps':
+        # Return CPU in string form so the Trainer receives a safe runtime device.
+        print('DeformTime uses CPU because its backward pass requires ops unsupported on MPS.')
+        return 'cpu'
+    # Keep the originally selected device when no DeformTime-specific MPS workaround is needed.
+    return best_device
+
 def _load_processed_data(paths_config: dict) -> tuple:
     
     processed_files = {
@@ -142,7 +168,7 @@ def _print_evaludation_info(in_win_date_cols, out_win_date_cols, **kwargs):
     for metric, df in kwargs.items():
         # Cleaning up the metric name
         title = metric.replace('_', ' ').upper()
-        print(f'\n{title} for each window:\n', df)
+        print(f'\n{title} summary for each window:\n', df)
 
 def run_training_pipeline(
         paths_config: dict,
@@ -166,7 +192,9 @@ def run_training_pipeline(
     print('\n', '=' * 40, ' Training Grid Pipeline ', '=' * 40)
     start_time = time.time()
     
-    plots_dir, results_dir, best_device = _common_setup(paths_config)
+    plots_dir, results_dir, best_device = _common_setup(
+        paths_config, hparams_config['seed']
+    )
     
     # -------------------- Loading Processed Data -------------------- #
     train_data, returns_train, val_data, returns_val = _load_processed_data(paths_config)
@@ -268,16 +296,14 @@ def run_training_pipeline(
     # Adding s&p500 returns to the evaluator as a benchmark
     evaluator.add_benchmark_rets('S&P500', sp500_rets_winds)
     
-    plot_windowed_comparison(
-        evaluator.get_all_daily_returns(),
-        out_win_date_cols,
-        plots_dir / (f'Daily Returns' + '.png')
-    )
+    # plot_windowed_comparison(
+    #     evaluator.get_all_daily_returns(),
+    #     out_win_date_cols,
+    #     plots_dir / (f'Daily Returns' + '.png')
+    # )
 
     total_returns = evaluator.calc_total_performance('returns')
-    total_returns.to_csv(results_dir / 'total_returns.csv', sep=',')
     total_sharpes = evaluator.calc_total_performance('sharpe')
-    total_sharpes.to_csv(results_dir / 'total_sharpes.csv', sep=',')
 
     plot_models_comparison(
         total_sharpes,
@@ -285,6 +311,11 @@ def run_training_pipeline(
         plots_dir / f'Sharpe Comprison.png'
     )
 
+    total_returns = total_returns.describe().T
+    total_returns.to_csv(results_dir / 'total_returns.csv', sep=',')
+    total_sharpes = total_sharpes.describe().T
+    total_sharpes.to_csv(results_dir / 'total_sharpes.csv', sep=',')
+
     _print_evaludation_info(
             in_win_date_cols,
             out_win_date_cols,
@@ -319,7 +350,13 @@ def run_training_one_model(
     if loss_cat not in ['objectives', 'custom']:
         raise ValueError('Loss category must be `objectives` or `custom`.')
     
-    plots_dir, results_dir, best_device = _common_setup(paths_config)
+    plots_dir, results_dir, best_device = _common_setup(
+        paths_config, hparams_config['seed']
+    )
+    # @author: Atharva Vaidya - Apply the DeformTime-specific device workaround before trainer construction.
+    if model_name == 'DeformTime':
+        # Move DeformTime off MPS so unsupported backward operators do not stop training.
+        best_device = _deformtime_device(best_device)
     
     # -------------------- Model and loss search -------------------- #    
     model_cls = NNModelLibrary.get(model_cat, model_name)
@@ -348,18 +385,18 @@ def run_training_one_model(
         evaluator = Evaluator(y_val)
 
         # -------------------- Training Tradional Models -------------------- #
-        trad_grid = TradModelsTrainer(TradModelLibrary.items(), hparams_config)
-        trad_alloc_weights = trad_grid.train_all(
-            in_wind_idxs,
-            out_wind_idxs,
-            returns_train,
-            returns_val
-        )
-
-        for trad_model_name, alloc_weights in trad_alloc_weights.items():
-            evaluator.calc_pf_daily_rets(alloc_weights, trad_model_name)
+        # trad_grid = TradModelsTrainer(TradModelLibrary.items(), hparams_config)
+        # trad_alloc_weights = trad_grid.train_all(
+        #     in_wind_idxs,
+        #     out_wind_idxs,
+        #     returns_train,
+        #     returns_val
+        # )
+
+        # for trad_model_name, alloc_weights in trad_alloc_weights.items():
+        #     evaluator.calc_pf_daily_rets(alloc_weights, trad_model_name)
         
-        del trad_grid
+        # del trad_grid
 
         # -------------------- Training Neural Network -------------------- #
         print('\n', '-'*10, f' Training {model_name}-{loss_name} ', '-'*10)
@@ -373,6 +410,7 @@ def run_training_one_model(
                 train_hparams=hparams_config['nn_models'][model_name]['train'],
                 in_size=X_train.shape[2],
                 num_stocks=y_train.shape[2],
+                scheduler_hparams=hparams_config['nn_models'][model_name]['scheduler'],
                 loss_hparams=hparams_config['losses'].get(loss_name),
                 device=best_device
             )
@@ -425,21 +463,15 @@ def run_training_one_model(
         # Adding s&p500 returns to the evaluator as a benchmark
         evaluator.add_benchmark_rets('S&P500', sp500_rets_winds)
 
-        plot_windowed_comparison(
-            evaluator.get_all_daily_returns(),
-            out_win_date_cols,
-            plots_dir /
-            (f'Daily Returns_{model_name}-{loss_name}' + '.png')
-        )
+        # plot_windowed_comparison(
+        #     evaluator.get_all_daily_returns(),
+        #     out_win_date_cols,
+        #     plots_dir /
+        #     (f'Daily Returns_{model_name}-{loss_name}' + '.png')
+        # )
 
         total_returns = evaluator.calc_total_performance('returns')
-        total_returns.to_csv(
-            results_dir / f'total_returns_{model_name}-{loss_name}.csv', sep=','
-        )
         total_sharpes = evaluator.calc_total_performance('sharpe')
-        total_sharpes.to_csv(
-            results_dir / f'total_sharpes_{model_name}-{loss_name}.csv', sep=','
-        )  
 
         plot_models_comparison(
             total_sharpes,
@@ -447,6 +479,15 @@ def run_training_one_model(
             plots_dir / f'Sharpe Comprison_{model_name}-{loss_name}.png'
         )
 
+        total_returns = total_returns.describe().T
+        total_returns.to_csv(
+            results_dir / f'total_returns_{model_name}-{loss_name}.csv', sep=','
+        )
+        total_sharpes = total_sharpes.describe().T
+        total_sharpes.to_csv(
+            results_dir / f'total_sharpes_{model_name}-{loss_name}.csv', sep=','
+        ) 
+
         _print_evaludation_info(
             in_win_date_cols,
             out_win_date_cols,
@@ -461,4 +502,4 @@ def run_training_one_model(
         raise ValueError(f'Model {model_name} of {model_cat} not found.')
 
     else:
-        raise ValueError(f'Loss Function {loss_name} not found.')
\ No newline at end of file
+        raise ValueError(f'Loss Function {loss_name} not found.')
diff --git a/financial_loss_functions/src/training/train.py b/financial_loss_functions/src/training/train.py
index 83421113..07db888f 100644
--- a/financial_loss_functions/src/training/train.py
+++ b/financial_loss_functions/src/training/train.py
@@ -1,11 +1,14 @@
 import gc
 import os
 import time
+import copy
 import torch
 import psutil
 import inspect
 import numpy as np
 import pandas as pd
+from tqdm import tqdm
+from src.utils.device import set_seed
 from torch.utils.data import DataLoader
 from src.data_processing.dataset import build_dataset
 from src.data_processing.preprocess_crsp import preprocessor2
@@ -15,7 +18,6 @@
 if TYPE_CHECKING:
     from src.data_processing.dataset import WindowDataset
 
-
 class Trainer:
     """
     Class to train provided models with provided hyperparameters.
@@ -31,6 +33,7 @@ def __init__(
         in_size: int,
         num_stocks: int,
         device: torch.device | str,
+        scheduler_hparams: dict[str, Any] | None = None,
         loss_hparams: dict[str, Any] | None = None
     ):
         """
@@ -68,6 +71,7 @@ def __init__(
         print('Model hyperparameters:\n', model_hparams)
         print('Optimizer hyperparameters:\n', optimizer_hparams)
         print('Training hyperparameters:\n', train_hparams)
+        print('Scheduler hyperparameters:\n', scheduler_hparams)
         print('Loss Function hyperparameters:\n', loss_hparams)
         
         # Initialize model with its specific hyperparameters
@@ -82,6 +86,19 @@ def __init__(
             self.model.parameters(),
             **optimizer_hparams
         )
+
+        if scheduler_hparams:
+            # 2. Initialize Scheduler
+            self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+                self.optimizer, 
+                mode='min',       # We want to minimize loss
+                **scheduler_hparams
+            )
+
+            self.lr_schedule = True
+        else:
+            self.lr_schedule = False
+
         self.loss = loss
 
         self.train_hparams = train_hparams
@@ -96,6 +113,11 @@ def __init__(
         self.avg_eval_loss = None
 
         self.eval_alloc_weights = []
+
+        # For Early Stopping
+        self.best_val_loss = float('inf')
+        self.best_model_state = None
+        self.patience_counter = 0
     
     def train(
             self, train_ds: 'WindowDataset', val_ds: Optional['WindowDataset'] = None
@@ -107,10 +129,16 @@ def train(
             Training data split converted to windowed dataset tensors
         """
         start_time = time.time()
+
+        # Pull hyperparameters with sensible defaults
+        patience = self.train_hparams.get('early_stop_patience', 20)
+        min_delta = self.train_hparams.get('early_stop_min_delta', 1e-3)
+        early_stopping = self.train_hparams.get('early_stopping', True)
+        
         train_loader = DataLoader(
             train_ds,
             batch_size=self.train_hparams['train_batch_size'],
-            shuffle=False
+            shuffle=True
         )
 
         for epoch in range(self.train_hparams['epochs']):
@@ -141,20 +169,47 @@ def train(
 
             epoch_avg_loss = total_loss_sum / total_samples
             self.train_losses.append(epoch_avg_loss)
-            epoch_losses_print = f'Epoch {epoch} | Train Loss: {epoch_avg_loss:.4f}'
+            status_msg = f'Epoch {epoch} | Train Loss: {epoch_avg_loss:.4f}'
+            self.avg_train_loss = epoch_avg_loss
 
+            # --- Validation & Early Stopping Logic ---
             if val_ds is not None:
                 avg_val_loss = self.validate(val_ds)
                 self.val_losses.append(avg_val_loss)
-                epoch_losses_print = epoch_losses_print + f' | Val Loss: {avg_val_loss:.4f}'
+
+                # --- STEP THE SCHEDULER HERE ---
+                # It takes the current validation loss to decide if it should drop the LR
+                if self.lr_schedule:
+                    self.scheduler.step(avg_val_loss)
+
+                if early_stopping:
+                    # Check for improvement
+                    if avg_val_loss < (self.best_val_loss - min_delta):
+                        self.best_val_loss = avg_val_loss
+                        self.patience_counter = 0
+                        # Deep copy the weights so we can return to this point later
+                        self.best_model_state = copy.deepcopy(self.model.state_dict())
+                    else:
+                        self.patience_counter += 1
+                    
+                    status_msg = status_msg + f' | Val Loss: {avg_val_loss:.4f}'
+
+                    if self.patience_counter >= patience:
+                        print(f'\n--- Early Stopping Triggered at Epoch {epoch} ---')
+                        # Load the "Best" weights back into the model
+                        self.model.load_state_dict(self.best_model_state)
+                        break
+                        
+                else:
+                    status_msg = status_msg + f' | Val Loss: {avg_val_loss:.4f}'
             
-            self.avg_train_loss = epoch_avg_loss
+            print(status_msg + f' | Time: {round(time.time() - epoch_start, 3)}s')
+        
+        if self.best_model_state is not None:
+            self.model.load_state_dict(self.best_model_state)
             
-            epoch_end = time.time()
-            epoch_time = round(epoch_end - epoch_start, 3)
+        print(f'Training Complete. Best Val Loss: {self.best_val_loss:.4f}')
             
-            print(epoch_losses_print + f' | Time Taken: {epoch_time}s')            
-
         end_time = time.time()
         time_taken = round(end_time - start_time, 3)
         print(f'Average Train Loss: {self.avg_train_loss:.4f}, Time Taken: {time_taken}s')
@@ -319,6 +374,8 @@ def _train_eval_helper(
         ) -> np.ndarray:
         #### Hyperparamater searching can be done here ####
 
+        set_seed(self.hparams_config['seed']) # Per model seed for fair comparison
+        
         if self.enable_diagnostics:
             print(f'\n[Before training {model_name} with {loss_name}]')
             self._memory_diagnostics()
@@ -338,8 +395,11 @@ def _train_eval_helper(
             ][model_name]['train'],
             in_size=X_train_shape[2],
             num_stocks=y_train_shape[2],
+            scheduler_hparams=self.hparams_config[
+                self.models_hparams
+            ][model_name]['scheduler'],
             loss_hparams=self.hparams_config[self.losses_hparams].get(loss_name),
-            device=self.torch_device
+            device=self.torch_device if not model_name == 'DeformTime' else torch.device('cpu')
         )
         trainer.train(train_ds, val_ds)
         trainer.evaluate(val_ds)
@@ -726,7 +786,7 @@ def _train_one_model(
 
         # Get hyperparameters of the current model
         current_hparams = self.hparams_config[self.models_hparams].get(model_name) or {}
-        print('Model hyperparameters:\n', current_hparams)
+        # print('Model hyperparameters:\n', current_hparams)
         
         model_obj = model_class(**current_hparams)
         alloc_weights = model_obj.calculate_weights(**filtered_kwargs)
@@ -746,7 +806,7 @@ def _process_train_1_ds(self, returns_is: pd.DataFrame):
         # Loop over every model
         for model_name, model_class in self.model_lib.items():
             
-            print('\n', '-'*10, f' Training {model_name} ', '-'*10)
+            # print('\n', '-'*10, f' Training {model_name} ', '-'*10)
             
             self.all_alloc_weights.setdefault(model_name, [])
 
@@ -789,12 +849,17 @@ def train_all(
             returns_val: pd.DataFrame,
             returns_test: pd.DataFrame | None = None
         ) -> dict[str, list[pd.Series | np.ndarray]]:
-
+        
+        num_slices = len(in_sample_indexes)
         if returns_test is None: # To use Validation Set (Combines Train + in-sample Val)
             # Calculate indexes for in-sample and out-of-sample to match the neural networks
 
             # Loop over dataset slices
-            for i in range(len(in_sample_indexes)): # len(in-sample) = len(out-of-sample)
+            for i in tqdm(
+                range(num_slices),
+                desc=f'Training tradional models on {num_slices} slices',
+                unit='slice'
+            ): # len(in-sample) = len(out-of-sample)
                 returns_is, _ = build_dataset(
                     in_sample_indexes[i],
                     out_sample_indexes[i],
@@ -802,13 +867,15 @@ def train_all(
                     returns_val
                 )
                 
-                print(f'\nTraining all models on slice {i+1} of the data...')
-                
                 self._process_train_1_ds(returns_is)     
         
         else: # To use Test Set (Combines Train + Val + in-sample Test)
 
-            for i in range(len(in_sample_indexes)): 
+            for i in tqdm(
+                range(num_slices),
+                desc=f'Training tradional models on {num_slices} slices',
+                unit='slice'
+            ):
                 returns_is, _ = build_dataset(
                     in_sample_indexes[i],
                     out_sample_indexes[i],
diff --git a/financial_loss_functions/src/utils/device.py b/financial_loss_functions/src/utils/device.py
index 5cb3b6fc..aca11e33 100644
--- a/financial_loss_functions/src/utils/device.py
+++ b/financial_loss_functions/src/utils/device.py
@@ -1,4 +1,5 @@
 import torch
+import numpy as np
 
 def get_best_device() -> torch.device:
     if torch.backends.mps.is_available():
@@ -11,4 +12,30 @@ def get_best_device() -> torch.device:
     
     else:
         print('No GPU acceleration. Using CPU.')
-        return torch.device('cpu')
\ No newline at end of file
+        return torch.device('cpu')
+
+import random
+
+def set_seed(seed=50):
+    # Basic Python and Numpy seeds
+    random.seed(seed)
+    np.random.seed(seed)
+    # os.environ['PYTHONHASHSEED'] = str(seed)
+    
+    # Basic PyTorch seed (covers CPU)
+    torch.manual_seed(seed)
+    
+    # NVIDIA CUDA Specifics
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed) # for multi-GPU
+        # These two ensure deterministic behavior but may slow down training slightly
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+        
+    # Apple Silicon (MPS) Specifics
+    if hasattr(torch, 'mps') and torch.backends.mps.is_available():
+        torch.mps.manual_seed(seed)
+        # Note: MPS is still maturing; some operations might not be 100% deterministic yet
+        
+    print(f'Seeds set to {seed} across all available backends.')
diff --git a/financial_loss_functions/src/visualization/plots.py b/financial_loss_functions/src/visualization/plots.py
index ae4c466d..d044d64c 100644
--- a/financial_loss_functions/src/visualization/plots.py
+++ b/financial_loss_functions/src/visualization/plots.py
@@ -38,10 +38,10 @@ def train_val_losses_plot(
     bars = ax2.bar(window_labels, eval_losses)
     
     # Add value labels on top of the bars
-    for bar in bars:
-        height = bar.get_height()
-        ax2.text(bar.get_x() + bar.get_width()/2., height,
-                f'{height:.4f}', ha='center', va='bottom' if height > 0 else 'top', fontsize=10)
+    # for bar in bars:
+    #     height = bar.get_height()
+    #     ax2.text(bar.get_x() + bar.get_width()/2., height,
+    #             f'{height:.4f}', ha='center', va='bottom' if height > 0 else 'top', fontsize=10)
 
     ax2.set_ylabel('Loss')
     ax2.set_title(f'Evaluation Performance (N={num_windows} Windows)')
@@ -148,7 +148,7 @@ def plot_models_comparison(
     ):
     
     eval_metrics.boxplot(figsize=(12, 6))
-    plt.xticks(rotation=45)
+    plt.xticks(rotation=90)
     plt.title(title)
     plt.tight_layout() # Ensures labels aren't cut off