diff --git a/financial_loss_functions/config/hparams.json b/financial_loss_functions/config/hparams.json index d7d4ef90..1878b434 100644 --- a/financial_loss_functions/config/hparams.json +++ b/financial_loss_functions/config/hparams.json @@ -1,45 +1,205 @@ -{ +{ + "seed": 50, "rolling_windows": { - "in_size": 200, - "out_size": 50, + "in_size": 120, + "out_size": 60, "stride": 1 }, "nn_models": { "BaseLSTM": { "model": { - "hidden_size": 256, - "num_layers": 2, + "hidden_size": 16, + "num_layers": 4, "dropout": 0.2, - "equal_prior": true + "equal_prior": false }, "optimizer": { "lr": 1e-4, - "weight_decay": 1e-5 + "weight_decay": 1e-2 }, "train" : { - "train_batch_size": 256, - "val_batch_size": 2, + "train_batch_size": 64, + "val_batch_size": 64, "clip_grad_norm": 0.5, - "epochs": 100 + "epochs": 200, + "early_stopping": true, + "early_stop_patience": 20, + "early_stop_min_delta": 1e-5 + }, + "scheduler": { + "factor": 0.5, + "patience": 10, + "min_lr": 1e-6 } }, "AttentionLSTM": { "model": { - "hidden_size": 128, - "num_layers": 3, + "hidden_size": 16, + "num_layers": 4, + "attention_heads": 2, + "dropout": 0.2, + "equal_prior": false + }, + "optimizer": { + "lr": 1e-4, + "weight_decay": 1e-2 + }, + "train": { + "train_batch_size": 64, + "val_batch_size": 64, + "clip_grad_norm": 0.5, + "epochs": 200, + "early_stopping": true, + "early_stop_patience": 20, + "early_stop_min_delta": 1e-5 + }, + "scheduler": { + "factor": 0.5, + "patience": 10, + "min_lr": 1e-6 + } + }, + "TemporalTransformer": { + "model": { + "hidden_size": 16, + "num_layers": 2, "attention_heads": 2, "dropout": 0.2, - "equal_prior": true + "expansion_factor": 4, + "max_seq_len": 120 }, "optimizer": { "lr": 1e-4, - "weight_decay": 1e-4 + "weight_decay": 1e-2 }, "train": { - "train_batch_size": 256, - "val_batch_size": 2, + "train_batch_size": 64, + "val_batch_size": 64, + "clip_grad_norm": 0.1, + "epochs": 200, + "early_stopping": true, + "early_stop_patience": 20, + "early_stop_min_delta": 1e-5 + }, + "scheduler": { + "factor": 0.5, + "patience": 10, + "min_lr": 1e-6 + } + }, + "TFT": { + "model": { + "hidden_size": 16, + "num_layers": 1, + "attention_heads": 2, + "dropout": 0.4, + "expansion_factor": 2, + "max_seq_len": 120 + }, + "optimizer": { + "lr": 1e-4, + "weight_decay": 1e-2 + }, + "train": { + "train_batch_size": 64, + "val_batch_size": 64, "clip_grad_norm": 0.5, - "epochs": 100 + "epochs": 200, + "early_stopping": true, + "early_stop_patience": 20, + "early_stop_min_delta": 1e-5 + }, + "scheduler": { + "factor": 0.5, + "patience": 10, + "min_lr": 1e-6 + } + }, + "LSTMTransformer": { + "model": { + "hidden_size": 32, + "num_layers": 2, + "attention_heads": 2, + "dropout": 0.5, + "expansion_factor": 4, + "max_seq_len": 120 + }, + "optimizer": { + "lr": 1e-5, + "weight_decay": 1e-3 + }, + "train": { + "train_batch_size": 64, + "val_batch_size": 64, + "clip_grad_norm": 0.5, + "epochs": 200, + "early_stopping": true, + "early_stop_patience": 20, + "early_stop_min_delta": 1e-5 + }, + "scheduler": { + "factor": 0.5, + "patience": 10, + "min_lr": 1e-8 + } + }, + "InvertedAttentionLSTM": { + "model": { + "hidden_size": 16, + "num_layers": 4, + "attention_heads": 8, + "dropout": 0.5, + "max_seq_len": 120 + }, + "optimizer": { + "lr": 1e-4, + "weight_decay": 1e-3 + }, + "train": { + "train_batch_size": 64, + "val_batch_size": 64, + "clip_grad_norm": 0.5, + "epochs": 200, + "early_stopping": true, + "early_stop_patience": 20, + "early_stop_min_delta": 1e-5 + }, + "scheduler": { + "factor": 0.5, + "patience": 10, + "min_lr": 1e-6 + } + }, + "DeformTime": { + "model": { + "seq_len": 120, + "e_layers": 2, + "d_layers": 2, + "d_model": 16, + "attention_heads": 4, + "kernel_size": 4, + "dropout": 0.2, + "n_reshape": 12, + "patch_len": 6, + "stride": 6 + }, + "optimizer": { + "lr": 1e-4, + "weight_decay": 1e-3 + }, + "train": { + "train_batch_size": 64, + "val_batch_size": 64, + "clip_grad_norm": 0.5, + "epochs": 200, + "early_stopping": true, + "early_stop_patience": 20, + "early_stop_min_delta": 1e-5 + }, + "scheduler": { + "factor": 0.5, + "patience": 10, + "min_lr": 1e-6 } } }, @@ -68,13 +228,17 @@ "lambda2": 0.1 }, "custom_loss_8" : { - "lambda1": 0.01, - "lambda2": 0.01, - "lambda3": 0.1 + "log_ret_lambda": 0.01, + "cvar_lambda": 0.1, + "risk_p_lambda": 0.1 }, "custom_loss_9" : { "lambda1": 0.01, "lambda2": 0.1 + }, + "custom_loss_10" : { + "cvar_lambda": 0.1, + "risk_p_lambda": 0.1 } }, "trad_models": { diff --git a/financial_loss_functions/config/paths.json b/financial_loss_functions/config/paths.json index 38c76bac..e210c150 100644 --- a/financial_loss_functions/config/paths.json +++ b/financial_loss_functions/config/paths.json @@ -7,9 +7,9 @@ "crsp_dir": "" }, "raw_files": { - "train": "combined_predictors_train.csv", - "val": "combined_predictors_validation.csv", - "test": "combined_predictors_test.csv" + "train": "crsp_train_2019.csv", + "val": "crsp_val_2021.csv", + "test": "crsp_test_2023.csv" }, "processed_paths": { "returns_train": "data/processed/ret_train.csv", diff --git a/financial_loss_functions/data/raw/sample/combined_predictors_test.csv b/financial_loss_functions/data/raw/sample/crsp_test_2023.csv similarity index 100% rename from financial_loss_functions/data/raw/sample/combined_predictors_test.csv rename to financial_loss_functions/data/raw/sample/crsp_test_2023.csv diff --git a/financial_loss_functions/data/raw/sample/combined_predictors_train.csv b/financial_loss_functions/data/raw/sample/crsp_train_2019.csv similarity index 100% rename from financial_loss_functions/data/raw/sample/combined_predictors_train.csv rename to financial_loss_functions/data/raw/sample/crsp_train_2019.csv diff --git a/financial_loss_functions/data/raw/sample/combined_predictors_validation.csv b/financial_loss_functions/data/raw/sample/crsp_val_2021.csv similarity index 100% rename from financial_loss_functions/data/raw/sample/combined_predictors_validation.csv rename to financial_loss_functions/data/raw/sample/crsp_val_2021.csv diff --git a/financial_loss_functions/requirements.txt b/financial_loss_functions/requirements.txt index bba07a16..43dc8934 100644 --- a/financial_loss_functions/requirements.txt +++ b/financial_loss_functions/requirements.txt @@ -12,4 +12,7 @@ seaborn==0.13.0 statsmodels==0.14.5 torch==2.9.1 torchvision==0.24.1 -optuna==4.6.0 \ No newline at end of file +optuna==4.6.0 +tqdm=4.67.1 +einops==0.8.2 +timm==1.0.25 \ No newline at end of file diff --git a/financial_loss_functions/scripts/run_training_one.py b/financial_loss_functions/scripts/run_training_one.py index 924b890f..0abac271 100644 --- a/financial_loss_functions/scripts/run_training_one.py +++ b/financial_loss_functions/scripts/run_training_one.py @@ -3,6 +3,10 @@ import signal import argparse from src.utils.io import load_path_config, load_config + +# @author: Atharva Vaidya - This fallback helps in allowing unsupported MPS ops to run through CPU when DeformTime triggers them. +os.environ.setdefault('PYTORCH_ENABLE_MPS_FALLBACK', '1') + from src.training.pipeline import run_training_one_model _interrupted = False @@ -123,4 +127,4 @@ def cleanup_on_interrupt(): print(f"\nPipeline failed with error: {e}") import traceback traceback.print_exc() - sys.exit(1) \ No newline at end of file + sys.exit(1) diff --git a/financial_loss_functions/src/models/DeformTime/DeformTime.py b/financial_loss_functions/src/models/DeformTime/DeformTime.py new file mode 100644 index 00000000..0c2bf0df --- /dev/null +++ b/financial_loss_functions/src/models/DeformTime/DeformTime.py @@ -0,0 +1,189 @@ +import torch +from torch import nn +from src.models.registry import NNModelLibrary + +from src.models.DeformTime.layers.TemporalDeformAttention import Encoder, CrossDeformAttn +from src.models.DeformTime.layers.Embed import Deform_Temporal_Embedding, Local_Temporal_Embedding +from math import ceil + +class Layernorm(nn.Module): + def __init__(self, dim): + super(Layernorm, self).__init__() + self.layernorm = nn.LayerNorm(dim) + + def forward(self, x): + x_hat = self.layernorm(x) + bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1) + return x_hat - bias + + +@NNModelLibrary.register(category='transformer') +class DeformTime(nn.Module): + def __init__( + self, + input_size: int, + num_stocks: int, + seq_len: int, + e_layers: int, + d_layers: int, + d_model: int, + attention_heads: int, + kernel_size: int, + dropout: float, + n_reshape: int, + patch_len: int, + stride: int + ) -> None: + super().__init__() + + self.input_size = input_size + self.num_stocks = num_stocks + + self.d_layers = d_layers + self.d_model = d_model + + # Embedding + if self.input_size == 1: + self.enc_value_embedding = Deform_Temporal_Embedding(self.input_size, self.d_model, freq='d') + else: + self.s_group = 4 + assert self.d_model % self.s_group == 0 + # Embedding local patches + self.pad_in_len = ceil(1.0 * self.input_size / self.s_group) * self.s_group + self.enc_value_embedding = Local_Temporal_Embedding(self.pad_in_len//self.s_group, self.d_model, self.pad_in_len-self.input_size, self.s_group) + + self.pre_norm = nn.LayerNorm(self.d_model) + # Encoder + n_days = [1,n_reshape,n_reshape] + assert len(n_days) > e_layers-1 + drop_path_rate=dropout + dpr = [x.item() for x in torch.linspace(drop_path_rate, drop_path_rate, e_layers)] + self.encoder = Encoder( + [ + CrossDeformAttn(seq_len=seq_len, + d_model=self.d_model, + n_heads=attention_heads, + dropout=dropout, + droprate=dpr[l], + n_days=n_days[l], + window_size=kernel_size, + patch_len=patch_len, + stride=stride) for l in range(e_layers) + ], + norm_layer=Layernorm(self.d_model) + ) + + # GRU layers + self.gru = torch.nn.GRU( + self.d_model, self.d_model, self.d_layers, batch_first=True, dropout=dropout + ) + + # @author: Atharva Vaidya - This head helps in converting the encoded DeformTime context into portfolio logits expected by the loss pipeline. + self.fc = nn.Sequential( + nn.Linear(self.d_model, self.d_model), + nn.LeakyReLU(), + nn.Linear(self.d_model, self.num_stocks) + ) + + def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec): + """ + Variables + • x_enc + type: tensor of numbers + usage: used to store the input feature window that is passed from the + trainer for DeformTime encoding + • x_mark_enc + type: tensor of numbers or empty value + usage: reserved encoder marker input that is accepted for interface + compatibility but is not used in this runtime path + • x_dec + type: tensor of numbers or empty value + usage: reserved decoder input kept for compatibility with the + surrounding model interface + • x_mark_dec + type: tensor of numbers or empty value + usage: reserved decoder marker input kept for compatibility with the + surrounding model interface + • mean_enc + type: tensor of numbers + usage: used to store the per-window mean so the input can be + stationarized before attention is applied + • std_enc + type: tensor of numbers + usage: used to store the per-window standard deviation so the input + scaling remains numerically stable + • enc_out + type: tensor of numbers + usage: used to store the encoded deformable-attention representation + produced by the encoder + • h0 + type: tensor of numbers + usage: used to store the initial hidden state passed into the GRU + • out + type: tensor of numbers + usage: used to store the GRU sequence output before collapsing it to + a single allocation context + • context + type: tensor of numbers + usage: used to store the last GRU state which becomes the compact + portfolio-allocation context for the final head + + forecast now extracts a portfolio-allocation context from the encoded sequence + instead of reconstructing the original feature space. @author: Atharva Vaidya + """ + assert x_enc.shape[-1] == self.input_size + + # Series Stationarization adopted from NSformer, optional + mean_enc = x_enc.mean(1, keepdim=True).detach() # B x 1 x E + x_enc = x_enc - mean_enc + std_enc = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5).detach() + x_enc = x_enc / std_enc + + x_enc = self.enc_value_embedding(x_enc) + x_enc = self.pre_norm(x_enc) + + # Deformed attention + enc_out, _ = self.encoder(x_enc) + + # Decoder + h0 = torch.zeros(self.d_layers, x_enc.size(0), self.d_model).requires_grad_().to(x_enc.device) + out, _ = self.gru(enc_out, h0.detach()) + # Extract the final GRU state so one context vector represents the allocation decision. + context = out[:, -1, :] + # Apply the portfolio head to convert the context vector into allocation logits. + return self.fc(context) + + def forward(self, x_enc, x_mark_enc=None, x_dec=None, x_mark_dec=None, mask=None): + """ + Variables + • x_enc + type: tensor of numbers + usage: used to store the encoded input window passed from the trainer + for a portfolio-weight prediction + • x_mark_enc + type: tensor of numbers or empty value + usage: reserved encoder marker input that is kept for interface + compatibility + • x_dec + type: tensor of numbers or empty value + usage: reserved decoder input that is kept for interface compatibility + • x_mark_dec + type: tensor of numbers or empty value + usage: reserved decoder marker input that is kept for interface + compatibility + • mask + type: tensor of numbers or empty value + usage: reserved mask input accepted for compatibility with the model + interface + • logits + type: tensor of numbers + usage: used to store the raw portfolio scores returned from forecast + before normalization + + forward normalizes the DeformTime logits into portfolio weights expected by + the training and loss pipeline. @author: Atharva Vaidya + """ + logits = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec) + # Normalize the logits so the downstream losses receive portfolio weights. + return torch.softmax(logits, dim=-1) + diff --git a/financial_loss_functions/src/models/DeformTime/__init__.py b/financial_loss_functions/src/models/DeformTime/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/financial_loss_functions/src/models/DeformTime/layers/Embed.py b/financial_loss_functions/src/models/DeformTime/layers/Embed.py new file mode 100644 index 00000000..fd754c02 --- /dev/null +++ b/financial_loss_functions/src/models/DeformTime/layers/Embed.py @@ -0,0 +1,231 @@ +import torch +import torch.nn as nn +import math +from einops import rearrange + +class PositionalEmbedding(nn.Module): + def __init__(self, d_model, max_len=5000): + super(PositionalEmbedding, self).__init__() + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, d_model).float() + pe.require_grad = False + + position = torch.arange(0, max_len).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() + * -(math.log(10000.0) / d_model)).exp() + + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + return self.pe[:, :x.size(1)] + + +class TokenEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(TokenEmbedding, self).__init__() + padding = 1 if torch.__version__ >= '1.5.0' else 2 + self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model, + kernel_size=3, padding=padding, padding_mode='circular', bias=False) + for m in self.modules(): + if isinstance(m, nn.Conv1d): + nn.init.kaiming_normal_( + m.weight, mode='fan_in', nonlinearity='leaky_relu') + + def forward(self, x): + x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2) + return x + + +class FixedEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(FixedEmbedding, self).__init__() + + w = torch.zeros(c_in, d_model).float() + w.require_grad = False + + position = torch.arange(0, c_in).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() + * -(math.log(10000.0) / d_model)).exp() + + w[:, 0::2] = torch.sin(position * div_term) + w[:, 1::2] = torch.cos(position * div_term) + + self.emb = nn.Embedding(c_in, d_model) + self.emb.weight = nn.Parameter(w, requires_grad=False) + + def forward(self, x): + return self.emb(x).detach() + + +class TemporalEmbedding(nn.Module): + def __init__(self, d_model, embed_type='fixed', freq='h'): + super(TemporalEmbedding, self).__init__() + + minute_size = 4 + hour_size = 24 + weekday_size = 7 + day_size = 32 + month_size = 13 + + Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding + if freq == 't': + self.minute_embed = Embed(minute_size, d_model) + self.hour_embed = Embed(hour_size, d_model) + self.weekday_embed = Embed(weekday_size, d_model) + self.day_embed = Embed(day_size, d_model) + self.month_embed = Embed(month_size, d_model) + + def forward(self, x): + x = x.long() + minute_x = self.minute_embed(x[:, :, 4]) if hasattr( + self, 'minute_embed') else 0. + hour_x = self.hour_embed(x[:, :, 3]) + weekday_x = self.weekday_embed(x[:, :, 2]) + day_x = self.day_embed(x[:, :, 1]) + month_x = self.month_embed(x[:, :, 0]) + + return hour_x + weekday_x + day_x + month_x + minute_x + + +class TimeFeatureEmbedding(nn.Module): + def __init__(self, d_model, embed_type='timeF', freq='h'): + super(TimeFeatureEmbedding, self).__init__() + + freq_map = {'h': 4, 't': 5, 's': 6, + 'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3} + d_inp = freq_map[freq] + self.embed = nn.Linear(d_inp, d_model, bias=False) + + def forward(self, x): + return self.embed(x) + + +class DataEmbedding(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + if x_mark is None: + x = self.value_embedding(x) + self.position_embedding(x) + else: + x = self.value_embedding( + x) + self.temporal_embedding(x_mark) + self.position_embedding(x) + return self.dropout(x) + + +class DataEmbedding_inverted(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding_inverted, self).__init__() + self.value_embedding = nn.Linear(c_in, d_model) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + x = x.permute(0, 2, 1) + # x: [Batch Variate Time] + if x_mark is None: + x = self.value_embedding(x) + else: + x = self.value_embedding(torch.cat([x, x_mark.permute(0, 2, 1)], 1)) + # x: [Batch Variate d_model] + return self.dropout(x) + + +class DataEmbedding_wo_pos(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding_wo_pos, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + if x_mark is None: + x = self.value_embedding(x) + else: + x = self.value_embedding(x) + self.temporal_embedding(x_mark) + return self.dropout(x) + + +class PatchEmbedding(nn.Module): + def __init__(self, d_model, patch_len, stride, padding, dropout): + super(PatchEmbedding, self).__init__() + # Patching + self.patch_len = patch_len + self.stride = stride + self.padding_patch_layer = nn.ReplicationPad1d((0, padding)) + + # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space + self.value_embedding = nn.Linear(patch_len, d_model, bias=False) + + # Positional embedding + self.position_embedding = PositionalEmbedding(d_model) + + # Residual dropout + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + # do patching + n_vars = x.shape[1] + x = self.padding_patch_layer(x) + x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride) + x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3])) + # Input encoding + x = self.value_embedding(x) + self.position_embedding(x) + return self.dropout(x), n_vars + + +class Deform_Temporal_Embedding(nn.Module): + def __init__(self, d_inp, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(Deform_Temporal_Embedding, self).__init__() + + self.value_embedding = nn.Linear(d_inp, d_model, bias=False) + self.position_embedding = PositionalEmbedding(d_model) + + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x): + x = self.value_embedding(x) + self.position_embedding(x) + return self.dropout(x) + +class Local_Temporal_Embedding(nn.Module): + def __init__(self, d_inp, d_model, padding, sub_groups=8, dropout=0.1): + super(Local_Temporal_Embedding, self).__init__() + + d_out = d_model // sub_groups if d_model % sub_groups == 0 else d_model // sub_groups + 1 + self.sub_seqlen = d_inp + self.padding_patch_layer = nn.ReplicationPad1d((0, padding)) + self.value_embedding = nn.Linear(d_inp, d_out, bias=False) + + self.position_embedding = PositionalEmbedding(d_model) + self.dropout = nn.Dropout(p=dropout) + self.d_model = d_model + + def forward(self, x): + # The in_channel is still fully conv with the out channel + # the number of output channels (out_channels) determines + # the number of filters applied to the input, and each filter + # processes the input across all input channels + B, L, C = x.shape + x = self.padding_patch_layer(x) + x = x.unfold(dimension=-1, size=self.sub_seqlen, step=self.sub_seqlen) + x = rearrange(x, 'b l g c -> (b g) l c') + # x = x.permute(0, 2, 1) + x = self.value_embedding(x) + # .permute(0, 2, 1) + x = rearrange(x, '(b g) l c -> b l (g c)', b = B)[:,:,:self.d_model] + x = x + self.position_embedding(x) + return self.dropout(x) \ No newline at end of file diff --git a/financial_loss_functions/src/models/DeformTime/layers/MLP.py b/financial_loss_functions/src/models/DeformTime/layers/MLP.py new file mode 100644 index 00000000..f971bc89 --- /dev/null +++ b/financial_loss_functions/src/models/DeformTime/layers/MLP.py @@ -0,0 +1,132 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +activation_functions = { + 'tanh': nn.Tanh(), + 'relu': nn.ReLU(), + 'elu': nn.ELU(), + 'sigmoid': nn.Sigmoid(), + 'gelu':nn.GELU() + } + +class LipSwish(torch.nn.Module): + def forward(self, x): + return 0.909 * F.silu(x) + +class MLPLipSwish(torch.nn.Module): + def __init__(self, in_size, out_size, mlp_size, num_layers, tanh): + super().__init__() + + model = [torch.nn.Linear(in_size, mlp_size), + LipSwish()] + for _ in range(num_layers - 1): + model.append(torch.nn.Linear(mlp_size, mlp_size)) + ################### + # LipSwish activations are useful to constrain the Lipschitz constant of the discriminator. + # (For simplicity we additionally use them in the generator, but that's less important.) + ################### + model.append(LipSwish()) + model.append(torch.nn.Linear(mlp_size, out_size)) + if tanh: + model.append(torch.nn.Tanh()) + self._model = torch.nn.Sequential(*model) + + def forward(self, x): + return self._model(x) + +class MLP(nn.Module): + # layer_sizes[0] is the dimension of the input + # layer_sizes[-1] is the dimension of the output + def __init__(self, layer_sizes, final_relu=False, drop_out=0.7): + super().__init__() + layer_list = [] + layer_sizes = [int(x) for x in layer_sizes] + num_layers = len(layer_sizes) - 1 + final_relu_layer = num_layers if final_relu else num_layers - 1 + for i in range(len(layer_sizes) - 1): + input_size = layer_sizes[i] + curr_size = layer_sizes[i + 1] + if i < final_relu_layer: + layer_list.append(nn.ReLU(inplace=False)) + if drop_out != 0: + layer_list.append(nn.Dropout(drop_out)) + layer_list.append(nn.Linear(input_size, curr_size)) + self.net = nn.Sequential(*layer_list) + self.last_linear = self.net[-1] + + def forward(self, x): + return self.net(x) + +class FreqMLP(nn.Module): + def __init__(self, layer_sizes, final_relu=False, drop_out=0.7) -> None: + super().__init__() + layer_list = [] + layer_sizes = [int(x) for x in layer_sizes] + num_layers = len(layer_sizes) - 1 + final_relu_layer = num_layers if final_relu else num_layers - 1 + for i in range(len(layer_sizes) - 1): + input_size = layer_sizes[i] + curr_size = layer_sizes[i + 1] + if i < final_relu_layer: + layer_list.append(nn.ReLU(inplace=False)) + if drop_out != 0: + layer_list.append(nn.Dropout(drop_out)) + layer_list.append(nn.Linear(input_size, curr_size)) + self.net = nn.Sequential(*layer_list) + self.last_linear = self.net[-1] + + self.r1 = nn.Parameter(self.scale * torch.randn(self.embed_size, self.embed_size)) + self.i1 = nn.Parameter(self.scale * torch.randn(self.embed_size, self.embed_size)) + self.rb1 = nn.Parameter(self.scale * torch.randn(self.embed_size)) + self.ib1 = nn.Parameter(self.scale * torch.randn(self.embed_size)) + self.r2 = nn.Parameter(self.scale * torch.randn(self.embed_size, self.embed_size)) + self.i2 = nn.Parameter(self.scale * torch.randn(self.embed_size, self.embed_size)) + self.rb2 = nn.Parameter(self.scale * torch.randn(self.embed_size)) + self.ib2 = nn.Parameter(self.scale * torch.randn(self.embed_size)) + + # frequency temporal learner + def MLP_temporal(self, x, B, N, L): + # [B, N, T, D] + x = torch.fft.rfft(x, dim=2, norm='ortho') # FFT on L dimension + y = self.FreMLP(B, N, L, x, self.r2, self.i2, self.rb2, self.ib2) + x = torch.fft.irfft(y, n=self.seq_len, dim=2, norm="ortho") + return x + + # frequency channel learner + def MLP_channel(self, x, B, N, L): + # [B, N, T, D] + x = x.permute(0, 2, 1, 3) + # [B, T, N, D] + x = torch.fft.rfft(x, dim=2, norm='ortho') # FFT on N dimension + y = self.FreMLP(B, L, N, x, self.r1, self.i1, self.rb1, self.ib1) + x = torch.fft.irfft(y, n=self.feature_size, dim=2, norm="ortho") + x = x.permute(0, 2, 1, 3) + # [B, N, T, D] + return x + + # frequency-domain MLPs + # dimension: FFT along the dimension, r: the real part of weights, i: the imaginary part of weights + # rb: the real part of bias, ib: the imaginary part of bias + def FreMLP(self, B, nd, dimension, x, r, i, rb, ib): + o1_real = torch.zeros([B, nd, dimension // 2 + 1, self.embed_size], + device=x.device) + o1_imag = torch.zeros([B, nd, dimension // 2 + 1, self.embed_size], + device=x.device) + + o1_real = F.relu( + torch.einsum('bijd,dd->bijd', x.real, r) - \ + torch.einsum('bijd,dd->bijd', x.imag, i) + \ + rb + ) + + o1_imag = F.relu( + torch.einsum('bijd,dd->bijd', x.imag, r) + \ + torch.einsum('bijd,dd->bijd', x.real, i) + \ + ib + ) + + y = torch.stack([o1_real, o1_imag], dim=-1) + y = F.softshrink(y, lambd=self.sparsity_threshold) + y = torch.view_as_complex(y) + return y \ No newline at end of file diff --git a/financial_loss_functions/src/models/DeformTime/layers/TemporalDeformAttention.py b/financial_loss_functions/src/models/DeformTime/layers/TemporalDeformAttention.py new file mode 100644 index 00000000..61b27b3c --- /dev/null +++ b/financial_loss_functions/src/models/DeformTime/layers/TemporalDeformAttention.py @@ -0,0 +1,457 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange +from timm.layers import trunc_normal_ +from src.models.DeformTime.layers.MLP import MLP +from src.models.DeformTime.utils.functions import num_patches + + +def normal_init(module, mean=0, std=1, bias=0): + if hasattr(module, 'weight') and module.weight is not None: + nn.init.normal_(module.weight, mean, std) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + +def constant_init(module, val, bias=0): + if hasattr(module, 'weight') and module.weight is not None: + nn.init.constant_(module.weight, val) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + +class series_decomp(nn.Module): + """ + Series decomposition block + """ + + def __init__(self, kernel_size): + super(series_decomp, self).__init__() + self.moving_avg = moving_avg(kernel_size, stride=1) + + def forward(self, x): + moving_mean = self.moving_avg(x) + res = x - moving_mean + return res, moving_mean + +class moving_avg(nn.Module): + """ + Moving average block to highlight the trend of time series + """ + + def __init__(self, kernel_size, stride): + super(moving_avg, self).__init__() + self.kernel_size = kernel_size + self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0) + + def forward(self, x): + # padding on the both ends of time series + front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1) + end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1) + x = torch.cat([front, x, end], dim=1) + x = self.avg(x.permute(0, 2, 1)) + x = x.permute(0, 2, 1) + return x + +def drop_path(x, drop_prob: float = 0., training: bool = False): + """ + From: https://github.com/huggingface/pytorch-image-models + + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + DropPath is dropping an entire sample from the batch while Dropout is dropping random values + """ + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) + random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() # binarize + output = x.div(keep_prob) * random_tensor + return output + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + +class LayerScale(nn.Module): + def __init__(self, + dim: int, + inplace: bool = False, + init_values: float = 1e-5): + super().__init__() + self.inplace = inplace + self.weight = nn.Parameter(torch.ones(dim) * init_values) + + def forward(self, x): + if self.inplace: + return x.mul_(self.weight.view(-1, 1, 1)) + else: + return x * self.weight.view(-1, 1, 1) + +class LayerNorm(nn.Module): + def __init__(self, dim): + super().__init__() + self.norm = nn.LayerNorm(dim) + + def forward(self, x): + x = self.norm(x) + return x + +class LayerNormProxy(nn.Module): + def __init__(self, dim): + super().__init__() + self.norm = nn.LayerNorm(dim) + + def forward(self, x): + x = rearrange(x, 'b c l -> b l c') + x = self.norm(x) + return rearrange(x, 'b l c -> b c l') + + +class LayerNormProxy2D(nn.Module): + def __init__(self, dim): + super().__init__() + self.norm = nn.LayerNorm(dim) + + def forward(self, x): + x = rearrange(x, 'b c h w -> b h w c') + x = self.norm(x) + return rearrange(x, 'b h w c -> b c h w') + + +class Encoder(nn.Module): + def __init__(self, attn_layers, norm_layer=None): + super(Encoder, self).__init__() + self.attn_layers = nn.ModuleList(attn_layers) + self.norm = norm_layer + + def forward(self, x, attn_mask=None, tau=None, delta=None): + # x [B, L, D] + attns = [] + for attn_layer in self.attn_layers: + x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta) + attns.append(attn) + + if self.norm is not None: + x = self.norm(x) + return x, attns + + +class DeformAtten1D(nn.Module): + ''' + max_offset (int): The maximum magnitude of the offset residue. Default: 14. + ''' + def __init__(self, seq_len, d_model, n_heads, dropout, kernel=5, n_groups=4, no_off=False, rpb=True) -> None: + super().__init__() + self.offset_range_factor = kernel + self.no_off = no_off + self.seq_len = seq_len + self.d_model = d_model + self.n_groups = n_groups + self.n_group_channels = self.d_model // self.n_groups + self.n_heads = n_heads + self.n_head_channels = self.d_model // self.n_heads + self.n_group_heads = self.n_heads // self.n_groups + self.scale = self.n_head_channels ** -0.5 + self.rpb = rpb + + self.proj_q = nn.Conv1d(self.d_model, self.d_model, kernel_size=1, stride=1, padding=0) + self.proj_k = nn.Conv1d(self.d_model, self.d_model, kernel_size=1, stride=1, padding=0) + self.proj_v = nn.Conv1d(self.d_model, self.d_model, kernel_size=1, stride=1, padding=0) + self.proj_out = nn.Linear(self.d_model, self.d_model) + kernel_size = kernel + self.stride = 1 + pad_size = kernel_size // 2 if kernel_size != self.stride else 0 + self.proj_offset = nn.Sequential( + nn.Conv1d(self.n_group_channels, self.n_group_channels, kernel_size=kernel_size, stride=self.stride, padding=pad_size), + nn.Conv1d(self.n_group_channels, 1, kernel_size=1, stride=self.stride, padding=pad_size), + ) + + self.scale_factor = self.d_model ** -0.5 # 1/np.sqrt(dim) + + if self.rpb: + self.relative_position_bias_table = nn.Parameter( + torch.zeros(1, self.d_model, self.seq_len)) + trunc_normal_(self.relative_position_bias_table, std=.02) + + def forward(self, x, mask=None): + B, L, C = x.shape + dtype, device = x.dtype, x.device + x = x.permute(0,2,1) # B, C, L + + q = self.proj_q(x) # B, C, L + + group = lambda t: rearrange(t, 'b (g d) n -> (b g) d n', g = self.n_groups) + + grouped_queries = group(q) + + offset = self.proj_offset(grouped_queries) # B * g 1 Lg + offset = rearrange(offset, 'b 1 n -> b n') + + def grid_sample_1d(feats, grid, *args, **kwargs): + # does 1d grid sample by reshaping it to 2d + grid = rearrange(grid, '... -> ... 1 1') + grid = F.pad(grid, (1, 0), value = 0.) + feats = rearrange(feats, '... -> ... 1') + # the backward of F.grid_sample is non-deterministic + # See for details: https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html + out = F.grid_sample(feats, grid, **kwargs) + return rearrange(out, '... 1 -> ...') + + def normalize_grid(arange, dim = 1, out_dim = -1): + # normalizes 1d sequence to range of -1 to 1 + n = arange.shape[-1] + return 2.0 * arange / max(n - 1, 1) - 1.0 + + if self.offset_range_factor >= 0 and not self.no_off: + offset = offset.tanh().mul(self.offset_range_factor) + + if self.no_off: + x_sampled = F.avg_pool1d(x, kernel_size=self.stride, stride=self.stride) + else: + grid = torch.arange(offset.shape[-1], device = device) + vgrid = grid + offset + vgrid_scaled = normalize_grid(vgrid) + + x_sampled = grid_sample_1d( + group(x), + vgrid_scaled, + mode = 'bilinear', padding_mode = 'zeros', align_corners = False)[:,:,:L] + + if not self.no_off: + x_sampled = rearrange(x_sampled,'(b g) d n -> b (g d) n', g = self.n_groups) + q = q.reshape(B * self.n_heads, self.n_head_channels, L) + k = self.proj_k(x_sampled).reshape(B * self.n_heads, self.n_head_channels, L) + if self.rpb: + v = self.proj_v(x_sampled) + v = (v + self.relative_position_bias_table).reshape(B * self.n_heads, self.n_head_channels, L) + else: + v = self.proj_v(x_sampled).reshape(B * self.n_heads, self.n_head_channels, L) + + scaled_dot_prod = torch.einsum('b i d , b j d -> b i j', q, k) * self.scale_factor + + if mask is not None: + assert mask.shape == scaled_dot_prod.shape[1:] + scaled_dot_prod = scaled_dot_prod.masked_fill(mask, -np.inf) + + attention = torch.softmax(scaled_dot_prod, dim=-1) # softmax: attention[0,0,:].sum() = 1 + + out = torch.einsum('b i j , b j d -> b i d', attention, v) + + return self.proj_out(rearrange(out, '(b g) l c -> b c (g l)', b=B)) + + +class DeformAtten2D(nn.Module): + ''' + max_offset (int): The maximum magnitude of the offset residue. Default: 14. + ''' + def __init__(self, seq_len, d_model, n_heads, dropout, kernel=5, n_groups=4, no_off=False, rpb=True) -> None: + super().__init__() + self.offset_range_factor = kernel + self.no_off = no_off + self.f_sample = False + self.seq_len = seq_len + self.d_model = d_model # (512) + self.n_groups = n_groups + self.n_group_channels = self.d_model // self.n_groups + self.n_heads = n_heads + self.n_head_channels = self.d_model // self.n_heads + self.n_group_heads = self.n_heads // self.n_groups + self.scale = self.n_head_channels ** -0.5 + self.rpb = rpb + + self.proj_q = nn.Conv2d(self.d_model, self.d_model, kernel_size=1, stride=1, padding=0) + self.proj_k = nn.Conv2d(self.d_model, self.d_model, kernel_size=1, stride=1, padding=0) + self.proj_v = nn.Conv2d(self.d_model, self.d_model, kernel_size=1, stride=1, padding=0) + self.proj_out = nn.Linear(self.d_model, self.d_model) + kernel_size = kernel + self.stride = 1 + pad_size = kernel_size // 2 if kernel_size != self.stride else 0 + self.proj_offset = nn.Sequential( + nn.Conv2d(self.n_group_channels, self.n_group_channels, kernel_size=kernel_size, stride=self.stride, padding=pad_size), + nn.Conv2d(self.n_group_channels, 2, kernel_size=1, stride=1, padding=0, bias=False) + ) + + self.scale_factor = self.d_model ** -0.5 # 1/np.sqrt(dim) + + if self.rpb: + self.relative_position_bias_table = nn.Parameter( + torch.zeros(1, self.d_model, self.seq_len, 1)) + trunc_normal_(self.relative_position_bias_table, std=.02) + + + def forward(self, x, mask=None): + B, H, W, C = x.shape + x = x.permute(0, 3, 1, 2) # B, C, H, W + + q = self.proj_q(x) # B, 1, H, W + + offset = self.proj_offset(q) # B, 2, H, W + + if self.offset_range_factor >= 0 and not self.no_off: + offset = offset.tanh().mul(self.offset_range_factor) + + def create_grid_like(t, dim = 0): + h, w, device = *t.shape[-2:], t.device + + grid = torch.stack(torch.meshgrid( + torch.arange(w, device = device), + torch.arange(h, device = device), + indexing = 'xy'), dim = dim) + + grid.requires_grad = False + grid = grid.type_as(t) + return grid + + def normalize_grid(grid, dim = 1, out_dim = -1): + # normalizes a grid to range from -1 to 1 + h, w = grid.shape[-2:] + grid_h, grid_w = grid.unbind(dim = dim) + + grid_h = 2.0 * grid_h / max(h - 1, 1) - 1.0 + grid_w = 2.0 * grid_w / max(w - 1, 1) - 1.0 + + return torch.stack((grid_h, grid_w), dim = out_dim) + + if self.no_off: + x_sampled = F.avg_pool2d(x, kernel_size=self.stride, stride=self.stride) + else: + grid =create_grid_like(offset) + vgrid = grid + offset + vgrid_scaled = normalize_grid(vgrid) + # the backward of F.grid_sample is non-deterministic + x_sampled = F.grid_sample( + x, + vgrid_scaled, + mode = 'bilinear', padding_mode = 'zeros', align_corners = False)[:,:,:H,:W] + + if not self.no_off: + x_sampled = rearrange(x_sampled, '(b g) c h w -> b (g c) h w', g=self.n_groups) + q = q.reshape(B * self.n_heads, H, W) + k = self.proj_k(x_sampled).reshape(B * self.n_heads, H, W) + if self.rpb: + v = self.proj_v(x_sampled) + v = (v + self.relative_position_bias_table).reshape(B * self.n_heads, H, W) + else: + v = self.proj_v(x_sampled).reshape(B * self.n_heads, H, W) + + scaled_dot_prod = torch.einsum('b i d , b j d -> b i j', q, k) * self.scale_factor + + if mask is not None: + assert mask.shape == scaled_dot_prod.shape[1:] + scaled_dot_prod = scaled_dot_prod.masked_fill(mask, -np.inf) + + attention = torch.softmax(scaled_dot_prod, dim=-1) + + out = torch.einsum('b i j , b j d -> b i d', attention, v) + + return self.proj_out(out.reshape(B, H, W, C)) + + +class CrossDeformAttn(nn.Module): + def __init__(self, seq_len, d_model, n_heads, dropout, droprate, + n_days=1, window_size=4, patch_len=7, stride=3, no_off=False) -> None: + super().__init__() + self.n_days = n_days + self.seq_len = seq_len + # 1d size: B*n_days, subseq_len, C + # 2d size: B*num_patches, 1, patch_len, C + self.subseq_len = seq_len // n_days + (1 if seq_len % n_days != 0 else 0) + self.patch_len = patch_len + self.stride = stride + self.num_patches = num_patches(self.seq_len, self.patch_len, self.stride) + + self.layer_norm = LayerNorm(d_model) + + # 1D + self.ff1 = nn.Linear(d_model, d_model, bias=True) + self.ff2 = nn.Linear(self.subseq_len, self.subseq_len, bias=True) + # Deform attention + self.deform_attn = DeformAtten1D(self.subseq_len, d_model, n_heads, dropout, kernel=window_size, no_off=no_off) + self.attn_layers1d = nn.ModuleList([self.deform_attn]) + + self.mlps1d = nn.ModuleList( + [ + MLP([d_model, d_model], final_relu=True, drop_out=0.0) for _ in range(len(self.attn_layers1d)) + ] + ) + self.drop_path1d = nn.ModuleList( + [ + DropPath(droprate) if droprate > 0.0 else nn.Identity() for _ in range(len(self.attn_layers1d)) + ] + ) + ####################################### + # 2D + d_route = 1 + self.conv_in = nn.Conv2d(1, d_route, kernel_size=1, bias=True) + self.conv_out = nn.Conv2d(d_route, 1, kernel_size=1, bias=True) + self.deform_attn2d = DeformAtten2D(self.patch_len, d_route, n_heads=1, dropout=dropout, kernel=window_size, n_groups=1, no_off=no_off) + self.write_out = nn.Linear(self.num_patches*self.patch_len, self.seq_len) + + self.attn_layers2d = nn.ModuleList([self.deform_attn2d]) + + self.mlps2d = nn.ModuleList( + [ + MLP([d_model, d_model], final_relu=True, drop_out=0.0) for _ in range(len(self.attn_layers2d)) + ] + ) + self.drop_path2d = nn.ModuleList( + [ + DropPath(droprate) if droprate > 0.0 else nn.Identity() for _ in range(len(self.attn_layers2d)) + ] + ) + + self.fc = nn.Linear(2*d_model, d_model) + + def forward(self, x, attn_mask=None, tau=None, delta=None): + n_day = self.n_days + B, L, C = x.shape + + x = self.layer_norm(x) + + padding_len = (n_day - (L % n_day)) % n_day + x_padded = torch.cat((x, x[:, [0], :].expand(-1, padding_len, -1)), dim=1) + x_1d = rearrange(x_padded, 'b (seg_num ts_d) d_model -> (b ts_d) seg_num d_model', ts_d=n_day) + # attn on 1D + for d, attn_layer in enumerate(self.attn_layers1d): + x0 = x_1d + x_1d = attn_layer(x_1d) + x_1d = self.drop_path1d[d](x_1d) + x0 + x0 = x_1d + x_1d = self.mlps1d[d](self.layer_norm(x_1d)) + x_1d = self.drop_path1d[d](x_1d) + x0 + x_1d = rearrange(x_1d, '(b ts_d) seg_num d_model -> b (seg_num ts_d) d_model', ts_d=n_day)[:,:L,:] + + # Patch attn on 2D + x_unfold = x.unfold(dimension=-2, size=self.patch_len, step=self.stride) + x_2d = rearrange(x_unfold, 'b n c l -> (b n) l c').unsqueeze(-3) + x_2d = rearrange(x_2d, 'b c h w -> b h w c') + for d, attn_layer in enumerate(self.attn_layers2d): + x0 = x_2d + x_2d = attn_layer(x_2d) + x_2d = self.drop_path2d[d](x_2d) + x0 + x0 = x_2d + x_2d = self.mlps2d[d](self.layer_norm(x_2d.permute(0,1,3,2))).permute(0,1,3,2) + x_2d = self.drop_path2d[d](x_2d) + x0 + x_2d = rearrange(x_2d, 'b h w c -> b c h w') + x_2d = rearrange(x_2d, '(b n) 1 l c -> b (n l) c', b=B) + x_2d = self.write_out(x_2d.permute(0,2,1)).permute(0,2,1) + + x = torch.concat([x_1d, x_2d], dim=-1) + x = self.fc(x) + + return x, None + diff --git a/financial_loss_functions/src/models/DeformTime/utils/functions.py b/financial_loss_functions/src/models/DeformTime/utils/functions.py new file mode 100644 index 00000000..6cdc805c --- /dev/null +++ b/financial_loss_functions/src/models/DeformTime/utils/functions.py @@ -0,0 +1,31 @@ +import torch + + +def grid_sample1D(tensor, grid): + """Given an input and a flow-field grid, computes the output using input + values and pixel locations from grid. + + Args: + tensor: (N, C, L_in) tensor + grid: (N, L_out, 2) tensor in the range of [-1, 1] + + Returns: + (N, C, L_out) tensor + + """ + b, c, l_in = tensor.shape + b_, l_out, w_ = grid.shape + assert b == b_ + out = [] + for (t, g) in zip(tensor, grid): + x_ = 0.5 * (l_in - 1) * (g[:, 0] + 1) + ix = torch.floor(x_).to(torch.int32).clamp(0, l_in - 2) + dx = x_ - ix + out.append((1 - dx) * t[..., ix] + dx * t[..., ix + 1]) + return torch.stack(out, dim=0) + +def num_patches(seq_len, patch_len, stride): + return (seq_len - patch_len) // stride + 1 + +print(num_patches(96, 7, 4)) + diff --git a/financial_loss_functions/src/models/__init__.py b/financial_loss_functions/src/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/financial_loss_functions/src/models/layers/TFT_vsn.py b/financial_loss_functions/src/models/layers/TFT_vsn.py new file mode 100644 index 00000000..9473f849 --- /dev/null +++ b/financial_loss_functions/src/models/layers/TFT_vsn.py @@ -0,0 +1,66 @@ +import torch +import torch.nn as nn +from torch import Tensor +from torch.nn.functional import elu + +class GatedResidualNetwork(nn.Module): + def __init__( + self, + input_size: int, + hidden_size: int, + output_size: int, + dropout: float + ): + super().__init__() + self.lin1 = nn.Linear(input_size, hidden_size) + self.lin2 = nn.Linear(hidden_size, output_size) + self.gate = nn.Linear(input_size, output_size) + self.ln = nn.LayerNorm(output_size) + self.dropout = nn.Dropout(dropout) + + def forward(self, x: Tensor) -> Tensor: + # GLU-style gating: (Residual + (Transformation * Gating)) + # This helps the model "ignore" noisy features + residual = self.gate(x) + x = elu(self.lin1(x)) # ELU is standard for TFT + x = self.dropout(self.lin2(x)) + gate = torch.sigmoid(residual) + return self.ln(residual + (x * gate)) + +class VariableSelectionNetwork(nn.Module): + def __init__(self, input_size: int, hidden_size: int, dropout: float) -> Tensor: + super().__init__() + self.input_size = input_size + self.hidden_size = hidden_size + + # Correct shape for broadcasting: (1, 1, F, H) + # This allows it to skip B and T and multiply directly against F + self.feature_weights = nn.Parameter(torch.randn(1, 1, input_size, hidden_size)) + self.feature_bias = nn.Parameter(torch.zeros(1, 1, input_size, hidden_size)) + + self.selector_grn = GatedResidualNetwork( + input_size * hidden_size, + hidden_size, + input_size, + dropout + ) + + def forward(self, x: Tensor) -> Tensor: + # x: (B, T, F) -> (B, 120, 251) + b, t, f = x.shape + + # 1. Project each feature to hidden_size + # x.unsqueeze(-1) is (B, T, F, 1) + # Multiplication now aligns correctly: (B, T, 251, 1) * (1, 1, 251, H) + var_outputs = x.unsqueeze(-1) * self.feature_weights + self.feature_bias # (B, T, F, H) + + # 2. Variable Selection Weights + flattened = var_outputs.view(b, t, -1) # (B, T, F*H) + + # selector_grn returns (B, T, F) + sparse_weights = torch.softmax(self.selector_grn(flattened), dim=-1) + sparse_weights = sparse_weights.unsqueeze(-1) # (B, T, F, 1) + + # 3. Weighted Sum across the Feature dimension + # (B, T, F, 1) * (B, T, F, H) -> sum over F -> (B, T, H) + return torch.sum(sparse_weights * var_outputs, dim=-2) \ No newline at end of file diff --git a/financial_loss_functions/src/models/lstm.py b/financial_loss_functions/src/models/lstm.py index f31f03ab..a8926dce 100644 --- a/financial_loss_functions/src/models/lstm.py +++ b/financial_loss_functions/src/models/lstm.py @@ -18,7 +18,7 @@ def __init__( hidden_size: int, num_layers: int, num_stocks: int, - dropout: float = 0.2, + dropout: float, equal_prior: bool = False ): """ @@ -41,7 +41,7 @@ def __init__( hidden_size=hidden_size, num_layers=num_layers, batch_first=True, - dropout=dropout, + dropout=dropout if num_layers > 1 else 0 ) self.equal_prior = equal_prior @@ -79,15 +79,6 @@ def forward(self, x: Tensor) -> Tensor: last = self.dropout(last) logits = self.fc(last) # (B, N) - - # if self.equal_prior: - # # Strong equal-weight prior that never goes away - # equal_prior = torch.full_like( - # logits, - # fill_value=np.log(1.0 / logits.shape[-1]), - # device=logits.device - # ) - # logits = logits + equal_prior pf_weights = torch.softmax(logits, dim=-1) return pf_weights @@ -126,7 +117,7 @@ def __init__( hidden_size=hidden_size, num_layers=num_layers, batch_first=True, - dropout=dropout, + dropout=dropout if num_layers > 1 else 0 ) self.equal_prior = equal_prior @@ -182,14 +173,162 @@ def forward(self, x: Tensor) -> Tensor: context = self.dropout(context) logits = self.fc(context) # (B, N) - # if self.equal_prior: - # # Strong equal-weight prior that never goes away - # equal_prior = torch.full_like( - # logits, - # fill_value=np.log(1.0 / logits.shape[-1]), - # device=logits.device - # ) - # logits = logits + equal_prior pf_weights = torch.softmax(logits, dim=-1) - return pf_weights \ No newline at end of file + return pf_weights + +@NNModelLibrary.register(category='lstm') +class InvertedAttentionLSTM(nn.Module): + def __init__( + self, + input_size: int, + hidden_size: int, + num_layers: int, + num_stocks: int, + attention_heads: int, + dropout: float, + max_seq_len: int, # Needed for the inverted Attention/Norm layers + ): + super().__init__() + # 1. Temporal Extraction (Standard) + self.lstm = nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout if num_layers > 1 else 0 + ) + self.ln_lstm = nn.LayerNorm(hidden_size) + + # 2. THE INVERSION: Attention now operates on the Time dimension (max_seq_len) + # We treat each hidden node as a token, and its sequence over time as the 'embedding' + self.attn = nn.MultiheadAttention( + embed_dim=max_seq_len, + num_heads=attention_heads, + batch_first=True + ) + self.ln_attn = nn.LayerNorm(max_seq_len) + + self.dropout = nn.Dropout(dropout) + self.fc = nn.Linear(hidden_size, num_stocks) + + # 3. Decision Space (Expansion FFN) + # After inversion and pooling, we go from hidden_size -> stocks + # self.ffn = nn.Sequential( + # nn.Linear(hidden_size, hidden_size * expansion_factor), + # nn.GELU(), + # nn.Dropout(dropout), + # nn.Linear(hidden_size * expansion_factor, num_stocks) + # ) + + def forward(self, x: Tensor) -> Tensor: + # Step 1: Standard LSTM processing + # x shape: (Batch, Time, Features) -> (B, 120, 251) + out, _ = self.lstm(x) # (B, 120, hidden_size) + out = self.ln_lstm(out) + out = torch.relu(out) + out = self.dropout(out) + + # Step 2: INVERT (Transpose) + # Swap Time (120) and Hidden (32) + # New shape: (Batch, hidden_size, Time) -> (B, 16, 120) + out_inverted = out.transpose(1, 2) + + # Step 3: Feature-wise Attention + # The model asks: "How do these hidden features correlate across the whole window?" + attn_out, _ = self.attn(out_inverted, out_inverted, out_inverted) + + # Residual Connection on the inverted shape + out_inverted = out_inverted + attn_out + out_inverted = self.ln_attn(out_inverted) + + # Step 4: Pooling across the temporal "embeddings" + # We mean-pool the time dimension (dim=2) to get one vector per hidden feature + context = out_inverted.mean(dim=-1) # (B, hidden_size) + context = self.dropout(context) + + # Step 5: Final Portfolio Weights + logits = self.fc(context) + return torch.softmax(logits, dim=-1) + +@NNModelLibrary.register(category='lstm') +class LSTMTransformer(nn.Module): + """ + Hybrid Model: LSTM for local temporal features + Transformer for global attention. + """ + def __init__( + self, + input_size: int, # 251 features + hidden_size: int, # Embedding dimension + num_layers: int, # LSTM layers + num_stocks: int, # 50 stocks + attention_heads: int, + dropout: float, + expansion_factor: int, + max_seq_len: int + ): + super().__init__() + + # 1. Feature Projection (Initial step to clean up features) + self.feature_proj = nn.Linear(input_size, hidden_size) + + # 2. LSTM Layer (Local Temporal Smoothing) + self.lstm = nn.LSTM( + input_size=hidden_size, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout if num_layers > 1 else 0 + ) + + # 3. Position Encoding (Crucial for the Transformer part) + self.pos_embedding = nn.Parameter(torch.zeros(1, max_seq_len, hidden_size)) + nn.init.trunc_normal_(self.pos_embedding, std=0.02) + + # 4. Transformer Block (Global Context) + # Replacing simple Attention with a full Encoder Layer (includes FFN + Norms) + encoder_layer = nn.TransformerEncoderLayer( + d_model=hidden_size, + nhead=attention_heads, + dim_feedforward=hidden_size * expansion_factor, + dropout=dropout, + batch_first=True, + activation='gelu' + ) + self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=1) + + # 5. Output Head + self.ln_final = nn.LayerNorm(hidden_size) + self.dropout = nn.Dropout(dropout) + self.fc = nn.Linear(hidden_size, num_stocks) + + def forward(self, x: Tensor) -> Tensor: + # x: (B, T, 251) + + # Initial Projection + x = self.feature_proj(x) + + # Step 1: LSTM local processing + # This helps the Transformer 'see' the sequence as a flow + x, _ = self.lstm(x) # (B, T, H) + # x = torch.relu(x) + x = nn.functional.gelu(x) + x = self.dropout(x) + + # Step 2: Add Positional Information + x = x + self.pos_embedding[:, :x.size(1), :] + + # Step 3: Transformer Global Attention + # Every day now looks at every other day through the lens of the LSTM output + x = self.transformer(x) # (B, T, H) + + # Step 4: Pooling + # Mean pooling the context of the whole 120-day window + context = x.mean(dim=1) + context = self.ln_final(context) + context = self.dropout(context) + + # Step 5: Portfolio Allocation + logits = self.fc(context) # (B, N) + return torch.softmax(logits, dim=-1) + diff --git a/financial_loss_functions/src/models/transformer.py b/financial_loss_functions/src/models/transformer.py new file mode 100644 index 00000000..db7c0719 --- /dev/null +++ b/financial_loss_functions/src/models/transformer.py @@ -0,0 +1,179 @@ +import torch +# import numpy as np +import torch.nn as nn +from torch import Tensor +from src.models.registry import NNModelLibrary +from src.models.layers.TFT_vsn import ( + VariableSelectionNetwork, + GatedResidualNetwork +) + + +@NNModelLibrary.register(category='transformer') +class TemporalTransformer(nn.Module): + """ + SOTA-style Transformer for Portfolio Optimization. + Replaces LSTM with Learned Positional Encodings and Transformer Blocks. + """ + def __init__( + self, + input_size: int, # Number of features (251) + hidden_size: int, # d_model + num_layers: int, # Number of Transformer blocks + num_stocks: int, # Output size + attention_heads: int, + dropout: float, + expansion_factor: int, + max_seq_len: int # Length of your lookback window + ): + super().__init__() + + # 1. Feature Projection: Projects 251 features to hidden_size + self.feature_projection = nn.Linear(input_size, hidden_size) + + # 2. Learned Positional Encoding + # Financial data is sequential; the model needs to know 'when' a bar happened + self.pos_embedding = nn.Parameter(torch.zeros(1, max_seq_len, hidden_size)) + + # 3. Transformer Encoder Blocks + encoder_layer = nn.TransformerEncoderLayer( + d_model=hidden_size, + nhead=attention_heads, + dim_feedforward=hidden_size * expansion_factor, + dropout=dropout, + batch_first=True, + activation='gelu' # GELU is standard for SOTA Transformers + ) + self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers) + + # 4. Global LayerNorm + self.ln_final = nn.LayerNorm(hidden_size) + + # 5. Output Head + self.fc = nn.Linear(hidden_size, num_stocks) + + def forward(self, x: Tensor) -> Tensor: + # x shape: (Batch, Time, Features) + + # Project features to embedding space + x = self.feature_projection(x) # (B, T, H) + + # Add Positional Encoding + x = x + self.pos_embedding[:, :x.size(1), :] + + # Pass through Transformer Blocks + # Self-attention allows every day in the window to look at every other day + out = self.transformer_encoder(x) # (B, T, H) + + # Mean pooling to average context of the whole window + # context = out[:, -1, :] # # Pooling: The last time step's representation makes model sensitive to last day + context = out.mean(dim=1) + context = self.ln_final(context) + + # Generate Portfolio Logits + logits = self.fc(context) # (B, N) + + # Softmax to ensure weights sum to 1 + return torch.softmax(logits, dim=-1) + + +@NNModelLibrary.register(category='transformer') +class TFT(nn.Module): + def __init__( + self, + input_size: int, + hidden_size: int, + num_layers: int, + num_stocks: int, + attention_heads: int, + dropout: float, + expansion_factor: int, + max_seq_len: int + ): + super().__init__() + + # 1. Feature Selection Layer (VSN) + self.vsn = VariableSelectionNetwork(input_size, hidden_size, dropout) + + # 2. Position Encoding + self.pos_embedding = nn.Parameter(torch.zeros(1, max_seq_len, hidden_size)) + nn.init.trunc_normal_(self.pos_embedding, std=0.02) + + # 3. Temporal Self-Attention + encoder_layer = nn.TransformerEncoderLayer( + d_model=hidden_size, + nhead=attention_heads, + dim_feedforward=hidden_size * expansion_factor, + dropout=dropout, + batch_first=True, + activation='gelu' + ) + self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers) + + # 4. Final Gating & Output + self.post_attention_grn = GatedResidualNetwork(hidden_size, hidden_size, hidden_size, dropout) + self.fc = nn.Linear(hidden_size, num_stocks) + + def forward(self, x: Tensor) -> Tensor: + # x: (B, T, 251) + + # Filter 251 features down to hidden_size + x = self.vsn(x) + + # Add position context + x = x + self.pos_embedding[:, :x.size(1), :] + + # Temporal context (Attention) + x = self.transformer(x) + + # Regulate attention output with gating + x = self.post_attention_grn(x) + + # Mean Pooling for stability + context = x.mean(dim=1) + + logits = self.fc(context) + return torch.softmax(logits, dim=-1) + + +# @NNModelLibrary.register(category='transformer') +class PatchTST(nn.Module): + def __init__( + self, + input_size: int, + hidden_size: int, + num_stocks: int, + patch_size: int, + stride: int, + seq_len: int + ): + super().__init__() + self.patch_size = patch_size + self.num_patches = seq_len // stride + + # 1. Patching Linear Layer + # Takes a patch of 5 days across 251 features and projects it + self.patch_projection = nn.Linear(input_size * patch_size, hidden_size) + + # 2. Standard Transformer Encoder + encoder_layer = nn.TransformerEncoderLayer( + d_model=hidden_size, nhead=4, batch_first=True, dropout=0.2 + ) + self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=3) + + self.fc = nn.Linear(hidden_size, num_stocks) + + def forward(self, x: Tensor) -> Tensor: + # x: (Batch, 60, 251) + B, T, C = x.shape + + # Create patches: (Batch, Num_Patches, Patch_Size * Features) + x = x.unfold(1, self.patch_size, self.patch_size) # Extract patches + x = x.reshape(B, self.num_patches, -1) + + x = self.patch_projection(x) # (B, 12, hidden_size) + x = self.transformer(x) + + # Pooling: Use the context of the most recent patches + context = x.mean(dim=1) + return torch.softmax(self.fc(context), dim=-1) \ No newline at end of file diff --git a/financial_loss_functions/src/training/loss_functions.py b/financial_loss_functions/src/training/loss_functions.py index b500e29d..aaac86da 100644 --- a/financial_loss_functions/src/training/loss_functions.py +++ b/financial_loss_functions/src/training/loss_functions.py @@ -1244,7 +1244,7 @@ def custom_loss_7( @LossLibrary.register(category='custom') def custom_loss_8( - weights: Tensor, returns: Tensor, lambda1: float, lambda2: float, lambda3: float + weights: Tensor, returns: Tensor, log_ret_lambda: float, cvar_lambda: float, risk_p_lambda: float ) -> Tensor: """ loss = differentiable sharpe + lambda1 * log returns + lambda2 * smooth CVar + lambda3 * risk_parity @@ -1258,7 +1258,11 @@ def custom_loss_8( # print('Sharpe:', sharpe) # print('CVaR:', cvar) # print('RP:', risk_parity) - return sharpe + (lambda1 * log_returns) + (lambda2 * cvar) + (lambda3 * risk_parity) + loss = sharpe + \ + (log_ret_lambda * log_returns) + \ + (cvar_lambda * cvar) + \ + (risk_p_lambda * risk_parity) + return loss @LossLibrary.register(category='custom') def custom_loss_9( @@ -1274,4 +1278,24 @@ def custom_loss_9( # print('Sharpe:', sharpe) # print('CVaR:', cvar) # print('RP:', risk_parity) - return log_sortino + (lambda1 * cvar) + (lambda2 * risk_parity) \ No newline at end of file + return log_sortino + (lambda1 * cvar) + (lambda2 * risk_parity) + +@LossLibrary.register(category='custom') +def custom_loss_10( + weights: Tensor, returns: Tensor, cvar_lambda: float, risk_p_lambda: float +) -> Tensor: + """ + loss = differentiable sharpe + lambda1 * log returns + lambda2 * smooth CVar + lambda3 * risk_parity + """ + ### 2nd Best + sharpe = smooth_neglog_sharpe_loss(weights, returns) + cvar = smooth_rockafellar_cvar_regularizer(weights, returns) + risk_parity = risk_parity_regularizer(weights, returns) + + # print('Sharpe:', sharpe) + # print('CVaR:', cvar) + # print('RP:', risk_parity) + loss = sharpe + \ + (cvar_lambda * cvar) + \ + (risk_p_lambda * risk_parity) + return loss \ No newline at end of file diff --git a/financial_loss_functions/src/training/pipeline.py b/financial_loss_functions/src/training/pipeline.py index c7231e8c..e4eb2c1b 100644 --- a/financial_loss_functions/src/training/pipeline.py +++ b/financial_loss_functions/src/training/pipeline.py @@ -1,11 +1,13 @@ import time +import torch +import numpy as np import pandas as pd from torch import optim from pathlib import Path from src.utils.io import create_directory -from src.utils.device import get_best_device from src.evaluation.evaluator import Evaluator from src.data_processing.loading import load_csv_files +from src.utils.device import get_best_device, set_seed from src.data_processing.dataset import ( Reshaper, calc_in_out_idx, @@ -29,13 +31,12 @@ from src.models.registry import NNModelLibrary, TradModelLibrary # TODO: -# Implement early stopping -# Implement Learning Rate Scheduler # Add other NN models # Add Best model ranker # Unit test NCO -def _common_setup(paths_config): +def _common_setup(paths_config, seed_value: int): + set_seed(seed_value) # Global seed for reproducibility # Create plots directory if it doesnt exist plots_dir = (Path(paths_config['artifacts']['plots'])) create_directory(plots_dir) @@ -53,6 +54,31 @@ def _common_setup(paths_config): return plots_dir, results_dir, best_device +def _deformtime_device(best_device: torch.device | str) -> torch.device | str: + """ + Variables + • best_device + type: device name or device object + usage: used to store the preferred runtime device chosen by the project + before DeformTime-specific compatibility checks are applied + + This function helps in downgrading DeformTime to CPU when the selected device is + MPS and its unsupported backward operators would otherwise break training. + @author: Atharva Vaidya + """ + # Check whether the selected device is an MPS device object that DeformTime should avoid. + if isinstance(best_device, torch.device) and best_device.type == 'mps': + # Return CPU so DeformTime avoids unsupported MPS backward operations. + print('DeformTime uses CPU because its backward pass requires ops unsupported on MPS.') + return torch.device('cpu') + # Check whether the selected device is the string form of MPS from the surrounding runtime. + if isinstance(best_device, str) and best_device == 'mps': + # Return CPU in string form so the Trainer receives a safe runtime device. + print('DeformTime uses CPU because its backward pass requires ops unsupported on MPS.') + return 'cpu' + # Keep the originally selected device when no DeformTime-specific MPS workaround is needed. + return best_device + def _load_processed_data(paths_config: dict) -> tuple: processed_files = { @@ -142,7 +168,7 @@ def _print_evaludation_info(in_win_date_cols, out_win_date_cols, **kwargs): for metric, df in kwargs.items(): # Cleaning up the metric name title = metric.replace('_', ' ').upper() - print(f'\n{title} for each window:\n', df) + print(f'\n{title} summary for each window:\n', df) def run_training_pipeline( paths_config: dict, @@ -166,7 +192,9 @@ def run_training_pipeline( print('\n', '=' * 40, ' Training Grid Pipeline ', '=' * 40) start_time = time.time() - plots_dir, results_dir, best_device = _common_setup(paths_config) + plots_dir, results_dir, best_device = _common_setup( + paths_config, hparams_config['seed'] + ) # -------------------- Loading Processed Data -------------------- # train_data, returns_train, val_data, returns_val = _load_processed_data(paths_config) @@ -268,16 +296,14 @@ def run_training_pipeline( # Adding s&p500 returns to the evaluator as a benchmark evaluator.add_benchmark_rets('S&P500', sp500_rets_winds) - plot_windowed_comparison( - evaluator.get_all_daily_returns(), - out_win_date_cols, - plots_dir / (f'Daily Returns' + '.png') - ) + # plot_windowed_comparison( + # evaluator.get_all_daily_returns(), + # out_win_date_cols, + # plots_dir / (f'Daily Returns' + '.png') + # ) total_returns = evaluator.calc_total_performance('returns') - total_returns.to_csv(results_dir / 'total_returns.csv', sep=',') total_sharpes = evaluator.calc_total_performance('sharpe') - total_sharpes.to_csv(results_dir / 'total_sharpes.csv', sep=',') plot_models_comparison( total_sharpes, @@ -285,6 +311,11 @@ def run_training_pipeline( plots_dir / f'Sharpe Comprison.png' ) + total_returns = total_returns.describe().T + total_returns.to_csv(results_dir / 'total_returns.csv', sep=',') + total_sharpes = total_sharpes.describe().T + total_sharpes.to_csv(results_dir / 'total_sharpes.csv', sep=',') + _print_evaludation_info( in_win_date_cols, out_win_date_cols, @@ -319,7 +350,13 @@ def run_training_one_model( if loss_cat not in ['objectives', 'custom']: raise ValueError('Loss category must be `objectives` or `custom`.') - plots_dir, results_dir, best_device = _common_setup(paths_config) + plots_dir, results_dir, best_device = _common_setup( + paths_config, hparams_config['seed'] + ) + # @author: Atharva Vaidya - Apply the DeformTime-specific device workaround before trainer construction. + if model_name == 'DeformTime': + # Move DeformTime off MPS so unsupported backward operators do not stop training. + best_device = _deformtime_device(best_device) # -------------------- Model and loss search -------------------- # model_cls = NNModelLibrary.get(model_cat, model_name) @@ -348,18 +385,18 @@ def run_training_one_model( evaluator = Evaluator(y_val) # -------------------- Training Tradional Models -------------------- # - trad_grid = TradModelsTrainer(TradModelLibrary.items(), hparams_config) - trad_alloc_weights = trad_grid.train_all( - in_wind_idxs, - out_wind_idxs, - returns_train, - returns_val - ) - - for trad_model_name, alloc_weights in trad_alloc_weights.items(): - evaluator.calc_pf_daily_rets(alloc_weights, trad_model_name) + # trad_grid = TradModelsTrainer(TradModelLibrary.items(), hparams_config) + # trad_alloc_weights = trad_grid.train_all( + # in_wind_idxs, + # out_wind_idxs, + # returns_train, + # returns_val + # ) + + # for trad_model_name, alloc_weights in trad_alloc_weights.items(): + # evaluator.calc_pf_daily_rets(alloc_weights, trad_model_name) - del trad_grid + # del trad_grid # -------------------- Training Neural Network -------------------- # print('\n', '-'*10, f' Training {model_name}-{loss_name} ', '-'*10) @@ -373,6 +410,7 @@ def run_training_one_model( train_hparams=hparams_config['nn_models'][model_name]['train'], in_size=X_train.shape[2], num_stocks=y_train.shape[2], + scheduler_hparams=hparams_config['nn_models'][model_name]['scheduler'], loss_hparams=hparams_config['losses'].get(loss_name), device=best_device ) @@ -425,21 +463,15 @@ def run_training_one_model( # Adding s&p500 returns to the evaluator as a benchmark evaluator.add_benchmark_rets('S&P500', sp500_rets_winds) - plot_windowed_comparison( - evaluator.get_all_daily_returns(), - out_win_date_cols, - plots_dir / - (f'Daily Returns_{model_name}-{loss_name}' + '.png') - ) + # plot_windowed_comparison( + # evaluator.get_all_daily_returns(), + # out_win_date_cols, + # plots_dir / + # (f'Daily Returns_{model_name}-{loss_name}' + '.png') + # ) total_returns = evaluator.calc_total_performance('returns') - total_returns.to_csv( - results_dir / f'total_returns_{model_name}-{loss_name}.csv', sep=',' - ) total_sharpes = evaluator.calc_total_performance('sharpe') - total_sharpes.to_csv( - results_dir / f'total_sharpes_{model_name}-{loss_name}.csv', sep=',' - ) plot_models_comparison( total_sharpes, @@ -447,6 +479,15 @@ def run_training_one_model( plots_dir / f'Sharpe Comprison_{model_name}-{loss_name}.png' ) + total_returns = total_returns.describe().T + total_returns.to_csv( + results_dir / f'total_returns_{model_name}-{loss_name}.csv', sep=',' + ) + total_sharpes = total_sharpes.describe().T + total_sharpes.to_csv( + results_dir / f'total_sharpes_{model_name}-{loss_name}.csv', sep=',' + ) + _print_evaludation_info( in_win_date_cols, out_win_date_cols, @@ -461,4 +502,4 @@ def run_training_one_model( raise ValueError(f'Model {model_name} of {model_cat} not found.') else: - raise ValueError(f'Loss Function {loss_name} not found.') \ No newline at end of file + raise ValueError(f'Loss Function {loss_name} not found.') diff --git a/financial_loss_functions/src/training/train.py b/financial_loss_functions/src/training/train.py index 83421113..07db888f 100644 --- a/financial_loss_functions/src/training/train.py +++ b/financial_loss_functions/src/training/train.py @@ -1,11 +1,14 @@ import gc import os import time +import copy import torch import psutil import inspect import numpy as np import pandas as pd +from tqdm import tqdm +from src.utils.device import set_seed from torch.utils.data import DataLoader from src.data_processing.dataset import build_dataset from src.data_processing.preprocess_crsp import preprocessor2 @@ -15,7 +18,6 @@ if TYPE_CHECKING: from src.data_processing.dataset import WindowDataset - class Trainer: """ Class to train provided models with provided hyperparameters. @@ -31,6 +33,7 @@ def __init__( in_size: int, num_stocks: int, device: torch.device | str, + scheduler_hparams: dict[str, Any] | None = None, loss_hparams: dict[str, Any] | None = None ): """ @@ -68,6 +71,7 @@ def __init__( print('Model hyperparameters:\n', model_hparams) print('Optimizer hyperparameters:\n', optimizer_hparams) print('Training hyperparameters:\n', train_hparams) + print('Scheduler hyperparameters:\n', scheduler_hparams) print('Loss Function hyperparameters:\n', loss_hparams) # Initialize model with its specific hyperparameters @@ -82,6 +86,19 @@ def __init__( self.model.parameters(), **optimizer_hparams ) + + if scheduler_hparams: + # 2. Initialize Scheduler + self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( + self.optimizer, + mode='min', # We want to minimize loss + **scheduler_hparams + ) + + self.lr_schedule = True + else: + self.lr_schedule = False + self.loss = loss self.train_hparams = train_hparams @@ -96,6 +113,11 @@ def __init__( self.avg_eval_loss = None self.eval_alloc_weights = [] + + # For Early Stopping + self.best_val_loss = float('inf') + self.best_model_state = None + self.patience_counter = 0 def train( self, train_ds: 'WindowDataset', val_ds: Optional['WindowDataset'] = None @@ -107,10 +129,16 @@ def train( Training data split converted to windowed dataset tensors """ start_time = time.time() + + # Pull hyperparameters with sensible defaults + patience = self.train_hparams.get('early_stop_patience', 20) + min_delta = self.train_hparams.get('early_stop_min_delta', 1e-3) + early_stopping = self.train_hparams.get('early_stopping', True) + train_loader = DataLoader( train_ds, batch_size=self.train_hparams['train_batch_size'], - shuffle=False + shuffle=True ) for epoch in range(self.train_hparams['epochs']): @@ -141,20 +169,47 @@ def train( epoch_avg_loss = total_loss_sum / total_samples self.train_losses.append(epoch_avg_loss) - epoch_losses_print = f'Epoch {epoch} | Train Loss: {epoch_avg_loss:.4f}' + status_msg = f'Epoch {epoch} | Train Loss: {epoch_avg_loss:.4f}' + self.avg_train_loss = epoch_avg_loss + # --- Validation & Early Stopping Logic --- if val_ds is not None: avg_val_loss = self.validate(val_ds) self.val_losses.append(avg_val_loss) - epoch_losses_print = epoch_losses_print + f' | Val Loss: {avg_val_loss:.4f}' + + # --- STEP THE SCHEDULER HERE --- + # It takes the current validation loss to decide if it should drop the LR + if self.lr_schedule: + self.scheduler.step(avg_val_loss) + + if early_stopping: + # Check for improvement + if avg_val_loss < (self.best_val_loss - min_delta): + self.best_val_loss = avg_val_loss + self.patience_counter = 0 + # Deep copy the weights so we can return to this point later + self.best_model_state = copy.deepcopy(self.model.state_dict()) + else: + self.patience_counter += 1 + + status_msg = status_msg + f' | Val Loss: {avg_val_loss:.4f}' + + if self.patience_counter >= patience: + print(f'\n--- Early Stopping Triggered at Epoch {epoch} ---') + # Load the "Best" weights back into the model + self.model.load_state_dict(self.best_model_state) + break + + else: + status_msg = status_msg + f' | Val Loss: {avg_val_loss:.4f}' - self.avg_train_loss = epoch_avg_loss + print(status_msg + f' | Time: {round(time.time() - epoch_start, 3)}s') + + if self.best_model_state is not None: + self.model.load_state_dict(self.best_model_state) - epoch_end = time.time() - epoch_time = round(epoch_end - epoch_start, 3) + print(f'Training Complete. Best Val Loss: {self.best_val_loss:.4f}') - print(epoch_losses_print + f' | Time Taken: {epoch_time}s') - end_time = time.time() time_taken = round(end_time - start_time, 3) print(f'Average Train Loss: {self.avg_train_loss:.4f}, Time Taken: {time_taken}s') @@ -319,6 +374,8 @@ def _train_eval_helper( ) -> np.ndarray: #### Hyperparamater searching can be done here #### + set_seed(self.hparams_config['seed']) # Per model seed for fair comparison + if self.enable_diagnostics: print(f'\n[Before training {model_name} with {loss_name}]') self._memory_diagnostics() @@ -338,8 +395,11 @@ def _train_eval_helper( ][model_name]['train'], in_size=X_train_shape[2], num_stocks=y_train_shape[2], + scheduler_hparams=self.hparams_config[ + self.models_hparams + ][model_name]['scheduler'], loss_hparams=self.hparams_config[self.losses_hparams].get(loss_name), - device=self.torch_device + device=self.torch_device if not model_name == 'DeformTime' else torch.device('cpu') ) trainer.train(train_ds, val_ds) trainer.evaluate(val_ds) @@ -726,7 +786,7 @@ def _train_one_model( # Get hyperparameters of the current model current_hparams = self.hparams_config[self.models_hparams].get(model_name) or {} - print('Model hyperparameters:\n', current_hparams) + # print('Model hyperparameters:\n', current_hparams) model_obj = model_class(**current_hparams) alloc_weights = model_obj.calculate_weights(**filtered_kwargs) @@ -746,7 +806,7 @@ def _process_train_1_ds(self, returns_is: pd.DataFrame): # Loop over every model for model_name, model_class in self.model_lib.items(): - print('\n', '-'*10, f' Training {model_name} ', '-'*10) + # print('\n', '-'*10, f' Training {model_name} ', '-'*10) self.all_alloc_weights.setdefault(model_name, []) @@ -789,12 +849,17 @@ def train_all( returns_val: pd.DataFrame, returns_test: pd.DataFrame | None = None ) -> dict[str, list[pd.Series | np.ndarray]]: - + + num_slices = len(in_sample_indexes) if returns_test is None: # To use Validation Set (Combines Train + in-sample Val) # Calculate indexes for in-sample and out-of-sample to match the neural networks # Loop over dataset slices - for i in range(len(in_sample_indexes)): # len(in-sample) = len(out-of-sample) + for i in tqdm( + range(num_slices), + desc=f'Training tradional models on {num_slices} slices', + unit='slice' + ): # len(in-sample) = len(out-of-sample) returns_is, _ = build_dataset( in_sample_indexes[i], out_sample_indexes[i], @@ -802,13 +867,15 @@ def train_all( returns_val ) - print(f'\nTraining all models on slice {i+1} of the data...') - self._process_train_1_ds(returns_is) else: # To use Test Set (Combines Train + Val + in-sample Test) - for i in range(len(in_sample_indexes)): + for i in tqdm( + range(num_slices), + desc=f'Training tradional models on {num_slices} slices', + unit='slice' + ): returns_is, _ = build_dataset( in_sample_indexes[i], out_sample_indexes[i], diff --git a/financial_loss_functions/src/utils/device.py b/financial_loss_functions/src/utils/device.py index 5cb3b6fc..aca11e33 100644 --- a/financial_loss_functions/src/utils/device.py +++ b/financial_loss_functions/src/utils/device.py @@ -1,4 +1,5 @@ import torch +import numpy as np def get_best_device() -> torch.device: if torch.backends.mps.is_available(): @@ -11,4 +12,30 @@ def get_best_device() -> torch.device: else: print('No GPU acceleration. Using CPU.') - return torch.device('cpu') \ No newline at end of file + return torch.device('cpu') + +import random + +def set_seed(seed=50): + # Basic Python and Numpy seeds + random.seed(seed) + np.random.seed(seed) + # os.environ['PYTHONHASHSEED'] = str(seed) + + # Basic PyTorch seed (covers CPU) + torch.manual_seed(seed) + + # NVIDIA CUDA Specifics + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) # for multi-GPU + # These two ensure deterministic behavior but may slow down training slightly + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + # Apple Silicon (MPS) Specifics + if hasattr(torch, 'mps') and torch.backends.mps.is_available(): + torch.mps.manual_seed(seed) + # Note: MPS is still maturing; some operations might not be 100% deterministic yet + + print(f'Seeds set to {seed} across all available backends.') diff --git a/financial_loss_functions/src/visualization/plots.py b/financial_loss_functions/src/visualization/plots.py index ae4c466d..d044d64c 100644 --- a/financial_loss_functions/src/visualization/plots.py +++ b/financial_loss_functions/src/visualization/plots.py @@ -38,10 +38,10 @@ def train_val_losses_plot( bars = ax2.bar(window_labels, eval_losses) # Add value labels on top of the bars - for bar in bars: - height = bar.get_height() - ax2.text(bar.get_x() + bar.get_width()/2., height, - f'{height:.4f}', ha='center', va='bottom' if height > 0 else 'top', fontsize=10) + # for bar in bars: + # height = bar.get_height() + # ax2.text(bar.get_x() + bar.get_width()/2., height, + # f'{height:.4f}', ha='center', va='bottom' if height > 0 else 'top', fontsize=10) ax2.set_ylabel('Loss') ax2.set_title(f'Evaluation Performance (N={num_windows} Windows)') @@ -148,7 +148,7 @@ def plot_models_comparison( ): eval_metrics.boxplot(figsize=(12, 6)) - plt.xticks(rotation=45) + plt.xticks(rotation=90) plt.title(title) plt.tight_layout() # Ensures labels aren't cut off