From ce0baf458ac0880f93d548133a8419fc1dc184d5 Mon Sep 17 00:00:00 2001 From: Ankit-1204 Date: Wed, 12 Mar 2025 19:40:20 +0530 Subject: [PATCH 01/10] [ENH] Initial commit --- .../models/informer/__init__.py | 3 + .../models/informer/_informer.py | 46 +++++ .../models/informer/sub_modules.py | 162 ++++++++++++++++++ 3 files changed, 211 insertions(+) create mode 100644 pytorch_forecasting/models/informer/__init__.py create mode 100644 pytorch_forecasting/models/informer/_informer.py create mode 100644 pytorch_forecasting/models/informer/sub_modules.py diff --git a/pytorch_forecasting/models/informer/__init__.py b/pytorch_forecasting/models/informer/__init__.py new file mode 100644 index 000000000..7d85740f6 --- /dev/null +++ b/pytorch_forecasting/models/informer/__init__.py @@ -0,0 +1,3 @@ +""" +Informer Transformer for Long Sequence Time-Series Forecasting. +""" diff --git a/pytorch_forecasting/models/informer/_informer.py b/pytorch_forecasting/models/informer/_informer.py new file mode 100644 index 000000000..b38543ed9 --- /dev/null +++ b/pytorch_forecasting/models/informer/_informer.py @@ -0,0 +1,46 @@ +""" +Informer Transformer for Long Sequence Time-Series Forecasting. +""" + +from typing import Dict, List, Optional, Union + +import numpy as np +import torch +from torch import nn + +from pytorch_forecasting.data import TimeSeriesDataSet +from pytorch_forecasting.data.encoders import NaNLabelEncoder +from pytorch_forecasting.metrics import MAE, MAPE, MASE, RMSE, SMAPE, MultiHorizonMetric +from pytorch_forecasting.models.base import BaseModel +from pytorch_forecasting.utils._dependencies import _check_matplotlib + + +class Informer(BaseModel): + def __init__( + self, + encoder_input: int, + decoder_input: int, + out_channels: int, + seq_len: int, + label_len: int, + out_len: int, + factor: int = 5, + d_model: int = 512, + n_heads: int = 8, + encoder_layers: Union[int, List[int]] = 3, + decoder_layers: int = 2, + d_ff: int = 512, + dropout: int = 0.0, + attn: str = "prob", + embed: str = "fixed", + freq: str = "h", + activation: str = "gelu", + output_attention: bool = False, + distil: bool = True, + mix: bool = True, + logging_metrics: Optional[nn.ModuleList] = None, + **kwargs, + ): + super().__init__() + if logging_metrics is None: + logging_metrics = nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()]) diff --git a/pytorch_forecasting/models/informer/sub_modules.py b/pytorch_forecasting/models/informer/sub_modules.py new file mode 100644 index 000000000..8830f8997 --- /dev/null +++ b/pytorch_forecasting/models/informer/sub_modules.py @@ -0,0 +1,162 @@ +import math + +import torch +import torch.nn as nn + + +class ConvLayer(nn.Module): + def __init__(self, input_channel) -> None: + super(ConvLayer, self).__init__() + padding = 1 if torch.__version__ >= "1.5.0" else 2 + self.downConv = nn.Conv1d( + in_channels=input_channel, + out_channels=input_channel, + kernel_size=3, + padding=padding, + padding_mode="circular", + ) + self.norm = nn.BatchNorm1d(input_channel) + self.activation = nn.ELU() + self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1) + + def forward(self, x): + x = self.downConv(x.permute(0, 2, 1)) + x = self.norm(x) + x = self.activation(x) + x = self.maxPool(x) + x = x.transpose(1, 2) + return x + + +class PositionalEmbedding(nn.Module): + def __init__(self, d_model, max_len=5000): + super(PositionalEmbedding, self).__init__() + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, d_model).float() + pe.require_grad = False + + position = torch.arange(0, max_len).float().unsqueeze(1) + div_term = ( + torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model) + ).exp() + + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + + pe = pe.unsqueeze(0) + self.register_buffer("pe", pe) + + def forward(self, x): + return self.pe[:, : x.size(1)] + + +class TokenEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(TokenEmbedding, self).__init__() + padding = 1 if torch.__version__ >= "1.5.0" else 2 + self.tokenConv = nn.Conv1d( + in_channels=c_in, + out_channels=d_model, + kernel_size=3, + padding=padding, + padding_mode="circular", + ) + for m in self.modules(): + if isinstance(m, nn.Conv1d): + nn.init.kaiming_normal_( + m.weight, mode="fan_in", nonlinearity="leaky_relu" + ) + + def forward(self, x): + x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2) + return x + + +class FixedEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(FixedEmbedding, self).__init__() + + w = torch.zeros(c_in, d_model).float() + w.require_grad = False + + position = torch.arange(0, c_in).float().unsqueeze(1) + div_term = ( + torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model) + ).exp() + + w[:, 0::2] = torch.sin(position * div_term) + w[:, 1::2] = torch.cos(position * div_term) + + self.emb = nn.Embedding(c_in, d_model) + self.emb.weight = nn.Parameter(w, requires_grad=False) + + def forward(self, x): + return self.emb(x).detach() + + +class TemporalEmbedding(nn.Module): + def __init__(self, d_model, embed_type="fixed", freq="h"): + super(TemporalEmbedding, self).__init__() + + minute_size = 4 + hour_size = 24 + weekday_size = 7 + day_size = 32 + month_size = 13 + + Embed = FixedEmbedding if embed_type == "fixed" else nn.Embedding + if freq == "t": + self.minute_embed = Embed(minute_size, d_model) + self.hour_embed = Embed(hour_size, d_model) + self.weekday_embed = Embed(weekday_size, d_model) + self.day_embed = Embed(day_size, d_model) + self.month_embed = Embed(month_size, d_model) + + def forward(self, x): + x = x.long() + + minute_x = ( + self.minute_embed(x[:, :, 4]) if hasattr(self, "minute_embed") else 0.0 + ) + hour_x = self.hour_embed(x[:, :, 3]) + weekday_x = self.weekday_embed(x[:, :, 2]) + day_x = self.day_embed(x[:, :, 1]) + month_x = self.month_embed(x[:, :, 0]) + + return hour_x + weekday_x + day_x + month_x + minute_x + + +class TimeFeatureEmbedding(nn.Module): + def __init__(self, d_model, embed_type="timeF", freq="h"): + super(TimeFeatureEmbedding, self).__init__() + + freq_map = {"h": 4, "t": 5, "s": 6, "m": 1, "a": 1, "w": 2, "d": 3, "b": 3} + d_inp = freq_map[freq] + self.embed = nn.Linear(d_inp, d_model) + + def forward(self, x): + return self.embed(x) + + +class DataEmbedding(nn.Module): + def __init__(self, c_in, d_model, embed_type="fixed", freq="h", dropout=0.1): + super(DataEmbedding, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = ( + TemporalEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) + if embed_type != "timeF" + else TimeFeatureEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) + ) + + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + x = ( + self.value_embedding(x) + + self.position_embedding(x) + + self.temporal_embedding(x_mark) + ) + + return self.dropout(x) From 0ca09d45281110a91ef7bdaed0c677207f311129 Mon Sep 17 00:00:00 2001 From: Ankit-1204 Date: Thu, 13 Mar 2025 22:53:13 +0530 Subject: [PATCH 02/10] [ENH] from_dataset implemented. incompleted --- .../models/informer/_informer.py | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/pytorch_forecasting/models/informer/_informer.py b/pytorch_forecasting/models/informer/_informer.py index b38543ed9..36bf41324 100644 --- a/pytorch_forecasting/models/informer/_informer.py +++ b/pytorch_forecasting/models/informer/_informer.py @@ -31,11 +31,11 @@ def __init__( decoder_layers: int = 2, d_ff: int = 512, dropout: int = 0.0, - attn: str = "prob", embed: str = "fixed", freq: str = "h", activation: str = "gelu", output_attention: bool = False, + loss: MultiHorizonMetric = None, distil: bool = True, mix: bool = True, logging_metrics: Optional[nn.ModuleList] = None, @@ -44,3 +44,29 @@ def __init__( super().__init__() if logging_metrics is None: logging_metrics = nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()]) + if loss is None: + loss = MAE() + + @classmethod + def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs): + """ + Convenience function to create network from :py:class`~pytorch_forecasting.data.timeseries.TimeSeriesDataSet`. + + Args: + dataset (TimeSeriesDataSet): dataset where sole predictor is the target. + **kwargs: additional arguments to be passed to ``__init__`` method. + + Returns: + Informer + """ # noqa: E501 + new_kwargs = { + "prediction_length": dataset.max_prediction_length, + "context_length": dataset.max_encoder_length, + } + new_kwargs.update(kwargs) + + # create class and return + return super().from_dataset( + dataset, + **new_kwargs, + ) From 108a7debe258239d1c5840bceac2def4f3bd0e18 Mon Sep 17 00:00:00 2001 From: Ankit-1204 Date: Thu, 13 Mar 2025 22:56:58 +0530 Subject: [PATCH 03/10] [ENH] functions from original implementation added --- .../models/informer/sub_modules.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/pytorch_forecasting/models/informer/sub_modules.py b/pytorch_forecasting/models/informer/sub_modules.py index 8830f8997..50405ee45 100644 --- a/pytorch_forecasting/models/informer/sub_modules.py +++ b/pytorch_forecasting/models/informer/sub_modules.py @@ -2,6 +2,7 @@ import torch import torch.nn as nn +import torch.nn.functional as F class ConvLayer(nn.Module): @@ -28,6 +29,59 @@ def forward(self, x): return x +class DecoderLayer(nn.Module): + def __init__( + self, + self_attention, + cross_attention, + d_model, + d_ff=None, + dropout=0.1, + activation="relu", + ): + super(DecoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.self_attention = self_attention + self.cross_attention = cross_attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1) + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, cross, x_mask=None, cross_mask=None): + x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0]) + x = self.norm1(x) + + x = x + self.dropout( + self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0] + ) + + y = x = self.norm2(x) + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + + return self.norm3(x + y) + + +class Decoder(nn.Module): + def __init__(self, layers, norm_layer=None): + super(Decoder, self).__init__() + self.layers = nn.ModuleList(layers) + self.norm = norm_layer + + def forward(self, x, cross, x_mask=None, cross_mask=None): + for layer in self.layers: + x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask) + + if self.norm is not None: + x = self.norm(x) + + return x + + class PositionalEmbedding(nn.Module): def __init__(self, d_model, max_len=5000): super(PositionalEmbedding, self).__init__() From ee947343bbb8d70307ca8a65c3919cd4aab678cf Mon Sep 17 00:00:00 2001 From: Ankit-1204 Date: Fri, 21 Mar 2025 20:26:40 +0530 Subject: [PATCH 04/10] [ENH] Adapting from thuml --- .../models/informer/_informer.py | 11 + .../models/informer/sub_modules.py | 388 ++++++++++++++---- 2 files changed, 312 insertions(+), 87 deletions(-) diff --git a/pytorch_forecasting/models/informer/_informer.py b/pytorch_forecasting/models/informer/_informer.py index 36bf41324..740ec467d 100644 --- a/pytorch_forecasting/models/informer/_informer.py +++ b/pytorch_forecasting/models/informer/_informer.py @@ -12,6 +12,16 @@ from pytorch_forecasting.data.encoders import NaNLabelEncoder from pytorch_forecasting.metrics import MAE, MAPE, MASE, RMSE, SMAPE, MultiHorizonMetric from pytorch_forecasting.models.base import BaseModel +from pytorch_forecasting.models.informer.sub_modules import ( + AttentionLayer, + ConvLayer, + DataEmbedding, + Decoder, + DecoderLayer, + Encoder, + EncoderLayer, + ProbAttention, +) from pytorch_forecasting.utils._dependencies import _check_matplotlib @@ -21,6 +31,7 @@ def __init__( encoder_input: int, decoder_input: int, out_channels: int, + task: str, seq_len: int, label_len: int, out_len: int, diff --git a/pytorch_forecasting/models/informer/sub_modules.py b/pytorch_forecasting/models/informer/sub_modules.py index 50405ee45..c162aedb0 100644 --- a/pytorch_forecasting/models/informer/sub_modules.py +++ b/pytorch_forecasting/models/informer/sub_modules.py @@ -1,31 +1,29 @@ import math +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F -class ConvLayer(nn.Module): - def __init__(self, input_channel) -> None: - super(ConvLayer, self).__init__() - padding = 1 if torch.__version__ >= "1.5.0" else 2 - self.downConv = nn.Conv1d( - in_channels=input_channel, - out_channels=input_channel, - kernel_size=3, - padding=padding, - padding_mode="circular", - ) - self.norm = nn.BatchNorm1d(input_channel) - self.activation = nn.ELU() - self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1) +class Decoder(nn.Module): + def __init__(self, layers, norm_layer=None, projection=None): + super(Decoder, self).__init__() + self.layers = nn.ModuleList(layers) + self.norm = norm_layer + self.projection = projection - def forward(self, x): - x = self.downConv(x.permute(0, 2, 1)) - x = self.norm(x) - x = self.activation(x) - x = self.maxPool(x) - x = x.transpose(1, 2) + def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None): + for layer in self.layers: + x = layer( + x, cross, x_mask=x_mask, cross_mask=cross_mask, tau=tau, delta=delta + ) + + if self.norm is not None: + x = self.norm(x) + + if self.projection is not None: + x = self.projection(x) return x @@ -51,12 +49,16 @@ def __init__( self.dropout = nn.Dropout(dropout) self.activation = F.relu if activation == "relu" else F.gelu - def forward(self, x, cross, x_mask=None, cross_mask=None): - x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0]) + def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None): + x = x + self.dropout( + self.self_attention(x, x, x, attn_mask=x_mask, tau=tau, delta=None)[0] + ) x = self.norm1(x) x = x + self.dropout( - self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0] + self.cross_attention( + x, cross, cross, attn_mask=cross_mask, tau=tau, delta=delta + )[0] ) y = x = self.norm2(x) @@ -66,42 +68,268 @@ def forward(self, x, cross, x_mask=None, cross_mask=None): return self.norm3(x + y) -class Decoder(nn.Module): - def __init__(self, layers, norm_layer=None): - super(Decoder, self).__init__() - self.layers = nn.ModuleList(layers) +class EncoderLayer(nn.Module): + def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"): + super(EncoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.attention = attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1) + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, attn_mask=None, tau=None, delta=None): + new_x, attn = self.attention(x, x, x, attn_mask=attn_mask, tau=tau, delta=delta) + x = x + self.dropout(new_x) + + y = x = self.norm1(x) + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + + return self.norm2(x + y), attn + + +class Encoder(nn.Module): + def __init__(self, attn_layers, conv_layers=None, norm_layer=None): + super(Encoder, self).__init__() + self.attn_layers = nn.ModuleList(attn_layers) + self.conv_layers = ( + nn.ModuleList(conv_layers) if conv_layers is not None else None + ) self.norm = norm_layer - def forward(self, x, cross, x_mask=None, cross_mask=None): - for layer in self.layers: - x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask) + def forward(self, x, attn_mask=None, tau=None, delta=None): + # x [B, L, D] + attns = [] + if self.conv_layers is not None: + for i, (attn_layer, conv_layer) in enumerate( + zip(self.attn_layers, self.conv_layers) + ): + delta = delta if i == 0 else None + x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta) + x = conv_layer(x) + attns.append(attn) + x, attn = self.attn_layers[-1](x, tau=tau, delta=None) + attns.append(attn) + else: + for attn_layer in self.attn_layers: + x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta) + attns.append(attn) if self.norm is not None: x = self.norm(x) + return x, attns + + +class ConvLayer(nn.Module): + def __init__(self, c_in): + super(ConvLayer, self).__init__() + self.downConv = nn.Conv1d( + in_channels=c_in, + out_channels=c_in, + kernel_size=3, + padding=2, + padding_mode="circular", + ) + self.norm = nn.BatchNorm1d(c_in) + self.activation = nn.ELU() + self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1) + + def forward(self, x): + x = self.downConv(x.permute(0, 2, 1)) + x = self.norm(x) + x = self.activation(x) + x = self.maxPool(x) + x = x.transpose(1, 2) return x -class PositionalEmbedding(nn.Module): - def __init__(self, d_model, max_len=5000): - super(PositionalEmbedding, self).__init__() - # Compute the positional encodings once in log space. - pe = torch.zeros(max_len, d_model).float() - pe.require_grad = False +class ProbAttention(nn.Module): + def __init__( + self, + mask_flag=True, + factor=5, + scale=None, + attention_dropout=0.1, + output_attention=False, + ): + super(ProbAttention, self).__init__() + self.factor = factor + self.scale = scale + self.mask_flag = mask_flag + self.output_attention = output_attention + self.dropout = nn.Dropout(attention_dropout) + + def _prob_QK(self, Q, K, sample_k, n_top): # n_top: c*ln(L_q) + # Q [B, H, L, D] + B, H, L_K, E = K.shape + _, _, L_Q, _ = Q.shape + + # calculate the sampled Q_K + K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E) + # real U = U_part(factor*ln(L_k))*L_q + index_sample = torch.randint(L_K, (L_Q, sample_k)) + K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :] + Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze() + + # find the Top_k query with sparisty measurement + M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K) + M_top = M.topk(n_top, sorted=False)[1] + + # use the reduced Q to calculate Q_K + Q_reduce = Q[ + torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], M_top, : + ] # factor*ln(L_q) + Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1)) # factor*ln(L_q)*L_k + + return Q_K, M_top + + def _get_initial_context(self, V, L_Q): + B, H, L_V, D = V.shape + if not self.mask_flag: + # V_sum = V.sum(dim=-2) + V_sum = V.mean(dim=-2) + contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone() + else: # use mask + # requires that L_Q == L_V, i.e. for self-attention only + assert L_Q == L_V + contex = V.cumsum(dim=-2) + return contex + + def _update_context(self, context_in, V, scores, index, L_Q, attn_mask): + B, H, L_V, D = V.shape + + if self.mask_flag: + attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device) + scores.masked_fill_(attn_mask.mask, -np.inf) + + attn = torch.softmax(scores, dim=-1) # nn.Softmax(dim=-1)(scores) + + context_in[ + torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, : + ] = torch.matmul(attn, V).type_as(context_in) + if self.output_attention: + attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device) + attns[ + torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, : + ] = attn + return context_in, attns + else: + return context_in, None + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + B, L_Q, H, D = queries.shape + _, L_K, _, _ = keys.shape + + queries = queries.transpose(2, 1) + keys = keys.transpose(2, 1) + values = values.transpose(2, 1) + + U_part = self.factor * np.ceil(np.log(L_K)).astype("int").item() # c*ln(L_k) + u = self.factor * np.ceil(np.log(L_Q)).astype("int").item() # c*ln(L_q) + + U_part = U_part if U_part < L_K else L_K + u = u if u < L_Q else L_Q + + scores_top, index = self._prob_QK(queries, keys, sample_k=U_part, n_top=u) + + # add scale factor + scale = self.scale or 1.0 / math.sqrt(D) + if scale is not None: + scores_top = scores_top * scale + # get the context + context = self._get_initial_context(values, L_Q) + # update the context with selected top_k queries + context, attn = self._update_context( + context, values, scores_top, index, L_Q, attn_mask + ) - position = torch.arange(0, max_len).float().unsqueeze(1) - div_term = ( - torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model) - ).exp() + return context.contiguous(), attn - pe[:, 0::2] = torch.sin(position * div_term) - pe[:, 1::2] = torch.cos(position * div_term) - pe = pe.unsqueeze(0) - self.register_buffer("pe", pe) +class ProbMask: + def __init__(self, B, H, L, index, scores, device="cpu"): + _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1) + _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1]) + indicator = _mask_ex[ + torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, : + ].to(device) + self._mask = indicator.view(scores.shape).to(device) + + @property + def mask(self): + return self._mask + + +class AttentionLayer(nn.Module): + def __init__(self, attention, d_model, n_heads, d_keys=None, d_values=None): + super(AttentionLayer, self).__init__() + + d_keys = d_keys or (d_model // n_heads) + d_values = d_values or (d_model // n_heads) + + self.inner_attention = attention + self.query_projection = nn.Linear(d_model, d_keys * n_heads) + self.key_projection = nn.Linear(d_model, d_keys * n_heads) + self.value_projection = nn.Linear(d_model, d_values * n_heads) + self.out_projection = nn.Linear(d_values * n_heads, d_model) + self.n_heads = n_heads + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + B, L, _ = queries.shape + _, S, _ = keys.shape + H = self.n_heads + + queries = self.query_projection(queries).view(B, L, H, -1) + keys = self.key_projection(keys).view(B, S, H, -1) + values = self.value_projection(values).view(B, S, H, -1) + + out, attn = self.inner_attention( + queries, keys, values, attn_mask, tau=tau, delta=delta + ) + out = out.view(B, L, -1) + + return self.out_projection(out), attn + + +class DataEmbedding(nn.Module): + def __init__(self, c_in, d_model, embed_type="fixed", freq="h", dropout=0.1): + super(DataEmbedding, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = ( + TemporalEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) + if embed_type != "timeF" + else TimeFeatureEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) + ) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + if x_mark is None: + x = self.value_embedding(x) + self.position_embedding(x) + else: + x = ( + self.value_embedding(x) + + self.temporal_embedding(x_mark) + + self.position_embedding(x) + ) + return self.dropout(x) + + +class TimeFeatureEmbedding(nn.Module): + def __init__(self, d_model, embed_type="timeF", freq="h"): + super(TimeFeatureEmbedding, self).__init__() + + freq_map = {"h": 4, "t": 5, "s": 6, "m": 1, "a": 1, "w": 2, "d": 3, "b": 3} + d_inp = freq_map[freq] + self.embed = nn.Linear(d_inp, d_model, bias=False) def forward(self, x): - return self.pe[:, : x.size(1)] + return self.embed(x) class TokenEmbedding(nn.Module): @@ -114,6 +342,7 @@ def __init__(self, c_in, d_model): kernel_size=3, padding=padding, padding_mode="circular", + bias=False, ) for m in self.modules(): if isinstance(m, nn.Conv1d): @@ -126,26 +355,26 @@ def forward(self, x): return x -class FixedEmbedding(nn.Module): - def __init__(self, c_in, d_model): - super(FixedEmbedding, self).__init__() - - w = torch.zeros(c_in, d_model).float() - w.require_grad = False +class PositionalEmbedding(nn.Module): + def __init__(self, d_model, max_len=5000): + super(PositionalEmbedding, self).__init__() + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, d_model).float() + pe.require_grad = False - position = torch.arange(0, c_in).float().unsqueeze(1) + position = torch.arange(0, max_len).float().unsqueeze(1) div_term = ( torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model) ).exp() - w[:, 0::2] = torch.sin(position * div_term) - w[:, 1::2] = torch.cos(position * div_term) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) - self.emb = nn.Embedding(c_in, d_model) - self.emb.weight = nn.Parameter(w, requires_grad=False) + pe = pe.unsqueeze(0) + self.register_buffer("pe", pe) def forward(self, x): - return self.emb(x).detach() + return self.pe[:, : x.size(1)] class TemporalEmbedding(nn.Module): @@ -168,7 +397,6 @@ def __init__(self, d_model, embed_type="fixed", freq="h"): def forward(self, x): x = x.long() - minute_x = ( self.minute_embed(x[:, :, 4]) if hasattr(self, "minute_embed") else 0.0 ) @@ -180,37 +408,23 @@ def forward(self, x): return hour_x + weekday_x + day_x + month_x + minute_x -class TimeFeatureEmbedding(nn.Module): - def __init__(self, d_model, embed_type="timeF", freq="h"): - super(TimeFeatureEmbedding, self).__init__() - - freq_map = {"h": 4, "t": 5, "s": 6, "m": 1, "a": 1, "w": 2, "d": 3, "b": 3} - d_inp = freq_map[freq] - self.embed = nn.Linear(d_inp, d_model) - - def forward(self, x): - return self.embed(x) - +class FixedEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(FixedEmbedding, self).__init__() -class DataEmbedding(nn.Module): - def __init__(self, c_in, d_model, embed_type="fixed", freq="h", dropout=0.1): - super(DataEmbedding, self).__init__() + w = torch.zeros(c_in, d_model).float() + w.require_grad = False - self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) - self.position_embedding = PositionalEmbedding(d_model=d_model) - self.temporal_embedding = ( - TemporalEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) - if embed_type != "timeF" - else TimeFeatureEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) - ) + position = torch.arange(0, c_in).float().unsqueeze(1) + div_term = ( + torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model) + ).exp() - self.dropout = nn.Dropout(p=dropout) + w[:, 0::2] = torch.sin(position * div_term) + w[:, 1::2] = torch.cos(position * div_term) - def forward(self, x, x_mark): - x = ( - self.value_embedding(x) - + self.position_embedding(x) - + self.temporal_embedding(x_mark) - ) + self.emb = nn.Embedding(c_in, d_model) + self.emb.weight = nn.Parameter(w, requires_grad=False) - return self.dropout(x) + def forward(self, x): + return self.emb(x).detach() From 3665f9d43b5693f4570ebd00f354c8272253065d Mon Sep 17 00:00:00 2001 From: Ankit-1204 Date: Sat, 22 Mar 2025 00:11:13 +0530 Subject: [PATCH 05/10] [ENH] Add forward function --- .../models/informer/_informer.py | 113 +++++++++++++++++- 1 file changed, 112 insertions(+), 1 deletion(-) diff --git a/pytorch_forecasting/models/informer/_informer.py b/pytorch_forecasting/models/informer/_informer.py index 740ec467d..ba5a5813f 100644 --- a/pytorch_forecasting/models/informer/_informer.py +++ b/pytorch_forecasting/models/informer/_informer.py @@ -7,6 +7,7 @@ import numpy as np import torch from torch import nn +import torch.nn.functional as F from pytorch_forecasting.data import TimeSeriesDataSet from pytorch_forecasting.data.encoders import NaNLabelEncoder @@ -52,11 +53,85 @@ def __init__( logging_metrics: Optional[nn.ModuleList] = None, **kwargs, ): - super().__init__() if logging_metrics is None: logging_metrics = nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()]) if loss is None: loss = MAE() + super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs) + self.enc_embedding = DataEmbedding( + self.encoder_input, self.d_model, self.embed, self.freq, self.dropout + ) + self.dec_embedding = DataEmbedding( + self.decoder_input, self.d_model, self.embed, self.freq, self.dropout + ) + self.encoder = Encoder( + [ + EncoderLayer( + AttentionLayer( + ProbAttention( + False, + self.factor, + attention_dropout=self.dropout, + output_attention=False, + ), + self.d_model, + self.n_heads, + ), + self.d_model, + self.d_ff, + dropout=self.dropout, + activation=self.activation, + ) + for l in range(self.encoder_layers) + ], + ( + [ConvLayer(self.d_model) for l in range(self.encoder_layers - 1)] + if self.distil and ("forecast" in self.task_name) + else None + ), + norm_layer=torch.nn.LayerNorm(self.d_model), + ) + self.decoder = Decoder( + [ + DecoderLayer( + AttentionLayer( + ProbAttention( + True, + self.factor, + attention_dropout=self.dropout, + output_attention=False, + ), + self.d_model, + self.n_heads, + ), + AttentionLayer( + ProbAttention( + False, + self.factor, + attention_dropout=self.dropout, + output_attention=False, + ), + self.d_model, + self.n_heads, + ), + self.d_model, + self.d_ff, + dropout=self.dropout, + activation=self.activation, + ) + for l in range(self.decoder_layers) + ], + norm_layer=torch.nn.LayerNorm(self.d_model), + projection=nn.Linear(self.d_model, self.out_channels, bias=True), + ) + if self.task_name == "imputation": + self.projection = nn.Linear(self.d_model, self.out_channels, bias=True) + if self.task_name == "anomaly_detection": + self.projection = nn.Linear(self.d_model, self.out_channels, bias=True) + if self.task_name == "classification": + self.act = F.gelu + self.dropout = nn.Dropout(self.dropout) + self.projection = nn.Linear(self.d_model * self.seq_len, self.num_class) @classmethod def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs): @@ -81,3 +156,39 @@ def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs): dataset, **new_kwargs, ) + + def long_forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec): + enc_out = self.enc_embedding(x_enc, x_mark_enc) + dec_out = self.dec_embedding(x_dec, x_mark_dec) + enc_out, attns = self.encoder(enc_out, attn_mask=None) + + dec_out = self.decoder(dec_out, enc_out, x_mask=None, cross_mask=None) + + return dec_out # [B, L, D] + + def short_forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec): + # Normalization + mean_enc = x_enc.mean(1, keepdim=True).detach() # B x 1 x E + x_enc = x_enc - mean_enc + std_enc = torch.sqrt( + torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5 + ).detach() # B x 1 x E + x_enc = x_enc / std_enc + + enc_out = self.enc_embedding(x_enc, x_mark_enc) + dec_out = self.dec_embedding(x_dec, x_mark_dec) + enc_out, attns = self.encoder(enc_out, attn_mask=None) + + dec_out = self.decoder(dec_out, enc_out, x_mask=None, cross_mask=None) + + dec_out = dec_out * std_enc + mean_enc + return dec_out # [B, L, D] + + def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None): + if self.task_name == "long_term_forecast": + dec_out = self.long_forecast(x_enc, x_mark_enc, x_dec, x_mark_dec) + return dec_out[:, -self.pred_len :, :] # [B, L, D] + if self.task_name == "short_term_forecast": + dec_out = self.short_forecast(x_enc, x_mark_enc, x_dec, x_mark_dec) + return dec_out[:, -self.pred_len :, :] # [B, L, D] + return None From b00d46e061058ba934d8c38aecb4d0f319857fad Mon Sep 17 00:00:00 2001 From: Ankit-1204 Date: Sat, 5 Apr 2025 23:08:02 +0530 Subject: [PATCH 06/10] [ENH] Remove unwanted feature --- pytorch_forecasting/models/informer/_informer.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pytorch_forecasting/models/informer/_informer.py b/pytorch_forecasting/models/informer/_informer.py index ba5a5813f..298b6d099 100644 --- a/pytorch_forecasting/models/informer/_informer.py +++ b/pytorch_forecasting/models/informer/_informer.py @@ -57,6 +57,7 @@ def __init__( logging_metrics = nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()]) if loss is None: loss = MAE() + self.save_hyperparameters() super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs) self.enc_embedding = DataEmbedding( self.encoder_input, self.d_model, self.embed, self.freq, self.dropout @@ -124,14 +125,6 @@ def __init__( norm_layer=torch.nn.LayerNorm(self.d_model), projection=nn.Linear(self.d_model, self.out_channels, bias=True), ) - if self.task_name == "imputation": - self.projection = nn.Linear(self.d_model, self.out_channels, bias=True) - if self.task_name == "anomaly_detection": - self.projection = nn.Linear(self.d_model, self.out_channels, bias=True) - if self.task_name == "classification": - self.act = F.gelu - self.dropout = nn.Dropout(self.dropout) - self.projection = nn.Linear(self.d_model * self.seq_len, self.num_class) @classmethod def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs): From e55f7dda6e2a67df19089c20a6c20ef8c9f7d0bf Mon Sep 17 00:00:00 2001 From: Ankit-1204 Date: Tue, 29 Apr 2025 17:01:27 +0530 Subject: [PATCH 07/10] [ENH] Add tests for informer --- pytorch_forecasting/models/__init__.py | 2 + .../models/informer/__init__.py | 4 + tests/test_models/test_informer.py | 103 ++++++++++++++++++ 3 files changed, 109 insertions(+) create mode 100644 tests/test_models/test_informer.py diff --git a/pytorch_forecasting/models/__init__.py b/pytorch_forecasting/models/__init__.py index 29aeb24f5..033a2e297 100644 --- a/pytorch_forecasting/models/__init__.py +++ b/pytorch_forecasting/models/__init__.py @@ -10,6 +10,7 @@ ) from pytorch_forecasting.models.baseline import Baseline from pytorch_forecasting.models.deepar import DeepAR +from pytorch_forecasting.models.informer import Informer from pytorch_forecasting.models.mlp import DecoderMLP from pytorch_forecasting.models.nbeats import NBeats from pytorch_forecasting.models.nhits import NHiTS @@ -37,4 +38,5 @@ "MultiEmbedding", "DecoderMLP", "TiDEModel", + "Informer", ] diff --git a/pytorch_forecasting/models/informer/__init__.py b/pytorch_forecasting/models/informer/__init__.py index 7d85740f6..655803f22 100644 --- a/pytorch_forecasting/models/informer/__init__.py +++ b/pytorch_forecasting/models/informer/__init__.py @@ -1,3 +1,7 @@ """ Informer Transformer for Long Sequence Time-Series Forecasting. """ + +from pytorch_forecasting.models.informer._informer import Informer + +__all__ = ["Informer"] diff --git a/tests/test_models/test_informer.py b/tests/test_models/test_informer.py new file mode 100644 index 000000000..771f1e909 --- /dev/null +++ b/tests/test_models/test_informer.py @@ -0,0 +1,103 @@ +import pickle +import shutil + +import lightning.pytorch as pl +from lightning.pytorch.callbacks import EarlyStopping +from lightning.pytorch.loggers import TensorBoardLogger +import pytest + +from pytorch_forecasting.models import Informer +from pytorch_forecasting.utils._dependencies import _get_installed_packages + + +def test_integration(dataloaders_fixed_window_without_covariates, tmp_path): + train_dataloader = dataloaders_fixed_window_without_covariates["train"] + val_dataloader = dataloaders_fixed_window_without_covariates["val"] + test_dataloader = dataloaders_fixed_window_without_covariates["test"] + + early_stop_callback = EarlyStopping( + monitor="val_loss", min_delta=1e-4, patience=1, verbose=False, mode="min" + ) + + logger = TensorBoardLogger(tmp_path) + trainer = pl.Trainer( + max_epochs=2, + gradient_clip_val=0.1, + callbacks=[early_stop_callback], + enable_checkpointing=True, + default_root_dir=tmp_path, + limit_train_batches=2, + limit_val_batches=2, + limit_test_batches=2, + logger=logger, + ) + + net = Informer.from_dataset( + train_dataloader.dataset, + learning_rate=0.15, + log_gradient_flow=True, + widths=[4, 4, 4], + log_interval=1000, + backcast_loss_ratio=1.0, + ) + net.size() + try: + trainer.fit( + net, + train_dataloaders=train_dataloader, + val_dataloaders=val_dataloader, + ) + test_outputs = trainer.test(net, dataloaders=test_dataloader) + assert len(test_outputs) > 0 + # check loading + net = Informer.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) + + # check prediction + net.predict( + val_dataloader, + fast_dev_run=True, + return_index=True, + return_decoder_lengths=True, + ) + finally: + shutil.rmtree(tmp_path, ignore_errors=True) + + net.predict( + val_dataloader, + fast_dev_run=True, + return_index=True, + return_decoder_lengths=True, + ) + + +@pytest.fixture(scope="session") +def model(dataloaders_fixed_window_without_covariates): + dataset = dataloaders_fixed_window_without_covariates["train"].dataset + net = Informer.from_dataset( + dataset, + learning_rate=0.15, + log_gradient_flow=True, + widths=[4, 4, 4], + log_interval=1000, + backcast_loss_ratio=1.0, + ) + return net + + +def test_pickle(model): + pkl = pickle.dumps(model) + pickle.loads(pkl) # noqa: S301 + + +@pytest.mark.skipif( + "matplotlib" not in _get_installed_packages(), + reason="skip test if required package matplotlib not installed", +) +def test_interpretation(model, dataloaders_fixed_window_without_covariates): + raw_predictions = model.predict( + dataloaders_fixed_window_without_covariates["val"], + mode="raw", + return_x=True, + fast_dev_run=True, + ) + model.plot_interpretation(raw_predictions.x, raw_predictions.output, idx=0) From dc39133bf51754f84a41980bebbd47de53965699 Mon Sep 17 00:00:00 2001 From: Ankit-1204 Date: Tue, 29 Apr 2025 17:10:06 +0530 Subject: [PATCH 08/10] [ENH] Correct parameters in tests --- tests/test_models/test_informer.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/test_models/test_informer.py b/tests/test_models/test_informer.py index 771f1e909..51b1d7e5f 100644 --- a/tests/test_models/test_informer.py +++ b/tests/test_models/test_informer.py @@ -35,10 +35,9 @@ def test_integration(dataloaders_fixed_window_without_covariates, tmp_path): net = Informer.from_dataset( train_dataloader.dataset, learning_rate=0.15, - log_gradient_flow=True, - widths=[4, 4, 4], - log_interval=1000, - backcast_loss_ratio=1.0, + seq_len=10, + factor=5, + n_heads=8, ) net.size() try: @@ -76,10 +75,9 @@ def model(dataloaders_fixed_window_without_covariates): net = Informer.from_dataset( dataset, learning_rate=0.15, - log_gradient_flow=True, - widths=[4, 4, 4], - log_interval=1000, - backcast_loss_ratio=1.0, + seq_len=10, + factor=5, + n_heads=8, ) return net From df10e4b92989b20154a931d0db37222c3fdbac9c Mon Sep 17 00:00:00 2001 From: Ankit-1204 Date: Sat, 3 May 2025 12:04:10 +0530 Subject: [PATCH 09/10] [ENH] Add value for non-default parameters --- pytorch_forecasting/models/informer/_informer.py | 10 +++++----- tests/test_models/test_informer.py | 4 ++++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pytorch_forecasting/models/informer/_informer.py b/pytorch_forecasting/models/informer/_informer.py index 298b6d099..c62537c97 100644 --- a/pytorch_forecasting/models/informer/_informer.py +++ b/pytorch_forecasting/models/informer/_informer.py @@ -31,11 +31,11 @@ def __init__( self, encoder_input: int, decoder_input: int, - out_channels: int, - task: str, - seq_len: int, - label_len: int, - out_len: int, + out_channels: int = 3, + seq_len: int = 20, + label_len: int = 4, + out_len: int = 10, + task_name: str = "forecasting", factor: int = 5, d_model: int = 512, n_heads: int = 8, diff --git a/tests/test_models/test_informer.py b/tests/test_models/test_informer.py index 51b1d7e5f..9232d0564 100644 --- a/tests/test_models/test_informer.py +++ b/tests/test_models/test_informer.py @@ -34,6 +34,8 @@ def test_integration(dataloaders_fixed_window_without_covariates, tmp_path): net = Informer.from_dataset( train_dataloader.dataset, + encoder_input=15, + decoder_input=15, learning_rate=0.15, seq_len=10, factor=5, @@ -74,6 +76,8 @@ def model(dataloaders_fixed_window_without_covariates): dataset = dataloaders_fixed_window_without_covariates["train"].dataset net = Informer.from_dataset( dataset, + encoder_input=15, + decoder_input=15, learning_rate=0.15, seq_len=10, factor=5, From 76f13a33d2638406661de76d05a153ccfeb6ea12 Mon Sep 17 00:00:00 2001 From: Ankit-1204 Date: Sat, 3 May 2025 17:36:26 +0530 Subject: [PATCH 10/10] [ENH] change from_dataset updates --- pytorch_forecasting/models/informer/_informer.py | 8 ++++---- tests/test_models/test_informer.py | 6 ------ 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/pytorch_forecasting/models/informer/_informer.py b/pytorch_forecasting/models/informer/_informer.py index c62537c97..5bb31fc23 100644 --- a/pytorch_forecasting/models/informer/_informer.py +++ b/pytorch_forecasting/models/informer/_informer.py @@ -29,8 +29,8 @@ class Informer(BaseModel): def __init__( self, - encoder_input: int, - decoder_input: int, + encoder_input: int = 5, + decoder_input: int = 10, out_channels: int = 3, seq_len: int = 20, label_len: int = 4, @@ -139,8 +139,8 @@ def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs): Informer """ # noqa: E501 new_kwargs = { - "prediction_length": dataset.max_prediction_length, - "context_length": dataset.max_encoder_length, + "seq_len": dataset.max_prediction_length, + "encoder_input": dataset.max_encoder_length, } new_kwargs.update(kwargs) diff --git a/tests/test_models/test_informer.py b/tests/test_models/test_informer.py index 9232d0564..e2e665575 100644 --- a/tests/test_models/test_informer.py +++ b/tests/test_models/test_informer.py @@ -34,10 +34,7 @@ def test_integration(dataloaders_fixed_window_without_covariates, tmp_path): net = Informer.from_dataset( train_dataloader.dataset, - encoder_input=15, - decoder_input=15, learning_rate=0.15, - seq_len=10, factor=5, n_heads=8, ) @@ -76,10 +73,7 @@ def model(dataloaders_fixed_window_without_covariates): dataset = dataloaders_fixed_window_without_covariates["train"].dataset net = Informer.from_dataset( dataset, - encoder_input=15, - decoder_input=15, learning_rate=0.15, - seq_len=10, factor=5, n_heads=8, )