Add SDPA and Flash Attention support for PatchTST model

Furkan-rgb · Furkan-rgb · commit 23fe9ff8507c · 2025-11-28T19:48:46.000Z
- Add _supports_sdpa = True and _supports_flash_attn = True to PatchTSTPreTrainedModel
- The existing PatchTSTAttention class already uses ALL_ATTENTION_FUNCTIONS
  to select the attention implementation based on config._attn_implementation
- Fix test_modeling_patchtst.py _prepare_for_class for dynamic batch sizes
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -418,7 +418,7 @@ def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
         self.channel_attention = config.channel_attention
-        # Multi-Head attention
+
         self.self_attn = PatchTSTAttention(
             embed_dim=config.d_model,
             num_heads=config.num_attention_heads,
@@ -555,6 +555,8 @@ class PatchTSTPreTrainedModel(PreTrainedModel):
     main_input_name = "past_values"
     input_modalities = ("time",)
     supports_gradient_checkpointing = False
+    _supports_flash_attn = True
+    _supports_sdpa = True
 
     @torch.no_grad()
     def _init_weights(self, module: nn.Module):
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
@@ -184,20 +184,23 @@ def test_config(self):
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
+        # Get the actual batch size from the inputs (may differ from model_tester.batch_size in some tests)
+        batch_size = inputs_dict["past_values"].shape[0]
+
         #  if PatchTSTForPretraining
         if model_class == PatchTSTForPretraining:
-            inputs_dict.pop("future_values")
+            inputs_dict.pop("future_values", None)
         # else if classification model:
         elif model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING):
             rng = random.Random(self.model_tester.seed)
-            labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_targets, rng=rng)
+            labels = ids_tensor([batch_size], self.model_tester.num_targets, rng=rng)
             inputs_dict["target_values"] = labels
-            inputs_dict.pop("future_values")
+            inputs_dict.pop("future_values", None)
         elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):
             rng = random.Random(self.model_tester.seed)
-            target_values = floats_tensor([self.model_tester.batch_size, self.model_tester.num_targets], rng=rng)
+            target_values = floats_tensor([batch_size, self.model_tester.num_targets], rng=rng)
             inputs_dict["target_values"] = target_values
-            inputs_dict.pop("future_values")
+            inputs_dict.pop("future_values", None)
         return inputs_dict
 
     def test_save_load_strict(self):