TL-System · Jasmine-Yuting-Zhang · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 29, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "plato/models/t2tvit"]
 	path = plato/models/t2tvit
 	url = https://github.com/yitu-opensource/T2T-ViT
+[submodule "external/nanochat"]
+	path = external/nanochat
+	url = https://github.com/karpathy/nanochat.git
diff --git a/cleanup.py b/cleanup.py
@@ -146,18 +146,14 @@ def main() -> None:
             continue
 
         cleared = clean_directory(runtime_dir)
-        print(
-            f"Failed to delete {runtime_dir}; cleared {cleared} items instead."
-        )
+        print(f"Failed to delete {runtime_dir}; cleared {cleared} items instead.")
         fallback_dirs += 1
         fallback_items += cleared
 
     if runtime_total == 0:
         print("No runtime directories found.")
     else:
-        print(
-            f"Removed {runtime_removed} of {runtime_total} runtime directories."
-        )
+        print(f"Removed {runtime_removed} of {runtime_total} runtime directories.")
         if fallback_dirs:
             print(
                 f"Cleared {fallback_items} items in "

diff --git a/configs/Nanochat/parquet_micro.toml b/configs/Nanochat/parquet_micro.toml
@@ -0,0 +1,53 @@
+[clients]
+type = "simple"
+total_clients = 10
+per_round = 3
+do_test = true
+
+[server]
+address = "127.0.0.1"
+port = 8000
+simulate_wall_time = false
+checkpoint_path = "checkpoints/nanochat/parquet"
+model_path = "models/nanochat/parquet"
+
+[data]
+datasource = "Nanochat"
+sampler = "iid"
+partition_size = 1
+random_seed = 1
+mode = "parquet"
+max_train_batches = 16
+max_val_batches = 1
+tokenizer_threads = 2
+tokenizer_batch_size = 32
+device = "cuda"
+vocab_size = 512
+synthetic_seed = 123
+
+[evaluation]
+type = "nanochat_core"
+# bundle_dir = "~/nanochat"
+max_per_task = 16
+
+[trainer]
+type = "nanochat"
+rounds = 10000
+epochs = 5
+batch_size = 1
+model_name = "nanochat"
+optimizer = "nanochat"
+
+[algorithm]
+type = "fedavg"
+
+[parameters.model]
+sequence_len = 256
+vocab_size = 50304
+n_layer = 4
+n_head = 4
+n_kv_head = 4
+n_embd = 256
+
+[results]
+types = "round, elapsed_time, core_metric, train_loss"
diff --git a/configs/Nanochat/synthetic_micro.toml b/configs/Nanochat/synthetic_micro.toml
@@ -0,0 +1,54 @@
+[clients]
+
+type = "simple"
+total_clients = 1
+per_round = 1
+do_test = false
+
+[server]
+address = "127.0.0.1"
+port = 8000
+simulate_wall_time = false
+checkpoint_path = "checkpoints/nanochat/synthetic"
+model_path = "models/nanochat/synthetic"
+
+[data]
+datasource = "Nanochat"
+sampler = "iid"
+partition_size = 1
+random_seed = 1
+mode = "synthetic"
+max_train_batches = 4
+max_val_batches = 1
+tokenizer_threads = 2
+tokenizer_batch_size = 64
+device = "cpu"
+vocab_size = 512
+synthetic_seed = 123
+
+[evaluation]
+type = "nanochat_core"
+# bundle_dir = "~/nanochat"  # Optional, defaults to nanochat base dir or Plato's data directory
+max_per_task = 16  # Optional, -1 means run all examples
+
+[trainer]
+type = "nanochat"
+rounds = 1
+epochs = 1
+batch_size = 2
+model_name = "nanochat"
+optimizer = "nanochat"
+
+[algorithm]
+type = "fedavg"
+
+[parameters.model]
+sequence_len = 128
+vocab_size = 512
+n_layer = 2
+n_head = 4
+n_kv_head = 4
+n_embd = 256
+
+[results]
+types = "round, elapsed_time, core_metric, train_loss"
diff --git a/configs/TimeSeries/patchtsmixer_custom.toml b/configs/TimeSeries/patchtsmixer_custom.toml
@@ -0,0 +1,67 @@
+# Federated Learning with PatchTSMixer for Time Series Forecasting
+# This configuration demonstrates using the IBM Granite PatchTSMixer model
+# with time series data from HuggingFace datasets
+
+[clients]
+type = "simple"
+total_clients = 1
+per_round = 1
+do_test = false
+
+[server]
+address = "127.0.0.1"
+port = 8000
+simulate_wall_time = false
+checkpoint_path = "checkpoints/timeseries/patchtsmixer"
+model_path = "models/timeseries/patchtsmixer"
+
+[data]
+# ETTh1: Electricity Transformer Temperature dataset (7 features)
+datasource = "ETTh1"
+
+partition_size = 100  # Number of training samples
+sampler = "iid"
+random_seed = 1
+
+[trainer]
+type = "HuggingFace"
+rounds = 3
+max_concurrency = 2
+model_type = "huggingface"
+
+# Train from scratch - simpler for testing
+model_name = "custom_patchtsmixer" 
+
+# Task type: forecasting, classification, regression, or pretraining
+task_type = "forecasting"
+
+# PatchTSMixer specific parameters (smaller model for testing)
+context_length = 64
+prediction_length = 24
+num_input_channels = 7  # ETTh1 has 7 features (HUFL, HULL, MUFL, MULL, LUFL, LULL, OT)
+patch_length = 8
+patch_stride = 8
+d_model = 32  # Hidden dimension of the model. Recommended to set it as a multiple of patch_length (i.e. 2-8X of patch_len). Larger value indicates more complex model.
+num_layers = 3  # Number of layers to use. Recommended range is 3-15. Larger value indicates more complex model.
+expansion_factor = 2 # Expansion factor to use inside MLP. Recommended range is 2-5. Larger value indicates more complex model.
+dropout = 0.5
+head_dropout = 0.7
+mode = "common_channel"
+gated_attn = true
+scaling = "std"
+
+# Training parameters
+epochs = 2
+batch_size = 8
+optimizer = "Adam"
+
+[algorithm]
+type = "fedavg"
+
+[parameters]
+[parameters.optimizer]
+lr = 0.001
+weight_decay = 0.0
+
+[results]
+types = "round, elapsed_time, accuracy"
diff --git a/configs/TimeSeries/patchtsmixer_large.toml b/configs/TimeSeries/patchtsmixer_large.toml
@@ -0,0 +1,67 @@
+# Federated Learning with Large PatchTSMixer for Time Series Forecasting
+# This configuration matches the PatchTSMixer paper parameters for ETTh1
+
+[clients]
+type = "simple"
+total_clients = 1
+per_round = 1
+do_test = true  # Enable testing to evaluate model on test set
+
+[server]
+address = "127.0.0.1"
+port = 8000
+simulate_wall_time = false
+checkpoint_path = "checkpoints/timeseries/patchtsmixer"
+model_path = "models/timeseries/patchtsmixer"
+
+[data]
+# ETTh1: Electricity Transformer Temperature dataset (7 features)
+datasource = "ETTh1"
+
+partition_size = 6960  # Full ETTh1 training set
+sampler = "iid"
+random_seed = 1
+
+[trainer]
+type = "HuggingFace"
+rounds = 1000
+max_concurrency = 10
+model_type = "huggingface"
+model_name = "custom_patchtsmixer" 
+
+# Task type: forecasting, classification, regression, or pretraining
+task_type = "forecasting"
+
+# PatchTSMixer specific parameters
+context_length = 512  # Paper uses 512 context length
+prediction_length = 96  # Standard benchmark (paper tests 96, 192, 336, 720)
+num_input_channels = 7  # ETTh1 has 7 features (HUFL, HULL, MUFL, MULL, LUFL, LULL, OT)
+patch_length = 16
+patch_stride = 8
+
+d_model = 128
+num_layers = 8
+expansion_factor = 2
+
+dropout = 0.3  # Increase regularization to prevent overfitting
+head_dropout = 0.3  # Increase regularization to prevent overfitting
+
+# Model configuration
+mode = "common_channel"
+gated_attn = true 
+scaling = "std"
+
+epochs = 100
+batch_size = 64
+optimizer = "Adam"
+
+[algorithm]
+type = "fedavg"
+
+[parameters]
+[parameters.optimizer]
+lr = 0.0001
+weight_decay = 0.001
+
+[results]
+types = "round, elapsed_time, mse"
diff --git a/configs/TimeSeries/patchtsmixer_pretrained.toml b/configs/TimeSeries/patchtsmixer_pretrained.toml
@@ -0,0 +1,68 @@
+# Federated Learning with PatchTSMixer for Time Series Forecasting
+# This configuration demonstrates using the IBM Granite PatchTSMixer model
+# with time series data from HuggingFace datasets
+
+[clients]
+type = "simple"
+total_clients = 1
+per_round = 1
+do_test = false
+
+[server]
+address = "127.0.0.1"
+port = 8000
+simulate_wall_time = false
+checkpoint_path = "checkpoints/timeseries/patchtsmixer"
+model_path = "models/timeseries/patchtsmixer"
+
+[data]
+# ETTh1: Electricity Transformer Temperature dataset (7 features)
+datasource = "ETTh1"
+
+partition_size = 100  # Number of training samples
+sampler = "iid"
+random_seed = 1
+
+[trainer]
+type = "HuggingFace"
+rounds = 3
+max_concurrency = 2
+model_type = "huggingface"
+
+# Use pre-trained IBM Granite model
+# For pre-trained model, the some settings must match pretrained model
+model_name = "ibm-granite/granite-timeseries-patchtsmixer"
+
+# Task type: forecasting, classification, regression, or pretraining
+task_type = "forecasting"
+
+# PatchTSMixer specific parameters (matching pretrained model)
+context_length = 512
+prediction_length = 96
+num_input_channels = 7
+patch_length = 16
+patch_stride = 8
+d_model = 64
+num_layers = 8
+expansion_factor = 2 # Expansion factor to use inside MLP. Recommended range is 2-5. Larger value indicates more complex model.
+dropout = 0.5
+head_dropout = 0.7
+mode = "common_channel"
+gated_attn = true
+scaling = "std"
+
+# Training parameters
+epochs = 2  # Reduced for testing
+batch_size = 8  # Reduced for testing
+optimizer = "Adam"
+
+[algorithm]
+type = "fedavg"
+
+[parameters]
+[parameters.optimizer]
+lr = 0.001
+weight_decay = 0.0
+
+[results]
+types = "round, elapsed_time, accuracy"
diff --git a/docs/docs/examples/Getting Started.md b/docs/docs/examples/Getting Started.md
@@ -45,6 +45,11 @@ Plato supports both Linux with NVIDIA GPUs and macOS with M1/M2/M4/M4 GPUs. It w
 
 - [Model Pruning Algorithms](algorithms/13.%20Model%20Pruning%20Algorithms.md)
 
+- [Gradient Leakage Attacks and Defences](algorithms/14.%20Gradient%20Leakage%20Attacks%20and%20Defences.md)
+
+- [Time Series Models](algorithms/15.%20Time%20Series%20Models.md)
+
+
 ## Case Studies
 
 - [Federated LoRA Fine-Tuning](case-studies/1.%20LoRA.md)

diff --git a/docs/docs/examples/algorithms/15. Time Series Models.md b/docs/docs/examples/algorithms/15. Time Series Models.md
@@ -0,0 +1,15 @@
+### PatchTSMixer
+
+PatchTSMixer is a lightweight time-series modeling approach based on the MLP-Mixer architecture. The model can be pretrained and subsequently used for various downstream tasks such as forecasting, classification and regression.
+
+```bash
+uv run python plato.py -c configs/TimeSeries/patchtsmixer_pretrained.toml
+```
+
+For custom model configurations without using pretrained weights:
+
+```bash
+uv run python plato.py -c configs/TimeSeries/patchtsmixer_custom.toml
+```
+
+**Reference:** V. Ekambaram, A. Jati, N. Nguyen, S. Sinthong, K. Kalagnanam. "[TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://dl.acm.org/doi/abs/10.1145/3580305.3599533)," in Proc. ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD), 2023. – [[Code available]](https://github.com/ibm-granite/granite-tsfm)
diff --git a/docs/docs/index.md b/docs/docs/index.md
@@ -32,6 +32,7 @@ Welcome to *Plato*, a software framework to facilitate scalable, reproducible, a
     - **[Poisoning Detection](examples/algorithms/12.%20Poisoning%20Detection%20Algorithms.md)**
     - **[Model Pruning](examples/algorithms/13.%20Model%20Pruning%20Algorithms.md)**
     - **[Gradient Leakage Attacks and Defences](examples/algorithms/14.%20Gradient%20Leakage%20Attacks%20and%20Defences.md)**
+    - **[Time Series Models](examples/algorithms/15.%20Time%20Series%20Models.md)**
 
 ## Configuration Settings
 

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -66,6 +66,7 @@ nav:
           - Poisoning Detection: examples/algorithms/12. Poisoning Detection Algorithms.md
           - Model Pruning: examples/algorithms/13. Model Pruning Algorithms.md
           - Gradient Leakage Attacks and Defences: examples/algorithms/14. Gradient Leakage Attacks and Defences.md
+          - Time Series Models: examples/algorithms/15. Time Series Models.md
       - Case Studies:
           - Federated LoRA Fine-Tuning: examples/case-studies/1. LoRA.md
           - Composable Trainer API: examples/case-studies/2. Composable Trainer.md