From 2d376c5d9b71251097842d7777f9d14fee12d5bb Mon Sep 17 00:00:00 2001 From: Soheyl Massoudi Date: Thu, 5 Mar 2026 09:28:37 +0100 Subject: [PATCH 1/2] fix(reproducibility): add opt-in strict determinism across trainers --- README.md | 6 +++ engiopt/cgan_1d/cgan_1d.py | 17 +++++--- engiopt/cgan_2d/cgan_2d.py | 17 +++++--- engiopt/cgan_bezier/cgan_bezier.py | 18 +++++---- engiopt/cgan_cnn_2d/cgan_cnn_2d.py | 18 +++++---- engiopt/cgan_cnn_3d/cgan_cnn_3d.py | 17 +++++--- engiopt/cgan_vae/cgan_vae.py | 17 +++++--- engiopt/diffusion_1d/diffusion_1d.py | 17 +++++--- .../diffusion_2d_cond/diffusion_2d_cond.py | 17 +++++--- engiopt/gan_1d/gan_1d.py | 17 +++++--- engiopt/gan_2d/gan_2d.py | 17 +++++--- engiopt/gan_bezier/gan_bezier.py | 35 +++++++++++----- engiopt/gan_cnn_2d/gan_cnn_2d.py | 18 +++++---- engiopt/pixel_cnn_pp_2d/pixel_cnn_pp_2d.py | 16 +++++--- engiopt/reproducibility.py | 40 +++++++++++++++++++ engiopt/surrogate_model/mlp_tabular_only.py | 27 ++++++++----- engiopt/vqgan/vqgan.py | 17 +++++--- 17 files changed, 233 insertions(+), 98 deletions(-) create mode 100644 engiopt/reproducibility.py diff --git a/README.md b/README.md index 6b828b5..5fe96c3 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,12 @@ python engiopt/cgan_cnn_2d/cgan_cnn_2d.py --problem-id "beams2d" --track --wandb This will run a CGAN 2D using CNN model on the beams2d problem. `--track` will track the run on wandb, `--wandb-entity None` will use the default wandb entity, `--save-model` will save the model, `--n-epochs 200` will run for 200 epochs, and `--seed 1` will set the random seed. +For reproducible debugging runs, you can additionally enable strict deterministic mode: +``` +python engiopt/cgan_cnn_2d/cgan_cnn_2d.py --problem-id "beams2d" --seed 1 --strict-determinism +``` +This enables stricter PyTorch deterministic settings and deterministic data shuffling while keeping the default behavior unchanged when the flag is omitted. + You can always check the help for more options: ``` python engiopt/cgan_cnn_2d/cgan_cnn_2d.py -h diff --git a/engiopt/cgan_1d/cgan_1d.py b/engiopt/cgan_1d/cgan_1d.py index cc834da..211996e 100644 --- a/engiopt/cgan_1d/cgan_1d.py +++ b/engiopt/cgan_1d/cgan_1d.py @@ -8,7 +8,6 @@ from dataclasses import dataclass import os -import random import time from typing import TYPE_CHECKING @@ -21,9 +20,12 @@ from torchvision import transforms import tqdm import tyro +import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training from engiopt.transforms import flatten_dict_factory -import wandb if TYPE_CHECKING: from engibench.utils.problem import Problem @@ -64,6 +66,9 @@ class Args: """Wandb entity name.""" seed: int = 1 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = False """Saves the model to disk.""" @@ -220,10 +225,9 @@ def prepare_data(problem: Problem, device: th.device) -> tuple[th.utils.data.Ten wandb.init(project=args.wandb_project, entity=args.wandb_entity, config=vars(args), save_code=True, name=run_name) # Seeding - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) os.makedirs("images", exist_ok=True) @@ -240,6 +244,7 @@ def prepare_data(problem: Problem, device: th.device) -> tuple[th.utils.data.Ten training_ds, batch_size=args.batch_size, shuffle=True, + generator=make_dataloader_generator(args.seed), ) # Loss function diff --git a/engiopt/cgan_2d/cgan_2d.py b/engiopt/cgan_2d/cgan_2d.py index 7b14510..fb68207 100644 --- a/engiopt/cgan_2d/cgan_2d.py +++ b/engiopt/cgan_2d/cgan_2d.py @@ -8,7 +8,6 @@ from dataclasses import dataclass import os -import random import time from engibench.utils.all_problems import BUILTIN_PROBLEMS @@ -18,9 +17,12 @@ from torch import nn import tqdm import tyro - import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training + @dataclass class Args: @@ -40,6 +42,9 @@ class Args: """Wandb entity name.""" seed: int = 1 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = False """Saves the model to disk.""" @@ -147,10 +152,9 @@ def forward(self, design: th.Tensor, conds: th.Tensor) -> th.Tensor: wandb.init(project=args.wandb_project, entity=args.wandb_entity, config=vars(args), save_code=True, name=run_name) # Seeding - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) os.makedirs("images", exist_ok=True) @@ -182,6 +186,7 @@ def forward(self, design: th.Tensor, conds: th.Tensor) -> th.Tensor: training_ds, batch_size=args.batch_size, shuffle=True, + generator=make_dataloader_generator(args.seed), ) # Optimizers diff --git a/engiopt/cgan_bezier/cgan_bezier.py b/engiopt/cgan_bezier/cgan_bezier.py index 0acba5e..053ad9f 100644 --- a/engiopt/cgan_bezier/cgan_bezier.py +++ b/engiopt/cgan_bezier/cgan_bezier.py @@ -8,7 +8,6 @@ from dataclasses import dataclass import os -import random import time from typing import TYPE_CHECKING @@ -19,9 +18,12 @@ from torch import nn import torch.nn.functional as f import tyro - import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training + if TYPE_CHECKING: from collections.abc import Callable @@ -46,6 +48,9 @@ class Args: """Wandb entity name.""" seed: int = 6 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = True """Saves the model to disk.""" @@ -438,11 +443,9 @@ def denormalize(self, x: th.Tensor) -> th.Tensor: save_code=True, name=run_name, ) - - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) os.makedirs("images", exist_ok=True) @@ -476,6 +479,7 @@ def denormalize(self, x: th.Tensor) -> th.Tensor: training_ds, batch_size=args.batch_size, shuffle=True, + generator=make_dataloader_generator(args.seed), ) discriminator = Discriminator( diff --git a/engiopt/cgan_cnn_2d/cgan_cnn_2d.py b/engiopt/cgan_cnn_2d/cgan_cnn_2d.py index 3c7a1f8..8da8c70 100644 --- a/engiopt/cgan_cnn_2d/cgan_cnn_2d.py +++ b/engiopt/cgan_cnn_2d/cgan_cnn_2d.py @@ -7,20 +7,21 @@ from dataclasses import dataclass import os -import random import time from engibench.utils.all_problems import BUILTIN_PROBLEMS import matplotlib.pyplot as plt -import numpy as np import torch as th from torch import nn from torchvision import transforms import tqdm import tyro - import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training + @dataclass class Args: @@ -40,6 +41,9 @@ class Args: """Wandb entity name.""" seed: int = 1 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = False """Saves the model to disk.""" @@ -243,10 +247,9 @@ def forward(self, x: th.Tensor, c: th.Tensor) -> th.Tensor: wandb.init(project=args.wandb_project, entity=args.wandb_entity, config=vars(args), save_code=True, name=run_name) # Seeding - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) os.makedirs("images", exist_ok=True) @@ -277,6 +280,7 @@ def forward(self, x: th.Tensor, c: th.Tensor) -> th.Tensor: training_ds, batch_size=args.batch_size, shuffle=True, + generator=make_dataloader_generator(args.seed), ) # Optimizers diff --git a/engiopt/cgan_cnn_3d/cgan_cnn_3d.py b/engiopt/cgan_cnn_3d/cgan_cnn_3d.py index 451f464..9f94df9 100644 --- a/engiopt/cgan_cnn_3d/cgan_cnn_3d.py +++ b/engiopt/cgan_cnn_3d/cgan_cnn_3d.py @@ -9,7 +9,6 @@ from dataclasses import dataclass import os -import random import time from engibench.utils.all_problems import BUILTIN_PROBLEMS @@ -21,10 +20,13 @@ from torch.nn import functional import tqdm import tyro +import wandb from engiopt.metrics import dpp_diversity from engiopt.metrics import mmd -import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training @dataclass @@ -47,6 +49,9 @@ class Args: """Wandb entity name.""" seed: int = 1 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = False """Saves the model to disk.""" @@ -368,10 +373,9 @@ def compute_gradient_penalty(discriminator, real_samples, fake_samples, conds, d wandb.init(project=args.wandb_project, entity=args.wandb_entity, config=vars(args), save_code=True, name=run_name) # Seeding - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) os.makedirs("images_3d", exist_ok=True) @@ -416,6 +420,7 @@ def compute_gradient_penalty(discriminator, real_samples, fake_samples, conds, d training_ds, batch_size=args.batch_size, shuffle=True, + generator=make_dataloader_generator(args.seed), ) # Optimizers diff --git a/engiopt/cgan_vae/cgan_vae.py b/engiopt/cgan_vae/cgan_vae.py index 8033366..70fabda 100644 --- a/engiopt/cgan_vae/cgan_vae.py +++ b/engiopt/cgan_vae/cgan_vae.py @@ -9,7 +9,6 @@ from dataclasses import dataclass import os -import random import time from engibench.utils.all_problems import BUILTIN_PROBLEMS @@ -21,10 +20,13 @@ from torch.nn import functional import tqdm import tyro +import wandb from engiopt.metrics import dpp_diversity from engiopt.metrics import mmd -import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training @dataclass @@ -47,6 +49,9 @@ class Args: """Wandb entity name.""" seed: int = 1 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = False """Saves the model to disk.""" @@ -461,10 +466,9 @@ def compute_gradient_penalty(discriminator, real_samples, fake_samples, conds, d wandb.init(project=args.wandb_project, entity=args.wandb_entity, config=vars(args), save_code=True, name=run_name) # Seeding - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) os.makedirs("images_3d", exist_ok=True) @@ -517,6 +521,7 @@ def compute_gradient_penalty(discriminator, real_samples, fake_samples, conds, d training_ds, batch_size=args.batch_size, shuffle=True, + generator=make_dataloader_generator(args.seed), ) # Optimizers for all components diff --git a/engiopt/diffusion_1d/diffusion_1d.py b/engiopt/diffusion_1d/diffusion_1d.py index 12cde36..7c6d1a4 100644 --- a/engiopt/diffusion_1d/diffusion_1d.py +++ b/engiopt/diffusion_1d/diffusion_1d.py @@ -7,7 +7,6 @@ from dataclasses import dataclass import os -import random import time from typing import TYPE_CHECKING @@ -21,9 +20,12 @@ from torchvision import transforms import tqdm import tyro +import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training from engiopt.transforms import flatten_dict_factory -import wandb if TYPE_CHECKING: from engibench.utils.problem import Problem @@ -100,6 +102,9 @@ class Args: """Wandb entity name.""" seed: int = 1 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = False """Saves the model to disk.""" @@ -153,10 +158,9 @@ class Args: wandb.init(project=args.wandb_project, entity=args.wandb_entity, config=vars(args), save_code=True, name=run_name) # Seeding - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) os.makedirs("images", exist_ok=True) @@ -188,6 +192,7 @@ class Args: training_ds, batch_size=args.batch_size, shuffle=True, + generator=make_dataloader_generator(args.seed), ) # Optimizers diff --git a/engiopt/diffusion_2d_cond/diffusion_2d_cond.py b/engiopt/diffusion_2d_cond/diffusion_2d_cond.py index 0b69619..5f071d7 100644 --- a/engiopt/diffusion_2d_cond/diffusion_2d_cond.py +++ b/engiopt/diffusion_2d_cond/diffusion_2d_cond.py @@ -5,7 +5,6 @@ from dataclasses import dataclass import math import os -import random import time from typing import Literal, TYPE_CHECKING @@ -17,9 +16,12 @@ from torch.nn import functional import tqdm import tyro - import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training + if TYPE_CHECKING: from collections.abc import Callable @@ -42,6 +44,9 @@ class Args: """Wandb entity name.""" seed: int = 1 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = False """Saves the model to disk.""" @@ -248,10 +253,9 @@ def sample_timestep( wandb.init(project=args.wandb_project, entity=args.wandb_entity, config=vars(args), save_code=True, name=run_name) # Seeding - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) os.makedirs("images", exist_ok=True) @@ -302,6 +306,7 @@ def sample_timestep( training_ds, batch_size=args.batch_size, shuffle=True, + generator=make_dataloader_generator(args.seed), ) num_timesteps = args.num_timesteps diff --git a/engiopt/gan_1d/gan_1d.py b/engiopt/gan_1d/gan_1d.py index ae60b68..c36067b 100644 --- a/engiopt/gan_1d/gan_1d.py +++ b/engiopt/gan_1d/gan_1d.py @@ -7,7 +7,6 @@ from dataclasses import dataclass import os -import random import time from typing import TYPE_CHECKING @@ -20,9 +19,12 @@ from torchvision import transforms import tqdm import tyro +import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training from engiopt.transforms import flatten_dict_factory -import wandb if TYPE_CHECKING: from engibench.utils.problem import Problem @@ -46,6 +48,9 @@ class Args: """Wandb entity name.""" seed: int = 1 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = False """Saves the model to disk.""" @@ -186,10 +191,9 @@ def prepare_data(problem: Problem, device: th.device) -> tuple[th.utils.data.Ten wandb.init(project=args.wandb_project, entity=args.wandb_entity, config=vars(args), save_code=True, name=run_name) # Seeding - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) os.makedirs("images", exist_ok=True) @@ -217,6 +221,7 @@ def prepare_data(problem: Problem, device: th.device) -> tuple[th.utils.data.Ten training_ds, batch_size=args.batch_size, shuffle=True, + generator=make_dataloader_generator(args.seed), ) # Optimizers diff --git a/engiopt/gan_2d/gan_2d.py b/engiopt/gan_2d/gan_2d.py index 95b7746..8cfd31b 100644 --- a/engiopt/gan_2d/gan_2d.py +++ b/engiopt/gan_2d/gan_2d.py @@ -7,7 +7,6 @@ from dataclasses import dataclass import os -import random import time from engibench.utils.all_problems import BUILTIN_PROBLEMS @@ -17,9 +16,12 @@ from torch import nn import tqdm import tyro - import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training + @dataclass class Args: @@ -39,6 +41,9 @@ class Args: """Wandb entity name.""" seed: int = 1 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = False """Saves the model to disk.""" @@ -123,10 +128,9 @@ def forward(self, img: th.Tensor) -> th.Tensor: wandb.init(project=args.wandb_project, entity=args.wandb_entity, config=vars(args), save_code=True, name=run_name) # Seeding - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) os.makedirs("images", exist_ok=True) @@ -156,6 +160,7 @@ def forward(self, img: th.Tensor) -> th.Tensor: training_ds, batch_size=args.batch_size, shuffle=True, + generator=make_dataloader_generator(args.seed), ) # Optimizers diff --git a/engiopt/gan_bezier/gan_bezier.py b/engiopt/gan_bezier/gan_bezier.py index e72111b..941696b 100644 --- a/engiopt/gan_bezier/gan_bezier.py +++ b/engiopt/gan_bezier/gan_bezier.py @@ -7,7 +7,6 @@ from dataclasses import dataclass import os -import random import time from typing import TYPE_CHECKING @@ -19,9 +18,12 @@ from torch import nn import torch.nn.functional as f import tyro - import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training + if TYPE_CHECKING: from collections.abc import Callable @@ -46,6 +48,9 @@ class Args: """Wandb entity name.""" seed: int = 1 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = False """Saves the model to disk.""" @@ -414,7 +419,7 @@ def denormalize(self, x: th.Tensor) -> th.Tensor: return x * (self.max_val - self.min_val + self.eps) + self.min_val -def prepare_data(problem, batch_size, device): +def prepare_data(problem, batch_size, device, seed=None): """Prepares the dataset and normalizer for training.""" problem_dataset = problem.dataset.with_format("torch")["train"] design_scalar_keys = list(problem_dataset["optimal_design"][0].keys()) @@ -427,7 +432,19 @@ def prepare_data(problem, batch_size, device): *[problem_dataset[key][:] for key, _ in problem.conditions], ) - dataloader = th.utils.data.DataLoader(training_ds, batch_size=batch_size, shuffle=True) + if seed is None: + dataloader = th.utils.data.DataLoader( + training_ds, + batch_size=batch_size, + shuffle=True, + ) + else: + dataloader = th.utils.data.DataLoader( + training_ds, + batch_size=batch_size, + shuffle=True, + generator=make_dataloader_generator(seed), + ) design_scalars_min = training_ds.tensors[1].amin(dim=0).to(device) design_scalars_max = training_ds.tensors[1].amax(dim=0).to(device) @@ -451,11 +468,9 @@ def prepare_data(problem, batch_size, device): save_code=True, name=run_name, ) - - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) if not isinstance(problem.design_space, (spaces.Box, spaces.Dict)): raise ValueError("This algorithm only works with Box or Dict spaces.") @@ -468,7 +483,7 @@ def prepare_data(problem, batch_size, device): n_data_points = coords_space.shape[1] # Prepare data - dataloader, design_scalars_normalizer, design_scalar_keys = prepare_data(problem, args.batch_size, device) + dataloader, design_scalars_normalizer, design_scalar_keys = prepare_data(problem, args.batch_size, device, args.seed) generator = Generator( latent_dim=args.latent_dim, diff --git a/engiopt/gan_cnn_2d/gan_cnn_2d.py b/engiopt/gan_cnn_2d/gan_cnn_2d.py index 7a5cdbd..8f46f1b 100644 --- a/engiopt/gan_cnn_2d/gan_cnn_2d.py +++ b/engiopt/gan_cnn_2d/gan_cnn_2d.py @@ -7,20 +7,21 @@ from dataclasses import dataclass import os -import random import time from engibench.utils.all_problems import BUILTIN_PROBLEMS import matplotlib.pyplot as plt -import numpy as np import torch as th from torch import nn from torchvision import transforms import tqdm import tyro - import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training + @dataclass class Args: @@ -40,6 +41,9 @@ class Args: """Wandb entity name.""" seed: int = 1 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = False """Saves the model to disk.""" @@ -178,10 +182,9 @@ def forward(self, x: th.Tensor) -> th.Tensor: wandb.init(project=args.wandb_project, entity=args.wandb_entity, config=vars(args), save_code=True, name=run_name) # Seeding - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) os.makedirs("images", exist_ok=True) @@ -210,6 +213,7 @@ def forward(self, x: th.Tensor) -> th.Tensor: training_ds, batch_size=args.batch_size, shuffle=True, + generator=make_dataloader_generator(args.seed), ) # Optimizers diff --git a/engiopt/pixel_cnn_pp_2d/pixel_cnn_pp_2d.py b/engiopt/pixel_cnn_pp_2d/pixel_cnn_pp_2d.py index 4fe6a46..c2a257f 100644 --- a/engiopt/pixel_cnn_pp_2d/pixel_cnn_pp_2d.py +++ b/engiopt/pixel_cnn_pp_2d/pixel_cnn_pp_2d.py @@ -11,7 +11,6 @@ from dataclasses import dataclass import os -import random import time import typing @@ -26,6 +25,10 @@ import tyro import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training + @dataclass class Args: @@ -45,6 +48,9 @@ class Args: """Wandb entity name.""" seed: int = 1 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = False """Saves the model to disk.""" @@ -708,10 +714,9 @@ def sample_from_discretized_mix_logistic(l: th.Tensor, nr_mix: int) -> th.Tensor wandb.init(project=args.wandb_project, entity=args.wandb_entity, config=vars(args), save_code=True, name=run_name) # Seeding - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) os.makedirs("images", exist_ok=True) @@ -748,6 +753,7 @@ def sample_from_discretized_mix_logistic(l: th.Tensor, nr_mix: int) -> th.Tensor training_ds, batch_size=args.batch_size, shuffle=True, + generator=make_dataloader_generator(args.seed), ) # Optimizer diff --git a/engiopt/reproducibility.py b/engiopt/reproducibility.py new file mode 100644 index 0000000..aee5d4c --- /dev/null +++ b/engiopt/reproducibility.py @@ -0,0 +1,40 @@ +"""Utilities for reproducible training runs.""" + +from __future__ import annotations + +import os +import random + +import numpy as np +import torch + + +def seed_training(seed: int) -> np.random.Generator: + """Seed Python, NumPy, and PyTorch RNGs for training scripts.""" + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + rng = np.random.default_rng(seed) + np.random.seed(seed) # noqa: NPY002 + random.seed(seed) + torch.backends.cudnn.deterministic = True + return rng + + +def enable_strict_determinism(*, warn_only: bool = True) -> None: + """Enable stricter deterministic settings without changing model logic.""" + os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8") + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + if torch.cuda.is_available(): + torch.backends.cuda.matmul.allow_tf32 = False + torch.backends.cudnn.allow_tf32 = False + torch.use_deterministic_algorithms(mode=True, warn_only=warn_only) + + +def make_dataloader_generator(seed: int) -> torch.Generator: + """Return a seeded generator for deterministic DataLoader shuffling.""" + generator = torch.Generator() + generator.manual_seed(seed) + return generator diff --git a/engiopt/surrogate_model/mlp_tabular_only.py b/engiopt/surrogate_model/mlp_tabular_only.py index affbc84..135da9a 100644 --- a/engiopt/surrogate_model/mlp_tabular_only.py +++ b/engiopt/surrogate_model/mlp_tabular_only.py @@ -9,7 +9,6 @@ from dataclasses import dataclass from dataclasses import field import os -import random import time from typing import Any, Literal @@ -21,15 +20,18 @@ import torch from torch.utils.data import DataLoader import tyro +import wandb from engiopt.args_utils import parse_list_from_single_item_list from engiopt.args_utils import parse_list_from_string +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training from engiopt.surrogate_model.model_pipeline import DataPreprocessor from engiopt.surrogate_model.model_pipeline import ModelPipeline from engiopt.surrogate_model.training_utils import get_device from engiopt.surrogate_model.training_utils import PlainTabularDataset from engiopt.surrogate_model.training_utils import train_one_model -import wandb @dataclass @@ -93,6 +95,7 @@ class Args: wandb_project: str = "engiopt" wandb_entity: str | None = None seed: int = 42 + strict_determinism: bool = False n_ensembles: int = 1 algo: str = os.path.basename(__file__)[: -len(".py")] save_model: bool = False @@ -182,12 +185,9 @@ def train_ensemble( seeds = [args.seed + i for i in range(args.n_ensembles)] for seed_i in seeds: print(f"=== Training model for seed={seed_i} ===") - torch.manual_seed(seed_i) - random.seed(seed_i) - torch.cuda.manual_seed(seed_i) - torch.cuda.manual_seed_all(seed_i) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False + seed_training(seed_i) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) model_i, best_val_loss_i = train_one_model(args, train_loader, val_loader, device=device) ensemble_models.append(model_i) @@ -264,6 +264,10 @@ def main(args: Args) -> float: # noqa: PLR0915 Returns: float: The best validation loss achieved by any model in the ensemble. """ + _ = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) + problem = BUILTIN_PROBLEMS[args.problem_id]() problem.reset(seed=args.seed) @@ -315,7 +319,12 @@ def main(args: Args) -> float: # noqa: PLR0915 val_dataset = PlainTabularDataset(x_val_s, y_val_s) test_dataset = PlainTabularDataset(x_test_s, y_test_s) - train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) + train_loader = DataLoader( + train_dataset, + batch_size=args.batch_size, + shuffle=True, + generator=make_dataloader_generator(args.seed), + ) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False) diff --git a/engiopt/vqgan/vqgan.py b/engiopt/vqgan/vqgan.py index c34e27f..fade92d 100644 --- a/engiopt/vqgan/vqgan.py +++ b/engiopt/vqgan/vqgan.py @@ -25,7 +25,6 @@ from engibench.utils.all_problems import BUILTIN_PROBLEMS import matplotlib.pyplot as plt -import numpy as np import torch as th from torch import nn from torch.nn import functional as f @@ -33,6 +32,9 @@ import tyro import wandb +from engiopt.reproducibility import enable_strict_determinism +from engiopt.reproducibility import make_dataloader_generator +from engiopt.reproducibility import seed_training from engiopt.transforms import drop_constant from engiopt.transforms import normalize from engiopt.transforms import resize_to @@ -68,6 +70,9 @@ class Args: """Wandb entity name.""" seed: int = 1 """Random seed.""" + + strict_determinism: bool = False + """Enable strict deterministic operations for reproducibility debugging.""" save_model: bool = True """Saves the model to disk.""" @@ -643,10 +648,9 @@ def log_images(self, x: th.Tensor, c: th.Tensor, top_k: int | None = None) -> tu args = tyro.cli(Args) # Seeding - th.manual_seed(args.seed) - rng = np.random.default_rng(args.seed) - random.seed(args.seed) - th.backends.cudnn.deterministic = True + rng = seed_training(args.seed) + if args.strict_determinism: + enable_strict_determinism(warn_only=True) os.makedirs("images/vqgan", exist_ok=True) os.makedirs("images/transformer", exist_ok=True) @@ -703,16 +707,19 @@ def log_images(self, x: th.Tensor, c: th.Tensor, top_k: int | None = None) -> tu th_training_ds, batch_size=args.batch_size_cvqgan, shuffle=True, + generator=make_dataloader_generator(args.seed), ) dataloader_vqgan = th.utils.data.DataLoader( th_training_ds, batch_size=args.batch_size_vqgan, shuffle=True, + generator=make_dataloader_generator(args.seed), ) dataloader_transformer = th.utils.data.DataLoader( th_training_ds, batch_size=args.batch_size_transformer, shuffle=True, + generator=make_dataloader_generator(args.seed), ) # If early stopping enabled, create a validation dataloader From 21e760fc289726e37d072302eb1a5efd5b24a610 Mon Sep 17 00:00:00 2001 From: Soheyl Massoudi Date: Thu, 5 Mar 2026 16:20:06 +0100 Subject: [PATCH 2/2] refactor(gan_bezier): simplify deterministic dataloader generator wiring --- engiopt/gan_bezier/gan_bezier.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/engiopt/gan_bezier/gan_bezier.py b/engiopt/gan_bezier/gan_bezier.py index 941696b..85a45f3 100644 --- a/engiopt/gan_bezier/gan_bezier.py +++ b/engiopt/gan_bezier/gan_bezier.py @@ -432,19 +432,12 @@ def prepare_data(problem, batch_size, device, seed=None): *[problem_dataset[key][:] for key, _ in problem.conditions], ) - if seed is None: - dataloader = th.utils.data.DataLoader( - training_ds, - batch_size=batch_size, - shuffle=True, - ) - else: - dataloader = th.utils.data.DataLoader( - training_ds, - batch_size=batch_size, - shuffle=True, - generator=make_dataloader_generator(seed), - ) + dataloader = th.utils.data.DataLoader( + training_ds, + batch_size=batch_size, + shuffle=True, + generator=make_dataloader_generator(seed) if seed is not None else None, + ) design_scalars_min = training_ds.tensors[1].amin(dim=0).to(device) design_scalars_max = training_ds.tensors[1].amax(dim=0).to(device)