From 0895b9d934befa2972fd2bddf9de133cb0fc609d Mon Sep 17 00:00:00 2001
From: Alexander Zhong <alexanderzousky@gmail.com>
Date: Sun, 30 Mar 2025 21:17:21 -0700
Subject: [PATCH 1/3] feat(test_install_script): added logging and CPU fallback
 when GPU unavailable

---
 test_install.py | 199 ++++++++++++++++++++++++------------------------
 1 file changed, 99 insertions(+), 100 deletions(-)

diff --git a/test_install.py b/test_install.py
index 84a7f2c..0bb82e3 100644
--- a/test_install.py
+++ b/test_install.py
@@ -32,107 +32,106 @@ def main():
 
     # Check if CUDA is available
     cuda_available = torch.cuda.is_available()
-    logger.info(f"CUDA available: {cuda_available}")
+    
+    logger.info(f"CUDA available: {cuda_available}" if cuda_available else "No CUDA-enabled GPU is available -- running with CPU")
     #assert cuda_available, "Error with submitting the test script"
-
-    if cuda_available:
-        # Get CUDA device properties
-        device = torch.device("cuda:0")
-        logger.info(f"Using device: {torch.cuda.get_device_name(device)}")
-        
-        logger.info("Begin Test Run:")
-        
-        # Hyper parameters
-        num_epochs = 15
-        batch_size = 128
-        learning_rate = 0.001
-
-        # Change the working directory to explainn in scratch space.
-        #explainn_path = os.path.join(os.environ.get("SCRATCH_PATH"), "ExplaiNN")
-        #os.chdir(explainn_path)
-        
-        h5_path = "./data/test/tf_peaks_TEST_sparse_Remap.h5"
-        compressed_file = f"{h5_path}.gz"
-        if not os.path.exists(h5_path):
-            if os.path.exists(compressed_file):
-                logger.info(f"Compressed file {compressed_file} found. Decompressing...")
-                with gzip.open(compressed_file, 'rb') as f_in, open(h5_path, 'wb') as f_out:
-                    f_out.write(f_in.read())
-                logger.info(f"Decompression complete: {h5_path}")
-            else:
-                raise FileNotFoundError(f"Neither {h5_path} nor {compressed_file} was found.")
-
-        # Load data
-        dataloaders, target_labels, train_out = utils.tools.load_datas(h5_path,
-                                                            batch_size,
-                                                            0,
-                                                            True)
-        target_labels = [i.decode("utf-8") for i in target_labels]
-
-        # Model parameters
-        num_cnns = 100
-        input_length = 200
-        num_classes = len(target_labels)
-        filter_size = 19
-        
-        # Create model
-        model = networks.ExplaiNN(num_cnns, input_length, num_classes, filter_size).to(device)
-        criterion = nn.BCEWithLogitsLoss()
-        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-        
-        weights_folder = "./data/test/weights"
-        if not os.path.exists(weights_folder):
-            os.makedirs(weights_folder)
-
-        # Train model
-        model, train_error, test_error = train.train_explainn(dataloaders["train"],
-                                                            dataloaders["valid"],
-                                                            model,
-                                                            device,
-                                                            criterion,
-                                                            optimizer,
-                                                            num_epochs,
-                                                            weights_folder,
-                                                            name_ind="",
-                                                            verbose=True,
-                                                            trim_weights=False,
-                                                            checkpoint=0,
-                                                            patience=0)
-
-        # Plot loss
-        utils.tools.showPlot(train_error, test_error, "Loss trend", "Loss")
-
-        # Test model
-        model.load_state_dict(torch.load(f"{weights_folder}/{os.listdir(weights_folder)[0]}"))
-        labels_E, outputs_E = test.run_test(model, dataloaders["test"], device)
-
-        # Get metrics
-        pr_rec = average_precision_score(labels_E, outputs_E)
-        no_skill_probs = [0 for _ in range(len(labels_E[:, 0]))]
-        ns_fpr, ns_tpr, _ = metrics.roc_curve(labels_E[:, 0], no_skill_probs)
-
-        roc_aucs, raw_aucs, roc_prcs, raw_prcs = {}, {}, {}, {}
-        for i in range(len(target_labels)):
-            nn_fpr, nn_tpr, threshold = metrics.roc_curve(labels_E[:, i], outputs_E[:, i])
-            roc_auc_nn = metrics.auc(nn_fpr, nn_tpr)
-
-            precision_nn, recall_nn, thresholds = metrics.precision_recall_curve(labels_E[:, i], outputs_E[:, i])
-            pr_auc_nn = metrics.auc(recall_nn, precision_nn)
-
-            raw_aucs[target_labels[i]] = nn_fpr, nn_tpr
-            roc_aucs[target_labels[i]] = roc_auc_nn
-
-            raw_prcs[target_labels[i]] = recall_nn, precision_nn
-            roc_prcs[target_labels[i]] = pr_auc_nn
-
-        logger.info(roc_prcs)
-        logger.info(roc_aucs)
-        
-        logger.info("Testing Complete")
-        
-    else:
-        logger.info("No CUDA-enabled GPU is available -- running with CPU")
-        # TODO: Test with CPU
+    if not cuda_available:
+        logger.warning("Training on CPU may cause longer waiting time than expected. Estimate: ~30 minutes")
+
+    
+    # Get CUDA device properties
+    device = torch.device("cuda:0" if cuda_available else "cpu")
+    logger.info(f"Using device: {torch.cuda.get_device_name(device)}" if cuda_available else "Using device: CPU")
+    
+    logger.info("Begin Test Run:")
+    
+    # Hyper parameters
+    num_epochs = 15
+    batch_size = 128
+    learning_rate = 0.001
+
+    # Change the working directory to explainn in scratch space.
+    #explainn_path = os.path.join(os.environ.get("SCRATCH_PATH"), "ExplaiNN")
+    #os.chdir(explainn_path)
+    
+    h5_path = "./data/test/tf_peaks_TEST_sparse_Remap.h5"
+    compressed_file = f"{h5_path}.gz"
+    if not os.path.exists(h5_path):
+        if os.path.exists(compressed_file):
+            logger.info(f"Compressed file {compressed_file} found. Decompressing...")
+            with gzip.open(compressed_file, 'rb') as f_in, open(h5_path, 'wb') as f_out:
+                f_out.write(f_in.read())
+            logger.info(f"Decompression complete: {h5_path}")
+        else:
+            raise FileNotFoundError(f"Neither {h5_path} nor {compressed_file} was found.")
+
+    # Load data
+    dataloaders, target_labels, train_out = utils.tools.load_datas(h5_path,
+                                                        batch_size,
+                                                        0,
+                                                        True)
+    target_labels = [i.decode("utf-8") for i in target_labels]
+
+    # Model parameters
+    num_cnns = 100
+    input_length = 200
+    num_classes = len(target_labels)
+    filter_size = 19
+    
+    # Create model
+    model = networks.ExplaiNN(num_cnns, input_length, num_classes, filter_size).to(device)
+    criterion = nn.BCEWithLogitsLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+    
+    weights_folder = "./data/test/weights"
+    if not os.path.exists(weights_folder):
+        os.makedirs(weights_folder)
+
+    # Train model
+    model, train_error, test_error = train.train_explainn(dataloaders["train"],
+                                                        dataloaders["valid"],
+                                                        model,
+                                                        device,
+                                                        criterion,
+                                                        optimizer,
+                                                        num_epochs,
+                                                        weights_folder,
+                                                        name_ind="",
+                                                        verbose=True,
+                                                        trim_weights=False,
+                                                        checkpoint=0,
+                                                        patience=0)
+
+    # Plot loss
+    utils.tools.showPlot(train_error, test_error, "Loss trend", "Loss")
+
+    # Test model
+    model.load_state_dict(torch.load(f"{weights_folder}/{os.listdir(weights_folder)[0]}"))
+    labels_E, outputs_E = test.run_test(model, dataloaders["test"], device)
+
+    # Get metrics
+    pr_rec = average_precision_score(labels_E, outputs_E)
+    no_skill_probs = [0 for _ in range(len(labels_E[:, 0]))]
+    ns_fpr, ns_tpr, _ = metrics.roc_curve(labels_E[:, 0], no_skill_probs)
+
+    roc_aucs, raw_aucs, roc_prcs, raw_prcs = {}, {}, {}, {}
+    for i in range(len(target_labels)):
+        nn_fpr, nn_tpr, threshold = metrics.roc_curve(labels_E[:, i], outputs_E[:, i])
+        roc_auc_nn = metrics.auc(nn_fpr, nn_tpr)
+
+        precision_nn, recall_nn, thresholds = metrics.precision_recall_curve(labels_E[:, i], outputs_E[:, i])
+        pr_auc_nn = metrics.auc(recall_nn, precision_nn)
+
+        raw_aucs[target_labels[i]] = nn_fpr, nn_tpr
+        roc_aucs[target_labels[i]] = roc_auc_nn
+
+        raw_prcs[target_labels[i]] = recall_nn, precision_nn
+        roc_prcs[target_labels[i]] = pr_auc_nn
+
+    logger.info(roc_prcs)
+    logger.info(roc_aucs)
+    
+    logger.info("Testing Complete")
 
 
 if __name__=='__main__':

From e26208f0ccba293cca883c10797d3435c2fc7b39 Mon Sep 17 00:00:00 2001
From: Alexander Zhong <alexanderzousky@gmail.com>
Date: Sun, 30 Mar 2025 21:24:53 -0700
Subject: [PATCH 2/3] feat: config validation + output dir check

---
 run/run.py   | 72 +++++++++++++++++++++++++++-----------
 run/utils.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 148 insertions(+), 23 deletions(-)

diff --git a/run/run.py b/run/run.py
index f92b655..da47c44 100644
--- a/run/run.py
+++ b/run/run.py
@@ -2,6 +2,7 @@
 import os
 import click
 import json
+import logging
 
 from explainn.train.train import train_explainn
 from explainn.utils.tools import pearson_loss
@@ -12,11 +13,24 @@
 from train import run_train
 from test import test_model
 from interpret import interpret_results
-from utils import save_data_splits
+from utils import save_data_splits, validate_config
+
+
+# Setup logging
+logging.basicConfig(
+    format="{asctime} - {name} - {levelname} - {message}",
+    style="{",
+    datefmt="%Y-%m-%d %H:%M",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
 
 CONTEXT_SETTINGS = {
     "help_option_names": ["-h", "--help"],
 }
+
+
 @click.command(no_args_is_help=True, context_settings=CONTEXT_SETTINGS)
 @click.argument(
     "config_file",
@@ -24,22 +38,42 @@
 )
 def main(**args):
     # Read config file
-    # TODO: Validate the fields of the config file
     with open(args["config_file"]) as f:
         config = json.load(f)
 
-    # TODO: Check that output dir exists
-
+    # Validate the fields of the config file
+    try:
+        validate_config(config)
+        logging.info("Config file validated.")
+    except Exception as e:
+        logging.error(str(e))
+
+    # Check that output dir exists
+    output_dir = config["data"]["output_dir"]
+    if not os.path.isdir(output_dir):
+        raise OSError(
+            f"The output directory: {output_dir} does not exist.\n"
+            f"Check the path relative to the current working directory: {os.getcwd()}"
+        )
 
     # TODO: Add preprocessing steps as arguments/config, eg. match-seqs-by-gc,
-    # subsample-seqs-by-gc, resize, etc. 
-
-
+    # subsample-seqs-by-gc, resize, etc.
+    if config["preprocessing"]["match_seqs_by_gc"]:
+        # TODO: perform match seqs by gc
+        pass
+    if config["preprocessing"]["subsample_seqs_by_gc"]:
+        # TODO: perform subsample_seqs_by_gc
+        pass
+    if config["preprocessing"]["resize"]:
+        # TODO Perform resize? 
+        pass
+    
     # Preprocess the data
     # TODO: Add this as an argument/in config
     classes = combine_seq_files(config["data"]["input_files"])
     splits = json2explainn(classes)
-    save_data_splits(config["data"]["output_dir"],
+    save_data_splits(
+        config["data"]["output_dir"],
         splits[0],
         splits[1],
         splits[2],
@@ -48,7 +82,9 @@ def main(**args):
     # TODO: Update config file with output location? Where to store path to intermediates
 
     if config["options"]["store_intermediates"]:
-        handle = open(os.path.join(config["data"]["output_dir"], "combined_data.json"), "wt")
+        handle = open(
+            os.path.join(config["data"]["output_dir"], "combined_data.json"), "wt"
+        )
         json.dump(classes, handle, indent=4, sort_keys=True)
         handle.close()
 
@@ -64,30 +100,26 @@ def main(**args):
     # Finetune the model
     # TODO: Specify this with config/arguments
 
-
     # Further interpretation
     # TODO: Specify these with config/arguments
     # MEME to logos
     meme2logo(config)
 
     # MEME to scores
-    #meme2scores(config)
+    # meme2scores(config)
 
     # MEME to clusters
-    #meme2clusters(config)
+    # meme2clusters(config)
 
     # Tomtom
-    #tomtom(config)
+    # tomtom(config)
 
     # JASPAR to logos
-    #jaspar2logo(config)
+    # jaspar2logo(config)
 
     # PWM to scores
-    #pwm2scores(config)
-
-
-
+    # pwm2scores(config)
 
 
-if __name__=='__main__':
-    main()
\ No newline at end of file
+if __name__ == "__main__":
+    main()
diff --git a/run/utils.py b/run/utils.py
index 6ee6abd..b45dfc3 100644
--- a/run/utils.py
+++ b/run/utils.py
@@ -73,7 +73,7 @@ def name_path(suffix, output_dir="./", prefix=None):
         str: formatted filepath 
     """
     return os.path.join(output_dir, ".".join(filter(None, (prefix, suffix))))
-                    
+
 
 def get_file_handle(file_name, mode):
     """
@@ -190,7 +190,6 @@ def _dna_one_hot_many(seqs):
     return(np.array([dna_one_hot(str(seq)) for seq in seqs]))
 
 
-
 def get_data_loader(seqs, labels, batch_size=100, shuffle=False):
 
     # TensorDatasets
@@ -236,4 +235,98 @@ def shuffle_string(s, k=2, random_seed=1714):
     l = [s[i-k:i] for i in range(k, len(s)+k, k)]
     random.Random(random_seed).shuffle(l)
 
-    return "".join(l)
\ No newline at end of file
+    return "".join(l)
+
+
+def validate_config(config):
+    """Validating the fields of the config file against the expected structure
+    
+        Ensures that the config dictionary has all the required keys and right types. 
+        
+        Error: 
+        - Missing a field, will throw a ValueError 
+        - Wrong types, will throw a TypeError
+    """
+    
+    # required fields being validated 
+    required_fields = {
+        "data": {
+            "input_files": list,
+            "output_dir": str,
+            "prefix": str,
+            "rev_complement": bool,
+            "input_length": int,
+            "intermediates": {
+                "training_file": str,
+                "validation_file": str,
+                "test_file": str,
+            },
+        },
+        "cnn": {
+            "filter_size": int,
+            "num_fc": int,
+            "num_units": int,
+            "pool_size": int,
+            "pool_stride": int,
+        },
+        "training": {
+            "cpu_threads": int,
+            "batch_size": int,
+            "num_epochs": int,
+            "checkpoint": int,
+            "patience": int,
+            "trim_weights": bool,
+        },
+        "optimizer": {"criterion": str, "lr": float, "optimizer": str},
+        "interpretation": {
+            "model_file": str,
+            "cpu_threads": int,
+            "batch_size": int,
+            "num_well_pred_seqs": int,
+            "correlation": int,
+            "exact_match": bool,
+            "percentile_bottom": int,
+            "percentile_top": int,
+        },
+        "options": {"debugging": bool, "use_time": bool, "store_intermediates": bool},
+        "postprocess": {
+            "cpu_threads": int,
+            "target_file": str,
+            "tomtom": {
+                "dist": str,
+                "evalue": bool,
+                "min_overlap": int,
+                "motif_pseudo": float,
+                "threshold": float,
+            },
+        },
+    }
+    
+    for section, fields in required_fields.items():
+        if section not in config:
+            raise ValueError(f"Missing section in config -- {section}")
+        
+        
+        for key, type in fields.items():
+
+            
+            if isinstance(type, dict):
+                if not isinstance(config[section][key], dict):
+                    raise TypeError(f"Incorrect type for {section}.{key} -- Intended type: dict")                
+                
+                for subsection, subtype in type.items():
+                    if subsection not in config[section][key]:
+                        raise ValueError(f"Missing subsection value in config -- {section}.{key}.{subsection}")
+                    
+                    if not isinstance(config[section][key][subsection], subtype):
+                        raise TypeError(f"Incorrect type for {section}.{key}.{subsection} -- Intended type: {subtype}")
+                    
+            else:
+                if key not in config[section]:
+                    raise ValueError(f"Missing section value in config -- {section}.{key}")
+                
+                if not isinstance(config[section][key], type):
+                    raise TypeError(f"Incorrect type for {section}.{key} -- Intended type: {type}")
+                
+
+    return True

From 72da7ecf4da1269a6ebb95c2fa4a15f18e32d5c3 Mon Sep 17 00:00:00 2001
From: Alexander Zhong <alexanderzousky@gmail.com>
Date: Mon, 31 Mar 2025 23:29:46 -0700
Subject: [PATCH 3/3] feat: added and refactored constants.py + error handling
 in train.py

---
 run/constants.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++
 run/test.py      | 18 ++++++++++---
 run/train.py     | 43 +++++++++++++++++-------------
 run/utils.py     | 63 +++----------------------------------------
 4 files changed, 111 insertions(+), 82 deletions(-)
 create mode 100644 run/constants.py

diff --git a/run/constants.py b/run/constants.py
new file mode 100644
index 0000000..a589520
--- /dev/null
+++ b/run/constants.py
@@ -0,0 +1,69 @@
+import torch
+from torch import nn
+from explainn.utils.tools import pearson_loss
+
+CRITERIONS = {
+    "bcewithlogits": nn.BCEWithLogitsLoss(),
+    "crossentropy": nn.CrossEntropyLoss(),
+    "mse": nn.MSELoss(),
+    "pearson": pearson_loss,
+    "poissonnll": nn.PoissonNLLLoss(),
+}
+
+OPTIMIZERS = {
+    "adam": torch.optim.Adam,
+    "sgd": torch.optim.SGD
+}
+
+CONFIG_REQUIRED_FIELDS = {
+    "data": {
+        "input_files": list,
+        "output_dir": str,
+        "prefix": str,
+        "rev_complement": bool,
+        "input_length": int,
+        "intermediates": {
+            "training_file": str,
+            "validation_file": str,
+            "test_file": str,
+        },
+    },
+    "cnn": {
+        "filter_size": int,
+        "num_fc": int,
+        "num_units": int,
+        "pool_size": int,
+        "pool_stride": int,
+    },
+    "training": {
+        "cpu_threads": int,
+        "batch_size": int,
+        "num_epochs": int,
+        "checkpoint": int,
+        "patience": int,
+        "trim_weights": bool,
+    },
+    "optimizer": {"criterion": str, "lr": float, "optimizer": str},
+    "interpretation": {
+        "model_file": str,
+        "cpu_threads": int,
+        "batch_size": int,
+        "num_well_pred_seqs": int,
+        "correlation": int,
+        "exact_match": bool,
+        "percentile_bottom": int,
+        "percentile_top": int,
+    },
+    "options": {"debugging": bool, "use_time": bool, "store_intermediates": bool},
+    "postprocess": {
+        "cpu_threads": int,
+        "target_file": str,
+        "tomtom": {
+            "dist": str,
+            "evalue": bool,
+            "min_overlap": int,
+            "motif_pseudo": float,
+            "threshold": float,
+        },
+    },
+}
diff --git a/run/test.py b/run/test.py
index 0fde4a6..56d4b15 100644
--- a/run/test.py
+++ b/run/test.py
@@ -26,7 +26,7 @@
 from explainn.models.networks import ExplaiNN
 from explainn.interpretation.interpretation import get_explainn_predictions
 from run.utils import (get_file_handle, get_seqs_labels_ids, get_data_loader,
-                   get_device, data_split_names, get_criterion)
+                   get_device, data_split_names, get_criterion, validate_config)
 
 CONTEXT_SETTINGS = {
     "help_option_names": ["-h", "--help"],
@@ -40,11 +40,23 @@ def main(**args):
     """
     """
     # Read config file
-    # TODO: Validate the fields of the config file
     with open(args["config_file"]) as f:
         config = json.load(f)
         
-    # TODO: Check that output dir exists
+    # Validate the fields of the config file
+    try:
+        validate_config(config)
+        logging.info("Config file validated.")
+    except Exception as e:
+        logging.error(str(e))
+
+    # Check that output dir exists
+    output_dir = config["data"]["output_dir"]
+    if not os.path.isdir(output_dir):
+        raise OSError(
+            f"The output directory: {output_dir} does not exist.\n"
+            f"Check the path relative to the current working directory: {os.getcwd()}"
+        )
     
     test_model(config)
 
diff --git a/run/train.py b/run/train.py
index 0fa7762..e57bf4e 100644
--- a/run/train.py
+++ b/run/train.py
@@ -1,11 +1,13 @@
 #!/usr/bin/env python
 
+import logging
 import os
 import sys
 import time
 import torch
 import click
 import json
+import constants
 
 import pandas as pd
 
@@ -15,7 +17,7 @@
 from explainn.train.train import train_explainn
 from explainn.models.networks import ExplaiNN
 from utils import (get_file_handle, get_seqs_labels_ids, get_data_loader,
-                   get_device, data_split_names, get_criterion)
+                   get_device, data_split_names, get_criterion, validate_config)
 
 CONTEXT_SETTINGS = {
     "help_option_names": ["-h", "--help"],
@@ -29,11 +31,23 @@ def main(**args):
     """
     """
     # Read config file
-    # TODO: Validate the fields of the config file
     with open(args["config_file"]) as f:
         config = json.load(f)
 
-    # TODO: Check that output dir exists
+    # Validate the fields of the config file
+    try:
+        validate_config(config)
+        logging.info("Config file validated.")
+    except Exception as e:
+        logging.error(str(e))
+
+    # Check that output dir exists
+    output_dir = config["data"]["output_dir"]
+    if not os.path.isdir(output_dir):
+        raise OSError(
+            f"The output directory: {output_dir} does not exist.\n"
+            f"Check the path relative to the current working directory: {os.getcwd()}"
+        )
     
     run_train(config)
 
@@ -76,15 +90,11 @@ def run_train(config):
     try:
         criterion = get_criterion()[config["optimizer"]["criterion"].lower()]
     except KeyError:
-        # TODO: Create error for this instead of print statement
-        print("""Criterion not found, please select from the following list:
-        BCEWithLogits
-        CrossEntropy
-        MSE
-        Pearson
-        PoissonNLL
-        """)
-        return
+        raise KeyError(
+            f"Invalid criterion '{config['optimizer']['criterion']}'. "
+            f"Please choose one of: {', '.join(get_criterion().keys())}"
+        )
+    
 
     # Get model
     m = ExplaiNN(config["cnn"]["num_units"], config["data"]["input_length"], 
@@ -116,13 +126,8 @@ def run_train(config):
 def _get_optimizer(optimizer, parameters, lr=0.0005):
     """
     """
-    # TODO: Change this to a map
-    if optimizer.lower() == "adam":
-        return torch.optim.Adam(parameters, lr=lr)
-    elif optimizer.lower() == "sgd":
-        return torch.optim.SGD(parameters, lr=lr)
-
-
+    return constants.OPTIMIZERS[optimizer.lower()](parameters, lr=lr)
+    
 def _train(train_loader, test_loader, model, device, criterion, optimizer,
     num_epochs=100, output_dir="./", name_ind=None, verbose=False,
     trim_weights=False, checkpoint=0, patience=0):
diff --git a/run/utils.py b/run/utils.py
index b45dfc3..4121a53 100644
--- a/run/utils.py
+++ b/run/utils.py
@@ -3,6 +3,7 @@
 
 import click
 import gzip
+import constants
 from functools import partial
 import numpy as np
 import pandas as pd
@@ -93,15 +94,8 @@ def get_file_handle(file_name, mode):
 
 def get_criterion():
     """
-    TODO: Move to constants.py?
     """
-    return {
-        "bcewithlogits": nn.BCEWithLogitsLoss(),
-        "crossentropy": nn.CrossEntropyLoss(),
-        "mse": nn.MSELoss(),
-        "pearson": pearson_loss,
-        "poissonnll": nn.PoissonNLLLoss()
-    }
+    return constants.CRITERIONS
 
 def get_or_create_dirs(output_path, output_dir):
     """
@@ -249,58 +243,7 @@ def validate_config(config):
     """
     
     # required fields being validated 
-    required_fields = {
-        "data": {
-            "input_files": list,
-            "output_dir": str,
-            "prefix": str,
-            "rev_complement": bool,
-            "input_length": int,
-            "intermediates": {
-                "training_file": str,
-                "validation_file": str,
-                "test_file": str,
-            },
-        },
-        "cnn": {
-            "filter_size": int,
-            "num_fc": int,
-            "num_units": int,
-            "pool_size": int,
-            "pool_stride": int,
-        },
-        "training": {
-            "cpu_threads": int,
-            "batch_size": int,
-            "num_epochs": int,
-            "checkpoint": int,
-            "patience": int,
-            "trim_weights": bool,
-        },
-        "optimizer": {"criterion": str, "lr": float, "optimizer": str},
-        "interpretation": {
-            "model_file": str,
-            "cpu_threads": int,
-            "batch_size": int,
-            "num_well_pred_seqs": int,
-            "correlation": int,
-            "exact_match": bool,
-            "percentile_bottom": int,
-            "percentile_top": int,
-        },
-        "options": {"debugging": bool, "use_time": bool, "store_intermediates": bool},
-        "postprocess": {
-            "cpu_threads": int,
-            "target_file": str,
-            "tomtom": {
-                "dist": str,
-                "evalue": bool,
-                "min_overlap": int,
-                "motif_pseudo": float,
-                "threshold": float,
-            },
-        },
-    }
+    required_fields = constants.CONFIG_REQUIRED_FIELDS
     
     for section, fields in required_fields.items():
         if section not in config: