From 0895b9d934befa2972fd2bddf9de133cb0fc609d Mon Sep 17 00:00:00 2001 From: Alexander Zhong Date: Sun, 30 Mar 2025 21:17:21 -0700 Subject: [PATCH 1/3] feat(test_install_script): added logging and CPU fallback when GPU unavailable --- test_install.py | 199 ++++++++++++++++++++++++------------------------ 1 file changed, 99 insertions(+), 100 deletions(-) diff --git a/test_install.py b/test_install.py index 84a7f2c..0bb82e3 100644 --- a/test_install.py +++ b/test_install.py @@ -32,107 +32,106 @@ def main(): # Check if CUDA is available cuda_available = torch.cuda.is_available() - logger.info(f"CUDA available: {cuda_available}") + + logger.info(f"CUDA available: {cuda_available}" if cuda_available else "No CUDA-enabled GPU is available -- running with CPU") #assert cuda_available, "Error with submitting the test script" - - if cuda_available: - # Get CUDA device properties - device = torch.device("cuda:0") - logger.info(f"Using device: {torch.cuda.get_device_name(device)}") - - logger.info("Begin Test Run:") - - # Hyper parameters - num_epochs = 15 - batch_size = 128 - learning_rate = 0.001 - - # Change the working directory to explainn in scratch space. - #explainn_path = os.path.join(os.environ.get("SCRATCH_PATH"), "ExplaiNN") - #os.chdir(explainn_path) - - h5_path = "./data/test/tf_peaks_TEST_sparse_Remap.h5" - compressed_file = f"{h5_path}.gz" - if not os.path.exists(h5_path): - if os.path.exists(compressed_file): - logger.info(f"Compressed file {compressed_file} found. Decompressing...") - with gzip.open(compressed_file, 'rb') as f_in, open(h5_path, 'wb') as f_out: - f_out.write(f_in.read()) - logger.info(f"Decompression complete: {h5_path}") - else: - raise FileNotFoundError(f"Neither {h5_path} nor {compressed_file} was found.") - - # Load data - dataloaders, target_labels, train_out = utils.tools.load_datas(h5_path, - batch_size, - 0, - True) - target_labels = [i.decode("utf-8") for i in target_labels] - - # Model parameters - num_cnns = 100 - input_length = 200 - num_classes = len(target_labels) - filter_size = 19 - - # Create model - model = networks.ExplaiNN(num_cnns, input_length, num_classes, filter_size).to(device) - criterion = nn.BCEWithLogitsLoss() - optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) - - weights_folder = "./data/test/weights" - if not os.path.exists(weights_folder): - os.makedirs(weights_folder) - - # Train model - model, train_error, test_error = train.train_explainn(dataloaders["train"], - dataloaders["valid"], - model, - device, - criterion, - optimizer, - num_epochs, - weights_folder, - name_ind="", - verbose=True, - trim_weights=False, - checkpoint=0, - patience=0) - - # Plot loss - utils.tools.showPlot(train_error, test_error, "Loss trend", "Loss") - - # Test model - model.load_state_dict(torch.load(f"{weights_folder}/{os.listdir(weights_folder)[0]}")) - labels_E, outputs_E = test.run_test(model, dataloaders["test"], device) - - # Get metrics - pr_rec = average_precision_score(labels_E, outputs_E) - no_skill_probs = [0 for _ in range(len(labels_E[:, 0]))] - ns_fpr, ns_tpr, _ = metrics.roc_curve(labels_E[:, 0], no_skill_probs) - - roc_aucs, raw_aucs, roc_prcs, raw_prcs = {}, {}, {}, {} - for i in range(len(target_labels)): - nn_fpr, nn_tpr, threshold = metrics.roc_curve(labels_E[:, i], outputs_E[:, i]) - roc_auc_nn = metrics.auc(nn_fpr, nn_tpr) - - precision_nn, recall_nn, thresholds = metrics.precision_recall_curve(labels_E[:, i], outputs_E[:, i]) - pr_auc_nn = metrics.auc(recall_nn, precision_nn) - - raw_aucs[target_labels[i]] = nn_fpr, nn_tpr - roc_aucs[target_labels[i]] = roc_auc_nn - - raw_prcs[target_labels[i]] = recall_nn, precision_nn - roc_prcs[target_labels[i]] = pr_auc_nn - - logger.info(roc_prcs) - logger.info(roc_aucs) - - logger.info("Testing Complete") - - else: - logger.info("No CUDA-enabled GPU is available -- running with CPU") - # TODO: Test with CPU + if not cuda_available: + logger.warning("Training on CPU may cause longer waiting time than expected. Estimate: ~30 minutes") + + + # Get CUDA device properties + device = torch.device("cuda:0" if cuda_available else "cpu") + logger.info(f"Using device: {torch.cuda.get_device_name(device)}" if cuda_available else "Using device: CPU") + + logger.info("Begin Test Run:") + + # Hyper parameters + num_epochs = 15 + batch_size = 128 + learning_rate = 0.001 + + # Change the working directory to explainn in scratch space. + #explainn_path = os.path.join(os.environ.get("SCRATCH_PATH"), "ExplaiNN") + #os.chdir(explainn_path) + + h5_path = "./data/test/tf_peaks_TEST_sparse_Remap.h5" + compressed_file = f"{h5_path}.gz" + if not os.path.exists(h5_path): + if os.path.exists(compressed_file): + logger.info(f"Compressed file {compressed_file} found. Decompressing...") + with gzip.open(compressed_file, 'rb') as f_in, open(h5_path, 'wb') as f_out: + f_out.write(f_in.read()) + logger.info(f"Decompression complete: {h5_path}") + else: + raise FileNotFoundError(f"Neither {h5_path} nor {compressed_file} was found.") + + # Load data + dataloaders, target_labels, train_out = utils.tools.load_datas(h5_path, + batch_size, + 0, + True) + target_labels = [i.decode("utf-8") for i in target_labels] + + # Model parameters + num_cnns = 100 + input_length = 200 + num_classes = len(target_labels) + filter_size = 19 + + # Create model + model = networks.ExplaiNN(num_cnns, input_length, num_classes, filter_size).to(device) + criterion = nn.BCEWithLogitsLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + + weights_folder = "./data/test/weights" + if not os.path.exists(weights_folder): + os.makedirs(weights_folder) + + # Train model + model, train_error, test_error = train.train_explainn(dataloaders["train"], + dataloaders["valid"], + model, + device, + criterion, + optimizer, + num_epochs, + weights_folder, + name_ind="", + verbose=True, + trim_weights=False, + checkpoint=0, + patience=0) + + # Plot loss + utils.tools.showPlot(train_error, test_error, "Loss trend", "Loss") + + # Test model + model.load_state_dict(torch.load(f"{weights_folder}/{os.listdir(weights_folder)[0]}")) + labels_E, outputs_E = test.run_test(model, dataloaders["test"], device) + + # Get metrics + pr_rec = average_precision_score(labels_E, outputs_E) + no_skill_probs = [0 for _ in range(len(labels_E[:, 0]))] + ns_fpr, ns_tpr, _ = metrics.roc_curve(labels_E[:, 0], no_skill_probs) + + roc_aucs, raw_aucs, roc_prcs, raw_prcs = {}, {}, {}, {} + for i in range(len(target_labels)): + nn_fpr, nn_tpr, threshold = metrics.roc_curve(labels_E[:, i], outputs_E[:, i]) + roc_auc_nn = metrics.auc(nn_fpr, nn_tpr) + + precision_nn, recall_nn, thresholds = metrics.precision_recall_curve(labels_E[:, i], outputs_E[:, i]) + pr_auc_nn = metrics.auc(recall_nn, precision_nn) + + raw_aucs[target_labels[i]] = nn_fpr, nn_tpr + roc_aucs[target_labels[i]] = roc_auc_nn + + raw_prcs[target_labels[i]] = recall_nn, precision_nn + roc_prcs[target_labels[i]] = pr_auc_nn + + logger.info(roc_prcs) + logger.info(roc_aucs) + + logger.info("Testing Complete") if __name__=='__main__': From e26208f0ccba293cca883c10797d3435c2fc7b39 Mon Sep 17 00:00:00 2001 From: Alexander Zhong Date: Sun, 30 Mar 2025 21:24:53 -0700 Subject: [PATCH 2/3] feat: config validation + output dir check --- run/run.py | 72 +++++++++++++++++++++++++++----------- run/utils.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 148 insertions(+), 23 deletions(-) diff --git a/run/run.py b/run/run.py index f92b655..da47c44 100644 --- a/run/run.py +++ b/run/run.py @@ -2,6 +2,7 @@ import os import click import json +import logging from explainn.train.train import train_explainn from explainn.utils.tools import pearson_loss @@ -12,11 +13,24 @@ from train import run_train from test import test_model from interpret import interpret_results -from utils import save_data_splits +from utils import save_data_splits, validate_config + + +# Setup logging +logging.basicConfig( + format="{asctime} - {name} - {levelname} - {message}", + style="{", + datefmt="%Y-%m-%d %H:%M", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + CONTEXT_SETTINGS = { "help_option_names": ["-h", "--help"], } + + @click.command(no_args_is_help=True, context_settings=CONTEXT_SETTINGS) @click.argument( "config_file", @@ -24,22 +38,42 @@ ) def main(**args): # Read config file - # TODO: Validate the fields of the config file with open(args["config_file"]) as f: config = json.load(f) - # TODO: Check that output dir exists - + # Validate the fields of the config file + try: + validate_config(config) + logging.info("Config file validated.") + except Exception as e: + logging.error(str(e)) + + # Check that output dir exists + output_dir = config["data"]["output_dir"] + if not os.path.isdir(output_dir): + raise OSError( + f"The output directory: {output_dir} does not exist.\n" + f"Check the path relative to the current working directory: {os.getcwd()}" + ) # TODO: Add preprocessing steps as arguments/config, eg. match-seqs-by-gc, - # subsample-seqs-by-gc, resize, etc. - - + # subsample-seqs-by-gc, resize, etc. + if config["preprocessing"]["match_seqs_by_gc"]: + # TODO: perform match seqs by gc + pass + if config["preprocessing"]["subsample_seqs_by_gc"]: + # TODO: perform subsample_seqs_by_gc + pass + if config["preprocessing"]["resize"]: + # TODO Perform resize? + pass + # Preprocess the data # TODO: Add this as an argument/in config classes = combine_seq_files(config["data"]["input_files"]) splits = json2explainn(classes) - save_data_splits(config["data"]["output_dir"], + save_data_splits( + config["data"]["output_dir"], splits[0], splits[1], splits[2], @@ -48,7 +82,9 @@ def main(**args): # TODO: Update config file with output location? Where to store path to intermediates if config["options"]["store_intermediates"]: - handle = open(os.path.join(config["data"]["output_dir"], "combined_data.json"), "wt") + handle = open( + os.path.join(config["data"]["output_dir"], "combined_data.json"), "wt" + ) json.dump(classes, handle, indent=4, sort_keys=True) handle.close() @@ -64,30 +100,26 @@ def main(**args): # Finetune the model # TODO: Specify this with config/arguments - # Further interpretation # TODO: Specify these with config/arguments # MEME to logos meme2logo(config) # MEME to scores - #meme2scores(config) + # meme2scores(config) # MEME to clusters - #meme2clusters(config) + # meme2clusters(config) # Tomtom - #tomtom(config) + # tomtom(config) # JASPAR to logos - #jaspar2logo(config) + # jaspar2logo(config) # PWM to scores - #pwm2scores(config) - - - + # pwm2scores(config) -if __name__=='__main__': - main() \ No newline at end of file +if __name__ == "__main__": + main() diff --git a/run/utils.py b/run/utils.py index 6ee6abd..b45dfc3 100644 --- a/run/utils.py +++ b/run/utils.py @@ -73,7 +73,7 @@ def name_path(suffix, output_dir="./", prefix=None): str: formatted filepath """ return os.path.join(output_dir, ".".join(filter(None, (prefix, suffix)))) - + def get_file_handle(file_name, mode): """ @@ -190,7 +190,6 @@ def _dna_one_hot_many(seqs): return(np.array([dna_one_hot(str(seq)) for seq in seqs])) - def get_data_loader(seqs, labels, batch_size=100, shuffle=False): # TensorDatasets @@ -236,4 +235,98 @@ def shuffle_string(s, k=2, random_seed=1714): l = [s[i-k:i] for i in range(k, len(s)+k, k)] random.Random(random_seed).shuffle(l) - return "".join(l) \ No newline at end of file + return "".join(l) + + +def validate_config(config): + """Validating the fields of the config file against the expected structure + + Ensures that the config dictionary has all the required keys and right types. + + Error: + - Missing a field, will throw a ValueError + - Wrong types, will throw a TypeError + """ + + # required fields being validated + required_fields = { + "data": { + "input_files": list, + "output_dir": str, + "prefix": str, + "rev_complement": bool, + "input_length": int, + "intermediates": { + "training_file": str, + "validation_file": str, + "test_file": str, + }, + }, + "cnn": { + "filter_size": int, + "num_fc": int, + "num_units": int, + "pool_size": int, + "pool_stride": int, + }, + "training": { + "cpu_threads": int, + "batch_size": int, + "num_epochs": int, + "checkpoint": int, + "patience": int, + "trim_weights": bool, + }, + "optimizer": {"criterion": str, "lr": float, "optimizer": str}, + "interpretation": { + "model_file": str, + "cpu_threads": int, + "batch_size": int, + "num_well_pred_seqs": int, + "correlation": int, + "exact_match": bool, + "percentile_bottom": int, + "percentile_top": int, + }, + "options": {"debugging": bool, "use_time": bool, "store_intermediates": bool}, + "postprocess": { + "cpu_threads": int, + "target_file": str, + "tomtom": { + "dist": str, + "evalue": bool, + "min_overlap": int, + "motif_pseudo": float, + "threshold": float, + }, + }, + } + + for section, fields in required_fields.items(): + if section not in config: + raise ValueError(f"Missing section in config -- {section}") + + + for key, type in fields.items(): + + + if isinstance(type, dict): + if not isinstance(config[section][key], dict): + raise TypeError(f"Incorrect type for {section}.{key} -- Intended type: dict") + + for subsection, subtype in type.items(): + if subsection not in config[section][key]: + raise ValueError(f"Missing subsection value in config -- {section}.{key}.{subsection}") + + if not isinstance(config[section][key][subsection], subtype): + raise TypeError(f"Incorrect type for {section}.{key}.{subsection} -- Intended type: {subtype}") + + else: + if key not in config[section]: + raise ValueError(f"Missing section value in config -- {section}.{key}") + + if not isinstance(config[section][key], type): + raise TypeError(f"Incorrect type for {section}.{key} -- Intended type: {type}") + + + return True From 72da7ecf4da1269a6ebb95c2fa4a15f18e32d5c3 Mon Sep 17 00:00:00 2001 From: Alexander Zhong Date: Mon, 31 Mar 2025 23:29:46 -0700 Subject: [PATCH 3/3] feat: added and refactored constants.py + error handling in train.py --- run/constants.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++ run/test.py | 18 ++++++++++--- run/train.py | 43 +++++++++++++++++------------- run/utils.py | 63 +++---------------------------------------- 4 files changed, 111 insertions(+), 82 deletions(-) create mode 100644 run/constants.py diff --git a/run/constants.py b/run/constants.py new file mode 100644 index 0000000..a589520 --- /dev/null +++ b/run/constants.py @@ -0,0 +1,69 @@ +import torch +from torch import nn +from explainn.utils.tools import pearson_loss + +CRITERIONS = { + "bcewithlogits": nn.BCEWithLogitsLoss(), + "crossentropy": nn.CrossEntropyLoss(), + "mse": nn.MSELoss(), + "pearson": pearson_loss, + "poissonnll": nn.PoissonNLLLoss(), +} + +OPTIMIZERS = { + "adam": torch.optim.Adam, + "sgd": torch.optim.SGD +} + +CONFIG_REQUIRED_FIELDS = { + "data": { + "input_files": list, + "output_dir": str, + "prefix": str, + "rev_complement": bool, + "input_length": int, + "intermediates": { + "training_file": str, + "validation_file": str, + "test_file": str, + }, + }, + "cnn": { + "filter_size": int, + "num_fc": int, + "num_units": int, + "pool_size": int, + "pool_stride": int, + }, + "training": { + "cpu_threads": int, + "batch_size": int, + "num_epochs": int, + "checkpoint": int, + "patience": int, + "trim_weights": bool, + }, + "optimizer": {"criterion": str, "lr": float, "optimizer": str}, + "interpretation": { + "model_file": str, + "cpu_threads": int, + "batch_size": int, + "num_well_pred_seqs": int, + "correlation": int, + "exact_match": bool, + "percentile_bottom": int, + "percentile_top": int, + }, + "options": {"debugging": bool, "use_time": bool, "store_intermediates": bool}, + "postprocess": { + "cpu_threads": int, + "target_file": str, + "tomtom": { + "dist": str, + "evalue": bool, + "min_overlap": int, + "motif_pseudo": float, + "threshold": float, + }, + }, +} diff --git a/run/test.py b/run/test.py index 0fde4a6..56d4b15 100644 --- a/run/test.py +++ b/run/test.py @@ -26,7 +26,7 @@ from explainn.models.networks import ExplaiNN from explainn.interpretation.interpretation import get_explainn_predictions from run.utils import (get_file_handle, get_seqs_labels_ids, get_data_loader, - get_device, data_split_names, get_criterion) + get_device, data_split_names, get_criterion, validate_config) CONTEXT_SETTINGS = { "help_option_names": ["-h", "--help"], @@ -40,11 +40,23 @@ def main(**args): """ """ # Read config file - # TODO: Validate the fields of the config file with open(args["config_file"]) as f: config = json.load(f) - # TODO: Check that output dir exists + # Validate the fields of the config file + try: + validate_config(config) + logging.info("Config file validated.") + except Exception as e: + logging.error(str(e)) + + # Check that output dir exists + output_dir = config["data"]["output_dir"] + if not os.path.isdir(output_dir): + raise OSError( + f"The output directory: {output_dir} does not exist.\n" + f"Check the path relative to the current working directory: {os.getcwd()}" + ) test_model(config) diff --git a/run/train.py b/run/train.py index 0fa7762..e57bf4e 100644 --- a/run/train.py +++ b/run/train.py @@ -1,11 +1,13 @@ #!/usr/bin/env python +import logging import os import sys import time import torch import click import json +import constants import pandas as pd @@ -15,7 +17,7 @@ from explainn.train.train import train_explainn from explainn.models.networks import ExplaiNN from utils import (get_file_handle, get_seqs_labels_ids, get_data_loader, - get_device, data_split_names, get_criterion) + get_device, data_split_names, get_criterion, validate_config) CONTEXT_SETTINGS = { "help_option_names": ["-h", "--help"], @@ -29,11 +31,23 @@ def main(**args): """ """ # Read config file - # TODO: Validate the fields of the config file with open(args["config_file"]) as f: config = json.load(f) - # TODO: Check that output dir exists + # Validate the fields of the config file + try: + validate_config(config) + logging.info("Config file validated.") + except Exception as e: + logging.error(str(e)) + + # Check that output dir exists + output_dir = config["data"]["output_dir"] + if not os.path.isdir(output_dir): + raise OSError( + f"The output directory: {output_dir} does not exist.\n" + f"Check the path relative to the current working directory: {os.getcwd()}" + ) run_train(config) @@ -76,15 +90,11 @@ def run_train(config): try: criterion = get_criterion()[config["optimizer"]["criterion"].lower()] except KeyError: - # TODO: Create error for this instead of print statement - print("""Criterion not found, please select from the following list: - BCEWithLogits - CrossEntropy - MSE - Pearson - PoissonNLL - """) - return + raise KeyError( + f"Invalid criterion '{config['optimizer']['criterion']}'. " + f"Please choose one of: {', '.join(get_criterion().keys())}" + ) + # Get model m = ExplaiNN(config["cnn"]["num_units"], config["data"]["input_length"], @@ -116,13 +126,8 @@ def run_train(config): def _get_optimizer(optimizer, parameters, lr=0.0005): """ """ - # TODO: Change this to a map - if optimizer.lower() == "adam": - return torch.optim.Adam(parameters, lr=lr) - elif optimizer.lower() == "sgd": - return torch.optim.SGD(parameters, lr=lr) - - + return constants.OPTIMIZERS[optimizer.lower()](parameters, lr=lr) + def _train(train_loader, test_loader, model, device, criterion, optimizer, num_epochs=100, output_dir="./", name_ind=None, verbose=False, trim_weights=False, checkpoint=0, patience=0): diff --git a/run/utils.py b/run/utils.py index b45dfc3..4121a53 100644 --- a/run/utils.py +++ b/run/utils.py @@ -3,6 +3,7 @@ import click import gzip +import constants from functools import partial import numpy as np import pandas as pd @@ -93,15 +94,8 @@ def get_file_handle(file_name, mode): def get_criterion(): """ - TODO: Move to constants.py? """ - return { - "bcewithlogits": nn.BCEWithLogitsLoss(), - "crossentropy": nn.CrossEntropyLoss(), - "mse": nn.MSELoss(), - "pearson": pearson_loss, - "poissonnll": nn.PoissonNLLLoss() - } + return constants.CRITERIONS def get_or_create_dirs(output_path, output_dir): """ @@ -249,58 +243,7 @@ def validate_config(config): """ # required fields being validated - required_fields = { - "data": { - "input_files": list, - "output_dir": str, - "prefix": str, - "rev_complement": bool, - "input_length": int, - "intermediates": { - "training_file": str, - "validation_file": str, - "test_file": str, - }, - }, - "cnn": { - "filter_size": int, - "num_fc": int, - "num_units": int, - "pool_size": int, - "pool_stride": int, - }, - "training": { - "cpu_threads": int, - "batch_size": int, - "num_epochs": int, - "checkpoint": int, - "patience": int, - "trim_weights": bool, - }, - "optimizer": {"criterion": str, "lr": float, "optimizer": str}, - "interpretation": { - "model_file": str, - "cpu_threads": int, - "batch_size": int, - "num_well_pred_seqs": int, - "correlation": int, - "exact_match": bool, - "percentile_bottom": int, - "percentile_top": int, - }, - "options": {"debugging": bool, "use_time": bool, "store_intermediates": bool}, - "postprocess": { - "cpu_threads": int, - "target_file": str, - "tomtom": { - "dist": str, - "evalue": bool, - "min_overlap": int, - "motif_pseudo": float, - "threshold": float, - }, - }, - } + required_fields = constants.CONFIG_REQUIRED_FIELDS for section, fields in required_fields.items(): if section not in config: