From 4e5b188db687cf3b4caf4d60a7846e61c5153f36 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 27 Jan 2023 16:41:39 -0600 Subject: [PATCH 001/254] work in progress. candlizing test.sh to work with get_file. --- get_test_data.py | 14 ++++++++++++++ preprocess.py | 36 ++++++++++++++++++++++++++++++++++++ test.sh | 6 ++++++ 3 files changed, 56 insertions(+) create mode 100644 get_test_data.py create mode 100644 preprocess.py create mode 100755 test.sh diff --git a/get_test_data.py b/get_test_data.py new file mode 100644 index 0000000..4eead14 --- /dev/null +++ b/get_test_data.py @@ -0,0 +1,14 @@ +import candle +import os + +# Assumes CANDLE_DATA_DIR is an environment variable +os.environ['CANDLE_DATA_DIR'] = '/tmp/data_dir' + +fname='input_txt_Nick.txt' +origin='http://chia.team/IMPROVE_data/input_txt_Nick.txt' + +# Download and unpack the data in CANDLE_DATA_DIR +candle.file_utils.get_file(fname, origin) + +# Do it again to confirm it's not re-downloading +candle.file_utils.get_file(fname, origin) diff --git a/preprocess.py b/preprocess.py new file mode 100644 index 0000000..378898f --- /dev/null +++ b/preprocess.py @@ -0,0 +1,36 @@ +def preprocess(params): + fname='input_for_Nick.txt' + origin=params['data_url'] + # Download and unpack the data in CANDLE_DATA_DIR + candle.file_utils.get_file(fname, origin) + params['train_data'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['train_data'] + #params['val_data'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['val_data'] + #params['gep_filepath'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['gep_filepath'] + #params['smi_filepath'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['smi_filepath'] + #params['gene_filepath'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['gene_filepath'] + #params['smiles_language_filepath'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['smiles_language_filepath'] + """ + params["train_data"] = candle.get_file(params['train_data'], origin, datadir=params['data_dir'], cache_subdir=None) + params["val_data"] = candle.get_file(params['val_data'], origin, datadir=params['data_dir'], cache_subdir=None) + params["gep_filepath"] = candle.get_file(params['gep_filepath'], origin, datadir=params['data_dir'], cache_subdir=None) + params["smi_filepath"] = candle.get_file(params['smi_filepath'], origin, datadir=params['data_dir'], cache_subdir=None) + params["gene_filepath"] = candle.get_file(params['gene_filepath'], origin, datadir=params['data_dir'], cache_subdir=None) + params["smiles_language_filepath"] = candle.get_file(params['smiles_language_filepath'], origin, datadir=params['data_dir'], cache_subdir=None) """ + return params + +def run(params): + params['data_type'] = str(params['data_type']) + with open ((params['output_dir']+'/params.json'), 'w') as outfile: + json.dump(params, outfile) + scores = main(params) + with open(params['output_dir'] + "/scores.json", "w", encoding="utf-8") as f: + json.dump(scores, f, ensure_ascii=False, indent=4) + print('IMPROVE_RESULT RMSE:\t' + str(scores['rmse'])) + +def candle_main(): + params = initialize_parameters() + params = preprocess(params) + run(params) + +if __name__ == "__main__": + candle_main() diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..9df1130 --- /dev/null +++ b/test.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +#The point of this is to test if the thing works at all + +python ./PathDSP/PathDSP/FNN.py -i inputs.txt -o ./output_prefix + From 2d3cc3e150a114267b709f1c7c15dad0b3831437 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 27 Jan 2023 20:44:09 -0600 Subject: [PATCH 002/254] minor fixes to candle for getting test data. still having issue with import torch. --- get_test_data.py | 4 ++-- test.sh | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/get_test_data.py b/get_test_data.py index 4eead14..9de863d 100644 --- a/get_test_data.py +++ b/get_test_data.py @@ -2,7 +2,7 @@ import os # Assumes CANDLE_DATA_DIR is an environment variable -os.environ['CANDLE_DATA_DIR'] = '/tmp/data_dir' +os.environ['CANDLE_DATA_DIR'] = 'tmp/' fname='input_txt_Nick.txt' origin='http://chia.team/IMPROVE_data/input_txt_Nick.txt' @@ -11,4 +11,4 @@ candle.file_utils.get_file(fname, origin) # Do it again to confirm it's not re-downloading -candle.file_utils.get_file(fname, origin) +#candle.file_utils.get_file(fname, origin) diff --git a/test.sh b/test.sh index 9df1130..f854b2b 100755 --- a/test.sh +++ b/test.sh @@ -2,5 +2,6 @@ #The point of this is to test if the thing works at all -python ./PathDSP/PathDSP/FNN.py -i inputs.txt -o ./output_prefix +python get_test_data.py +python ./PathDSP/FNN.py -i tmp/common/input_txt_Nick.txt -o ./output_prefix From d821eecc88d8544ab5c32385f55b6eb812a4216d Mon Sep 17 00:00:00 2001 From: root Date: Sat, 28 Jan 2023 03:14:01 +0000 Subject: [PATCH 003/254] removed import myPlotter - unused custom package --- PathDSP/FNN.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PathDSP/FNN.py b/PathDSP/FNN.py index a05247b..01c9acc 100644 --- a/PathDSP/FNN.py +++ b/PathDSP/FNN.py @@ -30,7 +30,7 @@ import myDataloader as mydl import myDatasplit as mysplit import myUtility as myutil -import myPlotter as myplot +#import myPlotter as myplot import myMetrics as mymts import shap as sp From b6fd78dbcb5a5ae8e8ddc386e07dcf060111e5c0 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 28 Jan 2023 03:45:59 +0000 Subject: [PATCH 004/254] adding data parsing R script --- parse_DSP_data_Chia_Jan12_2023.R | 72 ++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 parse_DSP_data_Chia_Jan12_2023.R diff --git a/parse_DSP_data_Chia_Jan12_2023.R b/parse_DSP_data_Chia_Jan12_2023.R new file mode 100644 index 0000000..28976f0 --- /dev/null +++ b/parse_DSP_data_Chia_Jan12_2023.R @@ -0,0 +1,72 @@ +rm(list = ls()) + +setwd("c:/Users/m092469/OneDrive - Mayo Clinic/temp_code/DOE_IMPROVE") + +options(stringsAsFactors = FALSE) + +rDrug <- read.delim( + "data/GDSCv2.Gao2015.Powell2020.Lee2021.GeoSearch.Ding2016.CHEM.256.MBits.txt") + +rGSEA <- read.delim( + "data/GDSCv2.Powell2020.EXP.ssGSEA.txt") + +rPNET <- read.delim( + "data/GDSCv2.Gao2015.Powell2020.Lee2021.GeoSearch.Ding2016.DGNet.NetPEA.txt") + +rReponse0 <- read.delim("data/GDSCv2.resp_PowellAUC.Alias.txt") + +Drug.ID.vec0 <- intersect(rDrug$drug, rReponse0$Therapy) +Cell.ID.vec <- intersect(rGSEA$X, rReponse0$Sample) + +Drug.ID.vec <- intersect(Drug.ID.vec0,rPNET$X) + +sel.Reponse.idx <- which(is.element(rReponse0$Therapy, Drug.ID.vec) & + is.element(rReponse0$Sample, Cell.ID.vec)) + +rReponse <- rReponse0[sel.Reponse.idx, ] + +N.cell <- length(Cell.ID.vec) +N.drug <- length(Drug.ID.vec) +N.comb <- nrow(rReponse) + +head(rDrug[,1:5]) +Drug.fmtx <- data.matrix(rDrug[match(Drug.ID.vec, rDrug$drug), 2: ncol(rDrug)]) +rownames(Drug.fmtx) <- Drug.ID.vec +head(Drug.fmtx[,1:5]) + +Drug.PNEA.fmtx <- data.matrix(rPNET[match(Drug.ID.vec, rPNET$X), 2: ncol(rPNET)]) +rownames(Drug.PNEA.fmtx) <- Drug.ID.vec +head(Drug.PNEA.fmtx[,1:5]) + +all(rownames(Drug.fmtx)==rownames(Drug.PNEA.fmtx)) + +Cell.GSEA.mtx <- data.matrix(rGSEA[match(Cell.ID.vec, rGSEA$X), 2: ncol(rGSEA)]) +rownames(Cell.GSEA.mtx) <- Cell.ID.vec +head(Cell.GSEA.mtx[,1:5]) + +N.col <- ncol(Drug.fmtx) + ncol(Drug.PNEA.fmtx) + + ncol(Cell.GSEA.mtx) + 3 # drug, cell & resp +comb.data.mtx <- mat.or.vec(N.comb, N.col) +colnames(comb.data.mtx) <- c("drug", "cell", + paste0("feature",1:(N.col-3)),"resp") + + +for(i in 1:N.comb){ + # i <- 1 + tmp.cell.ID <- rReponse$Sample[i] + tmp.drug.ID <- rReponse$Therapy[i] + comb.data.mtx[i, "drug"] <- tmp.drug.ID + comb.data.mtx[i, "cell"] <- tmp.cell.ID + comb.data.mtx[i, paste0("feature",1:(N.col-3))] <- + c(Drug.fmtx[tmp.drug.ID,], + Drug.PNEA.fmtx[tmp.drug.ID,], + Cell.GSEA.mtx[tmp.cell.ID,]) + response.idx <-which( + rReponse$Therapy==tmp.drug.ID & rReponse$Sample==tmp.cell.ID) + tmp.resp <- rReponse$Response[response.idx] + comb.data.mtx[i, "resp"] <- tmp.resp + +} + +write.table(x = comb.data.mtx, file = "input_txt_Nick.txt", + quote = FALSE, sep = "\t", row.names = FALSE) \ No newline at end of file From 0776dc62a485582554f831d9fcc018eaa585d9da Mon Sep 17 00:00:00 2001 From: root Date: Sat, 28 Jan 2023 04:22:24 +0000 Subject: [PATCH 005/254] added params file --- PathDSP_params.txt | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 PathDSP_params.txt diff --git a/PathDSP_params.txt b/PathDSP_params.txt new file mode 100644 index 0000000..07d0614 --- /dev/null +++ b/PathDSP_params.txt @@ -0,0 +1,23 @@ +[Global_Params] +data_url="http://drugcell.ucsd.edu/downloads/" +original_data="data.tgz" +CUDA_ID = 0 +load = "drugcell_v1.pt" +train_data = "../data/drugcell_train.txt" +test_data = "../data/drugcell_test.txt" +val_data = "../data/drugcell_val.txt" +onto = "drugcell_ont.txt" +learning_rate = 0.001 +batch_size = 1000 +genotype_hiddens = 6 +drug_hiddens='100,50,6' +final_hiddens=6 +genotype="cell2mutation.txt" +fingerprint='drug2fingerprint.txt' +cell2id='../data/cell2ind.txt' +drug2id='../data/drug2ind.txt' +output_dir = "MODEL" +epochs=200 +optimizer = "adam" +loss = "mse" +predict="drugcell_all.txt" From 860f51bad4a3bd36e74ddb88a01164a52705f8cc Mon Sep 17 00:00:00 2001 From: root Date: Sat, 28 Jan 2023 04:25:00 +0000 Subject: [PATCH 006/254] adding TODO.txt so I remember where I am --- TODO.txt | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 TODO.txt diff --git a/TODO.txt b/TODO.txt new file mode 100644 index 0000000..7f87e3c --- /dev/null +++ b/TODO.txt @@ -0,0 +1,6 @@ +1. Expose parameters - give up on private crap +2. Fix params file +3. Document on docs +4. Use R script to make preprocess.sh +5. make train script +6. make infer script From b0260ba0e2ac2b00aeec3b14f20d62ce589778b2 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 19 Jul 2023 17:08:31 -0500 Subject: [PATCH 007/254] update preprocess script --- preprocess_new.py | 319 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 319 insertions(+) create mode 100644 preprocess_new.py diff --git a/preprocess_new.py b/preprocess_new.py new file mode 100644 index 0000000..ae219ae --- /dev/null +++ b/preprocess_new.py @@ -0,0 +1,319 @@ +#!/homes/ac.rgnanaolivu/miniconda3/envs/rohan_python/bin/python + +import sys +import os +import numpy as np +import torch +import torch.utils.data as du +from torch.autograd import Variable +import torch.nn as nn +import torch.nn.functional as F +#from code.drugcell_NN import * +import argparse +import numpy as np +import pandas as pd +import candle +import time +import logging +import networkx as nx +import networkx.algorithms.components.connected as nxacc +import networkx.algorithms.dag as nxadag +#from pathlib import Path +from functools import reduce +import improve_utils +# import RDKit +from rdkit import Chem +from rdkit.Chem import AllChem +from datetime import datetime +# import NetPEA modules +import RWR as rwr +import NetPEA as pea + + + +file_path = os.path.dirname(os.path.realpath(__file__)) +#fdir = Path('__file__').resolve().parent +#source = 'csa_data/raw_data/splits/' +required = None +additional_definitions = None + +# This should be set outside as a user environment variable +os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' + +# initialize class +class PathDSP_candle(candle.Benchmark): + def set_locals(self): + ''' + Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the benchmark. + ''' + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + + +def initialize_parameters(): + preprocessor_bmk = PathDSP_candle(file_path, + 'PathDSP_params.txt', + 'pytorch', + prog='PathDSP_candle', + desc='Data Preprocessor' + ) + #Initialize parameters + gParameters = candle.finalize_parameters(preprocessor_bmk) + return gParameters + + +def mkdir(directory): + directories = directory.split('/') + + folder = '' + for d in directories: + folder += d + '/' + if not os.path.exists(folder): + print('creating folder: %s'%folder) + os.mkdir(folder) + + +def preprocess(params, data_dir): + print(os.environ['CANDLE_DATA_DIR']) + #requirements go here + #keys_parsing = ['output_dir', 'hidden', 'result', 'metric', 'data_type'] + if not os.path.exists(data_dir): + mkdir(data_dir) + params['data_dir'] = data_dir + #args = candle.ArgumentStruct(**params) + for i in ['train_data', 'test_data', 'val_data', 'drug_bits_file', 'dgnet_file', + 'mutnet_file', 'cnvnet_file', 'exp_file', 'final_input_file', 'output_dir']: + params[i] = params['data_dir'] + '/' + params[i] + return(params) + +def download_anl_data(params): + csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'] + params['model_name'], 'csa_data', 'raw_data') + splits_dir = os.path.join(csa_data_folder, 'splits') + x_data_dir = os.path.join(csa_data_folder, 'x_data') + y_data_dir = os.path.join(csa_data_folder, 'y_data') + + if not os.path.exists(csa_data_folder): + print('creating folder: %s'%csa_data_folder) + os.makedirs(csa_data_folder) + mkdir(splits_dir) + mkdir(x_data_dir) + mkdir(y_data_dir) + mkdir(supplementary_folder) + + for improve_file in ['CCLE_all.txt', 'CCLE_split_0_test.txt', + 'CCLE_split_0_train.txt', 'CCLE_split_0_val.txt']: + url_dir = params['improve_data_url'] + '/splits/' + candle.file_utils.get_file(improve_file, url_dir + improve_file, + datadir=splits_dir, + cache_subdir=None) + + for improve_file in ['cancer_mutation_count.tsv', 'drug_SMILES.tsv', 'drug_info.tsv', 'cancer_discretized_copy_number.tsv', 'cancer_gene_expression.tsv']: + url_dir = params['improve_data_url'] + '/x_data/' + candle.file_utils.get_file(fname=improve_file, origin=url_dir + improve_file, + datadir=x_data_dir, + cache_subdir=None) + + url_dir = params['improve_data_url'] + '/y_data/' + response_file = 'response.tsv' + candle.file_utils.get_file(fname=response_file, origin=url_dir + response_file, + datadir=y_data_dir, + cache_subdir=None) + + ## get gene-set data and string data + for db_file in [params['gene_set'], params['ppi_data'], params['drug_target']]: + candle.file_utils.get_file(db_file, params['data_url'] + '/' +db_file, + datadir=params['data_dir'], + cache_subdir=None) + + +# set timer +def cal_time(end, start): + '''return time spent''' + # end = datetime.now(), start = datetime.now() + datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' + spend = datetime.strptime(str(end), datetimeFormat) - \ + datetime.strptime(str(start),datetimeFormat) + return spend + + +def download_author_data(params): + data_download_filepath = candle.get_file(params['original_data'], params['original_data_url'], + datadir = params['data_dir'], + cache_subdir = None) + print('download_path: {}'.format(data_download_filepath)) + + +def smile2bits(params): + start = datetime.now() + smile_df = improve_utils.load_smiles_data() + smile_df.columns = ['drug', 'smile'] + smile_df = smile_df.drop_duplicates(subset=['drug'], keep='first').set_index('drug') + bit_int = params['bit_int'] + record_list = [] + # smile2bits drug by drug + n_drug = 1 + for idx, row in smile_df.iterrows(): + drug = idx + smile = row['smile'] + mol = Chem.MolFromSmiles(smile) + if mol is None: + continue + mbit = list( AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=bit_int) ) + #drug_mbit_dict.update({drug:mbit}) + # append to result + record_list.append( tuple([drug]+mbit) ) + if len(mbit) == bit_int: + n_drug+=1 + print('total {:} drugs with bits'.format(n_drug)) + # convert dict to dataframe + colname_list = ['drug'] + ['mBit_'+str(i) for i in range(bit_int)] + drug_mbit_df = pd.DataFrame.from_records(record_list, columns=colname_list) + #drug_mbit_df = pd.DataFrame.from_dict(drug_mbit_dict, orient='index', columns=colname_list) + #drug_mbit_df.index.name = 'drug' + print('unique drugs={:}'.format(len(drug_mbit_df['drug'].unique()))) + # save to file + drug_mbit_df.to_csv(params['drug_bits_file'], header=True, index=False, sep='\t') + print('[Finished in {:}]'.format(cal_time(datetime.now(), start))) + +def times_expression(rwr, exp): + ''' + :param rwrDf: dataframe of cell by gene probability matrix + :param expDf: dataframe of cell by gene expression matrix + :return rwr_timesexp_df: dataframe of cell by gene probability matrix, + in which genes are multiplied with expression values + + Note: this function assumes cells are all overlapped while gene maybe not + ''' + cell_list = sorted(list(set(rwr.index) & set(exp.index))) + gene_list = sorted(list(set(rwr.columns)&set(exp.columns))) + + if len(cell_list) == 0: + print('ERROR! no overlapping cell lines') + sys.exit(1) + if len(gene_list) == 0: + print('ERROR! no overlapping genes') + sys.exit(1) + + # multiply with gene expression for overlapping cell, gene + rwr_timesexp = rwr.loc[cell_list, gene_list]*exp.loc[cell_list, gene_list] + + # concat with other gene + out_gene_list = list(set(rwr.columns)-set(gene_list)) + out_df = pd.concat([rwr_timesexp, rwr[out_gene_list]], axis=1) + return out_df + +def run_netpea(params, dtype, multiply_expression): + # timer + start_time = datetime.now() + ppi_path = params['data_dir'] + '/STRING/9606.protein_name.links.v11.0.pkl' + pathway_path = params['data_dir'] + '/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt' + log_transform = False + permutation_int = params['permutation_int'] + seed_int = params['seed_int'] + cpu_int = params['cpu_int'] + csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'] + params['model_name'], 'csa_data', 'raw_data') + if dtype == 'DGnet': + drug_info = pd.read_csv(csa_data_folder + '/x_data/drug_info.tsv', sep='\t') + drug_info['NAME'] = drug_info['NAME'].str.upper() + target_info = pd.read_csv(params['data_dir'] + '/raw_data/DB.Drug.Target.txt', sep = '\t') + target_info = target_info.rename(columns={'drug': 'NAME'}) + combined_df = pd.merge(drug_info, target_info, how = 'left', on = 'NAME').dropna(subset=['gene']) + restart_path = params['data_dir'] + '/drug_target.txt' + combined_df.iloc[:,-2:].to_csv(restart_path, sep = '\t', header= True, index=False) + outpath = params['dgnet_file'] + elif dtype == 'MUTnet': + mutation_data = improve_utils.load_mutation_count_data(gene_system_identifier='Gene_Symbol') + mutation_data = mutation_data.reset_index() + mutation_data = pd.melt(mutation_data, id_vars='improve_sample_id').loc[lambda x: x['value'] > 0] + restart_path = params['data_dir'] + '/mutation_data.txt' + mutation_data.iloc[:,0:2].to_csv(restart_path, sep = '\t', header= True, index=False) + outpath = params['mutnet_file'] + else: + cnv_data = improve_utils.load_discretized_copy_number_data(gene_system_identifier='Gene_Symbol') + cnv_data = cnv_data.reset_index() + cnv_data = pd.melt(cnv_data, id_vars='improve_sample_id').loc[lambda x: x['value'] != 0] + restart_path = params['data_dir'] + '/cnv_data.txt' + cnv_data.iloc[:,0:2].to_csv(restart_path, sep = '\t', header= True, index=False) + outpath = params['mutnet_file'] + # perform Random Walk + print(datetime.now(), 'performing random walk with restart') + rwr_df = rwr.RWR(ppi_path, restart_path, restartProbFloat=0.5, convergenceFloat=0.00001, normalize='l1', weighted=True).get_prob() + # multiply with gene expression + if multiply_expression: + print(datetime.now(), 'multiplying gene expression with random walk probability for genes were expressed') + exp_df = improve_utils.load_gene_expression_data(gene_system_identifier='Gene_Symbol') + rwr_df = times_expression(rwr_df, exp_df) + #rwr_df.to_csv(out_path+'.RWR.txt', header=True, index=True, sep='\t') + # perform Pathwa Enrichment Analysis + print(datetime.now(), 'performing network-based pathway enrichment') + cell_pathway_df = pea.NetPEA(rwr_df, pathway_path, log_transform=log_transform, permutation=permutation_int, seed=seed_int, n_cpu=cpu_int, out_path=outpath) + print( '[Finished in {:}]'.format(cal_time(datetime.now(), start_time)) ) + +def prep_input(params): + # Read data files + drug_mbit_df = pd.read_csv(params['drug_bits_file'], sep = '\t', index_col=0) + drug_mbit_df = drug_mbit_df.reset_index().rename(columns={'drug': 'drug_id'}) + DGnet = pd.read_csv(params['dgnet_file'], sep='\t', index_col=0) + DGnet = DGnet.add_suffix('_dgnet').reset_index().rename(columns={'index': 'drug_id'}) + CNVnet = pd.read_csv(params['cnvnet_file'], sep= '\t',index_col=0) + CNVnet = CNVnet.add_suffix('_cnvnet').reset_index().rename(columns={'index': 'sample_id'}) + MUTnet = pd.read_csv(params['mutnet_file'], sep='\t',index_col=0) + MUTnet = MUTnet.add_suffix('_mutnet').reset_index().rename(columns={'index': 'sample_id'}) + EXP = pd.read_csv(params['exp_file'], sep = '\t', index_col=0) + EXP = EXP.add_suffix('_exp').reset_index().rename(columns={'index': 'sample_id'}) + response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=0, + split_type=['train', 'test', 'val'], + y_col_name= params['metric']) + response_df = response_df.rename(columns={'improve_chem_id': 'drug_id', 'improve_sample_id': 'sample_id'}) + # Extract relevant IDs + + common_drug_ids = reduce(np.intersect1d, (drug_mbit_df['drug_id'], DGnet['drug_id'], response_df['drug_id'])) + common_sample_ids = reduce(np.intersect1d, (CNVnet['sample_id'], MUTnet['sample_id'], EXP['sample_id'] , response_df['sample_id'])) + response_df = response_df.loc[(response_df['drug_id'].isin(common_drug_ids)) & + (response_df['sample_id'].isin(common_sample_ids)), :] + + drug_mbit_df = drug_mbit_df.loc[drug_mbit_df['drug_id'].isin(common_drug_ids), :].set_index('drug_id').sort_index() + DGnet = DGnet.loc[DGnet['drug_id'].isin(common_drug_ids), :].set_index('drug_id').sort_index() + CNVnet = CNVnet.loc[CNVnet['sample_id'].isin(common_sample_ids), :].set_index('sample_id').sort_index() + MUTnet = MUTnet.loc[MUTnet['sample_id'].isin(common_sample_ids), :].set_index('sample_id').sort_index() + EXP = EXP.loc[EXP['sample_id'].isin(common_sample_ids), :].set_index('sample_id').sort_index() + + drug_data = drug_mbit_df.join(DGnet) + sample_data = CNVnet.join([MUTnet, EXP]) + comb_data_mtx = pd.DataFrame({'drug_id': response_df['drug_id'].values, + 'sample_id': response_df['sample_id'].values}) + comb_data_mtx = comb_data_mtx.set_index(['drug_id', 'sample_id']).join(drug_data, on = 'drug_id').join(sample_data, on = 'sample_id').reset_index() + comb_data_mtx['response'] = response_df[params['metric']] + comb_data_mtx.to_csv(params['final_input_file'], sep = '\t', header= True, index=False) + +def candle_main(anl): + params = initialize_parameters() + data_dir = os.environ['CANDLE_DATA_DIR'] + params['model_name'] + '/Data/' + params = preprocess(params, data_dir) + if params['improve_analysis'] == 'yes' or anl: + download_anl_data(params) + print('convert drug to bits.') + smile2bits(params) + print('compute DGnet.') + run_netpea(params, dtype = 'DGnet', multiply_expression=False) + print('compute MUTnet.') + run_netpea(params, dtype = 'MUTnet', multiply_expression=True) + print('compute CNVnet.') + run_netpea(params, dtype = 'CNVnet', multiply_expression=True) + print('compute EXP.') + run_ssgsea(params) + print('prepare final input file.') + prep_input(params) + else: + download_author_data(params) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('-a', dest='anl', default=False) + args = parser.parse_args() + candle_main(args.anl) From ff4643cd9dea14dc2d69a089b31fb8329a43fa82 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 19 Jul 2023 17:08:40 -0500 Subject: [PATCH 008/254] update preprocess script --- preprocess.sh | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 preprocess.sh diff --git a/preprocess.sh b/preprocess.sh new file mode 100644 index 0000000..973d006 --- /dev/null +++ b/preprocess.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +######################################################################### +### THIS IS A TEMPLATE FILE. SUBSTITUTE #PATH# WITH THE MODEL EXECUTABLE. +######################################################################### + + +# arg 1 CUDA_VISIBLE_DEVICES +# arg 2 CANDLE_DATA_DIR +# arg 3 CANDLE_CONFIG + +### Path to your CANDLEized model's main Python script### + +CANDLE_MODEL=preprocessing_new.py + +if [ $# -lt 2 ] ; then + echo "Illegal number of parameters" + echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" + exit +fi + +if [ $# -eq 2 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + CMD="python ${CANDLE_MODEL}" + echo "CMD = $CMD" + +elif [ $# -ge 3 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + + # if original $3 is a file, set candle_config and passthrough $@ + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$CANDLE_DATA_DIR/$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD" + + # else passthrough $@ + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + + fi +fi + + +# Display runtime arguments +echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" +echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" + +# Set up environmental variables and execute model +echo "activating environment" +. /homes/ac.rgnanaolivu/miniconda3/etc/profile.d/conda.sh +conda activate rohan_python +echo "running command ${CMD}" +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD From f1c9159437cb6f9e8238cf7c5a5c5c64e068ac4f Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 19 Jul 2023 17:09:09 -0500 Subject: [PATCH 009/254] add improve_utils script --- improve_utils.py | 735 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 735 insertions(+) create mode 100644 improve_utils.py diff --git a/improve_utils.py b/improve_utils.py new file mode 100644 index 0000000..f6d2b33 --- /dev/null +++ b/improve_utils.py @@ -0,0 +1,735 @@ +import os +import numpy as np +import pandas as pd +from pathlib import Path, PosixPath +from math import sqrt +from scipy import stats +from typing import List, Union, Optional, Tuple + + +fdir = Path(__file__).resolve().parent + + +# ----------------------------------------------------------------------------- +# TODO +# Note! +# We need to decide how this utils file will be provided for each model. +# Meanwhile, place this .py file in the level as your data preprocessing script. +# For example: +# GraphDRP/ +# |_______ preprocess.py +# |_______ improve_utils.py +# | +# | +# ----------------------------------------------------------------------------- + + + +# ----------------------------------------------------------------------------- +# Global variables +# ---------------- +# These are globals for all models +import types +improve_globals = types.SimpleNamespace() + +# TODO: +# This is CANDLE_DATA_DIR (or something...). +# How this is going to be passed to the code? +improve_globals.main_data_dir = fdir/"csa_data" +# improve_globals.main_data_dir = fdir/"improve_data_dir" +# imp_globals.main_data_dir = fdir/"candle_data_dir" + +# Dir names corresponding to the primary input/output blocks in the pipeline +# {}: input/output +# []: process +# train path: {raw_data} --> [preprocess] --> {ml_data} --> [train] --> {models} +# inference path: {ml_data, models} --> [inference] --> {infer} +improve_globals.raw_data_dir_name = "raw_data" # benchmark data +improve_globals.ml_data_dir_name = "ml_data" # preprocessed data for a specific ML model +improve_globals.models_dir_name = "models" # output from model training +improve_globals.infer_dir_name = "infer" # output from model inference (testing) + +# Secondary dirs in raw_data +improve_globals.x_data_dir_name = "x_data" # feature data +improve_globals.y_data_dir_name = "y_data" # target data +improve_globals.splits_dir_name = "splits" # splits files + +# Column names in the raw data files +# imp_globals.canc_col_name = "CancID" +# imp_globals.drug_col_name = "DrugID" +improve_globals.canc_col_name = "improve_sample_id" # column name that contains the cancer sample ids TODO: rename to sample_col_name +improve_globals.drug_col_name = "improve_chem_id" # column name that contains the drug ids +improve_globals.source_col_name = "source" # column name that contains source/study names (CCLE, GDSCv1, etc.) +improve_globals.pred_col_name_suffix = "_pred" # suffix to predictions col name (example of final col name: auc_pred) + +# Response data file name +improve_globals.y_file_name = "response.tsv" # response data + +# Cancer sample features file names +improve_globals.copy_number_fname = "cancer_copy_number.tsv" # cancer feature +improve_globals.discretized_copy_number_fname = "cancer_discretized_copy_number.tsv" # cancer feature +improve_globals.dna_methylation_fname = "cancer_DNA_methylation.tsv" # cancer feature +improve_globals.gene_expression_fname = "cancer_gene_expression.tsv" # cancer feature +improve_globals.miRNA_expression_fname = "cancer_miRNA_expression.tsv" # cancer feature +improve_globals.mutation_count_fname = "cancer_mutation_count.tsv" # cancer feature +improve_globals.mutation_fname = "cancer_mutation.tsv" # cancer feature +improve_globals.rppa_fname = "cancer_RPPA.tsv" # cancer feature + +# Drug features file names +improve_globals.smiles_file_name = "drug_SMILES.tsv" # drug feature +improve_globals.mordred_file_name = "drug_mordred.tsv" # drug feature +improve_globals.ecfp4_512bit_file_name = "drug_ecfp4_512bit.tsv" # drug feature + +# Globals derived from the ones defined above +improve_globals.raw_data_dir = improve_globals.main_data_dir/improve_globals.raw_data_dir_name # raw_data +improve_globals.ml_data_dir = improve_globals.main_data_dir/improve_globals.ml_data_dir_name # ml_data +improve_globals.models_dir = improve_globals.main_data_dir/improve_globals.models_dir_name # models +improve_globals.infer_dir = improve_globals.main_data_dir/improve_globals.infer_dir_name # infer +# ----- +improve_globals.x_data_dir = improve_globals.raw_data_dir/improve_globals.x_data_dir_name # x_data +improve_globals.y_data_dir = improve_globals.raw_data_dir/improve_globals.y_data_dir_name # y_data +improve_globals.splits_dir = improve_globals.raw_data_dir/improve_globals.splits_dir_name # splits + +# Response +improve_globals.y_file_path = improve_globals.y_data_dir/improve_globals.y_file_name # response.txt + +# Cancers +improve_globals.copy_number_file_path = improve_globals.x_data_dir/improve_globals.copy_number_fname # cancer_copy_number.txt +improve_globals.discretized_copy_number_file_path = improve_globals.x_data_dir/improve_globals.discretized_copy_number_fname # cancer_discretized_copy_number.txt +improve_globals.dna_methylation_file_path = improve_globals.x_data_dir/improve_globals.dna_methylation_fname # cancer_DNA_methylation.txt +improve_globals.gene_expression_file_path = improve_globals.x_data_dir/improve_globals.gene_expression_fname # cancer_gene_expression.txt +improve_globals.mirna_expression_file_path = improve_globals.x_data_dir/improve_globals.miRNA_expression_fname # cancer_miRNA_expression.txt +improve_globals.mutation_count_file_path = improve_globals.x_data_dir/improve_globals.mutation_count_fname # cancer_mutation_count.txt +improve_globals.mutation_file_path = improve_globals.x_data_dir/improve_globals.mutation_fname # cancer_mutation.txt +improve_globals.rppa_file_path = improve_globals.x_data_dir/improve_globals.rppa_fname # cancer_RPPA.txt + +# Drugs +improve_globals.smiles_file_path = improve_globals.x_data_dir/improve_globals.smiles_file_name # +improve_globals.mordred_file_path = improve_globals.x_data_dir/improve_globals.mordred_file_name # +improve_globals.ecfp4_512bit_file_path = improve_globals.x_data_dir/improve_globals.ecfp4_512bit_file_name # +# ----------------------------------------------------------------------------- + + +# ------------------------------------- +# Drug response loaders +# ------------------------------------- + +def load_single_drug_response_data( + # source: Union[str, List[str]], + source: str, + split: Union[int, None]=None, + split_type: Union[str, List[str], None]=None, + y_col_name: str="auc", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns datarame with cancer ids, drug ids, and drug response values. Samples + from the original drug response file are filtered based on the specified + sources. + + Args: + source (str or list of str): DRP source name (str) or multiple sources (list of strings) + split(int or None): split id (int), None (load all samples) + split_type (str or None): one of the following: 'train', 'val', 'test' + y_col_name (str): name of drug response measure/score (e.g., AUC, IC50) + + Returns: + pd.Dataframe: dataframe that contains drug response values + """ + # TODO: at this point, this func implements the loading a single source + df = pd.read_csv(improve_globals.y_file_path, sep=sep) + + # import pdb; pdb.set_trace() + if isinstance(split, int): + # Get a subset of samples + ids = load_split_file(source, split, split_type) + df = df.loc[ids] + else: + # Get the full dataset for a given source + df = df[df[improve_globals.source_col_name].isin([source])] + + cols = [improve_globals.source_col_name, + improve_globals.drug_col_name, + improve_globals.canc_col_name, + y_col_name] + df = df[cols] # [source, drug id, cancer id, response] + df = df.reset_index(drop=True) + if verbose: + print(f"Response data: {df.shape}") + print(df[[improve_globals.canc_col_name, improve_globals.drug_col_name]].nunique()) + return df + + +def load_single_drug_response_data_v2( + # source: Union[str, List[str]], + source: str, + # split: Union[int, None]=None, + # split_type: Union[str, List[str], None]=None, + split_file_name: Union[str, List[str], None]=None, + y_col_name: str="auc", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns datarame with cancer ids, drug ids, and drug response values. Samples + from the original drug response file are filtered based on the specified + sources. + + Args: + source (str or list of str): DRP source name (str) or multiple sources (list of strings) + split(int or None): split id (int), None (load all samples) + split_type (str or None): one of the following: 'train', 'val', 'test' + y_col_name (str): name of drug response measure/score (e.g., AUC, IC50) + + Returns: + pd.Dataframe: dataframe that contains drug response values + """ + # TODO: currently, this func implements loading a single data source (CCLE or CTRPv2 or ...) + df = pd.read_csv(improve_globals.y_file_path, sep=sep) + + # Get a subset of samples + if isinstance(split_file_name, list) and len(split_file_name) == 0: + raise ValueError("Empty list is passed via split_file_name.") + if isinstance(split_file_name, str): + split_file_name = [split_file_name] + ids = load_split_ids(split_file_name) + df = df.loc[ids] + # else: + # # Get the full dataset for a given source + # df = df[df[improve_globals.source_col_name].isin([source])] + + # # Get a subset of cols + # cols = [improve_globals.source_col_name, + # improve_globals.drug_col_name, + # improve_globals.canc_col_name, + # y_col_name] + # df = df[cols] # [source, drug id, cancer id, response] + + df = df.reset_index(drop=True) + if verbose: + print(f"Response data: {df.shape}") + print(f"Unique cells: {df[improve_globals.canc_col_name].nunique()}") + print(f"Unique drugs: {df[improve_globals.drug_col_name].nunique()}") + return df + + +def load_split_ids(split_file_name: Union[str, List[str]]) -> List[int]: + """ Returns list of integers, representing the rows in the response dataset. + Args: + split_file_name (str or list of str): splits file name or list of file names + + Returns: + list: list of integers representing the ids + """ + ids = [] + for fname in split_file_name: + fpath = improve_globals.splits_dir/fname + assert fpath.exists(), f"split_file_name {fname} not found." + ids_ = pd.read_csv(fpath, header=None)[0].tolist() + ids.extend(ids_) + return ids + + +def load_split_file( + source: str, + split: Union[int, None]=None, + split_type: Union[str, List[str], None]=None) -> List[int]: + """ + Args: + source (str): DRP source name (str) + + Returns: + ids (list): list of id integers + """ + # TODO: used in the old version of the rsp loader + if isinstance(split_type, str): + split_type = [split_type] + + # Check if the split file exists and load + ids = [] + for st in split_type: + fpath = improve_globals.splits_dir/f"{source}_split_{split}_{st}.txt" + assert fpath.exists(), f"Splits file not found: {fpath}" + ids_ = pd.read_csv(fpath, header=None)[0].tolist() + ids.extend(ids_) + return ids + + +# ------------------------------------- +# Omic feature loaders +# ------------------------------------- + +""" +Notes about omics data. + +Omics data files are multi-level tables with several column types (generally 3 +or 4), each contains gene names using a different gene identifier system: +Entrez ID, Gene Symbol, Ensembl ID, TSS + +The column levels are not organized in the same order across the different +omic files. + +The level_map dict, in each loader function, encodes the column level and the +corresponding identifier systems. + +For example, in the copy number file the level_map is: +level_map = {"Entrez":0, "Gene_Symbol": 1, "Ensembl": 2} +""" + +def set_col_names_in_multilevel_dataframe( + df: pd.DataFrame, + level_map: dict, + gene_system_identifier: Union[str, List[str]]="Gene_Symbol") -> pd.DataFrame: + """ Util function that supports loading of the omic data files. + Returns the input dataframe with the multi-level column names renamed as + specified by the gene_system_identifier arg. + + Args: + df (pd.DataFrame): omics dataframe + level_map (dict): encodes the column level and the corresponding identifier systems + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: the input dataframe with the specified multi-level column names + """ + df = df.copy() + + level_names = list(level_map.keys()) + level_values = list(level_map.values()) + n_levels = len(level_names) + + if isinstance(gene_system_identifier, list) and len(gene_system_identifier) == 1: + gene_system_identifier = gene_system_identifier[0] + + # print(gene_system_identifier) + # import pdb; pdb.set_trace() + if isinstance(gene_system_identifier, str): + if gene_system_identifier == "all": + df.columns = df.columns.rename(level_names, level=level_values) # assign multi-level col names + else: + df.columns = df.columns.get_level_values(level_map[gene_system_identifier]) # retian specific column level + else: + assert len(gene_system_identifier) <= n_levels, f"'gene_system_identifier' can't contain more than {n_levels} items." + set_diff = list(set(gene_system_identifier).difference(set(level_names))) + assert len(set_diff) == 0, f"Passed unknown gene identifiers: {set_diff}" + kk = {i: level_map[i] for i in level_map if i in gene_system_identifier} + # print(list(kk.keys())) + # print(list(kk.values())) + df.columns = df.columns.rename(list(kk.keys()), level=kk.values()) # assign multi-level col names + drop_levels = list(set(level_map.values()).difference(set(kk.values()))) + df = df.droplevel(level=drop_levels, axis=1) + return df + + +def load_copy_number_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns copy number data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + # level_map encodes the relationship btw the column and gene identifier system + level_map = {"Ensembl": 2, "Entrez": 0, "Gene_Symbol": 1} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.copy_number_file_path, sep=sep, index_col=0, header=header) + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + # Test the func + # d0 = set_col_names_in_multilevel_dataframe(df, "all") + # d1 = set_col_names_in_multilevel_dataframe(df, "Ensembl") + # d2 = set_col_names_in_multilevel_dataframe(df, ["Ensembl"]) + # d3 = set_col_names_in_multilevel_dataframe(df, ["Entrez", "Gene_Symbol", "Ensembl"]) + # d4 = set_col_names_in_multilevel_dataframe(df, ["Entrez", "Ensembl"]) + # d5 = set_col_names_in_multilevel_dataframe(df, ["Blah", "Ensembl"]) + if verbose: + print(f"Copy number data: {df.shape}") + # print(df.dtypes) + # print(df.dtypes.value_counts()) + return df + + +def load_discretized_copy_number_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns discretized copy number data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + # level_map encodes the relationship btw the column and gene identifier system + level_map = {"Ensembl": 2, "Entrez": 0, "Gene_Symbol": 1} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.discretized_copy_number_file_path, sep=sep, index_col=0, header=header) + + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + if verbose: + print(f"Discretized copy number data: {df.shape}") + + return df + + +def load_dna_methylation_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns methylation data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + level_map = {"Ensembl": 2, "Entrez": 1, "Gene_Symbol": 3, "TSS": 0} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.dna_methylation_file_path, sep=sep, index_col=0, header=header) + + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + if verbose: + print(f"DNA methylation data: {df.shape}") + # print(df.dtypes) # TODO: many column are of type 'object' + # print(df.dtypes.value_counts()) + return df + + +def load_gene_expression_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns gene expression data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + # level_map encodes the relationship btw the column and gene identifier system + level_map = {"Ensembl": 0, "Entrez": 1, "Gene_Symbol": 2} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.gene_expression_file_path, sep=sep, index_col=0, header=header) + + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + if verbose: + print(f"Gene expression data: {df.shape}") + return df + + +def load_mirna_expression_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + # TODO + raise NotImplementedError("The function is not implemeted yet.") + return None + + +def load_mutation_count_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns mutation count data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + # level_map encodes the relationship btw the column and gene identifier system + level_map = {"Ensembl": 2, "Entrez": 0, "Gene_Symbol": 1} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.mutation_count_file_path, sep=sep, index_col=0, header=header) + + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + if verbose: + print(f"Mutation count data: {df.shape}") + + return df + + +def load_mutation_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + # TODO + raise NotImplementedError("The function is not implemeted yet.") + return None + + +def load_rppa_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + # TODO + raise NotImplementedError("The function is not implemeted yet.") + return None + + + + +# ------------------------------------- +# Drug feature loaders +# ------------------------------------- + +def load_smiles_data( + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + IMPROVE-specific func. + Read smiles data. + src_raw_data_dir : data dir where the raw DRP data is stored + """ + df = pd.read_csv(improve_globals.smiles_file_path, sep=sep) + + # TODO: updated this after we update the data + df.columns = ["improve_chem_id", "smiles"] + + if verbose: + print(f"SMILES data: {df.shape}") + # print(df.dtypes) + # print(df.dtypes.value_counts()) + return df + + +def load_mordred_descriptor_data( + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Return Mordred descriptors data. + """ + df = pd.read_csv(improve_globals.mordred_file_path, sep=sep) + df = df.set_index(improve_globals.drug_col_name) + if verbose: + print(f"Mordred descriptors data: {df.shape}") + return df + + +def load_morgan_fingerprint_data( + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Return Morgan fingerprints data. + """ + df = pd.read_csv(improve_globals.ecfp4_512bit_file_path, sep=sep) + df = df.set_index(improve_globals.drug_col_name) + return df + + +# ------------------------------------- +# Save data functions +# ------------------------------------- + +def save_preds(df: pd.DataFrame, y_col_name: str, + outpath: Union[str, PosixPath], round_decimals: int=4) -> None: + """ Save model predictions. + This function throws errors if the dataframe does not include the expected + columns: canc_col_name, drug_col_name, y_col_name, y_col_name + "_pred" + + Args: + df (pd.DataFrame): df with model predictions + y_col_name (str): drug response col name (e.g., IC50, AUC) + outpath (str or PosixPath): outdir to save the model predictions df + round (int): round response values + + Returns: + None + """ + # Check that the 4 columns exist + assert improve_globals.canc_col_name in df.columns, f"{improve_globals.canc_col_name} was not found in columns." + assert improve_globals.drug_col_name in df.columns, f"{improve_globals.drug_col_name} was not found in columns." + assert y_col_name in df.columns, f"{y_col_name} was not found in columns." + pred_col_name = y_col_name + f"{improve_globals.pred_col_name_suffix}" + assert pred_col_name in df.columns, f"{pred_col_name} was not found in columns." + + # Round + df = df.round({y_col_name: round_decimals, pred_col_name: round_decimals}) + + # Save preds df + df.to_csv(outpath, index=False) + return None + + + + + + +# ================================================================== +# Leftovers +# ================================================================== +def get_data_splits( + src_raw_data_dir: str, + splitdir_name: str, + split_file_name: str, + rsp_df: pd.DataFrame): + """ + IMPROVE-specific func. + Read smiles data. + src_raw_data_dir : data dir where the raw DRP data is stored + """ + splitdir = src_raw_data_dir/splitdir_name + if len(split_file_name) == 1 and split_file_name[0] == "full": + # Full dataset (take all samples) + ids = list(range(rsp_df.shape[0])) + else: + # Check if the split file exists and load + ids = [] + for fname in split_file_name: + assert (splitdir/fname).exists(), "split_file_name not found." + with open(splitdir/fname) as f: + ids_ = [int(line.rstrip()) for line in f] + ids.extend(ids_) + + """ + # Method 1 + splitdir = Path(os.path.join(src_raw_data_dir))/"splits" + if len(args.split_file_name) == 1 and args.split_file_name[0] == "full": + # Full dataset (take all samples) + ids = list(range(rsp_df.shape[0])) + outdir_name = "full" + else: + # Check if the split file exists and load + ids = [] + split_id_str = [] # e.g. split_5 + split_type_str = [] # e.g. tr, vl, te + for fname in args.split_file_name: + assert (splitdir/fname).exists(), "split_file_name not found." + with open(splitdir/fname) as f: + # Get the ids + ids_ = [int(line.rstrip()) for line in f] + ids.extend(ids_) + # Get the name + fname_sep = fname.split("_") + split_id_str.append("_".join([s for s in fname_sep[:2]])) + split_type_str.append(fname_sep[2]) + assert len(set(split_id_str)) == 1, "Data splits must be from the same dataset source." + split_id_str = list(set(split_id_str))[0] + split_type_str = "_".join([x for x in split_type_str]) + outdir_name = f"{split_id_str}_{split_type_str}" + ML_DATADIR = main_data_dir/"ml_data" + root = ML_DATADIR/f"data.{args.source_data_name}"/outdir_name # ML data + os.makedirs(root, exist_ok=True) + """ + + """ + # Method 2 + splitdir = src_raw_data_dir/args.splitdir_name + if len(args.split_file_name) == 1 and args.split_file_name[0] == "full": + # Full dataset (take all samples) + ids = list(range(rsp_df.shape[0])) + else: + # Check if the split file exists and load + ids = [] + for fname in args.split_file_name: + assert (splitdir/fname).exists(), "split_file_name not found." + with open(splitdir/fname) as f: + ids_ = [int(line.rstrip()) for line in f] + ids.extend(ids_) + """ + return ids + + +def get_common_samples( + df1: pd.DataFrame, + df2: pd.DataFrame, + ref_col: str) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Args: + df1, df2 (pd.DataFrame): dataframes + ref_col (str): the ref column to find the common values + + Returns: + df1, df2 + + Example: + TODO + """ + # Retain (canc, drug) response samples for which we have omic data + common_ids = list(set(df1[ref_col]).intersection(df2[ref_col])) + # print(df1.shape) + df1 = df1[ df1[improve_globals.canc_col_name].isin(common_ids) ].reset_index(drop=True) + # print(df1.shape) + # print(df2.shape) + df2 = df2[ df2[improve_globals.canc_col_name].isin(common_ids) ].reset_index(drop=True) + # print(df2.shape) + return df1, df2 + + +def read_df(fpath: str, sep: str=","): + """ + IMPROVE-specific func. + Load a dataframe. Supports csv and parquet files. + sep : the sepator in the csv file + """ + # TODO: this func might be available in candle + assert Path(fpath).exists(), f"File {fpath} was not found." + if "parquet" in str(fpath): + df = pd.read_parquet(fpath) + else: + df = pd.read_csv(fpath, sep=sep) + return df + + +def get_subset_df(df: pd.DataFrame, ids: list) -> pd.DataFrame: + """ Get a subset of the input dataframe based on row ids.""" + df = df.loc[ids] + return df + + +def rmse(y, f): + rmse = sqrt(((y - f)**2).mean(axis=0)) + return rmse + + +def mse(y, f): + mse = ((y - f)**2).mean(axis=0) + return mse + + +def pearson(y, f): + rp = np.corrcoef(y, f)[0, 1] + return rp + + +def spearman(y, f): + rs = stats.spearmanr(y, f)[0] + return rs + + +def r_square(y_true, y_pred): + from sklearn.metrics import r2_score + return r2_score(y_true, y_pred) From 6914bb8d005dae72409a9630cea79f217d676847 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 19 Jul 2023 17:09:48 -0500 Subject: [PATCH 010/254] add nea scripts --- NetPEA.py | 212 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ RWR.py | 162 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 374 insertions(+) create mode 100644 NetPEA.py create mode 100644 RWR.py diff --git a/NetPEA.py b/NetPEA.py new file mode 100644 index 0000000..d30599f --- /dev/null +++ b/NetPEA.py @@ -0,0 +1,212 @@ +""" +Implementation of NetPEA: pathway enrichment with networks (Liu, 2017) + +Ref: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5664096/ +zscore >1.65, equivalent to p-value=0.05 +""" + +import os +import sys +import argparse +import numpy as np +import pandas as pd +import multiprocessing as mp +import scipy.stats as scistat +from datetime import datetime + +class NetPEA: + """ + :param rwrDf: dataframe with cell by PPI genes + :param pathwayGMT: pathway database in gmt format + :param permutation: + :param seed: + :param threshold: + """ + def __init__(self, rwrPath, pathwayGMT, log_transform=False, permutation=1000, seed=42, n_cpu=5, out_path='./'): + # load data + self.rwr_path = rwrPath #pd.read_csv(rwrDf, header=0, index_col=0, sep="\t") + self.pathway_gmt = pathwayGMT + self.permutation = int(permutation) + self.seed = int(seed) + self.out_path = out_path + + # settings + np.random.seed(self.seed) + self.n_cpu = int(n_cpu) + if len(self.rwr_path) < self.n_cpu: + self.n_cpu = len(self.rwr_path) + + # prepare pathway genes to save time + print('{:}: collect pathway genes'.format(datetime.now())) + pathway_geneList_dict = self._get_pathway_genes(pathwayGMT) # {pathway: geneList} + # obtain shared genes for calculating score of pathway genes + self.rwrDf = self.rwr_path#pd.read_csv(rwrPath, header=0, index_col=0, sep="\t") + if log_transform == True: + print('log transform input data') + self.rwrDf = np.log(self.rwrDf) + pathway_shareGeneList_dict = self._find_overlaps(self.rwrDf, pathway_geneList_dict) # {pathway: shareGeneList} + # generate random gene list for calculating score of random pathway genes + pathway_randomGeneListList_dict = {} + bg_gene_list = self.rwrDf.columns.tolist() # ppi genes + for pathway, shareGeneList in pathway_shareGeneList_dict.items(): + pathway_randomGeneListList_dict.update({pathway:[]}) + for p in range(self.permutation): + gene_list = np.random.choice(bg_gene_list, len(shareGeneList)).tolist() + pathway_randomGeneListList_dict[pathway].append(gene_list) + self.pathwayDictList = [pathway_geneList_dict, pathway_shareGeneList_dict, pathway_randomGeneListList_dict] + + # call function + self.netpea_parallel(self.rwrDf, self.pathwayDictList, self.n_cpu, self.out_path) + + def netpea_parallel(self, rwrDf, pathwayDictList, n_cpu, out_path): + # split dataframe + n_partitions = int(n_cpu) + split_list = np.array_split(rwrDf, n_partitions) + # parallel computing + pool = mp.Pool(int(n_cpu)) + df_list = pool.starmap(self.netpea, [(df, pathwayDictList) for df in split_list]) + pool.close() + pool.join() + print('{:}: comple {:} dfs'.format(datetime.now(), len(df_list))) + print(df_list[0]) + + # merge result of all cells and save to file + print('{:}: merge result of all cells and save to file'.format(datetime.now())) + all_cell_zscore_df = pd.concat(df_list, axis=0) + zscore_fname = self.out_path + all_cell_zscore_df.to_csv(zscore_fname, header=True, index=True, sep="\t") + print(all_cell_zscore_df) + + + def netpea(self, rwrDf, pathwayDictList): + """return dataframe with cell by pathway""" + pathway_geneList_dict, pathway_shareGeneList_dict, pathway_randomGeneListList_dict = pathwayDictList + # convert to dataframe with headers=[pathway, #pathway genes, overlap genes] + pathway_df = self._merge_pathway_dict(pathway_geneList_dict, pathway_shareGeneList_dict) + # collect score of random gene list + print('{:}: collect score of random gene list'.format(datetime.now())) + cell_pathway_bgScoreList_dict = {} # dict of dict + for cell in rwrDf.index: + cell_pathway_bgScoreList_dict.update({cell:{}}) + # prepare data + rwr_df = rwrDf.loc[cell] # 1 by ppiG dataframe + # append aggregate score for each randomgenelist for each pathway + for pathway, randomGeneListList in pathway_randomGeneListList_dict.items(): + bgScoreList = [rwr_df.loc[randomGeneList].mean() for randomGeneList in randomGeneListList] + cell_pathway_bgScoreList_dict[cell].update({pathway:bgScoreList}) + + # collect score of share gene list + print('{:}: collect score of share gene list'.format(datetime.now())) + cell_pathway_ScoreList_dict = {} # dict of dict + for cell in rwrDf.index: + cell_pathway_ScoreList_dict.update({cell:{}}) + # prepare data + rwr_df = rwrDf.loc[cell] # 1 by ppiG dataframe + # append aggregate score for each randomgenelist for each pathway + for pathway, shareGeneList in pathway_shareGeneList_dict.items(): + score = rwr_df.loc[shareGeneList].mean() + cell_pathway_ScoreList_dict[cell].update({pathway:score}) + # ztest to determin significance + print('{:}: ztest to determin significance'.format(datetime.now())) + zscore_dfs = [] + cell_pathway_zscore_dict = {} # collect zscore for each pathway + cell_pathway_ztest_dict = {} # collect zscore and pvalue for each pathway + for cell in rwrDf.index: + cell_pathway_zscore_dict.update({cell:{}}) + cell_pathway_ztest_dict.update({cell:{}}) + pathway_score_dict = cell_pathway_ScoreList_dict[cell] + pathway_bgList_dict = cell_pathway_bgScoreList_dict[cell] + for pathway in pathway_geneList_dict.keys(): + score = pathway_score_dict[pathway] + bgList = pathway_bgList_dict[pathway] + [zscore, pvalue] = self._cal_zscore(score, bgList) + cell_pathway_ztest_dict[cell].update({pathway: [zscore, pvalue]}) + cell_pathway_zscore_dict[cell].update({pathway:zscore}) + # save per-cell zscore + cell_zscore_df = pd.DataFrame(cell_pathway_zscore_dict[cell], index=[cell]) + zscore_dfs.append(cell_zscore_df) + # save per-cell ztest results + cell_bgtest_df = pd.DataFrame(cell_pathway_ztest_dict[cell], index=['zscore', 'pvalue']).T + cell_bgtest_df.index.name = 'pathway' + cell_bgtest_df = cell_bgtest_df.join(pathway_df) + #percell_fname = self.out_path + '.' + cell + '.NetPEA.background_result.txt' + #cell_bgtest_df.to_csv(percell_fname, header=True, index=True, sep="\t") + # merge result of all cells and save to file + #print('{:}: merge result of all cells and save to file'.format(datetime.now())) + all_cell_zscore_df = pd.concat(zscore_dfs, axis=0) + #zscore_fname = self.out_path + '.NetPEA.zscore.txt' + #all_cell_zscore_df.to_csv(zscore_fname, header=True, index=True, sep="\t") + + # clear space + pathwayDictList = [] + return all_cell_zscore_df + + def _merge_pathway_dict(self, pathway_geneList_dict, pathway_shareGeneList_dict): + """return dataframe with headers = [pathway, #pathway genes, overlap genes]""" + pathway_lenG_dict = {pathway: len(geneList) for pathway, geneList in pathway_geneList_dict.items()} + pathway_strG_dict = {pathway: ",".join(geneList) for pathway, geneList in pathway_shareGeneList_dict.items()} + df1 = pd.DataFrame(pathway_lenG_dict.items(), columns=['pathway', '#pathway genes']) + df2 = pd.DataFrame(pathway_strG_dict.items(), columns=['pathway', 'overlap genes']) + return df1.set_index('pathway').join(df2.set_index('pathway')) + + def _find_overlaps(self, rwrDf, pathway_dict): + """return diction with pathway:geneList""" + # create result dictionary + result_dict = {} #pathway:sharedGeneList + # get ppiGenes + ppi_gene_list = rwrDf.columns.tolist() + # find overlaps + for pathway, geneList in pathway_dict.items(): + shareGene_list = sorted(list(set(geneList) & set(ppi_gene_list))) + result_dict.update({pathway:shareGene_list}) + return result_dict + + def _cal_zscore(self, score, scoreList): + """return zscore and pvalue by lookup table""" + if np.std(scoreList) != 0: + zscore = (score - np.mean(scoreList) ) / np.std(scoreList) + pvalue = scistat.norm.sf(abs(zscore)) # not pdf + print('score={:}, scoreList={:}, zscore={:}, pvalue={:}'.format( + score, scoreList[:10], zscore, pvalue)) + else: + zscore, pvalue = np.nan, np.nan + return [zscore, pvalue] + + def _cal_similarity_score(self, rwrDf, geneList): + """return similarity score by taking average of rwr for given geneList""" + return rwrDf.loc[geneList].mean() + + def _get_pathway_genes(self, gmt): + """ + Return pathwayStr_geneList_dict + + :param fin: file name to pathway in gmt format + :return pathway_dict: dictionary of pathway as key, genelist as values + """ + pathwayStr_geneList_dict = {} + with open(gmt, 'r') as f: + for line in f: + # extract fields + line = line.strip('\n').split('\t') + pathway_str = line[0] + gene_list = line[2:] + # update to dict + pathwayStr_geneList_dict.update({pathway_str:gene_list}) + return pathwayStr_geneList_dict + + def _df2dict(self, df): + """return 1 by N dataframe to dictionary of N keys""" + return df.to_dict('records')[0] # keys are column names = gene nams + + +if __name__ == "__main__": + # timer + datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' + start_time = datetime.now() + rwr_df = 'test.txt' #'/repo4/ytang4/PHD/db/GDSC/processed/GDSC.MUTCNV.STRING.RWR.txt' + pathway_gmt = '/repo4/ytang4/PHD/db/MSigdb/c2.cp.pid.v7.1.symbols.gmt' + # initiate + cell_pathway_df = NetPEA(rwr_df, pathway_gmt, permutation=3, seed=42, n_cpu=5, out_path='./test_netpea/GDSC') + spend = datetime.strptime(str(datetime.now()), datetimeFormat) - datetime.strptime(str(start_time),datetimeFormat) + print( '[Finished in {:}]'.format(spend) ) + diff --git a/RWR.py b/RWR.py new file mode 100644 index 0000000..5bc82b9 --- /dev/null +++ b/RWR.py @@ -0,0 +1,162 @@ +""" +Return cell by gene probability dataframe + +""" + + +import argparse +import numpy as np +import pandas as pd +import scipy.sparse as scisp +import sklearn.preprocessing as skprc +from datetime import datetime + + +class RWR: + """ + Return probability matrix where columns are PPI genes + + :param ppiPathStr: string representing path to ppi file (with three columns) + :param restartPathStr: string representing path to restart file (i.e., input gene sets) + :param restartProbFloat: float representing restart probability (default: 0.5) + :param convergenceFloat: folat representing convergence criterion (default: 1e-5) + :param normalize: string representing normalization method (choices=['l1', 'l2']) + :param weighted: boolean indicating weither to use weighted graph or not (if False, will set weight of all edges to 1) + :param outPathStr: string representing output path + """ + def __init__(self, ppiPathStr, restartPathStr, restartProbFloat=0.5, convergenceFloat=0.00001, normalize='l1', weighted=True, outPathStr='./'): + # initiating + self.ppiPathStr = ppiPathStr + self.restartPathStr = restartPathStr + self.restartProbFloat = float(restartProbFloat) + self.convergenceFloat = float(convergenceFloat) + self.normalize = normalize + self.weighted = weighted + self.outPathStr = outPathStr + + + + def get_prob(self): + # load PPI graph + print('loading protein-protein interaction network.....') + self.adj_mat, self.name_idx_dict = self.load_graph(self.ppiPathStr, normalize=True, weighted=True) + # mapping dictionary of node index number: node name string + self.idx_name_dict = { idx:name for name, idx in self.name_idx_dict.items() } + + # load restart list (i.e., input gene sets) + print('collecting restart list') + df = pd.read_csv(self.restartPathStr, header=0, sep="\t") + df.columns = ['group', 'gene'] + # collect gene sets by group + grps = df.groupby('group') + grps_dict = {} + for grp in df['group'].unique(): + seed_list = grps.get_group(grp)['gene'].values.tolist() #input gene set + # check if input gene set in ppi and convert name to index number + seed_idx_list = [self.name_idx_dict[i] for i in seed_list if i in self.name_idx_dict.keys()] + # update to dictionary + grps_dict.update({ grp: {'gList':seed_list, 'ppiList':seed_idx_list} }) + + # perform random walk + print('performing random walk.....') + n_grps = len(grps_dict) + grp_list = list(grps_dict.keys()) + grp_prob_dict = {} + n_grp_has_no_ppiList = 0 # number of group has restart list not found on PPI network + for i in range(n_grps): + grp = grp_list[i] + if len(grps_dict[grp]['ppiList']) > 0: # has restart list on PPI network + prob_list = self.run_single_rwr(self.adj_mat, grps_dict[grp]['ppiList'], restartProbFloat=self.restartProbFloat, convergenceFloat=self.convergenceFloat) + + else: + n_grp_has_no_ppiList += 1 + prob_list = [0.0] * len(self.name_idx_dict) + + # update to result + grp_prob_dict.update( {grp:prob_list} ) + + # reformat result: dict2fataframe + print('finalizing result of probability matrix.....') + result_df = pd.DataFrame(grp_prob_dict) + result_df = result_df.T + result_df.columns = list(self.name_idx_dict.keys()) + return result_df # probability matrix grp by ppi genes + + + def load_graph(self, ppiPathStr, normalize=True, weighted=True): + """ + Return a graph in adjacency matrix format and its name string and correspoing index number mapping dictionary + + :param ppiPathStr: string representing file name of a graph in edge list format + :param name2index: boolean indicating whether to convert name string to index number or not + :param normalize: boolean indicating whether to perform column-wised normalization + """ + # load data + df = pd.read_pickle(ppiPathStr) + df.columns = ['source', 'target', 'weight'] + + # convert name to index + all_nodes = sorted(list(set( df['source'] ) | set( df['target'] ))) # retrieve name strings of all nodes + + # create name:index mapping dictionary + gnm_gid_dict = { all_nodes[i]:i for i in range(len(all_nodes)) } + + # replace name string with index number + df['source'].update(df['source'].map(gnm_gid_dict)) + df['target'].update(df['target'].map(gnm_gid_dict)) + + # use weighted graph or unweighted graph + if weighted == False: + df['weight'] = 1 # unweighted graph + + # create adjancency matrix + network_matrix = scisp.csr_matrix((df['weight'].values, (df['source'].values, df['target'].values)), + shape=(len(all_nodes), len(all_nodes)), dtype=float) # Create sparse matrix + network_matrix = (network_matrix + network_matrix.T) # Make the ajdacency matrix symmetric + network_matrix.setdiag(0) # Set diagnoals to zero + + # normalization: Normalize the rows of network_matrix because we are multiplying vector by matrix (from left) + if normalize == True: + network_matrix = skprc.normalize(network_matrix, norm='l1', axis=1) + + # return + return network_matrix, gnm_gid_dict + + def run_single_rwr(self, ppiAdjMat, restartList, restartProbFloat=0.5, convergenceFloat=0.00001): + """ + Return + + :param ppiAdjMat: adjacency matrix of protein-protein interaction network + :param restartList: list of restart nodes (i.e., gene list) + :param restartProbFloat: float representing restart probability (default: 0.5) + :param convergenceFloat: folat representing convergence criterion (default: 1e-5) + """ + # settings + convergence_criterion_float = float(convergenceFloat) # stops when vector L1 norm drops below 10^(-5) + restartProbFloat = float(restartProbFloat) + residual_float = 1.0 # difference between p^(t + 1) and p^(t) + max_iter = 1000 + + # initialze probability vector for restart nodes + prob_seed_list = [0] * ppiAdjMat.shape[0] + for idx in restartList: + prob_seed_list[idx] = 1.0 #1/float(len(restartList)) + prob_seed_arr = np.array(prob_seed_list) + steady_prob_old = prob_seed_arr + + # RWR + iter_int = 0 + #print('updating probability array.....') + while (residual_float > convergence_criterion_float): + # update vector + steady_prob_new = scisp.csr_matrix.dot(steady_prob_old, ppiAdjMat) + steady_prob_new *= (1 - restartProbFloat) + steady_prob_new += (prob_seed_arr * restartProbFloat) + + # Calculate the residual -- the sum of the absolute + # differences of the new node probability vector minus the old + # diff_norm = np.linalg.norm(np.subtract(p_t_1, p_t), 1) + residual_float = abs(steady_prob_new - steady_prob_old).sum() + steady_prob_old = steady_prob_new.copy() + return steady_prob_old + From 1c0b48c58d1b5a9ca8948fd5b9e237fadbb89fd9 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 19 Jul 2023 17:10:12 -0500 Subject: [PATCH 011/254] update params --- PathDSP_params.txt | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 07d0614..ec939cb 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -1,23 +1,39 @@ [Global_Params] -data_url="http://drugcell.ucsd.edu/downloads/" -original_data="data.tgz" + +model_name='PathDSP' +data_url='https://zenodo.org/record/6093818/files/' +improve_data_url='https://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/' +original_data_url='https://zenodo.org/record/7532963/' +original_data='input.zip' +gene_set = 'MSigdb.zip' +ppi_data = 'STRING.zip' +drug_target = 'raw_data.zip' +train_data = 'PathDSP_train.txt' +test_data = 'PathDSP_test.txt' +val_data = 'PathDSP_val.txt' +drug_bits_file='drug_mbit_df.txt' +dgnet_file='DGnet.txt' +mutnet_file='MUTnet.txt' +cnvnet_file='CNVnet.txt' +exp_file='EXP.txt' +final_input_file='input.txt' +output='Result/' +bit_int=128 +permutation_int=100 +seed_int=42 +cpu_int=20 + +#Model parameter +metric='auc1' +data_type='CCLE' CUDA_ID = 0 -load = "drugcell_v1.pt" -train_data = "../data/drugcell_train.txt" -test_data = "../data/drugcell_test.txt" -val_data = "../data/drugcell_val.txt" -onto = "drugcell_ont.txt" learning_rate = 0.001 batch_size = 1000 +eps=0.00001 genotype_hiddens = 6 drug_hiddens='100,50,6' final_hiddens=6 -genotype="cell2mutation.txt" -fingerprint='drug2fingerprint.txt' -cell2id='../data/cell2ind.txt' -drug2id='../data/drug2ind.txt' -output_dir = "MODEL" epochs=200 -optimizer = "adam" -loss = "mse" -predict="drugcell_all.txt" +optimizer = 'adam' +loss = 'mse' +improve_analysis='no' From 5b683192142ecdc5f98b44369a35fa0ac789a658 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 19 Jul 2023 17:10:30 -0500 Subject: [PATCH 012/254] add gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0c33311 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.ipynb_checkpoints/ +PathDSP/__pycache__/ +__pycache__/ + From 82541c03d252e2bae2af7ef6e2c7b5075e1c1685 Mon Sep 17 00:00:00 2001 From: willherbert27 Date: Mon, 24 Jul 2023 11:55:26 -0500 Subject: [PATCH 013/254] EXP processing --- preprocess_new.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/preprocess_new.py b/preprocess_new.py index ae219ae..66725ee 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -28,6 +28,9 @@ # import NetPEA modules import RWR as rwr import NetPEA as pea +#import gsea module +import gseapy as gp + @@ -290,6 +293,31 @@ def prep_input(params): comb_data_mtx['response'] = response_df[params['metric']] comb_data_mtx.to_csv(params['final_input_file'], sep = '\t', header= True, index=False) +def run_ssgsea(params): + expMat = improve_utils.load_gene_expression_data(sep='\t') + gct = expMat.T # gene (rows) cell lines (columns) + pathway_path = params['data_dir'] + '/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt' + gmt = pathway_path + tmp_str = '/ssgsea/' + + if not os.path.isdir(tmp_str): + os.mkdir(tmp_str) + + # run enrichment + ssgsea = gp.ssgsea(data=gct, #gct: a matrix of gene by sample + gene_sets=gmt, #gmt format + outdir=tmp_str, + scale=True, + permutation_num=2, #1000 + no_plot=True, + processes=10, + #min_size=0, + format='png') + + result_mat = ssgsea.res2d.T # get the normalized enrichment score (i.e., NES) + result_mat.to_csv(tmp_str+'ssGSEA.txt', header=True, index=True, sep="\t") + + def candle_main(anl): params = initialize_parameters() data_dir = os.environ['CANDLE_DATA_DIR'] + params['model_name'] + '/Data/' From a1d15b27da908184460f5fcb4882f7cb52e7f285 Mon Sep 17 00:00:00 2001 From: willherbert27 Date: Mon, 24 Jul 2023 12:22:40 -0500 Subject: [PATCH 014/254] updated to integrate with prep_input --- preprocess_new.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/preprocess_new.py b/preprocess_new.py index 66725ee..82ecbe4 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -317,6 +317,21 @@ def run_ssgsea(params): result_mat = ssgsea.res2d.T # get the normalized enrichment score (i.e., NES) result_mat.to_csv(tmp_str+'ssGSEA.txt', header=True, index=True, sep="\t") + f = open(tmp_str+'ssGSEA.txt', 'r') + lines = f.readlines() + total_dict = {} + for cell in set(lines[1].split()): + total_dict[cell] = {} + cell_lines = lines[1].split() + vals = lines[4].split() + for i, pathway in enumerate((lines[2].split())): + if i > 0: + total_dict[cell_lines[i]][pathway] = float(vals[i]) + + df = pd.DataFrame(total_dict) + + df.to_csv(params['data_dir'] + '/' + 'exp_file') + def candle_main(anl): params = initialize_parameters() From d09d21afb2740e84bf6f3839a7a6258ae3bb5b9a Mon Sep 17 00:00:00 2001 From: liuy12 Date: Wed, 26 Jul 2023 14:33:45 -0700 Subject: [PATCH 015/254] add definition file --- PathDSP.def | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 PathDSP.def diff --git a/PathDSP.def b/PathDSP.def new file mode 100644 index 0000000..48d2687 --- /dev/null +++ b/PathDSP.def @@ -0,0 +1,53 @@ +Bootstrap: docker +From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime + +%labels + MANTAINER Yuanhang Liu + +%setup + cp ./src/Singularity_gpu_fix.sh $SINGULARITY_ROOTFS + # add local url of this repository for testing + + +%environment + PATH=$PATH:/usr/local/PathDSP + MODEL_DIR=/usr/local/PathDSP + CANDLE_DATA_DIR=/candle_data_dir + +%post + apt-get update -y + apt-get install wget -y + apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F60F4B3D7FA2AF80 + apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC + + apt-get install build-essential -y + apt-get install git -y + apt-get install vim -y + apt-get install subversion -y + + # install gpu fix and clean up + cd / + chmod +x Singularity_gpu_fix.sh + ./Singularity_gpu_fix.sh + rm Singularity_gpu_fix.sh + + # these three need to be compiled and linked to the cuda libs. + # at the moment, what works for me is to build these in a + # singularity shell in a sandbox with the --nv flag to singularity set. + + + # create default internal candle_data_dir, map external candle_data_dir here + mkdir /candle_data_dir + + #install python modules and model prerequites + cd /usr/local + git clone -b develop git@github.com:Liuy12/PathDSP.git + cd PathDSP + + conda env create -f environment.yml + conda activate PathDSP_env + pip install --upgrade pip + python3 -m pip install git+https://github.com/ECP-CANDLE/candle_lib@develop + + + chmod a+x *.sh \ No newline at end of file From f58939ae817585c09e77c0031ef385281414a638 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 26 Jul 2023 18:54:55 -0700 Subject: [PATCH 016/254] update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 0c33311..f72ee49 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .ipynb_checkpoints/ PathDSP/__pycache__/ __pycache__/ +EDA.ipynb From 06384a5d05b01cc8a6b4a1f1d31edbf274c09178 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 26 Jul 2023 18:55:22 -0700 Subject: [PATCH 017/254] update filename for ssGSEA --- preprocess_new.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/preprocess_new.py b/preprocess_new.py index 82ecbe4..98b1dbd 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -298,7 +298,7 @@ def run_ssgsea(params): gct = expMat.T # gene (rows) cell lines (columns) pathway_path = params['data_dir'] + '/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt' gmt = pathway_path - tmp_str = '/ssgsea/' + tmp_str = params['data_dir'] if not os.path.isdir(tmp_str): os.mkdir(tmp_str) @@ -330,7 +330,7 @@ def run_ssgsea(params): df = pd.DataFrame(total_dict) - df.to_csv(params['data_dir'] + '/' + 'exp_file') + df.to_csv(params['exp_file']) def candle_main(anl): From 6780bbd844c94e08f2cf7ab2a8a6770fa6f9df2a Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 26 Jul 2023 18:55:45 -0700 Subject: [PATCH 018/254] add FNN_new --- PathDSP/FNN_new.py | 272 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100644 PathDSP/FNN_new.py diff --git a/PathDSP/FNN_new.py b/PathDSP/FNN_new.py new file mode 100644 index 0000000..ec72577 --- /dev/null +++ b/PathDSP/FNN_new.py @@ -0,0 +1,272 @@ +""" +Train a neural network for regression task: + cv: 10 + batch size: 8 + initializer: He normal initializer + optimizer: AdamMax + learning rate: 0.0004 + loss: RMSE + +Calculate RMSE at once, Oct. 3, 2020 revised +""" + + +import argparse +import numpy as np +import pandas as pd +import scipy.stats as scistat +from datetime import datetime + +import sklearn.preprocessing as skpre +import sklearn.model_selection as skms +import sklearn.metrics as skmts +import sklearn.utils as skut + +import torch as tch +import torch.utils.data as tchud + +import myModel as mynet +import myFit as myfit +import myDataloader as mydl +import myDatasplit as mysplit +import myUtility as myutil +#import myPlotter as myplot +import myMetrics as mymts + +import shap as sp + +class RMSELoss(tch.nn.Module): + def __init__(self): + super(RMSELoss,self).__init__() + + def forward(self,x,y): + eps = 1e-6 + criterion = tch.nn.MSELoss() + loss = tch.sqrt(criterion(x, y) + eps) + return loss + +def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): + """ + Return train and valid performance including loss + + :param net: model + :param train_dl: train dataloader + :param valid_dl: valid dataloader + :param epochs: integer representing EPOCH + :param learning_rate: float representing LEARNING_RATE + :param device: string representing cpu or cuda:0 + :param opt_fn: optimization function in torch (e.g., tch.optim.Adam) + :param loss_fn: loss function in torch (e.g., tch.nn.MSELoss) + """ + # setup + criterion = RMSELoss() # setup LOSS function + optimizer = opt_fn(net.parameters(), lr=learning_rate, weight_decay=1e-5) # setup optimizer + net = net.to(device) # load the network onto the device + trainloss_list = [] # metrics: MSE, size equals to EPOCH + validloss_list = [] # metrics: MSE, size equals to EPOCH + early_stopping = myutil.EarlyStopping(patience=30, verbose=True) # initialize the early_stopping + # repeat the training for EPOCH times + for epoch in range(epochs): + ## training phase + net.train() + # initial loss + train_epoch_loss = 0.0 # save loss for each epoch, batch by batch + for i, (X_train, y_train) in enumerate(train_dl): + X_train, y_train = X_train.to(device), y_train.to(device) # load data onto the device + y_train_pred = net(X_train) # train result + train_loss = criterion(y_train_pred, y_train.float()) # calculate loss + optimizer.zero_grad() # clear gradients + train_loss.backward() # backpropagation + #### add this if you have gradient explosion problem ### + clip_value = 5 + tch.nn.utils.clip_grad_value_(net.parameters(), clip_value) + ########climp gradient within -5 ~ 5 ################### + optimizer.step() # update weights + train_epoch_loss += train_loss.item() # adding loss from each batch + # calculate total loss of all batches + avg_train_loss = train_epoch_loss / len(train_dl) + trainloss_list.append( avg_train_loss ) + ## validation phase + with tch.no_grad(): + net.eval() + valid_epoch_loss = 0.0 # save loss for each epoch, batch by batch + for i, (X_valid, y_valid) in enumerate(valid_dl): + X_valid, y_valid = X_valid.to(device), y_valid.to(device) # load data onto the device + y_valid_pred = net(X_valid) # valid result + valid_loss = criterion(y_valid_pred, y_valid.float())#y_valid.unsqueeze(1)) # calculate loss + valid_epoch_loss += valid_loss.item() # adding loss from each batch + # calculate total loss of all batches, and append to result list + avg_valid_loss = valid_epoch_loss / len(valid_dl) + validloss_list.append( avg_valid_loss) + + # display print message + #print('epoch={:}/{:}, train loss={:.5f}, valid loss={:.5f}'.format( + # epoch+1, epochs, train_epoch_loss / len(train_dl), + # valid_epoch_loss / len(valid_dl))) + + # early_stopping needs the validation loss to check if it has decresed, + # and if it has, it will make a checkpoint of the current model + early_stopping(avg_valid_loss, net) + + if early_stopping.early_stop: + print("Early stopping") + break + + # load the last checkpoint with the best model + net.load_state_dict(tch.load('checkpoint.pt')) + + return net, trainloss_list, validloss_list + +def predict(net, test_dl, device): + """ + Return prediction list + + :param net: model + :param train_dl: train dataloader + :param device: string representing cpu or cuda:0 + """ + # create result lists + prediction_list = list() + + with tch.no_grad(): + net = net.to(device) # load the network onto the device + net.eval() + for i, (X_test, y_test) in enumerate(test_dl): + X_test, y_test = X_test.to(device), y_test.to(device) # load data onto the device + y_test_pred = net(X_test) # test result + # bring data back to cpu in np.array format, and append to result lists + prediction_list.append( y_test_pred.cpu().numpy() ) + #print(prediction_list) + + # merge all batches + prediction_list = np.vstack(prediction_list) + prediction_list = np.hstack(prediction_list).tolist() + # return + return prediction_list + + +def main(params): + start_time = datetime.now() + # load data + df = pd.read_csv(params.final_input_file, header=0, index_col=[0,1], sep="\t") + + # shuffle + sdf = skut.shuffle(df, random_state=params.seed_int) + + # set parameters + myutil.set_seed(params.seed_int) + device = myutil.get_device(uth=params.gpu_int) + kFold = params.cv_int + learning_rate = params['learning_rate'] + epoch = params['epochs'] + batch_size = params['batch_size'] + opt_fn = tch.optim.Adam + + # create result list + loss_df_list = [] + score_df_list = [] + ytest_df_list = [] + shap_df_list = [] + # train with cross-validation + kf = skms.KFold(n_splits=kFold, random_state=params['seed_int'], shuffle=True) + X_df = sdf.iloc[:, 0:-1] + y_df = sdf.iloc[:, -1] + # save best model with lowest RMSE + best_rmse = 10000 + best_model = None + best_fold = 0 + for i, (train_index, test_index) in enumerate(kf.split(X_df, y_df)): + n_fold = i+1 + print('Fold={:}/{:}'.format(n_fold, params['cv_int'])) + # get train/test splits + Xtrain_arr = X_df.values[train_index] + Xtest_arr = X_df.values[test_index] + ytrain_arr = y_df.values[train_index] + ytest_arr = y_df.values[test_index] + # get train/valid splits from train + Xtrain_arr, Xvalid_arr, ytrain_arr, yvalid_arr = skms.train_test_split(Xtrain_arr, ytrain_arr, + test_size=0.1, random_state=params['seed_int']) + print(' train={:}, valid={:}, test={:}'.format(Xtrain_arr.shape, Xvalid_arr.shape, Xtest_arr.shape)) + # prepare dataframe for output + ytest_df = y_df.iloc[test_index].to_frame() + # convert to numpy array + Xtrain_arr = np.array(Xtrain_arr).astype('float32') + Xvalid_arr = np.array(Xvalid_arr).astype('float32') + Xtest_arr = np.array(Xtest_arr).astype('float32') + ytrain_arr = np.array(ytrain_arr).astype('float32') + yvalid_arr = np.array(yvalid_arr).astype('float32') + ytest_arr = np.array(ytest_arr).astype('float32') + # create mini-batch + train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) + valid_dataset = mydl.NumpyDataset(tch.from_numpy(Xvalid_arr), tch.from_numpy(yvalid_arr)) + test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) + train_dl = tchud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + valid_dl = tchud.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) + test_dl = tchud.DataLoader(test_dataset, batch_size=batch_size, shuffle=False) + # initial weight + def init_weights(m): + if type(m) == tch.nn.Linear: + tch.nn.init.kaiming_uniform_(m.weight) + m.bias.data.fill_(0.01) + # load model + n_features = Xtrain_arr.shape[1] + net = mynet.FNN(n_features) + net.apply(init_weights) + # fit data with model + trained_net, train_loss_list, valid_loss_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn) + prediction_list = predict(trained_net, test_dl, device) + # evaluation metrics + mse = skmts.mean_squared_error(ytest_arr, prediction_list) + rmse = np.sqrt(mse) + if rmse <= best_rmse: + best_rmse = rmse + best_fold = n_fold + best_model = trained_net + print('best model so far at fold={:}, rmse={:}'.format(best_fold, best_rmse)) + + + if params['shap_bool'] == True: + print('calculate shapely values') + # random select 100 samples as baseline + train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) + train_dl = tchud.DataLoader(train_dataset, batch_size=200, shuffle=True) + background, lbl = next(iter(train_dl)) + explainer = sp.DeepExplainer(trained_net, background[:100].to(device)) + shap_arr = explainer.shap_values(tch.from_numpy(Xtest_arr)) + shap_df = pd.DataFrame(shap_arr, index=ytest_df.index, columns=X_df.columns) + # append to result + shap_df_list.append(shap_df) + + # collect result + loss_df = pd.DataFrame({'fold':[n_fold]*len(train_loss_list), + 'epoch':[i+1 for i in range(len(train_loss_list))], + 'train loss':train_loss_list, + 'valid loss': valid_loss_list}) + ytest_df['prediction'] = prediction_list + ytest_df['fold'] = n_fold + loss_df_list.append(loss_df) + ytest_df_list.append(ytest_df) + # end of fold + trained_net = None + break + + # save to output + all_ytest_df = pd.concat(ytest_df_list, axis=0) + all_loss_df = pd.concat(loss_df_list, axis=0) + all_ytest_df.to_csv(params['output'] + '.FNN.cv_' + str(params['cv_int']) + '.Prediction.txt', header=True, index=True, sep="\t") + all_loss_df.to_csv(params['output'] + '.FNN.cv_' + str(params['cv_int']) + '.Loss.txt', header=True, index=False, sep="\t") + if params['shap_bool'] == True: + all_shap_df = pd.concat(shap_df_list, axis=0) + all_shap_df.to_csv(params['output'] + '.FNN.cv_' + str(params['cv_int']) + '.SHAP.txt', header=True, index=True, sep="\t") + + # make train/valid loss plots + tch.save(best_model.state_dict(), params['output'] + '.FNN.cv_' + str(params['cv_int']) + 'best_model.pt') + print( '[Finished in {:}]'.format(myutil.cal_time(datetime.now(), start_time)) ) + # display evaluation metrics of all folds + mse, rmse, r_square, pccy = mymts.eval_regressor_performance(all_ytest_df, 'response', 'prediction') + + + + +if __name__ == "__main__": + main() \ No newline at end of file From dc203cfdf91afde7d0f7eb9f831ee5af97f1fb47 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 26 Jul 2023 18:56:16 -0700 Subject: [PATCH 019/254] add train/infer --- infer.py | 267 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ train.py | 68 ++++++++++++++ train.sh | 83 +++++++++++++++++ 3 files changed, 418 insertions(+) create mode 100755 infer.py create mode 100644 train.py create mode 100755 train.sh diff --git a/infer.py b/infer.py new file mode 100755 index 0000000..da26b33 --- /dev/null +++ b/infer.py @@ -0,0 +1,267 @@ +import os +import candle +import pandas as pd +import torch +import torchvision +import numpy as np +import networkx as nx +import networkx.algorithms.components.connected as nxacc +import networkx.algorithms.dag as nxadag +import torch.utils.data as du +from torch.autograd import Variable +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchmetrics.functional import mean_absolute_error +from scipy.stats import spearmanr +import torch.nn as nn +import torch.nn.functional as F +#from code.predict_drugcell import main +import sklearn +from code.utils.util import * +from code.drugcell_NN import * +from code.utils.util import load_mapping +from code.utils.util import load_train_data +from code.utils.util import build_input_vector +from code.utils.util import pearson_corr +from code.utils.util import prepare_predict_data +from time import time + +file_path = os.path.dirname(os.path.realpath(__file__)) +print(file_path) + +# Just because the tensorflow warnings are a bit verbose +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +# This should be set outside as a user environment variable +os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' + + +# additional definitions +additional_definitions = [ + { + "name": "batchsize", + "type": int, + "help": "...", + }, + { + "name": "gene2id", + "type": str, + "help": "path to gene2id file", + }, + { + "name": "drug2id", + "type": str, + "help": "path to drug to ID file", + }, + { + "name": "cell2id", + "type": str, + "help": "Path to cell 2 id file", + }, + { + "name": "hidden", + "type": str, + "help": "string to indicate hidden output layer ", + }, + { + "name": "cuda", + "type": int, + "help": "CUDA ID", + }, + { + "name": "result", + "type": str, + "help": "result file name", + }, +] + +# required definitions +required = [ + "genotype", + "fingerprint", +] + +# initialize class +class DrugCell_candle(candle.Benchmark): + def set_locals(self): + """ + Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the benchmark. + """ + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definisions = additional_definitions + + +def initialize_parameters(): + preprocessor_bmk = DrugCell_candle(file_path, + 'DrugCell_params.txt', + 'pytorch', + prog='DrugCell_candle', + desc='Data Preprocessor' + ) + #Initialize parameters + candle_data_dir = os.getenv("CANDLE_DATA_DIR") + gParameters = candle.finalize_parameters(preprocessor_bmk) + return gParameters + +def load_mapping(map_file): + mapping = {} + with open(map_file) as fin: + for raw_line in fin: + line = raw_line.strip().split() + mapping[line[1]] = int(line[0]) + return mapping + +def load_train_data(drug_data, cell2id_dict, drug2id_dict): + data = [] + label = [] + with open(drug_data) as fin: + for raw_line in fin: + tokens = raw_line.strip().split('\t') + data.append([cell2id_dict[tokens[0]], drug2id_dict[tokens[1]]]) + label.append([float(tokens[2])]) + return data, label + + +def predict_dcell(predict_data, gene_dim, drug_dim, model_file, hidden_folder, + batch_size, result_file, cell_features, drug_features, CUDA_ID,output_dir): + feature_dim = gene_dim + drug_dim + device = torch.device("cuda") + model = torch.load(model_file, map_location='cuda:%d' % CUDA_ID) +# checkpoint = torch.load(model_file, map_location='cuda:%d' % CUDA_ID) + #model = torch.load(model_file, map_location='cuda:0') + model.to(device) +# model.load_state_dict(checkpoint['model_state_dict']) +# optimizer.load_state_dict(checkpoint['optimizer_state_dict']) +# epoch = checkpoint['epoch'] +# loss = checkpoint['loss'] + #model = torch.load(model_file, map_location='cuda:%d' % CUDA_ID) + + predict_feature, predict_label, feature_dict = predict_data + + predict_label_gpu = predict_label.cuda(CUDA_ID) + model.cuda(CUDA_ID) + model.eval() + + test_loader = du.DataLoader(du.TensorDataset(predict_feature,predict_label), batch_size=batch_size, shuffle=False) + model_dir = output_dir + + #Test + test_predict = torch.zeros(0,0).cuda(CUDA_ID) + term_hidden_map = {} + test_loss = 0 + batch_num = 0 + test_loss_list = [] + test_corr_list = [] + test_r2_list = [] + drug_list = [] + tissue_list = [] + print("Begin test evaluation") + for i, (inputdata, labels) in enumerate(test_loader): + # Convert torch tensor to Variable + cuda_labels = torch.autograd.Variable(labels.cuda(CUDA_ID)) + features = build_input_vector(inputdata, cell_features, drug_features) + cuda_features = Variable(features.cuda(CUDA_ID), requires_grad=False) + loss = nn.MSELoss() + values = inputdata.cpu().detach().numpy().tolist() + keys = [i for i in feature_dict for x in values if feature_dict [i]== x ] + tissue = [i.split(';')[0] for i in keys] + tissue_list.append(tissue) + drug = [i.split(';')[1] for i in keys] + drug_list.append(drug) + # make prediction for test data + aux_out_map, term_hidden_map = model(cuda_features) + if test_predict.size()[0] == 0: + test_predict = aux_out_map['final'].data + loss_a = loss(test_predict, cuda_labels) + print(loss_a) + test_loss += loss_a.item() + else: + test_predict = torch.cat([test_predict, aux_out_map['final'].data], dim=0) + loss_a = loss(test_predict, cuda_labels) + print(loss_a) + test_loss += loss_a.item() + batch_num += 1 + + predictions = np.array([p.cpu() for preds in test_predict for p in preds] ,dtype = np.float ) + predictions = predictions[0:len(predictions)] + labels = np.array([l.cpu() for label in labels for l in label],dtype = np.float) + labels = labels[0:len(labels)] + test_pearson_a = pearson_corr(torch.Tensor(predictions), torch.Tensor(labels)) + test_spearman_a = spearmanr(labels, predictions)[0] + test_mean_absolute_error = sklearn.metrics.mean_absolute_error(y_true=labels, y_pred=predictions) + test_r2_a = sklearn.metrics.r2_score(y_true=labels, y_pred=predictions) + test_rmse_a = np.sqrt(np.mean((predictions - labels)**2)) + test_loss_a = test_loss / len(test_loader) + epoch_end_time = time() + test_loss_a = test_loss/len(test_loader) + test_loss_list.append(test_loss_a) + test_corr_list.append(test_pearson_a.cpu().detach().numpy()) + test_r2_list.append(test_r2_a) + min_test_loss = test_loss_a + scores = {} + scores['test_loss'] = min_test_loss + scores['test_pcc'] = test_pearson_a.cpu().detach().numpy().tolist() + scores['test_MSE'] = test_mean_absolute_error + scores['test_r2'] = test_r2_a + scores['test_scc'] = test_spearman_a + test_corr = pearson_corr(test_predict, predict_label_gpu) + print("Test pearson corr\t%s\t%.6f" % (model.root, test_corr)) + cols = ['drug', 'tissue', 'test_loss', 'test_corr', 'test_r2'] + metrics_test_df = pd.DataFrame(columns=cols, index=range(len(test_loader))) + metrics_test_df['test_loss'] = test_loss_list + metrics_test_df['test_corr'] = test_corr_list + metrics_test_df['test_r2'] = test_r2_list + loss_results_name = str(result_file+'/test_metrics_results.csv') + metrics_test_df.to_csv(loss_results_name, index=False) + np.savetxt(result_file+'/drugcell.predict', test_predict.cpu().numpy(),'%.4e') + + +def run(params): + keys_parsing = ["train_data", "test_data", "val_data", + "onto", "genotype_hiddens", "fingerprint", + "genotype", "cell2id","drug2id", "drug_hiddens", + "model_name"] + model_param_key = [] + for key in params.keys(): + if key not in keys_parsing: + model_param_key.append(key) + model_params = {key: params[key] for key in model_param_key} + params['model_params'] = model_params + args = candle.ArgumentStruct(**params) + cell2id_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['cell2id'] + drug2id_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['drug2id'] + gene2id_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['gene2id'] + genotype_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['genotype'] + fingerprint_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['fingerprint'] + hidden_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['hidden'] + result_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['result'] + val_data = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['val_data'] + trained_model = params['data_model'] + hidden = params['drug_hiddens'] + batchsize = params['batch_size'] + cell_features = np.genfromtxt(genotype_path, delimiter=',') + drug_features = np.genfromtxt(fingerprint_path, delimiter=',') + CUDA_ID = params['cuda_id'] + num_cells = len(cell2id_path) + num_drugs = len(drug2id_path) + num_genes = len(gene2id_path) + drug_dim = len(drug_features[0,:]) + output_dir = params['output_dir'] + trained_model = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + os.path.join(output_dir) + "/" + "model_final.pt" + print(trained_model) + predict_data = prepare_predict_data(val_data, cell2id_path, drug2id_path) + predict_dcell(predict_data, num_genes, drug_dim, trained_model, hidden_path, batchsize, + result_path, cell_features, drug_features, CUDA_ID, output_dir) + + +def candle_main(): + params = initialize_parameters() + run(params) + +if __name__ == "__main__": + candle_main() diff --git a/train.py b/train.py new file mode 100644 index 0000000..bfb1dd0 --- /dev/null +++ b/train.py @@ -0,0 +1,68 @@ +import candle +import os +import json +from json import JSONEncoder +from preprocess_new import mkdir, preprocess +from PathDSP.FNN_new import main + +file_path = os.path.dirname(os.path.realpath(__file__)) +# This should be set outside as a user environment variable +os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' +required = None +additional_definitions = None + +# initialize class +class PathDSP_candle(candle.Benchmark): + def set_locals(self): + ''' + Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the benchmark. + ''' + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + +def initialize_parameters(): + preprocessor_bmk = PathDSP_candle(file_path, + 'PathDSP_params.txt', + 'pytorch', + prog='PathDSP_candle', + desc='Data Preprocessor' + ) + #Initialize parameters + gParameters = candle.finalize_parameters(preprocessor_bmk) + return gParameters + +class CustomData: + def __init__(self, name, value): + self.name = name + self.value = value + +class CustomEncoder(json.JSONEncoder): + def default(self, o): + return o.__dict__ + + +def run(params): + params['data_type'] = str(params['data_type']) + json_out = params['output_dir']+'/params.json' + print(params) + + with open (json_out, 'w') as fp: + json.dump(params, fp, indent=4, cls=CustomEncoder) + + scores = main(params) + with open(params['output_dir'] + "/scores.json", "w", encoding="utf-8") as f: + json.dump(scores, f, ensure_ascii=False, indent=4) +# print('IMPROVE_RESULT RMSE:\t' + str(scores['rmse'])) + + +def candle_main(): + params = initialize_parameters() + params = preprocess(params) + run(params) + +if __name__ == "__main__": + candle_main() diff --git a/train.sh b/train.sh new file mode 100755 index 0000000..b71f3e8 --- /dev/null +++ b/train.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# arg 1 CUDA_VISIBLE_DEVICES +# arg 2 CANDLE_DATA_DIR +# arg 3 CANDLE_CONFIG + +### Path to your CANDLEized model's main Python script### +CANDLE_MODEL=train.py + +### Set env if CANDLE_MODEL is not in same directory as this script +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} + +CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL} +if [ ! -f ${CANDLE_MODEL} ] ; then + echo No such file ${CANDLE_MODEL} + exit 404 +fi + +if [ $# -lt 2 ]; then + echo "Illegal number of parameters" + echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" + exit +fi + +if [ $# -eq 2 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + CMD="python ${CANDLE_MODEL}" + echo "CMD = $CMD" + +elif [ $# -ge 3 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + + # if original $3 is a file, set candle_config and passthrough $@ + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$CANDLE_DATA_DIR/$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD $@" + + # else passthrough $@ + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + + fi +fi + +if [ -d ${CANDLE_DATA_DIR} ]; then + if [ "$(ls -A ${CANDLE_DATA_DIR})" ] ; then + echo "using data from ${CANDLE_DATA_DIR}" + else + ./candle_glue.sh + echo "using original data placed in ${CANDLE_DATA_DIR}" + fi +fi + +export CANDLE_DATA_DIR=${CANDLE_DATA_DIR} +FULL_DATA_DIR="$CANDLE_DATA_DIR/$MODEL_NAME/Data" +echo $FULL_DATA_DIR + +if [ -d ${FULL_DATA_DIR} ]; then + if [ "$(ls -A ${FULL_DATA_DIR})" ] ; then + echo "using data from ${FULL_DATA_DIR}" + else + ./candle_glue.sh + echo "using original data placed in ${FULL_DATA_DIR}" + fi +else + ./candle_glue.sh + echo "using original data placed in ${FULL_DATA_DIR}" +fi + +# Display runtime arguments +echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" +echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" + +# Set up environmental variables and execute model +echo "running command ${CMD}" +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD From 0c43c0df8dc49d2fc061ad13e5135040e8e2acdc Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 26 Jul 2023 18:56:30 -0700 Subject: [PATCH 020/254] update params --- PathDSP_params.txt | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index ec939cb..5b32763 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -20,20 +20,21 @@ final_input_file='input.txt' output='Result/' bit_int=128 permutation_int=100 -seed_int=42 -cpu_int=20 + #Model parameter +seed_int=42 +cpu_int=20 +cv_int=1 +gpu_int=0 metric='auc1' data_type='CCLE' -CUDA_ID = 0 learning_rate = 0.001 -batch_size = 1000 +batch_size = 12 eps=0.00001 -genotype_hiddens = 6 drug_hiddens='100,50,6' final_hiddens=6 -epochs=200 +epochs=800 optimizer = 'adam' loss = 'mse' improve_analysis='no' From 8d9ebcfe2b54c655cca352c94aff9bf94251a5cc Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 26 Jul 2023 18:57:21 -0700 Subject: [PATCH 021/254] add .yml --- environment.yml | 230 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 environment.yml diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..97d231d --- /dev/null +++ b/environment.yml @@ -0,0 +1,230 @@ +name: PathDSP_env +channels: + - pytorch + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - anyio=3.6.2 + - appdirs=1.4.4 + - argon2-cffi=21.3.0 + - argon2-cffi-bindings=21.2.0 + - asttokens=2.2.1 + - async-lru=2.0.2 + - attrs=23.1.0 + - babel=2.12.1 + - backcall=0.2.0 + - backports=1.0 + - backports.functools_lru_cache=1.6.4 + - beautifulsoup4=4.12.2 + - blas=1.0 + - bleach=6.0.0 + - boost=1.78.0 + - boost-cpp=1.78.0 + - bottleneck=1.3.5 + - brotli=1.0.9 + - brotli-bin=1.0.9 + - brotlipy=0.7.0 + - bzip2=1.0.8 + - ca-certificates=2023.5.7 + - cairo=1.16.0 + - certifi=2023.5.7 + - cffi=1.15.1 + - charset-normalizer=2.0.4 + - cloudpickle=2.2.1 + - colorama=0.4.6 + - comm=0.1.3 + - contourpy=1.0.7 + - cryptography=39.0.1 + - cycler=0.11.0 + - debugpy=1.6.7 + - decorator=5.1.1 + - defusedxml=0.7.1 + - entrypoints=0.4 + - executing=1.2.0 + - expat=2.5.0 + - ffmpeg=4.3 + - filelock=3.9.0 + - flit-core=3.9.0 + - fontconfig=2.14.1 + - fonttools=4.39.4 + - freetype=2.12.1 + - gettext=0.21.1 + - giflib=5.2.1 + - glib=2.76.3 + - glib-tools=2.76.3 + - gmp=6.2.1 + - gmpy2=2.1.2 + - gnutls=3.6.15 + - greenlet=2.0.2 + - icu=72.1 + - idna=3.4 + - importlib-metadata=6.6.0 + - importlib_metadata=6.6.0 + - importlib_resources=5.12.0 + - intel-openmp=2023.1.0 + - ipykernel=6.23.1 + - ipython=8.13.2 + - jedi=0.18.2 + - jinja2=3.1.2 + - joblib=1.2.0 + - jpeg=9e + - json5=0.9.5 + - jsonschema=4.17.3 + - jupyter-lsp=2.1.0 + - jupyter_client=8.2.0 + - jupyter_core=4.12.0 + - jupyter_events=0.6.3 + - jupyter_server=2.5.0 + - jupyter_server_terminals=0.4.4 + - jupyterlab=4.0.0 + - jupyterlab_pygments=0.2.2 + - jupyterlab_server=2.22.1 + - kiwisolver=1.4.4 + - lame=3.100 + - lcms2=2.12 + - ld_impl_linux-64=2.38 + - lerc=3.0 + - libblas=3.9.0 + - libbrotlicommon=1.0.9 + - libbrotlidec=1.0.9 + - libbrotlienc=1.0.9 + - libcblas=3.9.0 + - libdeflate=1.17 + - libexpat=2.5.0 + - libffi=3.4.4 + - libgcc-ng=12.2.0 + - libgfortran-ng=11.2.0 + - libgfortran5=11.2.0 + - libglib=2.76.3 + - libiconv=1.17 + - libidn2=2.3.4 + - liblapack=3.9.0 + - libllvm11=11.1.0 + - libpng=1.6.39 + - libsodium=1.0.18 + - libstdcxx-ng=12.2.0 + - libtasn1=4.19.0 + - libtiff=4.5.0 + - libunistring=0.9.10 + - libuuid=1.41.5 + - libwebp=1.2.4 + - libwebp-base=1.2.4 + - libxcb=1.15 + - libxml2=2.10.4 + - libzlib=1.2.13 + - llvm-openmp=16.0.4 + - llvmlite=0.39.1 + - lz4-c=1.9.4 + - markupsafe=2.1.1 + - matplotlib-base=3.7.1 + - matplotlib-inline=0.1.6 + - mistune=2.0.5 + - mkl=2023.1.0 + - mkl-service=2.4.0 + - mkl_fft=1.3.6 + - mkl_random=1.2.2 + - mpc=1.1.0 + - mpfr=4.0.2 + - mpmath=1.3.0 + - munkres=1.1.4 + - nbclient=0.8.0 + - nbconvert-core=7.4.0 + - nbformat=5.8.0 + - ncurses=6.4 + - nest-asyncio=1.5.6 + - nettle=3.7.3 + - networkx=2.8.4 + - notebook-shim=0.2.3 + - numba=0.56.4 + - numexpr=2.8.4 + - numpy=1.21.6 + - openh264=2.1.1 + - openssl=1.1.1t + - packaging=23.0 + - pandas=1.5.3 + - pandocfilters=1.5.0 + - parso=0.8.3 + - patsy=0.5.3 + - pcre2=10.40 + - pexpect=4.8.0 + - pickleshare=0.7.5 + - pillow=9.4.0 + - pip=23.0.1 + - pixman=0.40.0 + - pkgutil-resolve-name=1.3.10 + - pooch=1.4.0 + - prometheus_client=0.16.0 + - prompt-toolkit=3.0.38 + - prompt_toolkit=3.0.38 + - psutil=5.9.5 + - pthread-stubs=0.4 + - ptyprocess=0.7.0 + - pure_eval=0.2.2 + - pycairo=1.23.0 + - pycparser=2.21 + - pygments=2.15.1 + - pyopenssl=23.0.0 + - pyparsing=3.0.9 + - pyrsistent=0.19.3 + - pysocks=1.7.1 + - python=3.10.11 + - python-dateutil=2.8.2 + - python-fastjsonschema=2.17.1 + - python-json-logger=2.0.7 + - python_abi=3.10 + - pytorch=2.0.1 + - pytorch-mutex=1.0 + - pytz=2022.7 + - pyyaml=6.0 + - pyzmq=25.0.2 + - rdkit=2023.03.1 + - readline=8.2 + - reportlab=3.6.13 + - requests=2.29.0 + - rfc3339-validator=0.1.4 + - rfc3986-validator=0.1.1 + - scikit-learn=1.0.2 + - scipy=1.10.1 + - seaborn=0.12.2 + - seaborn-base=0.12.2 + - send2trash=1.8.2 + - setuptools=66.0.0 + - shap=0.41.0 + - six=1.16.0 + - slicer=0.0.7 + - sniffio=1.3.0 + - soupsieve=2.3.2.post1 + - sqlalchemy=1.4.46 + - sqlite=3.41.2 + - stack_data=0.6.2 + - statsmodels=0.14.0 + - sympy=1.11.1 + - tbb=2021.8.0 + - terminado=0.17.1 + - threadpoolctl=3.1.0 + - tinycss2=1.2.1 + - tk=8.6.12 + - tomli=2.0.1 + - torchvision=0.15.2 + - tornado=6.3.2 + - tqdm=4.65.0 + - traitlets=5.9.0 + - typing_extensions=4.5.0 + - tzdata=2023c + - unicodedata2=15.0.0 + - urllib3=1.26.15 + - wcwidth=0.2.6 + - webencodings=0.5.1 + - websocket-client=1.5.2 + - wheel=0.38.4 + - xorg-libxau=1.0.11 + - xorg-libxdmcp=1.1.3 + - xz=5.4.2 + - yaml=0.2.5 + - zeromq=4.3.4 + - zipp=3.15.0 + - zlib=1.2.13 + - zstd=1.5.5 +prefix: /homes/ac.liu.yuanhang/miniconda3/envs/improve_env From 027117e7c1a88257887ecce9bf64fc0f84d0a1c7 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 3 Aug 2023 12:26:45 -0700 Subject: [PATCH 022/254] update params --- PathDSP/FNN_new.py | 13 +++++++------ PathDSP_params.txt | 6 +++--- preprocess_new.py | 22 +++++++++++++++------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/PathDSP/FNN_new.py b/PathDSP/FNN_new.py index ec72577..210b641 100644 --- a/PathDSP/FNN_new.py +++ b/PathDSP/FNN_new.py @@ -148,15 +148,16 @@ def predict(net, test_dl, device): def main(params): start_time = datetime.now() # load data - df = pd.read_csv(params.final_input_file, header=0, index_col=[0,1], sep="\t") - + train_df = pd.read_csv(params['train_data'], header=0, index_col=[0,1], sep="\t") + val_df = pd.read_csv(params['val_data'], header=0, index_col=[0,1], sep="\t") + # shuffle - sdf = skut.shuffle(df, random_state=params.seed_int) + sdf = skut.shuffle(df, random_state=params["seed_int"]) # set parameters - myutil.set_seed(params.seed_int) - device = myutil.get_device(uth=params.gpu_int) - kFold = params.cv_int + myutil.set_seed(params["seed_int"]) + device = myutil.get_device(uth=params["gpu_int"]) + kFold = params["cv_int"] learning_rate = params['learning_rate'] epoch = params['epochs'] batch_size = params['batch_size'] diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 5b32763..2c0347e 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -16,10 +16,12 @@ dgnet_file='DGnet.txt' mutnet_file='MUTnet.txt' cnvnet_file='CNVnet.txt' exp_file='EXP.txt' -final_input_file='input.txt' output='Result/' bit_int=128 permutation_int=100 +metric='auc1' +data_type='CCLE' +split=0 #Model parameter @@ -27,8 +29,6 @@ seed_int=42 cpu_int=20 cv_int=1 gpu_int=0 -metric='auc1' -data_type='CCLE' learning_rate = 0.001 batch_size = 12 eps=0.00001 diff --git a/preprocess_new.py b/preprocess_new.py index 98b1dbd..e1a4848 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -268,7 +268,7 @@ def prep_input(params): MUTnet = MUTnet.add_suffix('_mutnet').reset_index().rename(columns={'index': 'sample_id'}) EXP = pd.read_csv(params['exp_file'], sep = '\t', index_col=0) EXP = EXP.add_suffix('_exp').reset_index().rename(columns={'index': 'sample_id'}) - response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=0, + response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], split_type=['train', 'test', 'val'], y_col_name= params['metric']) response_df = response_df.rename(columns={'improve_chem_id': 'drug_id', 'improve_sample_id': 'sample_id'}) @@ -278,7 +278,6 @@ def prep_input(params): common_sample_ids = reduce(np.intersect1d, (CNVnet['sample_id'], MUTnet['sample_id'], EXP['sample_id'] , response_df['sample_id'])) response_df = response_df.loc[(response_df['drug_id'].isin(common_drug_ids)) & (response_df['sample_id'].isin(common_sample_ids)), :] - drug_mbit_df = drug_mbit_df.loc[drug_mbit_df['drug_id'].isin(common_drug_ids), :].set_index('drug_id').sort_index() DGnet = DGnet.loc[DGnet['drug_id'].isin(common_drug_ids), :].set_index('drug_id').sort_index() CNVnet = CNVnet.loc[CNVnet['sample_id'].isin(common_sample_ids), :].set_index('sample_id').sort_index() @@ -287,11 +286,20 @@ def prep_input(params): drug_data = drug_mbit_df.join(DGnet) sample_data = CNVnet.join([MUTnet, EXP]) - comb_data_mtx = pd.DataFrame({'drug_id': response_df['drug_id'].values, - 'sample_id': response_df['sample_id'].values}) - comb_data_mtx = comb_data_mtx.set_index(['drug_id', 'sample_id']).join(drug_data, on = 'drug_id').join(sample_data, on = 'sample_id').reset_index() - comb_data_mtx['response'] = response_df[params['metric']] - comb_data_mtx.to_csv(params['final_input_file'], sep = '\t', header= True, index=False) + ## export train,val,test set + for i in ['train', 'test', 'val']: + response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], + split_type=i, + y_col_name= params['metric']) + response_df = response_df.rename(columns={'improve_chem_id': 'drug_id', 'improve_sample_id': 'sample_id'}) + response_df = response_df.loc[(response_df['drug_id'].isin(common_drug_ids)) & + (response_df['sample_id'].isin(common_sample_ids)), :] + comb_data_mtx = pd.DataFrame({'drug_id': response_df['drug_id'].values, + 'sample_id': response_df['sample_id'].values}) + comb_data_mtx = comb_data_mtx.set_index(['drug_id', 'sample_id']).join(drug_data, on = 'drug_id').join(sample_data, on = 'sample_id') + comb_data_mtx['response'] = response_df[params['metric']] + comb_data_mtx.to_csv(params[i + '_data'], sep = '\t', header= True, index=False) + def run_ssgsea(params): expMat = improve_utils.load_gene_expression_data(sep='\t') From be7baabdae24eea3fe0e39f734cd33f4bfaecd87 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 10:24:45 -0700 Subject: [PATCH 023/254] update conda path --- preprocess.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/preprocess.sh b/preprocess.sh index 973d006..eb579cd 100644 --- a/preprocess.sh +++ b/preprocess.sh @@ -53,7 +53,6 @@ echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" # Set up environmental variables and execute model echo "activating environment" -. /homes/ac.rgnanaolivu/miniconda3/etc/profile.d/conda.sh -conda activate rohan_python +/opt/conda/bin/conda activate PathDSP_env echo "running command ${CMD}" CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD From 629f023a33018eb722ae130b56e2e44a186d6d05 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 11:33:56 -0700 Subject: [PATCH 024/254] fix conda --- preprocess.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/preprocess.sh b/preprocess.sh index eb579cd..250b3cc 100644 --- a/preprocess.sh +++ b/preprocess.sh @@ -53,6 +53,7 @@ echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" # Set up environmental variables and execute model echo "activating environment" -/opt/conda/bin/conda activate PathDSP_env +#source /opt/conda/etc/profile.d/conda.sh +source activate /usr/local/conda_envs/PathDSP_env echo "running command ${CMD}" CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD From 5bb62b3a0943d6f0ee3df35e501d1a56e2f2e2c6 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 11:49:34 -0700 Subject: [PATCH 025/254] update preprocess.sh --- preprocess.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocess.sh b/preprocess.sh index 250b3cc..425d724 100644 --- a/preprocess.sh +++ b/preprocess.sh @@ -11,7 +11,7 @@ ### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=preprocessing_new.py +CANDLE_MODEL=/usr/local/PathDSP/preprocessing_new.py if [ $# -lt 2 ] ; then echo "Illegal number of parameters" From d24c498df823a1073e2c318277864bd29406cdb4 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 12:04:15 -0700 Subject: [PATCH 026/254] update preprocess.sh --- preprocess.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocess.sh b/preprocess.sh index 425d724..a20dd1d 100644 --- a/preprocess.sh +++ b/preprocess.sh @@ -11,7 +11,7 @@ ### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=/usr/local/PathDSP/preprocessing_new.py +CANDLE_MODEL=/usr/local/PathDSP/preprocess_new.py if [ $# -lt 2 ] ; then echo "Illegal number of parameters" From 326c68d4d6c402eccad033affd0222a853e5a605 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 12:13:17 -0700 Subject: [PATCH 027/254] update preprocess_new.py --- preprocess_new.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocess_new.py b/preprocess_new.py index e1a4848..9cead4e 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -41,7 +41,7 @@ additional_definitions = None # This should be set outside as a user environment variable -os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' +#os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' # initialize class class PathDSP_candle(candle.Benchmark): From 3c6e7bfde5afc6d278010801a34ee38317827551 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 12:42:56 -0700 Subject: [PATCH 028/254] update env --- environment.yml | 9 +++++---- preprocess_new.py | 6 ++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/environment.yml b/environment.yml index 97d231d..ad9d1d7 100644 --- a/environment.yml +++ b/environment.yml @@ -1,5 +1,6 @@ name: PathDSP_env channels: + - bioconda - pytorch - conda-forge - defaults @@ -27,9 +28,9 @@ dependencies: - brotli-bin=1.0.9 - brotlipy=0.7.0 - bzip2=1.0.8 - - ca-certificates=2023.5.7 + - ca-certificates=2023.05.30 - cairo=1.16.0 - - certifi=2023.5.7 + - certifi=2023.7.22 - cffi=1.15.1 - charset-normalizer=2.0.4 - cloudpickle=2.2.1 @@ -58,6 +59,7 @@ dependencies: - gmpy2=2.1.2 - gnutls=3.6.15 - greenlet=2.0.2 + - gseapy=1.0.5 - icu=72.1 - idna=3.4 - importlib-metadata=6.6.0 @@ -141,7 +143,7 @@ dependencies: - numexpr=2.8.4 - numpy=1.21.6 - openh264=2.1.1 - - openssl=1.1.1t + - openssl=1.1.1v - packaging=23.0 - pandas=1.5.3 - pandocfilters=1.5.0 @@ -227,4 +229,3 @@ dependencies: - zipp=3.15.0 - zlib=1.2.13 - zstd=1.5.5 -prefix: /homes/ac.liu.yuanhang/miniconda3/envs/improve_env diff --git a/preprocess_new.py b/preprocess_new.py index 9cead4e..0fbf444 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -107,8 +107,8 @@ def download_anl_data(params): mkdir(y_data_dir) mkdir(supplementary_folder) - for improve_file in ['CCLE_all.txt', 'CCLE_split_0_test.txt', - 'CCLE_split_0_train.txt', 'CCLE_split_0_val.txt']: + for improve_file in ['CCLE_all.txt', 'CCLE_split_' + str(params['split']) + '_test.txt', + 'CCLE_split_' + str(params['split']) + '_train.txt', 'CCLE_split_' + str(params['split']) + '_val.txt']: url_dir = params['improve_data_url'] + '/splits/' candle.file_utils.get_file(improve_file, url_dir + improve_file, datadir=splits_dir, @@ -131,6 +131,8 @@ def download_anl_data(params): candle.file_utils.get_file(db_file, params['data_url'] + '/' +db_file, datadir=params['data_dir'], cache_subdir=None) + + # set timer From 21cfc163496f354d1673b65d4251e3107dd0df8a Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 13:15:31 -0700 Subject: [PATCH 029/254] update preproce_new.py --- preprocess_new.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/preprocess_new.py b/preprocess_new.py index 0fbf444..9889fbf 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -94,7 +94,7 @@ def preprocess(params, data_dir): return(params) def download_anl_data(params): - csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'] + params['model_name'], 'csa_data', 'raw_data') + csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'], params['model_name'], 'csa_data', 'raw_data') splits_dir = os.path.join(csa_data_folder, 'splits') x_data_dir = os.path.join(csa_data_folder, 'x_data') y_data_dir = os.path.join(csa_data_folder, 'y_data') @@ -105,7 +105,6 @@ def download_anl_data(params): mkdir(splits_dir) mkdir(x_data_dir) mkdir(y_data_dir) - mkdir(supplementary_folder) for improve_file in ['CCLE_all.txt', 'CCLE_split_' + str(params['split']) + '_test.txt', 'CCLE_split_' + str(params['split']) + '_train.txt', 'CCLE_split_' + str(params['split']) + '_val.txt']: @@ -345,7 +344,7 @@ def run_ssgsea(params): def candle_main(anl): params = initialize_parameters() - data_dir = os.environ['CANDLE_DATA_DIR'] + params['model_name'] + '/Data/' + data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + params['model_name'] + '/Data/' params = preprocess(params, data_dir) if params['improve_analysis'] == 'yes' or anl: download_anl_data(params) From fcb9d3a9dc482f2ade0931495882a6e234c0a29c Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 13:35:16 -0700 Subject: [PATCH 030/254] update preproce_new.py --- preprocess_new.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocess_new.py b/preprocess_new.py index 9889fbf..0741683 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -89,7 +89,7 @@ def preprocess(params, data_dir): params['data_dir'] = data_dir #args = candle.ArgumentStruct(**params) for i in ['train_data', 'test_data', 'val_data', 'drug_bits_file', 'dgnet_file', - 'mutnet_file', 'cnvnet_file', 'exp_file', 'final_input_file', 'output_dir']: + 'mutnet_file', 'cnvnet_file', 'exp_file', 'output_dir']: params[i] = params['data_dir'] + '/' + params[i] return(params) From 5ab58dcf64006a885ffbc15d3dc37d80bf772ab6 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 14:17:19 -0700 Subject: [PATCH 031/254] update files --- PathDSP_params.txt | 2 +- preprocess_new.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 2c0347e..e0de182 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -3,7 +3,7 @@ model_name='PathDSP' data_url='https://zenodo.org/record/6093818/files/' improve_data_url='https://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/' -original_data_url='https://zenodo.org/record/7532963/' +original_data_url='https://zenodo.org/record/7532963/files/' original_data='input.zip' gene_set = 'MSigdb.zip' ppi_data = 'STRING.zip' diff --git a/preprocess_new.py b/preprocess_new.py index 0741683..844a4ab 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -145,7 +145,7 @@ def cal_time(end, start): def download_author_data(params): - data_download_filepath = candle.get_file(params['original_data'], params['original_data_url'], + data_download_filepath = candle.get_file(params['original_data'], params['original_data_url'] + '/' + params['original_data'], datadir = params['data_dir'], cache_subdir = None) print('download_path: {}'.format(data_download_filepath)) From a3a76d708b21c30efc38acdb82237e7d37d20161 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 14:47:12 -0700 Subject: [PATCH 032/254] update params --- PathDSP_params.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index e0de182..1fa3f73 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -37,4 +37,4 @@ final_hiddens=6 epochs=800 optimizer = 'adam' loss = 'mse' -improve_analysis='no' +improve_analysis='yes' From 212560f7ae6d317421cfdf2787c41c223a73c3d5 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 15:05:02 -0700 Subject: [PATCH 033/254] fix params --- PathDSP_params.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 1fa3f73..1ba89b2 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -2,7 +2,7 @@ model_name='PathDSP' data_url='https://zenodo.org/record/6093818/files/' -improve_data_url='https://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/' +improve_data_url='https://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/raw_data/' original_data_url='https://zenodo.org/record/7532963/files/' original_data='input.zip' gene_set = 'MSigdb.zip' From 2c6e7783c51def0340585925b30aa2a7bc94a6b7 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 19:42:46 -0700 Subject: [PATCH 034/254] update preproce_new.py --- preprocess_new.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/preprocess_new.py b/preprocess_new.py index 844a4ab..df8b8bd 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -153,9 +153,13 @@ def download_author_data(params): def smile2bits(params): start = datetime.now() + rs_all = improve_utils.load_single_drug_response_data(source=params['source'], + split=params['split'], split_type=["train", "test", "val"], + y_col_name=params['metric']) smile_df = improve_utils.load_smiles_data() smile_df.columns = ['drug', 'smile'] smile_df = smile_df.drop_duplicates(subset=['drug'], keep='first').set_index('drug') + smile_df = smile_df.loc[smile_df.index.isin(rs_all['improve_chem_id']),] bit_int = params['bit_int'] record_list = [] # smile2bits drug by drug @@ -220,12 +224,16 @@ def run_netpea(params, dtype, multiply_expression): seed_int = params['seed_int'] cpu_int = params['cpu_int'] csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'] + params['model_name'], 'csa_data', 'raw_data') + rs_all = improve_utils.load_single_drug_response_data(source=params['source'], + split=params['split'], split_type=["train", "test", "val"], + y_col_name=params['metric']) if dtype == 'DGnet': drug_info = pd.read_csv(csa_data_folder + '/x_data/drug_info.tsv', sep='\t') drug_info['NAME'] = drug_info['NAME'].str.upper() target_info = pd.read_csv(params['data_dir'] + '/raw_data/DB.Drug.Target.txt', sep = '\t') target_info = target_info.rename(columns={'drug': 'NAME'}) combined_df = pd.merge(drug_info, target_info, how = 'left', on = 'NAME').dropna(subset=['gene']) + combined_df = combined_df.loc[combined_df['improve_chem_id'].isin(rs_all['improve_chem_id']),] restart_path = params['data_dir'] + '/drug_target.txt' combined_df.iloc[:,-2:].to_csv(restart_path, sep = '\t', header= True, index=False) outpath = params['dgnet_file'] @@ -233,6 +241,7 @@ def run_netpea(params, dtype, multiply_expression): mutation_data = improve_utils.load_mutation_count_data(gene_system_identifier='Gene_Symbol') mutation_data = mutation_data.reset_index() mutation_data = pd.melt(mutation_data, id_vars='improve_sample_id').loc[lambda x: x['value'] > 0] + mutation_data = mutation_data.loc[mutation_data['improve_sample_id'].isin(rs_all['improve_sample_id']),] restart_path = params['data_dir'] + '/mutation_data.txt' mutation_data.iloc[:,0:2].to_csv(restart_path, sep = '\t', header= True, index=False) outpath = params['mutnet_file'] @@ -240,6 +249,7 @@ def run_netpea(params, dtype, multiply_expression): cnv_data = improve_utils.load_discretized_copy_number_data(gene_system_identifier='Gene_Symbol') cnv_data = cnv_data.reset_index() cnv_data = pd.melt(cnv_data, id_vars='improve_sample_id').loc[lambda x: x['value'] != 0] + cnv_data = cnv_data.loc[cnv_data['improve_sample_id'].isin(rs_all['improve_sample_id']),] restart_path = params['data_dir'] + '/cnv_data.txt' cnv_data.iloc[:,0:2].to_csv(restart_path, sep = '\t', header= True, index=False) outpath = params['mutnet_file'] @@ -304,6 +314,10 @@ def prep_input(params): def run_ssgsea(params): expMat = improve_utils.load_gene_expression_data(sep='\t') + rs_all = improve_utils.load_single_drug_response_data(source=params['source'], + split=params['split'], split_type=["train", "test", "val"], + y_col_name=params['metric']) + expMat = expMat.loc[expMat.index.isin(rs_all['improve_sample_id']),] gct = expMat.T # gene (rows) cell lines (columns) pathway_path = params['data_dir'] + '/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt' gmt = pathway_path @@ -317,9 +331,9 @@ def run_ssgsea(params): gene_sets=gmt, #gmt format outdir=tmp_str, scale=True, - permutation_num=2, #1000 + permutation_num=0, #1000 no_plot=True, - processes=10, + processes=params['cpu_int'], #min_size=0, format='png') @@ -336,10 +350,8 @@ def run_ssgsea(params): for i, pathway in enumerate((lines[2].split())): if i > 0: total_dict[cell_lines[i]][pathway] = float(vals[i]) - df = pd.DataFrame(total_dict) - - df.to_csv(params['exp_file']) + df.T.to_csv(params['exp_file'], header=True, index=True, sep="\t") def candle_main(anl): From 32a50c767d3954037e01bbf75be677db032e66c4 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 20:08:56 -0700 Subject: [PATCH 035/254] update preprocess_new.py --- preprocess_new.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/preprocess_new.py b/preprocess_new.py index df8b8bd..3a6e079 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -153,7 +153,7 @@ def download_author_data(params): def smile2bits(params): start = datetime.now() - rs_all = improve_utils.load_single_drug_response_data(source=params['source'], + rs_all = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], split_type=["train", "test", "val"], y_col_name=params['metric']) smile_df = improve_utils.load_smiles_data() @@ -224,7 +224,7 @@ def run_netpea(params, dtype, multiply_expression): seed_int = params['seed_int'] cpu_int = params['cpu_int'] csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'] + params['model_name'], 'csa_data', 'raw_data') - rs_all = improve_utils.load_single_drug_response_data(source=params['source'], + rs_all = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], split_type=["train", "test", "val"], y_col_name=params['metric']) if dtype == 'DGnet': @@ -314,7 +314,7 @@ def prep_input(params): def run_ssgsea(params): expMat = improve_utils.load_gene_expression_data(sep='\t') - rs_all = improve_utils.load_single_drug_response_data(source=params['source'], + rs_all = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], split_type=["train", "test", "val"], y_col_name=params['metric']) expMat = expMat.loc[expMat.index.isin(rs_all['improve_sample_id']),] From 30e62fa748f9b1fa2e2099c878685f4b079fd1c1 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 20:39:14 -0700 Subject: [PATCH 036/254] update preprocess_new.py --- improve_utils.py | 2 +- preprocess_new.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/improve_utils.py b/improve_utils.py index f6d2b33..a804c7d 100644 --- a/improve_utils.py +++ b/improve_utils.py @@ -35,7 +35,7 @@ # TODO: # This is CANDLE_DATA_DIR (or something...). # How this is going to be passed to the code? -improve_globals.main_data_dir = fdir/"csa_data" +improve_globals.main_data_dir = os.environ['CANDLE_DATA_DIR'] + "/csa_data/" # improve_globals.main_data_dir = fdir/"improve_data_dir" # imp_globals.main_data_dir = fdir/"candle_data_dir" diff --git a/preprocess_new.py b/preprocess_new.py index 3a6e079..8138e52 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -94,7 +94,7 @@ def preprocess(params, data_dir): return(params) def download_anl_data(params): - csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'], params['model_name'], 'csa_data', 'raw_data') + csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'], 'csa_data', 'raw_data') splits_dir = os.path.join(csa_data_folder, 'splits') x_data_dir = os.path.join(csa_data_folder, 'x_data') y_data_dir = os.path.join(csa_data_folder, 'y_data') @@ -223,7 +223,7 @@ def run_netpea(params, dtype, multiply_expression): permutation_int = params['permutation_int'] seed_int = params['seed_int'] cpu_int = params['cpu_int'] - csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'] + params['model_name'], 'csa_data', 'raw_data') + csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'], 'csa_data', 'raw_data') rs_all = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], split_type=["train", "test", "val"], y_col_name=params['metric']) @@ -356,7 +356,7 @@ def run_ssgsea(params): def candle_main(anl): params = initialize_parameters() - data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + params['model_name'] + '/Data/' + data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/' params = preprocess(params, data_dir) if params['improve_analysis'] == 'yes' or anl: download_anl_data(params) From ff0c86f459889999982aeac88edbd83b5851a2ab Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 21:02:20 -0700 Subject: [PATCH 037/254] update file --- improve_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/improve_utils.py b/improve_utils.py index a804c7d..ec3b32b 100644 --- a/improve_utils.py +++ b/improve_utils.py @@ -35,7 +35,7 @@ # TODO: # This is CANDLE_DATA_DIR (or something...). # How this is going to be passed to the code? -improve_globals.main_data_dir = os.environ['CANDLE_DATA_DIR'] + "/csa_data/" +improve_globals.main_data_dir = PosixPath(os.environ['CANDLE_DATA_DIR']) + "/csa_data/" # improve_globals.main_data_dir = fdir/"improve_data_dir" # imp_globals.main_data_dir = fdir/"candle_data_dir" From 8567eb67a97971b87e524e984b2edb224f018d07 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 21:15:05 -0700 Subject: [PATCH 038/254] update file --- improve_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/improve_utils.py b/improve_utils.py index ec3b32b..49898bd 100644 --- a/improve_utils.py +++ b/improve_utils.py @@ -35,7 +35,7 @@ # TODO: # This is CANDLE_DATA_DIR (or something...). # How this is going to be passed to the code? -improve_globals.main_data_dir = PosixPath(os.environ['CANDLE_DATA_DIR']) + "/csa_data/" +improve_globals.main_data_dir = PosixPath(os.environ['CANDLE_DATA_DIR'])/"/csa_data/" # improve_globals.main_data_dir = fdir/"improve_data_dir" # imp_globals.main_data_dir = fdir/"candle_data_dir" From d50dc0410be6e1fdee288f85b4ebf8e951ef6947 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 11 Aug 2023 21:28:43 -0700 Subject: [PATCH 039/254] update file --- improve_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/improve_utils.py b/improve_utils.py index 49898bd..edc5095 100644 --- a/improve_utils.py +++ b/improve_utils.py @@ -35,7 +35,7 @@ # TODO: # This is CANDLE_DATA_DIR (or something...). # How this is going to be passed to the code? -improve_globals.main_data_dir = PosixPath(os.environ['CANDLE_DATA_DIR'])/"/csa_data/" +improve_globals.main_data_dir = PosixPath("/candle_data_dir/")/"/csa_data/" # improve_globals.main_data_dir = fdir/"improve_data_dir" # imp_globals.main_data_dir = fdir/"candle_data_dir" From fa81f1fcebc214c850aee9af127e5026ec5cffe0 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 16 Aug 2023 07:44:10 -0700 Subject: [PATCH 040/254] update script --- improve_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/improve_utils.py b/improve_utils.py index edc5095..9a7676b 100644 --- a/improve_utils.py +++ b/improve_utils.py @@ -35,7 +35,7 @@ # TODO: # This is CANDLE_DATA_DIR (or something...). # How this is going to be passed to the code? -improve_globals.main_data_dir = PosixPath("/candle_data_dir/")/"/csa_data/" +improve_globals.main_data_dir = PosixPath("/candle_data_dir/csa_data/") # improve_globals.main_data_dir = fdir/"improve_data_dir" # imp_globals.main_data_dir = fdir/"candle_data_dir" From fc39696c71ad573e13648faae1aee219084916bd Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 16 Aug 2023 07:45:41 -0700 Subject: [PATCH 041/254] add def --- PathDSP.def | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/PathDSP.def b/PathDSP.def index 48d2687..28cb90e 100644 --- a/PathDSP.def +++ b/PathDSP.def @@ -17,8 +17,9 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime %post apt-get update -y apt-get install wget -y - apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F60F4B3D7FA2AF80 - apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC + apt-get install -y gnupg + apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80 + apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys A4B469963BF863CC apt-get install build-essential -y apt-get install git -y @@ -41,13 +42,14 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime #install python modules and model prerequites cd /usr/local - git clone -b develop git@github.com:Liuy12/PathDSP.git + git clone -b develop https://github.com/Liuy12/PathDSP.git cd PathDSP - conda env create -f environment.yml - conda activate PathDSP_env - pip install --upgrade pip - python3 -m pip install git+https://github.com/ECP-CANDLE/candle_lib@develop + # download conda + + /opt/conda/bin/conda env create -f environment.yml --prefix /usr/local/conda_envs/PathDSP_env/ + #/opt/conda/bin/conda activate PathDSP_env + /usr/local/conda_envs/PathDSP_env/bin/pip install git+https://github.com/ECP-CANDLE/candle_lib@develop - - chmod a+x *.sh \ No newline at end of file + #cp *.sh /usr/local/bin + chmod a+x /usr/local/PathDSP/*.sh From d6caadad87423c12f7fa09cb1b24bbb249da7d24 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 16 Aug 2023 07:58:05 -0700 Subject: [PATCH 042/254] add script --- preprocess_new.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/preprocess_new.py b/preprocess_new.py index 8138e52..abea4f3 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -380,4 +380,6 @@ def candle_main(anl): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-a', dest='anl', default=False) args = parser.parse_args() + start = datetime.now() candle_main(args.anl) + print('[Finished in {:}]'.format(cal_time(datetime.now(), start))) From 7a5aabefd4adf8058548533b8e90a3b2553a3069 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 16 Aug 2023 08:24:17 -0700 Subject: [PATCH 043/254] update file --- preprocess_new.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocess_new.py b/preprocess_new.py index abea4f3..20f89e4 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -230,7 +230,7 @@ def run_netpea(params, dtype, multiply_expression): if dtype == 'DGnet': drug_info = pd.read_csv(csa_data_folder + '/x_data/drug_info.tsv', sep='\t') drug_info['NAME'] = drug_info['NAME'].str.upper() - target_info = pd.read_csv(params['data_dir'] + '/raw_data/DB.Drug.Target.txt', sep = '\t') + target_info = pd.read_csv(params['data_dir'] + '/data/DB.Drug.Target.txt', sep = '\t') target_info = target_info.rename(columns={'drug': 'NAME'}) combined_df = pd.merge(drug_info, target_info, how = 'left', on = 'NAME').dropna(subset=['gene']) combined_df = combined_df.loc[combined_df['improve_chem_id'].isin(rs_all['improve_chem_id']),] From a5e4360eb2ed8aa0ccfd1d13729b1cb0e7fa9ded Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 16 Aug 2023 13:29:22 -0700 Subject: [PATCH 044/254] update FNN_new --- PathDSP/FNN_new.py | 217 +++++++++++++++++++++++++-------------------- PathDSP_params.txt | 4 +- 2 files changed, 122 insertions(+), 99 deletions(-) diff --git a/PathDSP/FNN_new.py b/PathDSP/FNN_new.py index 210b641..63c6af1 100644 --- a/PathDSP/FNN_new.py +++ b/PathDSP/FNN_new.py @@ -45,6 +45,15 @@ def forward(self,x,y): loss = tch.sqrt(criterion(x, y) + eps) return loss + +def r2_score(y_true, y_pred): + y_mean = tch.mean(y_true) + ss_tot = tch.sum((y_true - y_mean)**2) + ss_res = tch.sum((y_true - y_pred)**2) + r2 = 1 - ss_res / ss_tot + return r2 + + def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): """ Return train and valid performance including loss @@ -64,6 +73,7 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): net = net.to(device) # load the network onto the device trainloss_list = [] # metrics: MSE, size equals to EPOCH validloss_list = [] # metrics: MSE, size equals to EPOCH + validr2_list = [] # metrics: r2, size equals to EPOCH early_stopping = myutil.EarlyStopping(patience=30, verbose=True) # initialize the early_stopping # repeat the training for EPOCH times for epoch in range(epochs): @@ -90,15 +100,21 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): with tch.no_grad(): net.eval() valid_epoch_loss = 0.0 # save loss for each epoch, batch by batch + ss_res = 0.0 + ss_tot = 0.0 for i, (X_valid, y_valid) in enumerate(valid_dl): X_valid, y_valid = X_valid.to(device), y_valid.to(device) # load data onto the device y_valid_pred = net(X_valid) # valid result valid_loss = criterion(y_valid_pred, y_valid.float())#y_valid.unsqueeze(1)) # calculate loss valid_epoch_loss += valid_loss.item() # adding loss from each batch + ss_res += tch.sum((y_valid_pred - y_valid.float())**2) + ss_tot += tch.sum((y_valid_pred - y_valid.mean())**2) + # calculate total loss of all batches, and append to result list avg_valid_loss = valid_epoch_loss / len(valid_dl) validloss_list.append( avg_valid_loss) - + valid_r2 = 1 - ss_res / ss_tot + validr2_list.append(valid_r2) # display print message #print('epoch={:}/{:}, train loss={:.5f}, valid loss={:.5f}'.format( # epoch+1, epochs, train_epoch_loss / len(train_dl), @@ -115,7 +131,7 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): # load the last checkpoint with the best model net.load_state_dict(tch.load('checkpoint.pt')) - return net, trainloss_list, validloss_list + return net, trainloss_list, validloss_list, validr2_list def predict(net, test_dl, device): """ @@ -150,121 +166,128 @@ def main(params): # load data train_df = pd.read_csv(params['train_data'], header=0, index_col=[0,1], sep="\t") val_df = pd.read_csv(params['val_data'], header=0, index_col=[0,1], sep="\t") + test_df = pd.read_csv(params['test_data'], header=0, index_col=[0,1], sep="\t") # shuffle - sdf = skut.shuffle(df, random_state=params["seed_int"]) + #sdf = skut.shuffle(df, random_state=params["seed_int"]) # set parameters myutil.set_seed(params["seed_int"]) device = myutil.get_device(uth=params["gpu_int"]) - kFold = params["cv_int"] + #kFold = params["cv_int"] learning_rate = params['learning_rate'] epoch = params['epochs'] batch_size = params['batch_size'] opt_fn = tch.optim.Adam # create result list - loss_df_list = [] - score_df_list = [] - ytest_df_list = [] - shap_df_list = [] - # train with cross-validation - kf = skms.KFold(n_splits=kFold, random_state=params['seed_int'], shuffle=True) - X_df = sdf.iloc[:, 0:-1] - y_df = sdf.iloc[:, -1] + # loss_df_list = [] + # score_df_list = [] + # ytest_df_list = [] + # shap_df_list = [] + # # train with cross-validation + #kf = skms.KFold(n_splits=kFold, random_state=params['seed_int'], shuffle=True) + #X_df = train_df.iloc[:, 0:-1] + #y_df = train_df.iloc[:, -1] # save best model with lowest RMSE - best_rmse = 10000 - best_model = None - best_fold = 0 - for i, (train_index, test_index) in enumerate(kf.split(X_df, y_df)): - n_fold = i+1 - print('Fold={:}/{:}'.format(n_fold, params['cv_int'])) - # get train/test splits - Xtrain_arr = X_df.values[train_index] - Xtest_arr = X_df.values[test_index] - ytrain_arr = y_df.values[train_index] - ytest_arr = y_df.values[test_index] - # get train/valid splits from train - Xtrain_arr, Xvalid_arr, ytrain_arr, yvalid_arr = skms.train_test_split(Xtrain_arr, ytrain_arr, - test_size=0.1, random_state=params['seed_int']) - print(' train={:}, valid={:}, test={:}'.format(Xtrain_arr.shape, Xvalid_arr.shape, Xtest_arr.shape)) - # prepare dataframe for output - ytest_df = y_df.iloc[test_index].to_frame() - # convert to numpy array - Xtrain_arr = np.array(Xtrain_arr).astype('float32') - Xvalid_arr = np.array(Xvalid_arr).astype('float32') - Xtest_arr = np.array(Xtest_arr).astype('float32') - ytrain_arr = np.array(ytrain_arr).astype('float32') - yvalid_arr = np.array(yvalid_arr).astype('float32') - ytest_arr = np.array(ytest_arr).astype('float32') - # create mini-batch - train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) - valid_dataset = mydl.NumpyDataset(tch.from_numpy(Xvalid_arr), tch.from_numpy(yvalid_arr)) - test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) - train_dl = tchud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) - valid_dl = tchud.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) - test_dl = tchud.DataLoader(test_dataset, batch_size=batch_size, shuffle=False) - # initial weight - def init_weights(m): - if type(m) == tch.nn.Linear: - tch.nn.init.kaiming_uniform_(m.weight) - m.bias.data.fill_(0.01) - # load model - n_features = Xtrain_arr.shape[1] - net = mynet.FNN(n_features) - net.apply(init_weights) - # fit data with model - trained_net, train_loss_list, valid_loss_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn) - prediction_list = predict(trained_net, test_dl, device) - # evaluation metrics - mse = skmts.mean_squared_error(ytest_arr, prediction_list) - rmse = np.sqrt(mse) - if rmse <= best_rmse: - best_rmse = rmse - best_fold = n_fold - best_model = trained_net - print('best model so far at fold={:}, rmse={:}'.format(best_fold, best_rmse)) +# best_rmse = 10000 +# best_model = None +# best_fold = 0 +# # for i, (train_index, test_index) in enumerate(kf.split(X_df, y_df)): + #n_fold = i+1 + #print('Fold={:}/{:}'.format(n_fold, params['cv_int'])) + # get train/test splits + Xtrain_arr = train_df.iloc[:, 0:-1].values + Xvalid_arr = val_df.iloc[:, 0:-1].values + Xtest_arr = test_df.iloc[:, 0:-1].values + ytrain_arr = train_df.iloc[:, -1].values + yvalid_arr = val_df.iloc[:, -1].values + ytest_arr = test_df.iloc[:, -1].values + + # get train/valid splits from train + #Xtrain_arr, Xvalid_arr, ytrain_arr, yvalid_arr = skms.train_test_split(Xtrain_arr, ytrain_arr, + # test_size=0.1, random_state=params['seed_int']) + #print(' train={:}, valid={:}, test={:}'.format(Xtrain_arr.shape, Xvalid_arr.shape, Xtest_arr.shape)) + # prepare dataframe for output + ytest_df = test_df.iloc[:, -1].to_frame() + # convert to numpy array + Xtrain_arr = np.array(Xtrain_arr).astype('float32') + Xvalid_arr = np.array(Xvalid_arr).astype('float32') + Xtest_arr = np.array(Xtest_arr).astype('float32') + ytrain_arr = np.array(ytrain_arr).astype('float32') + yvalid_arr = np.array(yvalid_arr).astype('float32') + ytest_arr = np.array(ytest_arr).astype('float32') + # create mini-batch + train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) + valid_dataset = mydl.NumpyDataset(tch.from_numpy(Xvalid_arr), tch.from_numpy(yvalid_arr)) + test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) + train_dl = tchud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + valid_dl = tchud.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) + test_dl = tchud.DataLoader(test_dataset, batch_size=batch_size, shuffle=False) + # initial weight + def init_weights(m): + if type(m) == tch.nn.Linear: + tch.nn.init.kaiming_uniform_(m.weight) + m.bias.data.fill_(0.01) + # load model + n_features = Xtrain_arr.shape[1] + net = mynet.FNN(n_features) + net.apply(init_weights) + # fit data with model + trained_net, train_loss_list, valid_loss_list, valid_r2_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn) + prediction_list = predict(trained_net, test_dl, device) + # evaluation metrics + mse = skmts.mean_squared_error(ytest_arr, prediction_list) + rmse = np.sqrt(mse) + r2_pred = r2_score(ytest_arr, prediction_list) + loss_pred = pd.DataFrame({'rmse': rmse, + 'r2': r2_pred}) + loss_pred.to_csv(params['output'] + '/Loss_pred.txt', header=True, index=False, sep="\t") + # if rmse <= best_rmse: + # best_rmse = rmse + # best_fold = n_fold + # best_model = trained_net + # print('best model so far at fold={:}, rmse={:}'.format(best_fold, best_rmse)) + + + # if params['shap_bool'] == True: + # print('calculate shapely values') + # # random select 100 samples as baseline + # train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) + # train_dl = tchud.DataLoader(train_dataset, batch_size=200, shuffle=True) + # background, lbl = next(iter(train_dl)) + # explainer = sp.DeepExplainer(trained_net, background[:100].to(device)) + # shap_arr = explainer.shap_values(tch.from_numpy(Xtest_arr)) + # shap_df = pd.DataFrame(shap_arr, index=ytest_df.index, columns=X_df.columns) + # # append to result + # shap_df_list.append(shap_df) - - if params['shap_bool'] == True: - print('calculate shapely values') - # random select 100 samples as baseline - train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) - train_dl = tchud.DataLoader(train_dataset, batch_size=200, shuffle=True) - background, lbl = next(iter(train_dl)) - explainer = sp.DeepExplainer(trained_net, background[:100].to(device)) - shap_arr = explainer.shap_values(tch.from_numpy(Xtest_arr)) - shap_df = pd.DataFrame(shap_arr, index=ytest_df.index, columns=X_df.columns) - # append to result - shap_df_list.append(shap_df) - - # collect result - loss_df = pd.DataFrame({'fold':[n_fold]*len(train_loss_list), - 'epoch':[i+1 for i in range(len(train_loss_list))], - 'train loss':train_loss_list, - 'valid loss': valid_loss_list}) - ytest_df['prediction'] = prediction_list - ytest_df['fold'] = n_fold - loss_df_list.append(loss_df) - ytest_df_list.append(ytest_df) - # end of fold - trained_net = None - break + # collect result + loss_df = pd.DataFrame({'epoch':[i+1 for i in range(len(train_loss_list))], + 'train loss':train_loss_list, + 'valid loss': valid_loss_list, + 'valid r2': valid_r2_list}) + ytest_df['prediction'] = prediction_list + #loss_df_list.append(loss_df) + #ytest_df_list.append(ytest_df) + # end of fold + #trained_net = None # save to output - all_ytest_df = pd.concat(ytest_df_list, axis=0) - all_loss_df = pd.concat(loss_df_list, axis=0) - all_ytest_df.to_csv(params['output'] + '.FNN.cv_' + str(params['cv_int']) + '.Prediction.txt', header=True, index=True, sep="\t") - all_loss_df.to_csv(params['output'] + '.FNN.cv_' + str(params['cv_int']) + '.Loss.txt', header=True, index=False, sep="\t") - if params['shap_bool'] == True: - all_shap_df = pd.concat(shap_df_list, axis=0) - all_shap_df.to_csv(params['output'] + '.FNN.cv_' + str(params['cv_int']) + '.SHAP.txt', header=True, index=True, sep="\t") + #all_ytest_df = pd.concat(ytest_df_list, axis=0) + #all_loss_df = pd.concat(loss_df_list, axis=0) + ytest_df.to_csv(params['output'] + '/Prediction.txt', header=True, index=True, sep="\t") + loss_df.to_csv(params['output'] + '/Loss.txt', header=True, index=False, sep="\t") + # if params['shap_bool'] == True: + # all_shap_df = pd.concat(shap_df_list, axis=0) + # all_shap_df.to_csv(params['output'] + '.FNN.cv_' + str(params['cv_int']) + '.SHAP.txt', header=True, index=True, sep="\t") # make train/valid loss plots - tch.save(best_model.state_dict(), params['output'] + '.FNN.cv_' + str(params['cv_int']) + 'best_model.pt') + best_model = trained_net + tch.save(best_model.state_dict(), params['output'] + '/model.pt') print( '[Finished in {:}]'.format(myutil.cal_time(datetime.now(), start_time)) ) # display evaluation metrics of all folds - mse, rmse, r_square, pccy = mymts.eval_regressor_performance(all_ytest_df, 'response', 'prediction') + #mse, rmse, r_square, pccy = mymts.eval_regressor_performance(all_ytest_df, 'response', 'prediction') diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 1ba89b2..970ec95 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -25,9 +25,9 @@ split=0 #Model parameter -seed_int=42 +#seed_int=42 cpu_int=20 -cv_int=1 +#cv_int=1 gpu_int=0 learning_rate = 0.001 batch_size = 12 From 09e739fe8f63252c61a5592b0273d294f8b81616 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 16 Aug 2023 13:54:11 -0700 Subject: [PATCH 045/254] update FNN --- PathDSP/FNN_new.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/PathDSP/FNN_new.py b/PathDSP/FNN_new.py index 63c6af1..a3ed150 100644 --- a/PathDSP/FNN_new.py +++ b/PathDSP/FNN_new.py @@ -47,9 +47,9 @@ def forward(self,x,y): def r2_score(y_true, y_pred): - y_mean = tch.mean(y_true) - ss_tot = tch.sum((y_true - y_mean)**2) - ss_res = tch.sum((y_true - y_pred)**2) + y_mean = np.mean(y_true) + ss_tot = np.sum((y_true - y_mean)**2) + ss_res = np.sum((y_true - y_pred)**2) r2 = 1 - ss_res / ss_tot return r2 @@ -240,8 +240,8 @@ def init_weights(m): mse = skmts.mean_squared_error(ytest_arr, prediction_list) rmse = np.sqrt(mse) r2_pred = r2_score(ytest_arr, prediction_list) - loss_pred = pd.DataFrame({'rmse': rmse, - 'r2': r2_pred}) + loss_pred = pd.DataFrame({'metric': ['rmse', 'r2'], + 'value': [rmse, r2_pred]}) loss_pred.to_csv(params['output'] + '/Loss_pred.txt', header=True, index=False, sep="\t") # if rmse <= best_rmse: # best_rmse = rmse From 3d014a5561b32788b4963955894465b38278a4e4 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 16 Aug 2023 14:43:24 -0700 Subject: [PATCH 046/254] update params --- PathDSP_params.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 970ec95..b4cdc8d 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -18,7 +18,7 @@ cnvnet_file='CNVnet.txt' exp_file='EXP.txt' output='Result/' bit_int=128 -permutation_int=100 +permutation_int=3 metric='auc1' data_type='CCLE' split=0 From 8db2812b38cb3e3e9cce1610842899d7114fb2fe Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 16 Aug 2023 15:08:29 -0700 Subject: [PATCH 047/254] fix param --- PathDSP_params.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index b4cdc8d..f731fba 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -25,7 +25,7 @@ split=0 #Model parameter -#seed_int=42 +seed_int=42 cpu_int=20 #cv_int=1 gpu_int=0 From fdf2d8c43aad382fe224eae72e413da5e7586316 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 16 Aug 2023 16:13:45 -0700 Subject: [PATCH 048/254] fix bug --- preprocess_new.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocess_new.py b/preprocess_new.py index 20f89e4..9afa6b7 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -252,7 +252,7 @@ def run_netpea(params, dtype, multiply_expression): cnv_data = cnv_data.loc[cnv_data['improve_sample_id'].isin(rs_all['improve_sample_id']),] restart_path = params['data_dir'] + '/cnv_data.txt' cnv_data.iloc[:,0:2].to_csv(restart_path, sep = '\t', header= True, index=False) - outpath = params['mutnet_file'] + outpath = params['cnvnet_file'] # perform Random Walk print(datetime.now(), 'performing random walk with restart') rwr_df = rwr.RWR(ppi_path, restart_path, restartProbFloat=0.5, convergenceFloat=0.00001, normalize='l1', weighted=True).get_prob() From 9e179ae4d5d90c9dc68e288e56c538273b64f5a4 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 17 Aug 2023 07:48:16 -0700 Subject: [PATCH 049/254] add time --- NetPEA.py | 2 +- PathDSP/FNN_new.py | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/NetPEA.py b/NetPEA.py index d30599f..5ec48fc 100644 --- a/NetPEA.py +++ b/NetPEA.py @@ -75,7 +75,7 @@ def netpea_parallel(self, rwrDf, pathwayDictList, n_cpu, out_path): all_cell_zscore_df = pd.concat(df_list, axis=0) zscore_fname = self.out_path all_cell_zscore_df.to_csv(zscore_fname, header=True, index=True, sep="\t") - print(all_cell_zscore_df) + #print(all_cell_zscore_df) def netpea(self, rwrDf, pathwayDictList): diff --git a/PathDSP/FNN_new.py b/PathDSP/FNN_new.py index a3ed150..5413bad 100644 --- a/PathDSP/FNN_new.py +++ b/PathDSP/FNN_new.py @@ -53,6 +53,15 @@ def r2_score(y_true, y_pred): r2 = 1 - ss_res / ss_tot return r2 +def cal_time(end, start): + '''return time spent''' + # end = datetime.now(), start = datetime.now() + datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' + spend = datetime.strptime(str(end), datetimeFormat) - \ + datetime.strptime(str(start),datetimeFormat) + return spend + + def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): """ @@ -76,8 +85,10 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): validr2_list = [] # metrics: r2, size equals to EPOCH early_stopping = myutil.EarlyStopping(patience=30, verbose=True) # initialize the early_stopping # repeat the training for EPOCH times + start_total = datetime.now() for epoch in range(epochs): ## training phase + start = datetime.now() net.train() # initial loss train_epoch_loss = 0.0 # save loss for each epoch, batch by batch @@ -96,6 +107,7 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): # calculate total loss of all batches avg_train_loss = train_epoch_loss / len(train_dl) trainloss_list.append( avg_train_loss ) + print('epoch ' + str(i) + ' :[Finished in {:}]'.format(cal_time(datetime.now(), start))) ## validation phase with tch.no_grad(): net.eval() @@ -127,7 +139,8 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): if early_stopping.early_stop: print("Early stopping") break - + + print('Total time (all epochs) :[Finished in {:}]'.format(cal_time(datetime.now(), start_total))) # load the last checkpoint with the best model net.load_state_dict(tch.load('checkpoint.pt')) @@ -235,7 +248,9 @@ def init_weights(m): net.apply(init_weights) # fit data with model trained_net, train_loss_list, valid_loss_list, valid_r2_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn) + start = datetime.now() prediction_list = predict(trained_net, test_dl, device) + print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start))) # evaluation metrics mse = skmts.mean_squared_error(ytest_arr, prediction_list) rmse = np.sqrt(mse) From 4f4d94fe62ab31599b53b5364c37b43ad0d12f81 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 17 Aug 2023 09:14:12 -0700 Subject: [PATCH 050/254] update def --- PathDSP.def | 2 +- PathDSP/FNN_new.py | 2 +- environment_081723.yml | 158 +++++++++++++++++++++++++++++++++++++++++ preprocess_new.py | 20 +++--- 4 files changed, 170 insertions(+), 12 deletions(-) create mode 100644 environment_081723.yml diff --git a/PathDSP.def b/PathDSP.def index 28cb90e..3f1f0b0 100644 --- a/PathDSP.def +++ b/PathDSP.def @@ -47,7 +47,7 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime # download conda - /opt/conda/bin/conda env create -f environment.yml --prefix /usr/local/conda_envs/PathDSP_env/ + /opt/conda/bin/conda env create -f environment_081723.yml --prefix /usr/local/conda_envs/PathDSP_env/ #/opt/conda/bin/conda activate PathDSP_env /usr/local/conda_envs/PathDSP_env/bin/pip install git+https://github.com/ECP-CANDLE/candle_lib@develop diff --git a/PathDSP/FNN_new.py b/PathDSP/FNN_new.py index 5413bad..ab150e2 100644 --- a/PathDSP/FNN_new.py +++ b/PathDSP/FNN_new.py @@ -33,7 +33,7 @@ #import myPlotter as myplot import myMetrics as mymts -import shap as sp +#import shap as sp class RMSELoss(tch.nn.Module): def __init__(self): diff --git a/environment_081723.yml b/environment_081723.yml new file mode 100644 index 0000000..c66449f --- /dev/null +++ b/environment_081723.yml @@ -0,0 +1,158 @@ +name: PathDSP_env +channels: + - bioconda + - pytorch + - nvidia + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - blas=1.0 + - boost=1.74.0 + - boost-cpp=1.74.0 + - bottleneck=1.3.5 + - brotlipy=0.7.0 + - bzip2=1.0.8 + - ca-certificates=2023.7.22 + - cairo=1.16.0 + - certifi=2023.7.22 + - cffi=1.15.1 + - charset-normalizer=2.0.4 + - cryptography=41.0.2 + - cuda-cudart=11.7.99 + - cuda-cupti=11.7.101 + - cuda-libraries=11.7.1 + - cuda-nvrtc=11.7.99 + - cuda-nvtx=11.7.91 + - cuda-runtime=11.7.1 + - cycler=0.11.0 + - ffmpeg=4.3 + - filelock=3.9.0 + - fontconfig=2.14.1 + - freetype=2.10.4 + - giflib=5.2.1 + - glib=2.69.1 + - gmp=6.2.1 + - gmpy2=2.1.2 + - gnutls=3.6.15 + - greenlet=2.0.1 + - gseapy=1.0.5 + - icu=70.1 + - idna=3.4 + - intel-openmp=2023.1.0 + - jbig=2.1 + - jinja2=3.1.2 + - jpeg=9e + - kiwisolver=1.4.4 + - lame=3.100 + - lcms2=2.12 + - ld_impl_linux-64=2.38 + - lerc=3.0 + - libcublas=11.10.3.66 + - libcufft=10.7.2.124 + - libcufile=1.7.1.12 + - libcurand=10.3.3.129 + - libcusolver=11.4.0.1 + - libcusparse=11.7.4.91 + - libdeflate=1.8 + - libffi=3.4.4 + - libgcc-ng=13.1.0 + - libgfortran-ng=11.2.0 + - libgfortran5=11.2.0 + - libiconv=1.16 + - libidn2=2.3.4 + - libnpp=11.7.4.75 + - libnsl=2.0.0 + - libnvjpeg=11.8.0.2 + - libpng=1.6.39 + - libsqlite=3.42.0 + - libstdcxx-ng=11.2.0 + - libtasn1=4.19.0 + - libtiff=4.3.0 + - libunistring=0.9.10 + - libuuid=2.38.1 + - libwebp=1.2.4 + - libwebp-base=1.2.4 + - libxcb=1.15 + - libxml2=2.9.14 + - libzlib=1.2.13 + - llvm-openmp=16.0.6 + - lz4-c=1.9.4 + - markupsafe=2.1.1 + - matplotlib-base=3.4.3 + - mkl=2023.1.0 + - mkl-service=2.4.0 + - mkl_fft=1.3.6 + - mkl_random=1.2.2 + - mpc=1.1.0 + - mpfr=4.0.2 + - mpmath=1.3.0 + - ncurses=6.4 + - nettle=3.7.3 + - networkx=3.1 + - numexpr=2.8.4 + - numpy=1.25.2 + - numpy-base=1.25.2 + - openh264=2.1.1 + - openssl=3.1.2 + - pandas=1.5.3 + - pcre=8.45 + - pillow=9.4.0 + - pip=23.2.1 + - pixman=0.40.0 + - pthread-stubs=0.4 + - pycairo=1.24.0 + - pycparser=2.21 + - pyopenssl=23.2.0 + - pysocks=1.7.1 + - python=3.10.12 + - python-dateutil=2.8.2 + - python_abi=3.10 + - pytorch=2.0.1 + - pytorch-cuda=11.7 + - pytorch-mutex=1.0 + - pytz=2022.7 + - rdkit=2022.03.2 + - readline=8.2 + - reportlab=3.6.12 + - requests=2.31.0 + - scipy=1.11.1 + - setuptools=68.0.0 + - six=1.16.0 + - sqlalchemy=1.4.49 + - sqlite=3.41.2 + - sympy=1.11.1 + - tbb=2021.8.0 + - tk=8.6.12 + - torchaudio=2.0.2 + - torchtriton=2.0.0 + - torchvision=0.15.2 + - tornado=6.3.2 + - typing_extensions=4.7.1 + - tzdata=2023c + - urllib3=1.26.16 + - wheel=0.38.4 + - xorg-libxau=1.0.11 + - xorg-libxdmcp=1.1.3 + - xz=5.2.6 + - zlib=1.2.13 + - zstd=1.5.2 + - pip: + - astropy==5.3.2 + - candle==0.0.1 + - contourpy==1.1.0 + - fonttools==4.42.0 + - joblib==1.3.2 + - matplotlib==3.7.2 + - packaging==23.1 + - patsy==0.5.3 + - protobuf==3.19.0 + - pyerfa==2.0.0.3 + - pyparsing==3.0.9 + - pyyaml==6.0.1 + - scikit-learn==1.3.0 + - statsmodels==0.14.0 + - threadpoolctl==3.2.0 + - tqdm==4.66.1 +prefix: /homes/ac.liu.yuanhang/miniconda3/envs/improve_env_081723 diff --git a/preprocess_new.py b/preprocess_new.py index 9afa6b7..2549ee6 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -3,21 +3,21 @@ import sys import os import numpy as np -import torch -import torch.utils.data as du -from torch.autograd import Variable -import torch.nn as nn -import torch.nn.functional as F +#import torch +#import torch.utils.data as du +#from torch.autograd import Variable +#import torch.nn as nn +#import torch.nn.functional as F #from code.drugcell_NN import * import argparse import numpy as np import pandas as pd import candle -import time -import logging -import networkx as nx -import networkx.algorithms.components.connected as nxacc -import networkx.algorithms.dag as nxadag +#import time +#import logging +#import networkx as nx +#import networkx.algorithms.components.connected as nxacc +#import networkx.algorithms.dag as nxadag #from pathlib import Path from functools import reduce import improve_utils From 9767ff68bde6c6fbcffe7ce044eb2aa780c267fa Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 17 Aug 2023 09:33:04 -0700 Subject: [PATCH 051/254] update yml --- environment_081723.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/environment_081723.yml b/environment_081723.yml index c66449f..870884e 100644 --- a/environment_081723.yml +++ b/environment_081723.yml @@ -140,7 +140,6 @@ dependencies: - zstd=1.5.2 - pip: - astropy==5.3.2 - - candle==0.0.1 - contourpy==1.1.0 - fonttools==4.42.0 - joblib==1.3.2 @@ -155,4 +154,3 @@ dependencies: - statsmodels==0.14.0 - threadpoolctl==3.2.0 - tqdm==4.66.1 -prefix: /homes/ac.liu.yuanhang/miniconda3/envs/improve_env_081723 From 0588da41926b5f77cba18d485030a6ea94eed819 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 17 Aug 2023 09:51:40 -0700 Subject: [PATCH 052/254] update train.sh --- train.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/train.sh b/train.sh index b71f3e8..165eac6 100755 --- a/train.sh +++ b/train.sh @@ -79,5 +79,8 @@ echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" # Set up environmental variables and execute model +echo "activating environment" +#source /opt/conda/etc/profile.d/conda.sh +source activate /usr/local/conda_envs/PathDSP_env echo "running command ${CMD}" CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD From e6bd11573c754952d3f87bbcc581795d04618ed4 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 17 Aug 2023 11:14:45 -0700 Subject: [PATCH 053/254] update train.sh --- train.py | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/train.py b/train.py index bfb1dd0..9f352b1 100644 --- a/train.py +++ b/train.py @@ -1,13 +1,13 @@ import candle import os -import json -from json import JSONEncoder +#import json +#from json import JSONEncoder from preprocess_new import mkdir, preprocess from PathDSP.FNN_new import main file_path = os.path.dirname(os.path.realpath(__file__)) # This should be set outside as a user environment variable -os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' +#os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' required = None additional_definitions = None @@ -35,34 +35,36 @@ def initialize_parameters(): gParameters = candle.finalize_parameters(preprocessor_bmk) return gParameters -class CustomData: - def __init__(self, name, value): - self.name = name - self.value = value +# class CustomData: +# def __init__(self, name, value): +# self.name = name +# self.value = value -class CustomEncoder(json.JSONEncoder): - def default(self, o): - return o.__dict__ +# class CustomEncoder(json.JSONEncoder): +# def default(self, o): +# return o.__dict__ -def run(params): - params['data_type'] = str(params['data_type']) - json_out = params['output_dir']+'/params.json' - print(params) +# def run(params): +# params['data_type'] = str(params['data_type']) +# json_out = params['output_dir']+'/params.json' +# print(params) - with open (json_out, 'w') as fp: - json.dump(params, fp, indent=4, cls=CustomEncoder) +# with open (json_out, 'w') as fp: +# json.dump(params, fp, indent=4, cls=CustomEncoder) - scores = main(params) - with open(params['output_dir'] + "/scores.json", "w", encoding="utf-8") as f: - json.dump(scores, f, ensure_ascii=False, indent=4) -# print('IMPROVE_RESULT RMSE:\t' + str(scores['rmse'])) +# scores = main(params) +# with open(params['output_dir'] + "/scores.json", "w", encoding="utf-8") as f: +# json.dump(scores, f, ensure_ascii=False, indent=4) +# # print('IMPROVE_RESULT RMSE:\t' + str(scores['rmse'])) def candle_main(): params = initialize_parameters() - params = preprocess(params) - run(params) + data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/' + params = preprocess(params, data_dir) + main(params) + if __name__ == "__main__": candle_main() From c590e8811490d7f34e9bab9c3fa361e77b62648d Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 17 Aug 2023 11:31:34 -0700 Subject: [PATCH 054/254] update train.py --- train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/train.py b/train.py index 9f352b1..512c885 100644 --- a/train.py +++ b/train.py @@ -1,7 +1,9 @@ import candle import os +import sys #import json #from json import JSONEncoder +sys.path.append("./PathDSP/") from preprocess_new import mkdir, preprocess from PathDSP.FNN_new import main From 9c6afe3e6c277fa3e8bf8efe7047e9dd6e8d9343 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 17 Aug 2023 11:52:56 -0700 Subject: [PATCH 055/254] update train --- train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index 512c885..0517863 100644 --- a/train.py +++ b/train.py @@ -5,7 +5,7 @@ #from json import JSONEncoder sys.path.append("./PathDSP/") from preprocess_new import mkdir, preprocess -from PathDSP.FNN_new import main +import FNN_new file_path = os.path.dirname(os.path.realpath(__file__)) # This should be set outside as a user environment variable @@ -65,7 +65,7 @@ def candle_main(): params = initialize_parameters() data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/' params = preprocess(params, data_dir) - main(params) + FNN_new.main(params) if __name__ == "__main__": From 679a2f0a5c3ce92f71211f78e1d8335b37e0f78a Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 17 Aug 2023 15:42:13 -0700 Subject: [PATCH 056/254] fix bug --- PathDSP/FNN_new.py | 13 +++++++------ PathDSP_params.txt | 2 +- preprocess_new.py | 2 +- train.py | 3 ++- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/PathDSP/FNN_new.py b/PathDSP/FNN_new.py index ab150e2..588fb08 100644 --- a/PathDSP/FNN_new.py +++ b/PathDSP/FNN_new.py @@ -107,7 +107,7 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): # calculate total loss of all batches avg_train_loss = train_epoch_loss / len(train_dl) trainloss_list.append( avg_train_loss ) - print('epoch ' + str(i) + ' :[Finished in {:}]'.format(cal_time(datetime.now(), start))) + print('epoch ' + str(epoch) + ' :[Finished in {:}]'.format(cal_time(datetime.now(), start))) ## validation phase with tch.no_grad(): net.eval() @@ -126,7 +126,7 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): avg_valid_loss = valid_epoch_loss / len(valid_dl) validloss_list.append( avg_valid_loss) valid_r2 = 1 - ss_res / ss_tot - validr2_list.append(valid_r2) + validr2_list.append(valid_r2.cpu().numpy()) # display print message #print('epoch={:}/{:}, train loss={:.5f}, valid loss={:.5f}'.format( # epoch+1, epochs, train_epoch_loss / len(train_dl), @@ -257,7 +257,7 @@ def init_weights(m): r2_pred = r2_score(ytest_arr, prediction_list) loss_pred = pd.DataFrame({'metric': ['rmse', 'r2'], 'value': [rmse, r2_pred]}) - loss_pred.to_csv(params['output'] + '/Loss_pred.txt', header=True, index=False, sep="\t") + loss_pred.to_csv(params['data_dir'] + '/Loss_pred.txt', header=True, index=False, sep="\t") # if rmse <= best_rmse: # best_rmse = rmse # best_fold = n_fold @@ -282,6 +282,7 @@ def init_weights(m): 'train loss':train_loss_list, 'valid loss': valid_loss_list, 'valid r2': valid_r2_list}) + ytest_df['prediction'] = prediction_list #loss_df_list.append(loss_df) #ytest_df_list.append(ytest_df) @@ -291,15 +292,15 @@ def init_weights(m): # save to output #all_ytest_df = pd.concat(ytest_df_list, axis=0) #all_loss_df = pd.concat(loss_df_list, axis=0) - ytest_df.to_csv(params['output'] + '/Prediction.txt', header=True, index=True, sep="\t") - loss_df.to_csv(params['output'] + '/Loss.txt', header=True, index=False, sep="\t") + ytest_df.to_csv(params['data_dir'] + '/Prediction.txt', header=True, index=True, sep="\t") + loss_df.to_csv(params['data_dir'] + '/Loss.txt', header=True, index=False, sep="\t") # if params['shap_bool'] == True: # all_shap_df = pd.concat(shap_df_list, axis=0) # all_shap_df.to_csv(params['output'] + '.FNN.cv_' + str(params['cv_int']) + '.SHAP.txt', header=True, index=True, sep="\t") # make train/valid loss plots best_model = trained_net - tch.save(best_model.state_dict(), params['output'] + '/model.pt') + tch.save(best_model.state_dict(), params['data_dir'] + '/model.pt') print( '[Finished in {:}]'.format(myutil.cal_time(datetime.now(), start_time)) ) # display evaluation metrics of all folds #mse, rmse, r_square, pccy = mymts.eval_regressor_performance(all_ytest_df, 'response', 'prediction') diff --git a/PathDSP_params.txt b/PathDSP_params.txt index f731fba..6839b6a 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -16,7 +16,7 @@ dgnet_file='DGnet.txt' mutnet_file='MUTnet.txt' cnvnet_file='CNVnet.txt' exp_file='EXP.txt' -output='Result/' +#output='Result/' bit_int=128 permutation_int=3 metric='auc1' diff --git a/preprocess_new.py b/preprocess_new.py index 2549ee6..6537ddf 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -89,7 +89,7 @@ def preprocess(params, data_dir): params['data_dir'] = data_dir #args = candle.ArgumentStruct(**params) for i in ['train_data', 'test_data', 'val_data', 'drug_bits_file', 'dgnet_file', - 'mutnet_file', 'cnvnet_file', 'exp_file', 'output_dir']: + 'mutnet_file', 'cnvnet_file', 'exp_file']: params[i] = params['data_dir'] + '/' + params[i] return(params) diff --git a/train.py b/train.py index 0517863..28b97b1 100644 --- a/train.py +++ b/train.py @@ -3,8 +3,9 @@ import sys #import json #from json import JSONEncoder -sys.path.append("./PathDSP/") from preprocess_new import mkdir, preprocess +#sys.path.append("/usr/local/PathDSP/PathDSP") +sys.path.append("/usr/local/PathDSP/PathDSP") import FNN_new file_path = os.path.dirname(os.path.realpath(__file__)) From b13eab86a1641a6617ff8e0a6a228420d6d9b3ad Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Tue, 22 Aug 2023 11:39:27 -0700 Subject: [PATCH 057/254] update file --- PathDSP_params.txt | 4 ++-- preprocess_new.py | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 6839b6a..fbe7355 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -19,8 +19,8 @@ exp_file='EXP.txt' #output='Result/' bit_int=128 permutation_int=3 -metric='auc1' -data_type='CCLE' +metric='auc' +data_type='CTRPv2' split=0 diff --git a/preprocess_new.py b/preprocess_new.py index 6537ddf..a08f808 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -106,8 +106,18 @@ def download_anl_data(params): mkdir(x_data_dir) mkdir(y_data_dir) - for improve_file in ['CCLE_all.txt', 'CCLE_split_' + str(params['split']) + '_test.txt', - 'CCLE_split_' + str(params['split']) + '_train.txt', 'CCLE_split_' + str(params['split']) + '_val.txt']: + for improve_file in ['CCLE_all.txt', + 'CCLE_split_' + str(params['split']) + '_test.txt', + 'CCLE_split_' + str(params['split']) + '_train.txt', + 'CCLE_split_' + str(params['split']) + '_val.txt', + 'CTRPv2_all.txt', + 'CTRPv2_split' + str(params['split']) + '_test.txt', + 'CTRPv2_split' + str(params['split']) + '_train.txt', + 'CTRPv2_split' + str(params['split']) + '_val.txt', + 'gCSI_all.txt', + 'GDSCv1_all.txt', + 'GDSCv2_all.txt' + ]: url_dir = params['improve_data_url'] + '/splits/' candle.file_utils.get_file(improve_file, url_dir + improve_file, datadir=splits_dir, @@ -308,7 +318,7 @@ def prep_input(params): comb_data_mtx = pd.DataFrame({'drug_id': response_df['drug_id'].values, 'sample_id': response_df['sample_id'].values}) comb_data_mtx = comb_data_mtx.set_index(['drug_id', 'sample_id']).join(drug_data, on = 'drug_id').join(sample_data, on = 'sample_id') - comb_data_mtx['response'] = response_df[params['metric']] + comb_data_mtx['response'] = response_df[params['metric']].values comb_data_mtx.to_csv(params[i + '_data'], sep = '\t', header= True, index=False) From c7bd594726eb886549cd404dce8c7c3ef84a1cf8 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Tue, 22 Aug 2023 12:22:15 -0700 Subject: [PATCH 058/254] update file --- preprocess_new.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/preprocess_new.py b/preprocess_new.py index a08f808..41cc63c 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -111,9 +111,9 @@ def download_anl_data(params): 'CCLE_split_' + str(params['split']) + '_train.txt', 'CCLE_split_' + str(params['split']) + '_val.txt', 'CTRPv2_all.txt', - 'CTRPv2_split' + str(params['split']) + '_test.txt', - 'CTRPv2_split' + str(params['split']) + '_train.txt', - 'CTRPv2_split' + str(params['split']) + '_val.txt', + 'CTRPv2_split_' + str(params['split']) + '_test.txt', + 'CTRPv2_split_' + str(params['split']) + '_train.txt', + 'CTRPv2_split_' + str(params['split']) + '_val.txt', 'gCSI_all.txt', 'GDSCv1_all.txt', 'GDSCv2_all.txt' From 0c45b0914ba7677c061ad747a70810b9d9b1c3e3 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Tue, 22 Aug 2023 17:15:20 -0700 Subject: [PATCH 059/254] use polars --- NetPEA.py | 4 ++-- PathDSP/FNN_new.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/NetPEA.py b/NetPEA.py index 5ec48fc..d7006ef 100644 --- a/NetPEA.py +++ b/NetPEA.py @@ -166,8 +166,8 @@ def _cal_zscore(self, score, scoreList): if np.std(scoreList) != 0: zscore = (score - np.mean(scoreList) ) / np.std(scoreList) pvalue = scistat.norm.sf(abs(zscore)) # not pdf - print('score={:}, scoreList={:}, zscore={:}, pvalue={:}'.format( - score, scoreList[:10], zscore, pvalue)) + #print('score={:}, scoreList={:}, zscore={:}, pvalue={:}'.format( + # score, scoreList[:10], zscore, pvalue)) else: zscore, pvalue = np.nan, np.nan return [zscore, pvalue] diff --git a/PathDSP/FNN_new.py b/PathDSP/FNN_new.py index 588fb08..8349072 100644 --- a/PathDSP/FNN_new.py +++ b/PathDSP/FNN_new.py @@ -32,6 +32,7 @@ import myUtility as myutil #import myPlotter as myplot import myMetrics as mymts +import polars as pl #import shap as sp @@ -177,9 +178,13 @@ def predict(net, test_dl, device): def main(params): start_time = datetime.now() # load data - train_df = pd.read_csv(params['train_data'], header=0, index_col=[0,1], sep="\t") - val_df = pd.read_csv(params['val_data'], header=0, index_col=[0,1], sep="\t") - test_df = pd.read_csv(params['test_data'], header=0, index_col=[0,1], sep="\t") + print('loadinig data') + # train_df = pd.read_csv(params['train_data'], header=0, index_col=[0,1], sep="\t") + # val_df = pd.read_csv(params['val_data'], header=0, index_col=[0,1], sep="\t") + # test_df = pd.read_csv(params['test_data'], header=0, index_col=[0,1], sep="\t") + train_df = pl.read_csv(params['train_data'], separator = "\t").to_pandas() + val_df = pl.read_csv(params['val_data'], separator = "\t").to_pandas() + test_df = pl.read_csv(params['test_data'], separator = "\t").to_pandas() # shuffle #sdf = skut.shuffle(df, random_state=params["seed_int"]) @@ -247,6 +252,7 @@ def init_weights(m): net = mynet.FNN(n_features) net.apply(init_weights) # fit data with model + print('start training process') trained_net, train_loss_list, valid_loss_list, valid_r2_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn) start = datetime.now() prediction_list = predict(trained_net, test_dl, device) From 8419839a115efc257edf949bf721fcca9d3f1645 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Tue, 22 Aug 2023 17:19:37 -0700 Subject: [PATCH 060/254] update files --- PathDSP.def | 2 +- environment_082223.yml | 218 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 environment_082223.yml diff --git a/PathDSP.def b/PathDSP.def index 3f1f0b0..ae656c7 100644 --- a/PathDSP.def +++ b/PathDSP.def @@ -47,7 +47,7 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime # download conda - /opt/conda/bin/conda env create -f environment_081723.yml --prefix /usr/local/conda_envs/PathDSP_env/ + /opt/conda/bin/conda env create -f environment_082223.yml --prefix /usr/local/conda_envs/PathDSP_env/ #/opt/conda/bin/conda activate PathDSP_env /usr/local/conda_envs/PathDSP_env/bin/pip install git+https://github.com/ECP-CANDLE/candle_lib@develop diff --git a/environment_082223.yml b/environment_082223.yml new file mode 100644 index 0000000..0f91f17 --- /dev/null +++ b/environment_082223.yml @@ -0,0 +1,218 @@ +name: PathDSP_env +channels: + - bioconda + - pytorch + - nvidia + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - abseil-cpp=20211102.0 + - arrow-cpp=11.0.0 + - asttokens=2.2.1 + - aws-c-common=0.6.8 + - aws-c-event-stream=0.1.6 + - aws-checksums=0.1.11 + - aws-sdk-cpp=1.8.185 + - backcall=0.2.0 + - backports=1.0 + - backports.functools_lru_cache=1.6.5 + - blas=1.0 + - boost=1.74.0 + - boost-cpp=1.74.0 + - bottleneck=1.3.5 + - brotlipy=0.7.0 + - bzip2=1.0.8 + - c-ares=1.19.0 + - ca-certificates=2023.05.30 + - cairo=1.16.0 + - certifi=2023.7.22 + - cffi=1.15.1 + - charset-normalizer=2.0.4 + - comm=0.1.4 + - cryptography=41.0.2 + - cuda-cudart=11.7.99 + - cuda-cupti=11.7.101 + - cuda-libraries=11.7.1 + - cuda-nvrtc=11.7.99 + - cuda-nvtx=11.7.91 + - cuda-runtime=11.7.1 + - cycler=0.11.0 + - debugpy=1.6.7 + - decorator=5.1.1 + - executing=1.2.0 + - ffmpeg=4.3 + - filelock=3.9.0 + - fontconfig=2.14.1 + - freetype=2.10.4 + - gflags=2.2.2 + - giflib=5.2.1 + - glib=2.69.1 + - glog=0.5.0 + - gmp=6.2.1 + - gmpy2=2.1.2 + - gnutls=3.6.15 + - greenlet=2.0.1 + - grpc-cpp=1.48.2 + - gseapy=1.0.5 + - icu=70.1 + - idna=3.4 + - importlib-metadata=6.8.0 + - importlib_metadata=6.8.0 + - intel-openmp=2023.1.0 + - ipykernel=6.25.1 + - ipython=8.14.0 + - jbig=2.1 + - jedi=0.19.0 + - jinja2=3.1.2 + - jpeg=9e + - jupyter_client=8.3.0 + - jupyter_core=4.12.0 + - kiwisolver=1.4.4 + - krb5=1.20.1 + - lame=3.100 + - lcms2=2.12 + - ld_impl_linux-64=2.38 + - lerc=3.0 + - libbrotlicommon=1.0.9 + - libbrotlidec=1.0.9 + - libbrotlienc=1.0.9 + - libcublas=11.10.3.66 + - libcufft=10.7.2.124 + - libcufile=1.7.1.12 + - libcurand=10.3.3.129 + - libcurl=8.1.1 + - libcusolver=11.4.0.1 + - libcusparse=11.7.4.91 + - libdeflate=1.8 + - libedit=3.1.20221030 + - libev=4.33 + - libevent=2.1.12 + - libffi=3.4.4 + - libgcc-ng=13.1.0 + - libgfortran-ng=11.2.0 + - libgfortran5=11.2.0 + - libiconv=1.16 + - libidn2=2.3.4 + - libnghttp2=1.52.0 + - libnpp=11.7.4.75 + - libnsl=2.0.0 + - libnvjpeg=11.8.0.2 + - libpng=1.6.39 + - libprotobuf=3.20.3 + - libsodium=1.0.18 + - libsqlite=3.42.0 + - libssh2=1.10.0 + - libstdcxx-ng=11.2.0 + - libtasn1=4.19.0 + - libthrift=0.15.0 + - libtiff=4.3.0 + - libunistring=0.9.10 + - libuuid=2.38.1 + - libwebp=1.2.4 + - libwebp-base=1.2.4 + - libxcb=1.15 + - libxml2=2.9.14 + - libzlib=1.2.13 + - llvm-openmp=16.0.6 + - lz4-c=1.9.4 + - markupsafe=2.1.1 + - matplotlib-base=3.4.3 + - matplotlib-inline=0.1.6 + - mkl=2023.1.0 + - mkl-service=2.4.0 + - mkl_fft=1.3.6 + - mkl_random=1.2.2 + - mpc=1.1.0 + - mpfr=4.0.2 + - mpmath=1.3.0 + - ncurses=6.4 + - nest-asyncio=1.5.6 + - nettle=3.7.3 + - networkx=3.1 + - numexpr=2.8.4 + - numpy=1.25.2 + - numpy-base=1.25.2 + - openh264=2.1.1 + - openssl=3.1.2 + - orc=1.7.4 + - packaging=23.1 + - pandas=1.5.3 + - parso=0.8.3 + - pcre=8.45 + - pexpect=4.8.0 + - pickleshare=0.7.5 + - pillow=9.4.0 + - pip=23.2.1 + - pixman=0.40.0 + - polars=0.18.15 + - prompt-toolkit=3.0.39 + - prompt_toolkit=3.0.39 + - psutil=5.9.5 + - pthread-stubs=0.4 + - ptyprocess=0.7.0 + - pure_eval=0.2.2 + - pyarrow=11.0.0 + - pycairo=1.24.0 + - pycparser=2.21 + - pygments=2.16.1 + - pyopenssl=23.2.0 + - pysocks=1.7.1 + - python=3.10.12 + - python-dateutil=2.8.2 + - python_abi=3.10 + - pytorch=2.0.1 + - pytorch-cuda=11.7 + - pytorch-mutex=1.0 + - pytz=2022.7 + - pyzmq=25.1.0 + - rdkit=2022.03.2 + - re2=2022.04.01 + - readline=8.2 + - reportlab=3.6.12 + - requests=2.31.0 + - scipy=1.11.1 + - seaborn=0.12.2 + - setuptools=68.0.0 + - six=1.16.0 + - snappy=1.1.9 + - sqlalchemy=1.4.49 + - sqlite=3.41.2 + - stack_data=0.6.2 + - sympy=1.11.1 + - tbb=2021.8.0 + - tk=8.6.12 + - torchaudio=2.0.2 + - torchtriton=2.0.0 + - torchvision=0.15.2 + - tornado=6.3.2 + - traitlets=5.9.0 + - typing_extensions=4.7.1 + - tzdata=2023c + - urllib3=1.26.16 + - utf8proc=2.6.1 + - wcwidth=0.2.6 + - wheel=0.38.4 + - xorg-libxau=1.0.11 + - xorg-libxdmcp=1.1.3 + - xz=5.2.6 + - zeromq=4.3.4 + - zipp=3.16.2 + - zlib=1.2.13 + - zstd=1.5.2 + - pip: + - astropy==5.3.2 + - contourpy==1.1.0 + - fonttools==4.42.0 + - joblib==1.3.2 + - matplotlib==3.7.2 + - patsy==0.5.3 + - protobuf==3.19.0 + - pyerfa==2.0.0.3 + - pyparsing==3.0.9 + - pyyaml==6.0.1 + - scikit-learn==1.3.0 + - statsmodels==0.14.0 + - threadpoolctl==3.2.0 + - tqdm==4.66.1 From 2a767eaef137a6934a2a6ff8ec59c13851823179 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 23 Aug 2023 09:21:14 -0700 Subject: [PATCH 061/254] update preprocess --- preprocess_new.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/preprocess_new.py b/preprocess_new.py index 41cc63c..c9162e0 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -3,6 +3,7 @@ import sys import os import numpy as np +import polars as pl #import torch #import torch.utils.data as du #from torch.autograd import Variable @@ -319,7 +320,8 @@ def prep_input(params): 'sample_id': response_df['sample_id'].values}) comb_data_mtx = comb_data_mtx.set_index(['drug_id', 'sample_id']).join(drug_data, on = 'drug_id').join(sample_data, on = 'sample_id') comb_data_mtx['response'] = response_df[params['metric']].values - comb_data_mtx.to_csv(params[i + '_data'], sep = '\t', header= True, index=False) + comb_data_mtx = comb_data_mtx.dropna() + pl.from_pandas(comb_data_mtx).write_csv(params[i + '_data'], separator = '\t', has_header = True) def run_ssgsea(params): From 4564d8ebfd12c6ec068d6f6397117bd4e7bf7f98 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 30 Aug 2023 13:53:03 -0700 Subject: [PATCH 062/254] update infer.sh --- PathDSP/FNN_new.py | 21 +--- PathDSP/infer.sh | 86 ++++++++++++++ infer.py | 285 ++++++++------------------------------------- 3 files changed, 138 insertions(+), 254 deletions(-) create mode 100644 PathDSP/infer.sh diff --git a/PathDSP/FNN_new.py b/PathDSP/FNN_new.py index 8349072..00c8879 100644 --- a/PathDSP/FNN_new.py +++ b/PathDSP/FNN_new.py @@ -184,7 +184,6 @@ def main(params): # test_df = pd.read_csv(params['test_data'], header=0, index_col=[0,1], sep="\t") train_df = pl.read_csv(params['train_data'], separator = "\t").to_pandas() val_df = pl.read_csv(params['val_data'], separator = "\t").to_pandas() - test_df = pl.read_csv(params['test_data'], separator = "\t").to_pandas() # shuffle #sdf = skut.shuffle(df, random_state=params["seed_int"]) @@ -217,31 +216,25 @@ def main(params): # get train/test splits Xtrain_arr = train_df.iloc[:, 0:-1].values Xvalid_arr = val_df.iloc[:, 0:-1].values - Xtest_arr = test_df.iloc[:, 0:-1].values ytrain_arr = train_df.iloc[:, -1].values yvalid_arr = val_df.iloc[:, -1].values - ytest_arr = test_df.iloc[:, -1].values # get train/valid splits from train #Xtrain_arr, Xvalid_arr, ytrain_arr, yvalid_arr = skms.train_test_split(Xtrain_arr, ytrain_arr, # test_size=0.1, random_state=params['seed_int']) #print(' train={:}, valid={:}, test={:}'.format(Xtrain_arr.shape, Xvalid_arr.shape, Xtest_arr.shape)) # prepare dataframe for output - ytest_df = test_df.iloc[:, -1].to_frame() + #ytest_df = test_df.iloc[:, -1].to_frame() # convert to numpy array Xtrain_arr = np.array(Xtrain_arr).astype('float32') Xvalid_arr = np.array(Xvalid_arr).astype('float32') - Xtest_arr = np.array(Xtest_arr).astype('float32') ytrain_arr = np.array(ytrain_arr).astype('float32') yvalid_arr = np.array(yvalid_arr).astype('float32') - ytest_arr = np.array(ytest_arr).astype('float32') # create mini-batch train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) valid_dataset = mydl.NumpyDataset(tch.from_numpy(Xvalid_arr), tch.from_numpy(yvalid_arr)) - test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) train_dl = tchud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) valid_dl = tchud.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) - test_dl = tchud.DataLoader(test_dataset, batch_size=batch_size, shuffle=False) # initial weight def init_weights(m): if type(m) == tch.nn.Linear: @@ -254,16 +247,6 @@ def init_weights(m): # fit data with model print('start training process') trained_net, train_loss_list, valid_loss_list, valid_r2_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn) - start = datetime.now() - prediction_list = predict(trained_net, test_dl, device) - print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start))) - # evaluation metrics - mse = skmts.mean_squared_error(ytest_arr, prediction_list) - rmse = np.sqrt(mse) - r2_pred = r2_score(ytest_arr, prediction_list) - loss_pred = pd.DataFrame({'metric': ['rmse', 'r2'], - 'value': [rmse, r2_pred]}) - loss_pred.to_csv(params['data_dir'] + '/Loss_pred.txt', header=True, index=False, sep="\t") # if rmse <= best_rmse: # best_rmse = rmse # best_fold = n_fold @@ -289,7 +272,6 @@ def init_weights(m): 'valid loss': valid_loss_list, 'valid r2': valid_r2_list}) - ytest_df['prediction'] = prediction_list #loss_df_list.append(loss_df) #ytest_df_list.append(ytest_df) # end of fold @@ -298,7 +280,6 @@ def init_weights(m): # save to output #all_ytest_df = pd.concat(ytest_df_list, axis=0) #all_loss_df = pd.concat(loss_df_list, axis=0) - ytest_df.to_csv(params['data_dir'] + '/Prediction.txt', header=True, index=True, sep="\t") loss_df.to_csv(params['data_dir'] + '/Loss.txt', header=True, index=False, sep="\t") # if params['shap_bool'] == True: # all_shap_df = pd.concat(shap_df_list, axis=0) diff --git a/PathDSP/infer.sh b/PathDSP/infer.sh new file mode 100644 index 0000000..aa14ebe --- /dev/null +++ b/PathDSP/infer.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# arg 1 CUDA_VISIBLE_DEVICES +# arg 2 CANDLE_DATA_DIR +# arg 3 CANDLE_CONFIG + +### Path to your CANDLEized model's main Python script### +CANDLE_MODEL=infer.py + +### Set env if CANDLE_MODEL is not in same directory as this script +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} + +CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL} +if [ ! -f ${CANDLE_MODEL} ] ; then + echo No such file ${CANDLE_MODEL} + exit 404 +fi + +if [ $# -lt 2 ]; then + echo "Illegal number of parameters" + echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" + exit +fi + +if [ $# -eq 2 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + CMD="python ${CANDLE_MODEL}" + echo "CMD = $CMD" + +elif [ $# -ge 3 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + + # if original $3 is a file, set candle_config and passthrough $@ + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$CANDLE_DATA_DIR/$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD $@" + + # else passthrough $@ + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + + fi +fi + +if [ -d ${CANDLE_DATA_DIR} ]; then + if [ "$(ls -A ${CANDLE_DATA_DIR})" ] ; then + echo "using data from ${CANDLE_DATA_DIR}" + else + ./candle_glue.sh + echo "using original data placed in ${CANDLE_DATA_DIR}" + fi +fi + +export CANDLE_DATA_DIR=${CANDLE_DATA_DIR} +FULL_DATA_DIR="$CANDLE_DATA_DIR/$MODEL_NAME/Data" +echo $FULL_DATA_DIR + +if [ -d ${FULL_DATA_DIR} ]; then + if [ "$(ls -A ${FULL_DATA_DIR})" ] ; then + echo "using data from ${FULL_DATA_DIR}" + else + ./candle_glue.sh + echo "using original data placed in ${FULL_DATA_DIR}" + fi +else + ./candle_glue.sh + echo "using original data placed in ${FULL_DATA_DIR}" +fi + +# Display runtime arguments +echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" +echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" + +# Set up environmental variables and execute model +echo "activating environment" +#source /opt/conda/etc/profile.d/conda.sh +source activate /usr/local/conda_envs/PathDSP_env +echo "running command ${CMD}" +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD diff --git a/infer.py b/infer.py index da26b33..babd421 100755 --- a/infer.py +++ b/infer.py @@ -1,266 +1,83 @@ -import os import candle -import pandas as pd -import torch -import torchvision +import os +import sys +#import json +#from json import JSONEncoder +from preprocess_new import mkdir, preprocess import numpy as np -import networkx as nx -import networkx.algorithms.components.connected as nxacc -import networkx.algorithms.dag as nxadag -import torch.utils.data as du -from torch.autograd import Variable -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -from torchmetrics.functional import mean_absolute_error -from scipy.stats import spearmanr -import torch.nn as nn -import torch.nn.functional as F -#from code.predict_drugcell import main -import sklearn -from code.utils.util import * -from code.drugcell_NN import * -from code.utils.util import load_mapping -from code.utils.util import load_train_data -from code.utils.util import build_input_vector -from code.utils.util import pearson_corr -from code.utils.util import prepare_predict_data -from time import time - -file_path = os.path.dirname(os.path.realpath(__file__)) -print(file_path) - -# Just because the tensorflow warnings are a bit verbose -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - -# This should be set outside as a user environment variable -os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' +import pandas as pd +from datetime import datetime +import torch as tch +import torch.utils.data as tchud +import polars as pl +import sklearn.metrics as skmts +#sys.path.append("/usr/local/PathDSP/PathDSP") +sys.path.append("/usr/local/PathDSP/PathDSP") +import FNN_new -# additional definitions -additional_definitions = [ - { - "name": "batchsize", - "type": int, - "help": "...", - }, - { - "name": "gene2id", - "type": str, - "help": "path to gene2id file", - }, - { - "name": "drug2id", - "type": str, - "help": "path to drug to ID file", - }, - { - "name": "cell2id", - "type": str, - "help": "Path to cell 2 id file", - }, - { - "name": "hidden", - "type": str, - "help": "string to indicate hidden output layer ", - }, - { - "name": "cuda", - "type": int, - "help": "CUDA ID", - }, - { - "name": "result", - "type": str, - "help": "result file name", - }, -] +file_path = os.path.dirname(os.path.realpath(__file__)) +required = None +additional_definitions = None -# required definitions -required = [ - "genotype", - "fingerprint", -] # initialize class -class DrugCell_candle(candle.Benchmark): +class PathDSP_candle(candle.Benchmark): def set_locals(self): - """ + ''' Functionality to set variables specific for the benchmark - required: set of required parameters for the benchmark. - additional_definitions: list of dictionaries describing the additional parameters for the benchmark. - """ + ''' if required is not None: self.required = set(required) if additional_definitions is not None: - self.additional_definisions = additional_definitions - + self.additional_definitions = additional_definitions def initialize_parameters(): - preprocessor_bmk = DrugCell_candle(file_path, - 'DrugCell_params.txt', + preprocessor_bmk = PathDSP_candle(file_path, + 'PathDSP_params.txt', 'pytorch', - prog='DrugCell_candle', + prog='PathDSP_candle', desc='Data Preprocessor' ) #Initialize parameters - candle_data_dir = os.getenv("CANDLE_DATA_DIR") gParameters = candle.finalize_parameters(preprocessor_bmk) return gParameters -def load_mapping(map_file): - mapping = {} - with open(map_file) as fin: - for raw_line in fin: - line = raw_line.strip().split() - mapping[line[1]] = int(line[0]) - return mapping - -def load_train_data(drug_data, cell2id_dict, drug2id_dict): - data = [] - label = [] - with open(drug_data) as fin: - for raw_line in fin: - tokens = raw_line.strip().split('\t') - data.append([cell2id_dict[tokens[0]], drug2id_dict[tokens[1]]]) - label.append([float(tokens[2])]) - return data, label - - -def predict_dcell(predict_data, gene_dim, drug_dim, model_file, hidden_folder, - batch_size, result_file, cell_features, drug_features, CUDA_ID,output_dir): - feature_dim = gene_dim + drug_dim - device = torch.device("cuda") - model = torch.load(model_file, map_location='cuda:%d' % CUDA_ID) -# checkpoint = torch.load(model_file, map_location='cuda:%d' % CUDA_ID) - #model = torch.load(model_file, map_location='cuda:0') - model.to(device) -# model.load_state_dict(checkpoint['model_state_dict']) -# optimizer.load_state_dict(checkpoint['optimizer_state_dict']) -# epoch = checkpoint['epoch'] -# loss = checkpoint['loss'] - #model = torch.load(model_file, map_location='cuda:%d' % CUDA_ID) - - predict_feature, predict_label, feature_dict = predict_data - - predict_label_gpu = predict_label.cuda(CUDA_ID) - model.cuda(CUDA_ID) - model.eval() - - test_loader = du.DataLoader(du.TensorDataset(predict_feature,predict_label), batch_size=batch_size, shuffle=False) - model_dir = output_dir - - #Test - test_predict = torch.zeros(0,0).cuda(CUDA_ID) - term_hidden_map = {} - test_loss = 0 - batch_num = 0 - test_loss_list = [] - test_corr_list = [] - test_r2_list = [] - drug_list = [] - tissue_list = [] - print("Begin test evaluation") - for i, (inputdata, labels) in enumerate(test_loader): - # Convert torch tensor to Variable - cuda_labels = torch.autograd.Variable(labels.cuda(CUDA_ID)) - features = build_input_vector(inputdata, cell_features, drug_features) - cuda_features = Variable(features.cuda(CUDA_ID), requires_grad=False) - loss = nn.MSELoss() - values = inputdata.cpu().detach().numpy().tolist() - keys = [i for i in feature_dict for x in values if feature_dict [i]== x ] - tissue = [i.split(';')[0] for i in keys] - tissue_list.append(tissue) - drug = [i.split(';')[1] for i in keys] - drug_list.append(drug) - # make prediction for test data - aux_out_map, term_hidden_map = model(cuda_features) - if test_predict.size()[0] == 0: - test_predict = aux_out_map['final'].data - loss_a = loss(test_predict, cuda_labels) - print(loss_a) - test_loss += loss_a.item() - else: - test_predict = torch.cat([test_predict, aux_out_map['final'].data], dim=0) - loss_a = loss(test_predict, cuda_labels) - print(loss_a) - test_loss += loss_a.item() - batch_num += 1 - - predictions = np.array([p.cpu() for preds in test_predict for p in preds] ,dtype = np.float ) - predictions = predictions[0:len(predictions)] - labels = np.array([l.cpu() for label in labels for l in label],dtype = np.float) - labels = labels[0:len(labels)] - test_pearson_a = pearson_corr(torch.Tensor(predictions), torch.Tensor(labels)) - test_spearman_a = spearmanr(labels, predictions)[0] - test_mean_absolute_error = sklearn.metrics.mean_absolute_error(y_true=labels, y_pred=predictions) - test_r2_a = sklearn.metrics.r2_score(y_true=labels, y_pred=predictions) - test_rmse_a = np.sqrt(np.mean((predictions - labels)**2)) - test_loss_a = test_loss / len(test_loader) - epoch_end_time = time() - test_loss_a = test_loss/len(test_loader) - test_loss_list.append(test_loss_a) - test_corr_list.append(test_pearson_a.cpu().detach().numpy()) - test_r2_list.append(test_r2_a) - min_test_loss = test_loss_a - scores = {} - scores['test_loss'] = min_test_loss - scores['test_pcc'] = test_pearson_a.cpu().detach().numpy().tolist() - scores['test_MSE'] = test_mean_absolute_error - scores['test_r2'] = test_r2_a - scores['test_scc'] = test_spearman_a - test_corr = pearson_corr(test_predict, predict_label_gpu) - print("Test pearson corr\t%s\t%.6f" % (model.root, test_corr)) - cols = ['drug', 'tissue', 'test_loss', 'test_corr', 'test_r2'] - metrics_test_df = pd.DataFrame(columns=cols, index=range(len(test_loader))) - metrics_test_df['test_loss'] = test_loss_list - metrics_test_df['test_corr'] = test_corr_list - metrics_test_df['test_r2'] = test_r2_list - loss_results_name = str(result_file+'/test_metrics_results.csv') - metrics_test_df.to_csv(loss_results_name, index=False) - np.savetxt(result_file+'/drugcell.predict', test_predict.cpu().numpy(),'%.4e') - def run(params): - keys_parsing = ["train_data", "test_data", "val_data", - "onto", "genotype_hiddens", "fingerprint", - "genotype", "cell2id","drug2id", "drug_hiddens", - "model_name"] - model_param_key = [] - for key in params.keys(): - if key not in keys_parsing: - model_param_key.append(key) - model_params = {key: params[key] for key in model_param_key} - params['model_params'] = model_params - args = candle.ArgumentStruct(**params) - cell2id_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['cell2id'] - drug2id_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['drug2id'] - gene2id_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['gene2id'] - genotype_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['genotype'] - fingerprint_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['fingerprint'] - hidden_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['hidden'] - result_path = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['result'] - val_data = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + params['val_data'] - trained_model = params['data_model'] - hidden = params['drug_hiddens'] - batchsize = params['batch_size'] - cell_features = np.genfromtxt(genotype_path, delimiter=',') - drug_features = np.genfromtxt(fingerprint_path, delimiter=',') - CUDA_ID = params['cuda_id'] - num_cells = len(cell2id_path) - num_drugs = len(drug2id_path) - num_genes = len(gene2id_path) - drug_dim = len(drug_features[0,:]) - output_dir = params['output_dir'] - trained_model = os.environ['CANDLE_DATA_DIR'] + "/DrugCell/Improve/Data/" + os.path.join(output_dir) + "/" + "model_final.pt" - print(trained_model) - predict_data = prepare_predict_data(val_data, cell2id_path, drug2id_path) - predict_dcell(predict_data, num_genes, drug_dim, trained_model, hidden_path, batchsize, - result_path, cell_features, drug_features, CUDA_ID, output_dir) + trained_net = FNN_new.mynet.FNN(Xtest_arr.shape[1]) + trained_net.load_state_dict(tch.load(params['data_dir'] + '/model.pt')) + trained_net.eval() + test_df = pl.read_csv(params['test_data'], separator = "\t").to_pandas() + FNN_new.myutil.set_seed(params["seed_int"]) + device = FNN_new.myutil.get_device(uth=params["gpu_int"]) + Xtest_arr = test_df.iloc[:, 0:-1].values + ytest_arr = test_df.iloc[:, -1].values + Xtest_arr = np.array(Xtest_arr).astype('float32') + ytest_arr = np.array(ytest_arr).astype('float32') + test_dataset = FNN_new.mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) + test_dl = tchud.DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False) + start = datetime.now() + prediction_list = FNN_new.predict(trained_net, test_dl, device) + print('Inference time :[Finished in {:}]'.format(FNN_new.cal_time(datetime.now(), start))) + # evaluation metrics + mse = skmts.mean_squared_error(ytest_arr, prediction_list) + rmse = np.sqrt(mse) + r2_pred = FNN_new.r2_score(ytest_arr, prediction_list) + loss_pred = pd.DataFrame({'metric': ['rmse', 'r2'], + 'value': [rmse, r2_pred]}) + loss_pred.to_csv(params['data_dir'] + '/Loss_pred.txt', header=True, index=False, sep="\t") + ytest_df = test_df.iloc[:, -1].to_frame() + ytest_df['prediction'] = prediction_list + ytest_df.to_csv(params['data_dir'] + '/Prediction.txt', header=True, index=True, sep="\t") def candle_main(): params = initialize_parameters() + data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/' + params = preprocess(params, data_dir) run(params) if __name__ == "__main__": From 5ed9d0e6c1798ecc9680272e193ba0524b6ac9ff Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Wed, 30 Aug 2023 16:03:38 -0500 Subject: [PATCH 063/254] PathDSP in candle format (#2) * update preprocess script * update preprocess script * add improve_utils script * add nea scripts * update params * add gitignore * EXP processing * updated to integrate with prep_input * add definition file * update .gitignore * update filename for ssGSEA * add FNN_new * add train/infer * update params * add .yml * update params * update conda path * fix conda * update preprocess.sh * update preprocess.sh * update preprocess_new.py * update env * update preproce_new.py * update preproce_new.py * update files * update params * fix params * update preproce_new.py * update preprocess_new.py * update preprocess_new.py * update file * update file * update file * update script * add def * add script * update file * update FNN_new * update FNN * update params * fix param * fix bug * add time * update def * update yml * update train.sh * update train.sh * update train.py * update train * fix bug * update file * update file * use polars * update files * update preprocess * update infer.sh --------- Co-authored-by: willherbert27 --- .gitignore | 5 + NetPEA.py | 212 ++++++++++++ PathDSP.def | 55 +++ PathDSP/FNN_new.py | 299 +++++++++++++++++ PathDSP/infer.sh | 86 +++++ PathDSP_params.txt | 55 +-- RWR.py | 162 +++++++++ environment.yml | 231 +++++++++++++ environment_081723.yml | 156 +++++++++ environment_082223.yml | 218 ++++++++++++ improve_utils.py | 735 +++++++++++++++++++++++++++++++++++++++++ infer.py | 84 +++++ preprocess.sh | 59 ++++ preprocess_new.py | 397 ++++++++++++++++++++++ train.py | 73 ++++ train.sh | 86 +++++ 16 files changed, 2894 insertions(+), 19 deletions(-) create mode 100644 .gitignore create mode 100644 NetPEA.py create mode 100644 PathDSP.def create mode 100644 PathDSP/FNN_new.py create mode 100644 PathDSP/infer.sh create mode 100644 RWR.py create mode 100644 environment.yml create mode 100644 environment_081723.yml create mode 100644 environment_082223.yml create mode 100644 improve_utils.py create mode 100755 infer.py create mode 100644 preprocess.sh create mode 100644 preprocess_new.py create mode 100644 train.py create mode 100755 train.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f72ee49 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.ipynb_checkpoints/ +PathDSP/__pycache__/ +__pycache__/ +EDA.ipynb + diff --git a/NetPEA.py b/NetPEA.py new file mode 100644 index 0000000..d7006ef --- /dev/null +++ b/NetPEA.py @@ -0,0 +1,212 @@ +""" +Implementation of NetPEA: pathway enrichment with networks (Liu, 2017) + +Ref: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5664096/ +zscore >1.65, equivalent to p-value=0.05 +""" + +import os +import sys +import argparse +import numpy as np +import pandas as pd +import multiprocessing as mp +import scipy.stats as scistat +from datetime import datetime + +class NetPEA: + """ + :param rwrDf: dataframe with cell by PPI genes + :param pathwayGMT: pathway database in gmt format + :param permutation: + :param seed: + :param threshold: + """ + def __init__(self, rwrPath, pathwayGMT, log_transform=False, permutation=1000, seed=42, n_cpu=5, out_path='./'): + # load data + self.rwr_path = rwrPath #pd.read_csv(rwrDf, header=0, index_col=0, sep="\t") + self.pathway_gmt = pathwayGMT + self.permutation = int(permutation) + self.seed = int(seed) + self.out_path = out_path + + # settings + np.random.seed(self.seed) + self.n_cpu = int(n_cpu) + if len(self.rwr_path) < self.n_cpu: + self.n_cpu = len(self.rwr_path) + + # prepare pathway genes to save time + print('{:}: collect pathway genes'.format(datetime.now())) + pathway_geneList_dict = self._get_pathway_genes(pathwayGMT) # {pathway: geneList} + # obtain shared genes for calculating score of pathway genes + self.rwrDf = self.rwr_path#pd.read_csv(rwrPath, header=0, index_col=0, sep="\t") + if log_transform == True: + print('log transform input data') + self.rwrDf = np.log(self.rwrDf) + pathway_shareGeneList_dict = self._find_overlaps(self.rwrDf, pathway_geneList_dict) # {pathway: shareGeneList} + # generate random gene list for calculating score of random pathway genes + pathway_randomGeneListList_dict = {} + bg_gene_list = self.rwrDf.columns.tolist() # ppi genes + for pathway, shareGeneList in pathway_shareGeneList_dict.items(): + pathway_randomGeneListList_dict.update({pathway:[]}) + for p in range(self.permutation): + gene_list = np.random.choice(bg_gene_list, len(shareGeneList)).tolist() + pathway_randomGeneListList_dict[pathway].append(gene_list) + self.pathwayDictList = [pathway_geneList_dict, pathway_shareGeneList_dict, pathway_randomGeneListList_dict] + + # call function + self.netpea_parallel(self.rwrDf, self.pathwayDictList, self.n_cpu, self.out_path) + + def netpea_parallel(self, rwrDf, pathwayDictList, n_cpu, out_path): + # split dataframe + n_partitions = int(n_cpu) + split_list = np.array_split(rwrDf, n_partitions) + # parallel computing + pool = mp.Pool(int(n_cpu)) + df_list = pool.starmap(self.netpea, [(df, pathwayDictList) for df in split_list]) + pool.close() + pool.join() + print('{:}: comple {:} dfs'.format(datetime.now(), len(df_list))) + print(df_list[0]) + + # merge result of all cells and save to file + print('{:}: merge result of all cells and save to file'.format(datetime.now())) + all_cell_zscore_df = pd.concat(df_list, axis=0) + zscore_fname = self.out_path + all_cell_zscore_df.to_csv(zscore_fname, header=True, index=True, sep="\t") + #print(all_cell_zscore_df) + + + def netpea(self, rwrDf, pathwayDictList): + """return dataframe with cell by pathway""" + pathway_geneList_dict, pathway_shareGeneList_dict, pathway_randomGeneListList_dict = pathwayDictList + # convert to dataframe with headers=[pathway, #pathway genes, overlap genes] + pathway_df = self._merge_pathway_dict(pathway_geneList_dict, pathway_shareGeneList_dict) + # collect score of random gene list + print('{:}: collect score of random gene list'.format(datetime.now())) + cell_pathway_bgScoreList_dict = {} # dict of dict + for cell in rwrDf.index: + cell_pathway_bgScoreList_dict.update({cell:{}}) + # prepare data + rwr_df = rwrDf.loc[cell] # 1 by ppiG dataframe + # append aggregate score for each randomgenelist for each pathway + for pathway, randomGeneListList in pathway_randomGeneListList_dict.items(): + bgScoreList = [rwr_df.loc[randomGeneList].mean() for randomGeneList in randomGeneListList] + cell_pathway_bgScoreList_dict[cell].update({pathway:bgScoreList}) + + # collect score of share gene list + print('{:}: collect score of share gene list'.format(datetime.now())) + cell_pathway_ScoreList_dict = {} # dict of dict + for cell in rwrDf.index: + cell_pathway_ScoreList_dict.update({cell:{}}) + # prepare data + rwr_df = rwrDf.loc[cell] # 1 by ppiG dataframe + # append aggregate score for each randomgenelist for each pathway + for pathway, shareGeneList in pathway_shareGeneList_dict.items(): + score = rwr_df.loc[shareGeneList].mean() + cell_pathway_ScoreList_dict[cell].update({pathway:score}) + # ztest to determin significance + print('{:}: ztest to determin significance'.format(datetime.now())) + zscore_dfs = [] + cell_pathway_zscore_dict = {} # collect zscore for each pathway + cell_pathway_ztest_dict = {} # collect zscore and pvalue for each pathway + for cell in rwrDf.index: + cell_pathway_zscore_dict.update({cell:{}}) + cell_pathway_ztest_dict.update({cell:{}}) + pathway_score_dict = cell_pathway_ScoreList_dict[cell] + pathway_bgList_dict = cell_pathway_bgScoreList_dict[cell] + for pathway in pathway_geneList_dict.keys(): + score = pathway_score_dict[pathway] + bgList = pathway_bgList_dict[pathway] + [zscore, pvalue] = self._cal_zscore(score, bgList) + cell_pathway_ztest_dict[cell].update({pathway: [zscore, pvalue]}) + cell_pathway_zscore_dict[cell].update({pathway:zscore}) + # save per-cell zscore + cell_zscore_df = pd.DataFrame(cell_pathway_zscore_dict[cell], index=[cell]) + zscore_dfs.append(cell_zscore_df) + # save per-cell ztest results + cell_bgtest_df = pd.DataFrame(cell_pathway_ztest_dict[cell], index=['zscore', 'pvalue']).T + cell_bgtest_df.index.name = 'pathway' + cell_bgtest_df = cell_bgtest_df.join(pathway_df) + #percell_fname = self.out_path + '.' + cell + '.NetPEA.background_result.txt' + #cell_bgtest_df.to_csv(percell_fname, header=True, index=True, sep="\t") + # merge result of all cells and save to file + #print('{:}: merge result of all cells and save to file'.format(datetime.now())) + all_cell_zscore_df = pd.concat(zscore_dfs, axis=0) + #zscore_fname = self.out_path + '.NetPEA.zscore.txt' + #all_cell_zscore_df.to_csv(zscore_fname, header=True, index=True, sep="\t") + + # clear space + pathwayDictList = [] + return all_cell_zscore_df + + def _merge_pathway_dict(self, pathway_geneList_dict, pathway_shareGeneList_dict): + """return dataframe with headers = [pathway, #pathway genes, overlap genes]""" + pathway_lenG_dict = {pathway: len(geneList) for pathway, geneList in pathway_geneList_dict.items()} + pathway_strG_dict = {pathway: ",".join(geneList) for pathway, geneList in pathway_shareGeneList_dict.items()} + df1 = pd.DataFrame(pathway_lenG_dict.items(), columns=['pathway', '#pathway genes']) + df2 = pd.DataFrame(pathway_strG_dict.items(), columns=['pathway', 'overlap genes']) + return df1.set_index('pathway').join(df2.set_index('pathway')) + + def _find_overlaps(self, rwrDf, pathway_dict): + """return diction with pathway:geneList""" + # create result dictionary + result_dict = {} #pathway:sharedGeneList + # get ppiGenes + ppi_gene_list = rwrDf.columns.tolist() + # find overlaps + for pathway, geneList in pathway_dict.items(): + shareGene_list = sorted(list(set(geneList) & set(ppi_gene_list))) + result_dict.update({pathway:shareGene_list}) + return result_dict + + def _cal_zscore(self, score, scoreList): + """return zscore and pvalue by lookup table""" + if np.std(scoreList) != 0: + zscore = (score - np.mean(scoreList) ) / np.std(scoreList) + pvalue = scistat.norm.sf(abs(zscore)) # not pdf + #print('score={:}, scoreList={:}, zscore={:}, pvalue={:}'.format( + # score, scoreList[:10], zscore, pvalue)) + else: + zscore, pvalue = np.nan, np.nan + return [zscore, pvalue] + + def _cal_similarity_score(self, rwrDf, geneList): + """return similarity score by taking average of rwr for given geneList""" + return rwrDf.loc[geneList].mean() + + def _get_pathway_genes(self, gmt): + """ + Return pathwayStr_geneList_dict + + :param fin: file name to pathway in gmt format + :return pathway_dict: dictionary of pathway as key, genelist as values + """ + pathwayStr_geneList_dict = {} + with open(gmt, 'r') as f: + for line in f: + # extract fields + line = line.strip('\n').split('\t') + pathway_str = line[0] + gene_list = line[2:] + # update to dict + pathwayStr_geneList_dict.update({pathway_str:gene_list}) + return pathwayStr_geneList_dict + + def _df2dict(self, df): + """return 1 by N dataframe to dictionary of N keys""" + return df.to_dict('records')[0] # keys are column names = gene nams + + +if __name__ == "__main__": + # timer + datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' + start_time = datetime.now() + rwr_df = 'test.txt' #'/repo4/ytang4/PHD/db/GDSC/processed/GDSC.MUTCNV.STRING.RWR.txt' + pathway_gmt = '/repo4/ytang4/PHD/db/MSigdb/c2.cp.pid.v7.1.symbols.gmt' + # initiate + cell_pathway_df = NetPEA(rwr_df, pathway_gmt, permutation=3, seed=42, n_cpu=5, out_path='./test_netpea/GDSC') + spend = datetime.strptime(str(datetime.now()), datetimeFormat) - datetime.strptime(str(start_time),datetimeFormat) + print( '[Finished in {:}]'.format(spend) ) + diff --git a/PathDSP.def b/PathDSP.def new file mode 100644 index 0000000..ae656c7 --- /dev/null +++ b/PathDSP.def @@ -0,0 +1,55 @@ +Bootstrap: docker +From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime + +%labels + MANTAINER Yuanhang Liu + +%setup + cp ./src/Singularity_gpu_fix.sh $SINGULARITY_ROOTFS + # add local url of this repository for testing + + +%environment + PATH=$PATH:/usr/local/PathDSP + MODEL_DIR=/usr/local/PathDSP + CANDLE_DATA_DIR=/candle_data_dir + +%post + apt-get update -y + apt-get install wget -y + apt-get install -y gnupg + apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80 + apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys A4B469963BF863CC + + apt-get install build-essential -y + apt-get install git -y + apt-get install vim -y + apt-get install subversion -y + + # install gpu fix and clean up + cd / + chmod +x Singularity_gpu_fix.sh + ./Singularity_gpu_fix.sh + rm Singularity_gpu_fix.sh + + # these three need to be compiled and linked to the cuda libs. + # at the moment, what works for me is to build these in a + # singularity shell in a sandbox with the --nv flag to singularity set. + + + # create default internal candle_data_dir, map external candle_data_dir here + mkdir /candle_data_dir + + #install python modules and model prerequites + cd /usr/local + git clone -b develop https://github.com/Liuy12/PathDSP.git + cd PathDSP + + # download conda + + /opt/conda/bin/conda env create -f environment_082223.yml --prefix /usr/local/conda_envs/PathDSP_env/ + #/opt/conda/bin/conda activate PathDSP_env + /usr/local/conda_envs/PathDSP_env/bin/pip install git+https://github.com/ECP-CANDLE/candle_lib@develop + + #cp *.sh /usr/local/bin + chmod a+x /usr/local/PathDSP/*.sh diff --git a/PathDSP/FNN_new.py b/PathDSP/FNN_new.py new file mode 100644 index 0000000..00c8879 --- /dev/null +++ b/PathDSP/FNN_new.py @@ -0,0 +1,299 @@ +""" +Train a neural network for regression task: + cv: 10 + batch size: 8 + initializer: He normal initializer + optimizer: AdamMax + learning rate: 0.0004 + loss: RMSE + +Calculate RMSE at once, Oct. 3, 2020 revised +""" + + +import argparse +import numpy as np +import pandas as pd +import scipy.stats as scistat +from datetime import datetime + +import sklearn.preprocessing as skpre +import sklearn.model_selection as skms +import sklearn.metrics as skmts +import sklearn.utils as skut + +import torch as tch +import torch.utils.data as tchud + +import myModel as mynet +import myFit as myfit +import myDataloader as mydl +import myDatasplit as mysplit +import myUtility as myutil +#import myPlotter as myplot +import myMetrics as mymts +import polars as pl + +#import shap as sp + +class RMSELoss(tch.nn.Module): + def __init__(self): + super(RMSELoss,self).__init__() + + def forward(self,x,y): + eps = 1e-6 + criterion = tch.nn.MSELoss() + loss = tch.sqrt(criterion(x, y) + eps) + return loss + + +def r2_score(y_true, y_pred): + y_mean = np.mean(y_true) + ss_tot = np.sum((y_true - y_mean)**2) + ss_res = np.sum((y_true - y_pred)**2) + r2 = 1 - ss_res / ss_tot + return r2 + +def cal_time(end, start): + '''return time spent''' + # end = datetime.now(), start = datetime.now() + datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' + spend = datetime.strptime(str(end), datetimeFormat) - \ + datetime.strptime(str(start),datetimeFormat) + return spend + + + +def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): + """ + Return train and valid performance including loss + + :param net: model + :param train_dl: train dataloader + :param valid_dl: valid dataloader + :param epochs: integer representing EPOCH + :param learning_rate: float representing LEARNING_RATE + :param device: string representing cpu or cuda:0 + :param opt_fn: optimization function in torch (e.g., tch.optim.Adam) + :param loss_fn: loss function in torch (e.g., tch.nn.MSELoss) + """ + # setup + criterion = RMSELoss() # setup LOSS function + optimizer = opt_fn(net.parameters(), lr=learning_rate, weight_decay=1e-5) # setup optimizer + net = net.to(device) # load the network onto the device + trainloss_list = [] # metrics: MSE, size equals to EPOCH + validloss_list = [] # metrics: MSE, size equals to EPOCH + validr2_list = [] # metrics: r2, size equals to EPOCH + early_stopping = myutil.EarlyStopping(patience=30, verbose=True) # initialize the early_stopping + # repeat the training for EPOCH times + start_total = datetime.now() + for epoch in range(epochs): + ## training phase + start = datetime.now() + net.train() + # initial loss + train_epoch_loss = 0.0 # save loss for each epoch, batch by batch + for i, (X_train, y_train) in enumerate(train_dl): + X_train, y_train = X_train.to(device), y_train.to(device) # load data onto the device + y_train_pred = net(X_train) # train result + train_loss = criterion(y_train_pred, y_train.float()) # calculate loss + optimizer.zero_grad() # clear gradients + train_loss.backward() # backpropagation + #### add this if you have gradient explosion problem ### + clip_value = 5 + tch.nn.utils.clip_grad_value_(net.parameters(), clip_value) + ########climp gradient within -5 ~ 5 ################### + optimizer.step() # update weights + train_epoch_loss += train_loss.item() # adding loss from each batch + # calculate total loss of all batches + avg_train_loss = train_epoch_loss / len(train_dl) + trainloss_list.append( avg_train_loss ) + print('epoch ' + str(epoch) + ' :[Finished in {:}]'.format(cal_time(datetime.now(), start))) + ## validation phase + with tch.no_grad(): + net.eval() + valid_epoch_loss = 0.0 # save loss for each epoch, batch by batch + ss_res = 0.0 + ss_tot = 0.0 + for i, (X_valid, y_valid) in enumerate(valid_dl): + X_valid, y_valid = X_valid.to(device), y_valid.to(device) # load data onto the device + y_valid_pred = net(X_valid) # valid result + valid_loss = criterion(y_valid_pred, y_valid.float())#y_valid.unsqueeze(1)) # calculate loss + valid_epoch_loss += valid_loss.item() # adding loss from each batch + ss_res += tch.sum((y_valid_pred - y_valid.float())**2) + ss_tot += tch.sum((y_valid_pred - y_valid.mean())**2) + + # calculate total loss of all batches, and append to result list + avg_valid_loss = valid_epoch_loss / len(valid_dl) + validloss_list.append( avg_valid_loss) + valid_r2 = 1 - ss_res / ss_tot + validr2_list.append(valid_r2.cpu().numpy()) + # display print message + #print('epoch={:}/{:}, train loss={:.5f}, valid loss={:.5f}'.format( + # epoch+1, epochs, train_epoch_loss / len(train_dl), + # valid_epoch_loss / len(valid_dl))) + + # early_stopping needs the validation loss to check if it has decresed, + # and if it has, it will make a checkpoint of the current model + early_stopping(avg_valid_loss, net) + + if early_stopping.early_stop: + print("Early stopping") + break + + print('Total time (all epochs) :[Finished in {:}]'.format(cal_time(datetime.now(), start_total))) + # load the last checkpoint with the best model + net.load_state_dict(tch.load('checkpoint.pt')) + + return net, trainloss_list, validloss_list, validr2_list + +def predict(net, test_dl, device): + """ + Return prediction list + + :param net: model + :param train_dl: train dataloader + :param device: string representing cpu or cuda:0 + """ + # create result lists + prediction_list = list() + + with tch.no_grad(): + net = net.to(device) # load the network onto the device + net.eval() + for i, (X_test, y_test) in enumerate(test_dl): + X_test, y_test = X_test.to(device), y_test.to(device) # load data onto the device + y_test_pred = net(X_test) # test result + # bring data back to cpu in np.array format, and append to result lists + prediction_list.append( y_test_pred.cpu().numpy() ) + #print(prediction_list) + + # merge all batches + prediction_list = np.vstack(prediction_list) + prediction_list = np.hstack(prediction_list).tolist() + # return + return prediction_list + + +def main(params): + start_time = datetime.now() + # load data + print('loadinig data') + # train_df = pd.read_csv(params['train_data'], header=0, index_col=[0,1], sep="\t") + # val_df = pd.read_csv(params['val_data'], header=0, index_col=[0,1], sep="\t") + # test_df = pd.read_csv(params['test_data'], header=0, index_col=[0,1], sep="\t") + train_df = pl.read_csv(params['train_data'], separator = "\t").to_pandas() + val_df = pl.read_csv(params['val_data'], separator = "\t").to_pandas() + + # shuffle + #sdf = skut.shuffle(df, random_state=params["seed_int"]) + + # set parameters + myutil.set_seed(params["seed_int"]) + device = myutil.get_device(uth=params["gpu_int"]) + #kFold = params["cv_int"] + learning_rate = params['learning_rate'] + epoch = params['epochs'] + batch_size = params['batch_size'] + opt_fn = tch.optim.Adam + + # create result list + # loss_df_list = [] + # score_df_list = [] + # ytest_df_list = [] + # shap_df_list = [] + # # train with cross-validation + #kf = skms.KFold(n_splits=kFold, random_state=params['seed_int'], shuffle=True) + #X_df = train_df.iloc[:, 0:-1] + #y_df = train_df.iloc[:, -1] + # save best model with lowest RMSE +# best_rmse = 10000 +# best_model = None +# best_fold = 0 +# # for i, (train_index, test_index) in enumerate(kf.split(X_df, y_df)): + #n_fold = i+1 + #print('Fold={:}/{:}'.format(n_fold, params['cv_int'])) + # get train/test splits + Xtrain_arr = train_df.iloc[:, 0:-1].values + Xvalid_arr = val_df.iloc[:, 0:-1].values + ytrain_arr = train_df.iloc[:, -1].values + yvalid_arr = val_df.iloc[:, -1].values + + # get train/valid splits from train + #Xtrain_arr, Xvalid_arr, ytrain_arr, yvalid_arr = skms.train_test_split(Xtrain_arr, ytrain_arr, + # test_size=0.1, random_state=params['seed_int']) + #print(' train={:}, valid={:}, test={:}'.format(Xtrain_arr.shape, Xvalid_arr.shape, Xtest_arr.shape)) + # prepare dataframe for output + #ytest_df = test_df.iloc[:, -1].to_frame() + # convert to numpy array + Xtrain_arr = np.array(Xtrain_arr).astype('float32') + Xvalid_arr = np.array(Xvalid_arr).astype('float32') + ytrain_arr = np.array(ytrain_arr).astype('float32') + yvalid_arr = np.array(yvalid_arr).astype('float32') + # create mini-batch + train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) + valid_dataset = mydl.NumpyDataset(tch.from_numpy(Xvalid_arr), tch.from_numpy(yvalid_arr)) + train_dl = tchud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + valid_dl = tchud.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) + # initial weight + def init_weights(m): + if type(m) == tch.nn.Linear: + tch.nn.init.kaiming_uniform_(m.weight) + m.bias.data.fill_(0.01) + # load model + n_features = Xtrain_arr.shape[1] + net = mynet.FNN(n_features) + net.apply(init_weights) + # fit data with model + print('start training process') + trained_net, train_loss_list, valid_loss_list, valid_r2_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn) + # if rmse <= best_rmse: + # best_rmse = rmse + # best_fold = n_fold + # best_model = trained_net + # print('best model so far at fold={:}, rmse={:}'.format(best_fold, best_rmse)) + + + # if params['shap_bool'] == True: + # print('calculate shapely values') + # # random select 100 samples as baseline + # train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) + # train_dl = tchud.DataLoader(train_dataset, batch_size=200, shuffle=True) + # background, lbl = next(iter(train_dl)) + # explainer = sp.DeepExplainer(trained_net, background[:100].to(device)) + # shap_arr = explainer.shap_values(tch.from_numpy(Xtest_arr)) + # shap_df = pd.DataFrame(shap_arr, index=ytest_df.index, columns=X_df.columns) + # # append to result + # shap_df_list.append(shap_df) + + # collect result + loss_df = pd.DataFrame({'epoch':[i+1 for i in range(len(train_loss_list))], + 'train loss':train_loss_list, + 'valid loss': valid_loss_list, + 'valid r2': valid_r2_list}) + + #loss_df_list.append(loss_df) + #ytest_df_list.append(ytest_df) + # end of fold + #trained_net = None + + # save to output + #all_ytest_df = pd.concat(ytest_df_list, axis=0) + #all_loss_df = pd.concat(loss_df_list, axis=0) + loss_df.to_csv(params['data_dir'] + '/Loss.txt', header=True, index=False, sep="\t") + # if params['shap_bool'] == True: + # all_shap_df = pd.concat(shap_df_list, axis=0) + # all_shap_df.to_csv(params['output'] + '.FNN.cv_' + str(params['cv_int']) + '.SHAP.txt', header=True, index=True, sep="\t") + + # make train/valid loss plots + best_model = trained_net + tch.save(best_model.state_dict(), params['data_dir'] + '/model.pt') + print( '[Finished in {:}]'.format(myutil.cal_time(datetime.now(), start_time)) ) + # display evaluation metrics of all folds + #mse, rmse, r_square, pccy = mymts.eval_regressor_performance(all_ytest_df, 'response', 'prediction') + + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/PathDSP/infer.sh b/PathDSP/infer.sh new file mode 100644 index 0000000..aa14ebe --- /dev/null +++ b/PathDSP/infer.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# arg 1 CUDA_VISIBLE_DEVICES +# arg 2 CANDLE_DATA_DIR +# arg 3 CANDLE_CONFIG + +### Path to your CANDLEized model's main Python script### +CANDLE_MODEL=infer.py + +### Set env if CANDLE_MODEL is not in same directory as this script +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} + +CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL} +if [ ! -f ${CANDLE_MODEL} ] ; then + echo No such file ${CANDLE_MODEL} + exit 404 +fi + +if [ $# -lt 2 ]; then + echo "Illegal number of parameters" + echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" + exit +fi + +if [ $# -eq 2 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + CMD="python ${CANDLE_MODEL}" + echo "CMD = $CMD" + +elif [ $# -ge 3 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + + # if original $3 is a file, set candle_config and passthrough $@ + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$CANDLE_DATA_DIR/$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD $@" + + # else passthrough $@ + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + + fi +fi + +if [ -d ${CANDLE_DATA_DIR} ]; then + if [ "$(ls -A ${CANDLE_DATA_DIR})" ] ; then + echo "using data from ${CANDLE_DATA_DIR}" + else + ./candle_glue.sh + echo "using original data placed in ${CANDLE_DATA_DIR}" + fi +fi + +export CANDLE_DATA_DIR=${CANDLE_DATA_DIR} +FULL_DATA_DIR="$CANDLE_DATA_DIR/$MODEL_NAME/Data" +echo $FULL_DATA_DIR + +if [ -d ${FULL_DATA_DIR} ]; then + if [ "$(ls -A ${FULL_DATA_DIR})" ] ; then + echo "using data from ${FULL_DATA_DIR}" + else + ./candle_glue.sh + echo "using original data placed in ${FULL_DATA_DIR}" + fi +else + ./candle_glue.sh + echo "using original data placed in ${FULL_DATA_DIR}" +fi + +# Display runtime arguments +echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" +echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" + +# Set up environmental variables and execute model +echo "activating environment" +#source /opt/conda/etc/profile.d/conda.sh +source activate /usr/local/conda_envs/PathDSP_env +echo "running command ${CMD}" +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 07d0614..fbe7355 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -1,23 +1,40 @@ [Global_Params] -data_url="http://drugcell.ucsd.edu/downloads/" -original_data="data.tgz" -CUDA_ID = 0 -load = "drugcell_v1.pt" -train_data = "../data/drugcell_train.txt" -test_data = "../data/drugcell_test.txt" -val_data = "../data/drugcell_val.txt" -onto = "drugcell_ont.txt" + +model_name='PathDSP' +data_url='https://zenodo.org/record/6093818/files/' +improve_data_url='https://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/raw_data/' +original_data_url='https://zenodo.org/record/7532963/files/' +original_data='input.zip' +gene_set = 'MSigdb.zip' +ppi_data = 'STRING.zip' +drug_target = 'raw_data.zip' +train_data = 'PathDSP_train.txt' +test_data = 'PathDSP_test.txt' +val_data = 'PathDSP_val.txt' +drug_bits_file='drug_mbit_df.txt' +dgnet_file='DGnet.txt' +mutnet_file='MUTnet.txt' +cnvnet_file='CNVnet.txt' +exp_file='EXP.txt' +#output='Result/' +bit_int=128 +permutation_int=3 +metric='auc' +data_type='CTRPv2' +split=0 + + +#Model parameter +seed_int=42 +cpu_int=20 +#cv_int=1 +gpu_int=0 learning_rate = 0.001 -batch_size = 1000 -genotype_hiddens = 6 +batch_size = 12 +eps=0.00001 drug_hiddens='100,50,6' final_hiddens=6 -genotype="cell2mutation.txt" -fingerprint='drug2fingerprint.txt' -cell2id='../data/cell2ind.txt' -drug2id='../data/drug2ind.txt' -output_dir = "MODEL" -epochs=200 -optimizer = "adam" -loss = "mse" -predict="drugcell_all.txt" +epochs=800 +optimizer = 'adam' +loss = 'mse' +improve_analysis='yes' diff --git a/RWR.py b/RWR.py new file mode 100644 index 0000000..5bc82b9 --- /dev/null +++ b/RWR.py @@ -0,0 +1,162 @@ +""" +Return cell by gene probability dataframe + +""" + + +import argparse +import numpy as np +import pandas as pd +import scipy.sparse as scisp +import sklearn.preprocessing as skprc +from datetime import datetime + + +class RWR: + """ + Return probability matrix where columns are PPI genes + + :param ppiPathStr: string representing path to ppi file (with three columns) + :param restartPathStr: string representing path to restart file (i.e., input gene sets) + :param restartProbFloat: float representing restart probability (default: 0.5) + :param convergenceFloat: folat representing convergence criterion (default: 1e-5) + :param normalize: string representing normalization method (choices=['l1', 'l2']) + :param weighted: boolean indicating weither to use weighted graph or not (if False, will set weight of all edges to 1) + :param outPathStr: string representing output path + """ + def __init__(self, ppiPathStr, restartPathStr, restartProbFloat=0.5, convergenceFloat=0.00001, normalize='l1', weighted=True, outPathStr='./'): + # initiating + self.ppiPathStr = ppiPathStr + self.restartPathStr = restartPathStr + self.restartProbFloat = float(restartProbFloat) + self.convergenceFloat = float(convergenceFloat) + self.normalize = normalize + self.weighted = weighted + self.outPathStr = outPathStr + + + + def get_prob(self): + # load PPI graph + print('loading protein-protein interaction network.....') + self.adj_mat, self.name_idx_dict = self.load_graph(self.ppiPathStr, normalize=True, weighted=True) + # mapping dictionary of node index number: node name string + self.idx_name_dict = { idx:name for name, idx in self.name_idx_dict.items() } + + # load restart list (i.e., input gene sets) + print('collecting restart list') + df = pd.read_csv(self.restartPathStr, header=0, sep="\t") + df.columns = ['group', 'gene'] + # collect gene sets by group + grps = df.groupby('group') + grps_dict = {} + for grp in df['group'].unique(): + seed_list = grps.get_group(grp)['gene'].values.tolist() #input gene set + # check if input gene set in ppi and convert name to index number + seed_idx_list = [self.name_idx_dict[i] for i in seed_list if i in self.name_idx_dict.keys()] + # update to dictionary + grps_dict.update({ grp: {'gList':seed_list, 'ppiList':seed_idx_list} }) + + # perform random walk + print('performing random walk.....') + n_grps = len(grps_dict) + grp_list = list(grps_dict.keys()) + grp_prob_dict = {} + n_grp_has_no_ppiList = 0 # number of group has restart list not found on PPI network + for i in range(n_grps): + grp = grp_list[i] + if len(grps_dict[grp]['ppiList']) > 0: # has restart list on PPI network + prob_list = self.run_single_rwr(self.adj_mat, grps_dict[grp]['ppiList'], restartProbFloat=self.restartProbFloat, convergenceFloat=self.convergenceFloat) + + else: + n_grp_has_no_ppiList += 1 + prob_list = [0.0] * len(self.name_idx_dict) + + # update to result + grp_prob_dict.update( {grp:prob_list} ) + + # reformat result: dict2fataframe + print('finalizing result of probability matrix.....') + result_df = pd.DataFrame(grp_prob_dict) + result_df = result_df.T + result_df.columns = list(self.name_idx_dict.keys()) + return result_df # probability matrix grp by ppi genes + + + def load_graph(self, ppiPathStr, normalize=True, weighted=True): + """ + Return a graph in adjacency matrix format and its name string and correspoing index number mapping dictionary + + :param ppiPathStr: string representing file name of a graph in edge list format + :param name2index: boolean indicating whether to convert name string to index number or not + :param normalize: boolean indicating whether to perform column-wised normalization + """ + # load data + df = pd.read_pickle(ppiPathStr) + df.columns = ['source', 'target', 'weight'] + + # convert name to index + all_nodes = sorted(list(set( df['source'] ) | set( df['target'] ))) # retrieve name strings of all nodes + + # create name:index mapping dictionary + gnm_gid_dict = { all_nodes[i]:i for i in range(len(all_nodes)) } + + # replace name string with index number + df['source'].update(df['source'].map(gnm_gid_dict)) + df['target'].update(df['target'].map(gnm_gid_dict)) + + # use weighted graph or unweighted graph + if weighted == False: + df['weight'] = 1 # unweighted graph + + # create adjancency matrix + network_matrix = scisp.csr_matrix((df['weight'].values, (df['source'].values, df['target'].values)), + shape=(len(all_nodes), len(all_nodes)), dtype=float) # Create sparse matrix + network_matrix = (network_matrix + network_matrix.T) # Make the ajdacency matrix symmetric + network_matrix.setdiag(0) # Set diagnoals to zero + + # normalization: Normalize the rows of network_matrix because we are multiplying vector by matrix (from left) + if normalize == True: + network_matrix = skprc.normalize(network_matrix, norm='l1', axis=1) + + # return + return network_matrix, gnm_gid_dict + + def run_single_rwr(self, ppiAdjMat, restartList, restartProbFloat=0.5, convergenceFloat=0.00001): + """ + Return + + :param ppiAdjMat: adjacency matrix of protein-protein interaction network + :param restartList: list of restart nodes (i.e., gene list) + :param restartProbFloat: float representing restart probability (default: 0.5) + :param convergenceFloat: folat representing convergence criterion (default: 1e-5) + """ + # settings + convergence_criterion_float = float(convergenceFloat) # stops when vector L1 norm drops below 10^(-5) + restartProbFloat = float(restartProbFloat) + residual_float = 1.0 # difference between p^(t + 1) and p^(t) + max_iter = 1000 + + # initialze probability vector for restart nodes + prob_seed_list = [0] * ppiAdjMat.shape[0] + for idx in restartList: + prob_seed_list[idx] = 1.0 #1/float(len(restartList)) + prob_seed_arr = np.array(prob_seed_list) + steady_prob_old = prob_seed_arr + + # RWR + iter_int = 0 + #print('updating probability array.....') + while (residual_float > convergence_criterion_float): + # update vector + steady_prob_new = scisp.csr_matrix.dot(steady_prob_old, ppiAdjMat) + steady_prob_new *= (1 - restartProbFloat) + steady_prob_new += (prob_seed_arr * restartProbFloat) + + # Calculate the residual -- the sum of the absolute + # differences of the new node probability vector minus the old + # diff_norm = np.linalg.norm(np.subtract(p_t_1, p_t), 1) + residual_float = abs(steady_prob_new - steady_prob_old).sum() + steady_prob_old = steady_prob_new.copy() + return steady_prob_old + diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..ad9d1d7 --- /dev/null +++ b/environment.yml @@ -0,0 +1,231 @@ +name: PathDSP_env +channels: + - bioconda + - pytorch + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - anyio=3.6.2 + - appdirs=1.4.4 + - argon2-cffi=21.3.0 + - argon2-cffi-bindings=21.2.0 + - asttokens=2.2.1 + - async-lru=2.0.2 + - attrs=23.1.0 + - babel=2.12.1 + - backcall=0.2.0 + - backports=1.0 + - backports.functools_lru_cache=1.6.4 + - beautifulsoup4=4.12.2 + - blas=1.0 + - bleach=6.0.0 + - boost=1.78.0 + - boost-cpp=1.78.0 + - bottleneck=1.3.5 + - brotli=1.0.9 + - brotli-bin=1.0.9 + - brotlipy=0.7.0 + - bzip2=1.0.8 + - ca-certificates=2023.05.30 + - cairo=1.16.0 + - certifi=2023.7.22 + - cffi=1.15.1 + - charset-normalizer=2.0.4 + - cloudpickle=2.2.1 + - colorama=0.4.6 + - comm=0.1.3 + - contourpy=1.0.7 + - cryptography=39.0.1 + - cycler=0.11.0 + - debugpy=1.6.7 + - decorator=5.1.1 + - defusedxml=0.7.1 + - entrypoints=0.4 + - executing=1.2.0 + - expat=2.5.0 + - ffmpeg=4.3 + - filelock=3.9.0 + - flit-core=3.9.0 + - fontconfig=2.14.1 + - fonttools=4.39.4 + - freetype=2.12.1 + - gettext=0.21.1 + - giflib=5.2.1 + - glib=2.76.3 + - glib-tools=2.76.3 + - gmp=6.2.1 + - gmpy2=2.1.2 + - gnutls=3.6.15 + - greenlet=2.0.2 + - gseapy=1.0.5 + - icu=72.1 + - idna=3.4 + - importlib-metadata=6.6.0 + - importlib_metadata=6.6.0 + - importlib_resources=5.12.0 + - intel-openmp=2023.1.0 + - ipykernel=6.23.1 + - ipython=8.13.2 + - jedi=0.18.2 + - jinja2=3.1.2 + - joblib=1.2.0 + - jpeg=9e + - json5=0.9.5 + - jsonschema=4.17.3 + - jupyter-lsp=2.1.0 + - jupyter_client=8.2.0 + - jupyter_core=4.12.0 + - jupyter_events=0.6.3 + - jupyter_server=2.5.0 + - jupyter_server_terminals=0.4.4 + - jupyterlab=4.0.0 + - jupyterlab_pygments=0.2.2 + - jupyterlab_server=2.22.1 + - kiwisolver=1.4.4 + - lame=3.100 + - lcms2=2.12 + - ld_impl_linux-64=2.38 + - lerc=3.0 + - libblas=3.9.0 + - libbrotlicommon=1.0.9 + - libbrotlidec=1.0.9 + - libbrotlienc=1.0.9 + - libcblas=3.9.0 + - libdeflate=1.17 + - libexpat=2.5.0 + - libffi=3.4.4 + - libgcc-ng=12.2.0 + - libgfortran-ng=11.2.0 + - libgfortran5=11.2.0 + - libglib=2.76.3 + - libiconv=1.17 + - libidn2=2.3.4 + - liblapack=3.9.0 + - libllvm11=11.1.0 + - libpng=1.6.39 + - libsodium=1.0.18 + - libstdcxx-ng=12.2.0 + - libtasn1=4.19.0 + - libtiff=4.5.0 + - libunistring=0.9.10 + - libuuid=1.41.5 + - libwebp=1.2.4 + - libwebp-base=1.2.4 + - libxcb=1.15 + - libxml2=2.10.4 + - libzlib=1.2.13 + - llvm-openmp=16.0.4 + - llvmlite=0.39.1 + - lz4-c=1.9.4 + - markupsafe=2.1.1 + - matplotlib-base=3.7.1 + - matplotlib-inline=0.1.6 + - mistune=2.0.5 + - mkl=2023.1.0 + - mkl-service=2.4.0 + - mkl_fft=1.3.6 + - mkl_random=1.2.2 + - mpc=1.1.0 + - mpfr=4.0.2 + - mpmath=1.3.0 + - munkres=1.1.4 + - nbclient=0.8.0 + - nbconvert-core=7.4.0 + - nbformat=5.8.0 + - ncurses=6.4 + - nest-asyncio=1.5.6 + - nettle=3.7.3 + - networkx=2.8.4 + - notebook-shim=0.2.3 + - numba=0.56.4 + - numexpr=2.8.4 + - numpy=1.21.6 + - openh264=2.1.1 + - openssl=1.1.1v + - packaging=23.0 + - pandas=1.5.3 + - pandocfilters=1.5.0 + - parso=0.8.3 + - patsy=0.5.3 + - pcre2=10.40 + - pexpect=4.8.0 + - pickleshare=0.7.5 + - pillow=9.4.0 + - pip=23.0.1 + - pixman=0.40.0 + - pkgutil-resolve-name=1.3.10 + - pooch=1.4.0 + - prometheus_client=0.16.0 + - prompt-toolkit=3.0.38 + - prompt_toolkit=3.0.38 + - psutil=5.9.5 + - pthread-stubs=0.4 + - ptyprocess=0.7.0 + - pure_eval=0.2.2 + - pycairo=1.23.0 + - pycparser=2.21 + - pygments=2.15.1 + - pyopenssl=23.0.0 + - pyparsing=3.0.9 + - pyrsistent=0.19.3 + - pysocks=1.7.1 + - python=3.10.11 + - python-dateutil=2.8.2 + - python-fastjsonschema=2.17.1 + - python-json-logger=2.0.7 + - python_abi=3.10 + - pytorch=2.0.1 + - pytorch-mutex=1.0 + - pytz=2022.7 + - pyyaml=6.0 + - pyzmq=25.0.2 + - rdkit=2023.03.1 + - readline=8.2 + - reportlab=3.6.13 + - requests=2.29.0 + - rfc3339-validator=0.1.4 + - rfc3986-validator=0.1.1 + - scikit-learn=1.0.2 + - scipy=1.10.1 + - seaborn=0.12.2 + - seaborn-base=0.12.2 + - send2trash=1.8.2 + - setuptools=66.0.0 + - shap=0.41.0 + - six=1.16.0 + - slicer=0.0.7 + - sniffio=1.3.0 + - soupsieve=2.3.2.post1 + - sqlalchemy=1.4.46 + - sqlite=3.41.2 + - stack_data=0.6.2 + - statsmodels=0.14.0 + - sympy=1.11.1 + - tbb=2021.8.0 + - terminado=0.17.1 + - threadpoolctl=3.1.0 + - tinycss2=1.2.1 + - tk=8.6.12 + - tomli=2.0.1 + - torchvision=0.15.2 + - tornado=6.3.2 + - tqdm=4.65.0 + - traitlets=5.9.0 + - typing_extensions=4.5.0 + - tzdata=2023c + - unicodedata2=15.0.0 + - urllib3=1.26.15 + - wcwidth=0.2.6 + - webencodings=0.5.1 + - websocket-client=1.5.2 + - wheel=0.38.4 + - xorg-libxau=1.0.11 + - xorg-libxdmcp=1.1.3 + - xz=5.4.2 + - yaml=0.2.5 + - zeromq=4.3.4 + - zipp=3.15.0 + - zlib=1.2.13 + - zstd=1.5.5 diff --git a/environment_081723.yml b/environment_081723.yml new file mode 100644 index 0000000..870884e --- /dev/null +++ b/environment_081723.yml @@ -0,0 +1,156 @@ +name: PathDSP_env +channels: + - bioconda + - pytorch + - nvidia + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - blas=1.0 + - boost=1.74.0 + - boost-cpp=1.74.0 + - bottleneck=1.3.5 + - brotlipy=0.7.0 + - bzip2=1.0.8 + - ca-certificates=2023.7.22 + - cairo=1.16.0 + - certifi=2023.7.22 + - cffi=1.15.1 + - charset-normalizer=2.0.4 + - cryptography=41.0.2 + - cuda-cudart=11.7.99 + - cuda-cupti=11.7.101 + - cuda-libraries=11.7.1 + - cuda-nvrtc=11.7.99 + - cuda-nvtx=11.7.91 + - cuda-runtime=11.7.1 + - cycler=0.11.0 + - ffmpeg=4.3 + - filelock=3.9.0 + - fontconfig=2.14.1 + - freetype=2.10.4 + - giflib=5.2.1 + - glib=2.69.1 + - gmp=6.2.1 + - gmpy2=2.1.2 + - gnutls=3.6.15 + - greenlet=2.0.1 + - gseapy=1.0.5 + - icu=70.1 + - idna=3.4 + - intel-openmp=2023.1.0 + - jbig=2.1 + - jinja2=3.1.2 + - jpeg=9e + - kiwisolver=1.4.4 + - lame=3.100 + - lcms2=2.12 + - ld_impl_linux-64=2.38 + - lerc=3.0 + - libcublas=11.10.3.66 + - libcufft=10.7.2.124 + - libcufile=1.7.1.12 + - libcurand=10.3.3.129 + - libcusolver=11.4.0.1 + - libcusparse=11.7.4.91 + - libdeflate=1.8 + - libffi=3.4.4 + - libgcc-ng=13.1.0 + - libgfortran-ng=11.2.0 + - libgfortran5=11.2.0 + - libiconv=1.16 + - libidn2=2.3.4 + - libnpp=11.7.4.75 + - libnsl=2.0.0 + - libnvjpeg=11.8.0.2 + - libpng=1.6.39 + - libsqlite=3.42.0 + - libstdcxx-ng=11.2.0 + - libtasn1=4.19.0 + - libtiff=4.3.0 + - libunistring=0.9.10 + - libuuid=2.38.1 + - libwebp=1.2.4 + - libwebp-base=1.2.4 + - libxcb=1.15 + - libxml2=2.9.14 + - libzlib=1.2.13 + - llvm-openmp=16.0.6 + - lz4-c=1.9.4 + - markupsafe=2.1.1 + - matplotlib-base=3.4.3 + - mkl=2023.1.0 + - mkl-service=2.4.0 + - mkl_fft=1.3.6 + - mkl_random=1.2.2 + - mpc=1.1.0 + - mpfr=4.0.2 + - mpmath=1.3.0 + - ncurses=6.4 + - nettle=3.7.3 + - networkx=3.1 + - numexpr=2.8.4 + - numpy=1.25.2 + - numpy-base=1.25.2 + - openh264=2.1.1 + - openssl=3.1.2 + - pandas=1.5.3 + - pcre=8.45 + - pillow=9.4.0 + - pip=23.2.1 + - pixman=0.40.0 + - pthread-stubs=0.4 + - pycairo=1.24.0 + - pycparser=2.21 + - pyopenssl=23.2.0 + - pysocks=1.7.1 + - python=3.10.12 + - python-dateutil=2.8.2 + - python_abi=3.10 + - pytorch=2.0.1 + - pytorch-cuda=11.7 + - pytorch-mutex=1.0 + - pytz=2022.7 + - rdkit=2022.03.2 + - readline=8.2 + - reportlab=3.6.12 + - requests=2.31.0 + - scipy=1.11.1 + - setuptools=68.0.0 + - six=1.16.0 + - sqlalchemy=1.4.49 + - sqlite=3.41.2 + - sympy=1.11.1 + - tbb=2021.8.0 + - tk=8.6.12 + - torchaudio=2.0.2 + - torchtriton=2.0.0 + - torchvision=0.15.2 + - tornado=6.3.2 + - typing_extensions=4.7.1 + - tzdata=2023c + - urllib3=1.26.16 + - wheel=0.38.4 + - xorg-libxau=1.0.11 + - xorg-libxdmcp=1.1.3 + - xz=5.2.6 + - zlib=1.2.13 + - zstd=1.5.2 + - pip: + - astropy==5.3.2 + - contourpy==1.1.0 + - fonttools==4.42.0 + - joblib==1.3.2 + - matplotlib==3.7.2 + - packaging==23.1 + - patsy==0.5.3 + - protobuf==3.19.0 + - pyerfa==2.0.0.3 + - pyparsing==3.0.9 + - pyyaml==6.0.1 + - scikit-learn==1.3.0 + - statsmodels==0.14.0 + - threadpoolctl==3.2.0 + - tqdm==4.66.1 diff --git a/environment_082223.yml b/environment_082223.yml new file mode 100644 index 0000000..0f91f17 --- /dev/null +++ b/environment_082223.yml @@ -0,0 +1,218 @@ +name: PathDSP_env +channels: + - bioconda + - pytorch + - nvidia + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - abseil-cpp=20211102.0 + - arrow-cpp=11.0.0 + - asttokens=2.2.1 + - aws-c-common=0.6.8 + - aws-c-event-stream=0.1.6 + - aws-checksums=0.1.11 + - aws-sdk-cpp=1.8.185 + - backcall=0.2.0 + - backports=1.0 + - backports.functools_lru_cache=1.6.5 + - blas=1.0 + - boost=1.74.0 + - boost-cpp=1.74.0 + - bottleneck=1.3.5 + - brotlipy=0.7.0 + - bzip2=1.0.8 + - c-ares=1.19.0 + - ca-certificates=2023.05.30 + - cairo=1.16.0 + - certifi=2023.7.22 + - cffi=1.15.1 + - charset-normalizer=2.0.4 + - comm=0.1.4 + - cryptography=41.0.2 + - cuda-cudart=11.7.99 + - cuda-cupti=11.7.101 + - cuda-libraries=11.7.1 + - cuda-nvrtc=11.7.99 + - cuda-nvtx=11.7.91 + - cuda-runtime=11.7.1 + - cycler=0.11.0 + - debugpy=1.6.7 + - decorator=5.1.1 + - executing=1.2.0 + - ffmpeg=4.3 + - filelock=3.9.0 + - fontconfig=2.14.1 + - freetype=2.10.4 + - gflags=2.2.2 + - giflib=5.2.1 + - glib=2.69.1 + - glog=0.5.0 + - gmp=6.2.1 + - gmpy2=2.1.2 + - gnutls=3.6.15 + - greenlet=2.0.1 + - grpc-cpp=1.48.2 + - gseapy=1.0.5 + - icu=70.1 + - idna=3.4 + - importlib-metadata=6.8.0 + - importlib_metadata=6.8.0 + - intel-openmp=2023.1.0 + - ipykernel=6.25.1 + - ipython=8.14.0 + - jbig=2.1 + - jedi=0.19.0 + - jinja2=3.1.2 + - jpeg=9e + - jupyter_client=8.3.0 + - jupyter_core=4.12.0 + - kiwisolver=1.4.4 + - krb5=1.20.1 + - lame=3.100 + - lcms2=2.12 + - ld_impl_linux-64=2.38 + - lerc=3.0 + - libbrotlicommon=1.0.9 + - libbrotlidec=1.0.9 + - libbrotlienc=1.0.9 + - libcublas=11.10.3.66 + - libcufft=10.7.2.124 + - libcufile=1.7.1.12 + - libcurand=10.3.3.129 + - libcurl=8.1.1 + - libcusolver=11.4.0.1 + - libcusparse=11.7.4.91 + - libdeflate=1.8 + - libedit=3.1.20221030 + - libev=4.33 + - libevent=2.1.12 + - libffi=3.4.4 + - libgcc-ng=13.1.0 + - libgfortran-ng=11.2.0 + - libgfortran5=11.2.0 + - libiconv=1.16 + - libidn2=2.3.4 + - libnghttp2=1.52.0 + - libnpp=11.7.4.75 + - libnsl=2.0.0 + - libnvjpeg=11.8.0.2 + - libpng=1.6.39 + - libprotobuf=3.20.3 + - libsodium=1.0.18 + - libsqlite=3.42.0 + - libssh2=1.10.0 + - libstdcxx-ng=11.2.0 + - libtasn1=4.19.0 + - libthrift=0.15.0 + - libtiff=4.3.0 + - libunistring=0.9.10 + - libuuid=2.38.1 + - libwebp=1.2.4 + - libwebp-base=1.2.4 + - libxcb=1.15 + - libxml2=2.9.14 + - libzlib=1.2.13 + - llvm-openmp=16.0.6 + - lz4-c=1.9.4 + - markupsafe=2.1.1 + - matplotlib-base=3.4.3 + - matplotlib-inline=0.1.6 + - mkl=2023.1.0 + - mkl-service=2.4.0 + - mkl_fft=1.3.6 + - mkl_random=1.2.2 + - mpc=1.1.0 + - mpfr=4.0.2 + - mpmath=1.3.0 + - ncurses=6.4 + - nest-asyncio=1.5.6 + - nettle=3.7.3 + - networkx=3.1 + - numexpr=2.8.4 + - numpy=1.25.2 + - numpy-base=1.25.2 + - openh264=2.1.1 + - openssl=3.1.2 + - orc=1.7.4 + - packaging=23.1 + - pandas=1.5.3 + - parso=0.8.3 + - pcre=8.45 + - pexpect=4.8.0 + - pickleshare=0.7.5 + - pillow=9.4.0 + - pip=23.2.1 + - pixman=0.40.0 + - polars=0.18.15 + - prompt-toolkit=3.0.39 + - prompt_toolkit=3.0.39 + - psutil=5.9.5 + - pthread-stubs=0.4 + - ptyprocess=0.7.0 + - pure_eval=0.2.2 + - pyarrow=11.0.0 + - pycairo=1.24.0 + - pycparser=2.21 + - pygments=2.16.1 + - pyopenssl=23.2.0 + - pysocks=1.7.1 + - python=3.10.12 + - python-dateutil=2.8.2 + - python_abi=3.10 + - pytorch=2.0.1 + - pytorch-cuda=11.7 + - pytorch-mutex=1.0 + - pytz=2022.7 + - pyzmq=25.1.0 + - rdkit=2022.03.2 + - re2=2022.04.01 + - readline=8.2 + - reportlab=3.6.12 + - requests=2.31.0 + - scipy=1.11.1 + - seaborn=0.12.2 + - setuptools=68.0.0 + - six=1.16.0 + - snappy=1.1.9 + - sqlalchemy=1.4.49 + - sqlite=3.41.2 + - stack_data=0.6.2 + - sympy=1.11.1 + - tbb=2021.8.0 + - tk=8.6.12 + - torchaudio=2.0.2 + - torchtriton=2.0.0 + - torchvision=0.15.2 + - tornado=6.3.2 + - traitlets=5.9.0 + - typing_extensions=4.7.1 + - tzdata=2023c + - urllib3=1.26.16 + - utf8proc=2.6.1 + - wcwidth=0.2.6 + - wheel=0.38.4 + - xorg-libxau=1.0.11 + - xorg-libxdmcp=1.1.3 + - xz=5.2.6 + - zeromq=4.3.4 + - zipp=3.16.2 + - zlib=1.2.13 + - zstd=1.5.2 + - pip: + - astropy==5.3.2 + - contourpy==1.1.0 + - fonttools==4.42.0 + - joblib==1.3.2 + - matplotlib==3.7.2 + - patsy==0.5.3 + - protobuf==3.19.0 + - pyerfa==2.0.0.3 + - pyparsing==3.0.9 + - pyyaml==6.0.1 + - scikit-learn==1.3.0 + - statsmodels==0.14.0 + - threadpoolctl==3.2.0 + - tqdm==4.66.1 diff --git a/improve_utils.py b/improve_utils.py new file mode 100644 index 0000000..9a7676b --- /dev/null +++ b/improve_utils.py @@ -0,0 +1,735 @@ +import os +import numpy as np +import pandas as pd +from pathlib import Path, PosixPath +from math import sqrt +from scipy import stats +from typing import List, Union, Optional, Tuple + + +fdir = Path(__file__).resolve().parent + + +# ----------------------------------------------------------------------------- +# TODO +# Note! +# We need to decide how this utils file will be provided for each model. +# Meanwhile, place this .py file in the level as your data preprocessing script. +# For example: +# GraphDRP/ +# |_______ preprocess.py +# |_______ improve_utils.py +# | +# | +# ----------------------------------------------------------------------------- + + + +# ----------------------------------------------------------------------------- +# Global variables +# ---------------- +# These are globals for all models +import types +improve_globals = types.SimpleNamespace() + +# TODO: +# This is CANDLE_DATA_DIR (or something...). +# How this is going to be passed to the code? +improve_globals.main_data_dir = PosixPath("/candle_data_dir/csa_data/") +# improve_globals.main_data_dir = fdir/"improve_data_dir" +# imp_globals.main_data_dir = fdir/"candle_data_dir" + +# Dir names corresponding to the primary input/output blocks in the pipeline +# {}: input/output +# []: process +# train path: {raw_data} --> [preprocess] --> {ml_data} --> [train] --> {models} +# inference path: {ml_data, models} --> [inference] --> {infer} +improve_globals.raw_data_dir_name = "raw_data" # benchmark data +improve_globals.ml_data_dir_name = "ml_data" # preprocessed data for a specific ML model +improve_globals.models_dir_name = "models" # output from model training +improve_globals.infer_dir_name = "infer" # output from model inference (testing) + +# Secondary dirs in raw_data +improve_globals.x_data_dir_name = "x_data" # feature data +improve_globals.y_data_dir_name = "y_data" # target data +improve_globals.splits_dir_name = "splits" # splits files + +# Column names in the raw data files +# imp_globals.canc_col_name = "CancID" +# imp_globals.drug_col_name = "DrugID" +improve_globals.canc_col_name = "improve_sample_id" # column name that contains the cancer sample ids TODO: rename to sample_col_name +improve_globals.drug_col_name = "improve_chem_id" # column name that contains the drug ids +improve_globals.source_col_name = "source" # column name that contains source/study names (CCLE, GDSCv1, etc.) +improve_globals.pred_col_name_suffix = "_pred" # suffix to predictions col name (example of final col name: auc_pred) + +# Response data file name +improve_globals.y_file_name = "response.tsv" # response data + +# Cancer sample features file names +improve_globals.copy_number_fname = "cancer_copy_number.tsv" # cancer feature +improve_globals.discretized_copy_number_fname = "cancer_discretized_copy_number.tsv" # cancer feature +improve_globals.dna_methylation_fname = "cancer_DNA_methylation.tsv" # cancer feature +improve_globals.gene_expression_fname = "cancer_gene_expression.tsv" # cancer feature +improve_globals.miRNA_expression_fname = "cancer_miRNA_expression.tsv" # cancer feature +improve_globals.mutation_count_fname = "cancer_mutation_count.tsv" # cancer feature +improve_globals.mutation_fname = "cancer_mutation.tsv" # cancer feature +improve_globals.rppa_fname = "cancer_RPPA.tsv" # cancer feature + +# Drug features file names +improve_globals.smiles_file_name = "drug_SMILES.tsv" # drug feature +improve_globals.mordred_file_name = "drug_mordred.tsv" # drug feature +improve_globals.ecfp4_512bit_file_name = "drug_ecfp4_512bit.tsv" # drug feature + +# Globals derived from the ones defined above +improve_globals.raw_data_dir = improve_globals.main_data_dir/improve_globals.raw_data_dir_name # raw_data +improve_globals.ml_data_dir = improve_globals.main_data_dir/improve_globals.ml_data_dir_name # ml_data +improve_globals.models_dir = improve_globals.main_data_dir/improve_globals.models_dir_name # models +improve_globals.infer_dir = improve_globals.main_data_dir/improve_globals.infer_dir_name # infer +# ----- +improve_globals.x_data_dir = improve_globals.raw_data_dir/improve_globals.x_data_dir_name # x_data +improve_globals.y_data_dir = improve_globals.raw_data_dir/improve_globals.y_data_dir_name # y_data +improve_globals.splits_dir = improve_globals.raw_data_dir/improve_globals.splits_dir_name # splits + +# Response +improve_globals.y_file_path = improve_globals.y_data_dir/improve_globals.y_file_name # response.txt + +# Cancers +improve_globals.copy_number_file_path = improve_globals.x_data_dir/improve_globals.copy_number_fname # cancer_copy_number.txt +improve_globals.discretized_copy_number_file_path = improve_globals.x_data_dir/improve_globals.discretized_copy_number_fname # cancer_discretized_copy_number.txt +improve_globals.dna_methylation_file_path = improve_globals.x_data_dir/improve_globals.dna_methylation_fname # cancer_DNA_methylation.txt +improve_globals.gene_expression_file_path = improve_globals.x_data_dir/improve_globals.gene_expression_fname # cancer_gene_expression.txt +improve_globals.mirna_expression_file_path = improve_globals.x_data_dir/improve_globals.miRNA_expression_fname # cancer_miRNA_expression.txt +improve_globals.mutation_count_file_path = improve_globals.x_data_dir/improve_globals.mutation_count_fname # cancer_mutation_count.txt +improve_globals.mutation_file_path = improve_globals.x_data_dir/improve_globals.mutation_fname # cancer_mutation.txt +improve_globals.rppa_file_path = improve_globals.x_data_dir/improve_globals.rppa_fname # cancer_RPPA.txt + +# Drugs +improve_globals.smiles_file_path = improve_globals.x_data_dir/improve_globals.smiles_file_name # +improve_globals.mordred_file_path = improve_globals.x_data_dir/improve_globals.mordred_file_name # +improve_globals.ecfp4_512bit_file_path = improve_globals.x_data_dir/improve_globals.ecfp4_512bit_file_name # +# ----------------------------------------------------------------------------- + + +# ------------------------------------- +# Drug response loaders +# ------------------------------------- + +def load_single_drug_response_data( + # source: Union[str, List[str]], + source: str, + split: Union[int, None]=None, + split_type: Union[str, List[str], None]=None, + y_col_name: str="auc", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns datarame with cancer ids, drug ids, and drug response values. Samples + from the original drug response file are filtered based on the specified + sources. + + Args: + source (str or list of str): DRP source name (str) or multiple sources (list of strings) + split(int or None): split id (int), None (load all samples) + split_type (str or None): one of the following: 'train', 'val', 'test' + y_col_name (str): name of drug response measure/score (e.g., AUC, IC50) + + Returns: + pd.Dataframe: dataframe that contains drug response values + """ + # TODO: at this point, this func implements the loading a single source + df = pd.read_csv(improve_globals.y_file_path, sep=sep) + + # import pdb; pdb.set_trace() + if isinstance(split, int): + # Get a subset of samples + ids = load_split_file(source, split, split_type) + df = df.loc[ids] + else: + # Get the full dataset for a given source + df = df[df[improve_globals.source_col_name].isin([source])] + + cols = [improve_globals.source_col_name, + improve_globals.drug_col_name, + improve_globals.canc_col_name, + y_col_name] + df = df[cols] # [source, drug id, cancer id, response] + df = df.reset_index(drop=True) + if verbose: + print(f"Response data: {df.shape}") + print(df[[improve_globals.canc_col_name, improve_globals.drug_col_name]].nunique()) + return df + + +def load_single_drug_response_data_v2( + # source: Union[str, List[str]], + source: str, + # split: Union[int, None]=None, + # split_type: Union[str, List[str], None]=None, + split_file_name: Union[str, List[str], None]=None, + y_col_name: str="auc", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns datarame with cancer ids, drug ids, and drug response values. Samples + from the original drug response file are filtered based on the specified + sources. + + Args: + source (str or list of str): DRP source name (str) or multiple sources (list of strings) + split(int or None): split id (int), None (load all samples) + split_type (str or None): one of the following: 'train', 'val', 'test' + y_col_name (str): name of drug response measure/score (e.g., AUC, IC50) + + Returns: + pd.Dataframe: dataframe that contains drug response values + """ + # TODO: currently, this func implements loading a single data source (CCLE or CTRPv2 or ...) + df = pd.read_csv(improve_globals.y_file_path, sep=sep) + + # Get a subset of samples + if isinstance(split_file_name, list) and len(split_file_name) == 0: + raise ValueError("Empty list is passed via split_file_name.") + if isinstance(split_file_name, str): + split_file_name = [split_file_name] + ids = load_split_ids(split_file_name) + df = df.loc[ids] + # else: + # # Get the full dataset for a given source + # df = df[df[improve_globals.source_col_name].isin([source])] + + # # Get a subset of cols + # cols = [improve_globals.source_col_name, + # improve_globals.drug_col_name, + # improve_globals.canc_col_name, + # y_col_name] + # df = df[cols] # [source, drug id, cancer id, response] + + df = df.reset_index(drop=True) + if verbose: + print(f"Response data: {df.shape}") + print(f"Unique cells: {df[improve_globals.canc_col_name].nunique()}") + print(f"Unique drugs: {df[improve_globals.drug_col_name].nunique()}") + return df + + +def load_split_ids(split_file_name: Union[str, List[str]]) -> List[int]: + """ Returns list of integers, representing the rows in the response dataset. + Args: + split_file_name (str or list of str): splits file name or list of file names + + Returns: + list: list of integers representing the ids + """ + ids = [] + for fname in split_file_name: + fpath = improve_globals.splits_dir/fname + assert fpath.exists(), f"split_file_name {fname} not found." + ids_ = pd.read_csv(fpath, header=None)[0].tolist() + ids.extend(ids_) + return ids + + +def load_split_file( + source: str, + split: Union[int, None]=None, + split_type: Union[str, List[str], None]=None) -> List[int]: + """ + Args: + source (str): DRP source name (str) + + Returns: + ids (list): list of id integers + """ + # TODO: used in the old version of the rsp loader + if isinstance(split_type, str): + split_type = [split_type] + + # Check if the split file exists and load + ids = [] + for st in split_type: + fpath = improve_globals.splits_dir/f"{source}_split_{split}_{st}.txt" + assert fpath.exists(), f"Splits file not found: {fpath}" + ids_ = pd.read_csv(fpath, header=None)[0].tolist() + ids.extend(ids_) + return ids + + +# ------------------------------------- +# Omic feature loaders +# ------------------------------------- + +""" +Notes about omics data. + +Omics data files are multi-level tables with several column types (generally 3 +or 4), each contains gene names using a different gene identifier system: +Entrez ID, Gene Symbol, Ensembl ID, TSS + +The column levels are not organized in the same order across the different +omic files. + +The level_map dict, in each loader function, encodes the column level and the +corresponding identifier systems. + +For example, in the copy number file the level_map is: +level_map = {"Entrez":0, "Gene_Symbol": 1, "Ensembl": 2} +""" + +def set_col_names_in_multilevel_dataframe( + df: pd.DataFrame, + level_map: dict, + gene_system_identifier: Union[str, List[str]]="Gene_Symbol") -> pd.DataFrame: + """ Util function that supports loading of the omic data files. + Returns the input dataframe with the multi-level column names renamed as + specified by the gene_system_identifier arg. + + Args: + df (pd.DataFrame): omics dataframe + level_map (dict): encodes the column level and the corresponding identifier systems + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: the input dataframe with the specified multi-level column names + """ + df = df.copy() + + level_names = list(level_map.keys()) + level_values = list(level_map.values()) + n_levels = len(level_names) + + if isinstance(gene_system_identifier, list) and len(gene_system_identifier) == 1: + gene_system_identifier = gene_system_identifier[0] + + # print(gene_system_identifier) + # import pdb; pdb.set_trace() + if isinstance(gene_system_identifier, str): + if gene_system_identifier == "all": + df.columns = df.columns.rename(level_names, level=level_values) # assign multi-level col names + else: + df.columns = df.columns.get_level_values(level_map[gene_system_identifier]) # retian specific column level + else: + assert len(gene_system_identifier) <= n_levels, f"'gene_system_identifier' can't contain more than {n_levels} items." + set_diff = list(set(gene_system_identifier).difference(set(level_names))) + assert len(set_diff) == 0, f"Passed unknown gene identifiers: {set_diff}" + kk = {i: level_map[i] for i in level_map if i in gene_system_identifier} + # print(list(kk.keys())) + # print(list(kk.values())) + df.columns = df.columns.rename(list(kk.keys()), level=kk.values()) # assign multi-level col names + drop_levels = list(set(level_map.values()).difference(set(kk.values()))) + df = df.droplevel(level=drop_levels, axis=1) + return df + + +def load_copy_number_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns copy number data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + # level_map encodes the relationship btw the column and gene identifier system + level_map = {"Ensembl": 2, "Entrez": 0, "Gene_Symbol": 1} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.copy_number_file_path, sep=sep, index_col=0, header=header) + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + # Test the func + # d0 = set_col_names_in_multilevel_dataframe(df, "all") + # d1 = set_col_names_in_multilevel_dataframe(df, "Ensembl") + # d2 = set_col_names_in_multilevel_dataframe(df, ["Ensembl"]) + # d3 = set_col_names_in_multilevel_dataframe(df, ["Entrez", "Gene_Symbol", "Ensembl"]) + # d4 = set_col_names_in_multilevel_dataframe(df, ["Entrez", "Ensembl"]) + # d5 = set_col_names_in_multilevel_dataframe(df, ["Blah", "Ensembl"]) + if verbose: + print(f"Copy number data: {df.shape}") + # print(df.dtypes) + # print(df.dtypes.value_counts()) + return df + + +def load_discretized_copy_number_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns discretized copy number data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + # level_map encodes the relationship btw the column and gene identifier system + level_map = {"Ensembl": 2, "Entrez": 0, "Gene_Symbol": 1} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.discretized_copy_number_file_path, sep=sep, index_col=0, header=header) + + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + if verbose: + print(f"Discretized copy number data: {df.shape}") + + return df + + +def load_dna_methylation_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns methylation data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + level_map = {"Ensembl": 2, "Entrez": 1, "Gene_Symbol": 3, "TSS": 0} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.dna_methylation_file_path, sep=sep, index_col=0, header=header) + + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + if verbose: + print(f"DNA methylation data: {df.shape}") + # print(df.dtypes) # TODO: many column are of type 'object' + # print(df.dtypes.value_counts()) + return df + + +def load_gene_expression_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns gene expression data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + # level_map encodes the relationship btw the column and gene identifier system + level_map = {"Ensembl": 0, "Entrez": 1, "Gene_Symbol": 2} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.gene_expression_file_path, sep=sep, index_col=0, header=header) + + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + if verbose: + print(f"Gene expression data: {df.shape}") + return df + + +def load_mirna_expression_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + # TODO + raise NotImplementedError("The function is not implemeted yet.") + return None + + +def load_mutation_count_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns mutation count data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + # level_map encodes the relationship btw the column and gene identifier system + level_map = {"Ensembl": 2, "Entrez": 0, "Gene_Symbol": 1} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.mutation_count_file_path, sep=sep, index_col=0, header=header) + + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + if verbose: + print(f"Mutation count data: {df.shape}") + + return df + + +def load_mutation_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + # TODO + raise NotImplementedError("The function is not implemeted yet.") + return None + + +def load_rppa_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + # TODO + raise NotImplementedError("The function is not implemeted yet.") + return None + + + + +# ------------------------------------- +# Drug feature loaders +# ------------------------------------- + +def load_smiles_data( + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + IMPROVE-specific func. + Read smiles data. + src_raw_data_dir : data dir where the raw DRP data is stored + """ + df = pd.read_csv(improve_globals.smiles_file_path, sep=sep) + + # TODO: updated this after we update the data + df.columns = ["improve_chem_id", "smiles"] + + if verbose: + print(f"SMILES data: {df.shape}") + # print(df.dtypes) + # print(df.dtypes.value_counts()) + return df + + +def load_mordred_descriptor_data( + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Return Mordred descriptors data. + """ + df = pd.read_csv(improve_globals.mordred_file_path, sep=sep) + df = df.set_index(improve_globals.drug_col_name) + if verbose: + print(f"Mordred descriptors data: {df.shape}") + return df + + +def load_morgan_fingerprint_data( + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Return Morgan fingerprints data. + """ + df = pd.read_csv(improve_globals.ecfp4_512bit_file_path, sep=sep) + df = df.set_index(improve_globals.drug_col_name) + return df + + +# ------------------------------------- +# Save data functions +# ------------------------------------- + +def save_preds(df: pd.DataFrame, y_col_name: str, + outpath: Union[str, PosixPath], round_decimals: int=4) -> None: + """ Save model predictions. + This function throws errors if the dataframe does not include the expected + columns: canc_col_name, drug_col_name, y_col_name, y_col_name + "_pred" + + Args: + df (pd.DataFrame): df with model predictions + y_col_name (str): drug response col name (e.g., IC50, AUC) + outpath (str or PosixPath): outdir to save the model predictions df + round (int): round response values + + Returns: + None + """ + # Check that the 4 columns exist + assert improve_globals.canc_col_name in df.columns, f"{improve_globals.canc_col_name} was not found in columns." + assert improve_globals.drug_col_name in df.columns, f"{improve_globals.drug_col_name} was not found in columns." + assert y_col_name in df.columns, f"{y_col_name} was not found in columns." + pred_col_name = y_col_name + f"{improve_globals.pred_col_name_suffix}" + assert pred_col_name in df.columns, f"{pred_col_name} was not found in columns." + + # Round + df = df.round({y_col_name: round_decimals, pred_col_name: round_decimals}) + + # Save preds df + df.to_csv(outpath, index=False) + return None + + + + + + +# ================================================================== +# Leftovers +# ================================================================== +def get_data_splits( + src_raw_data_dir: str, + splitdir_name: str, + split_file_name: str, + rsp_df: pd.DataFrame): + """ + IMPROVE-specific func. + Read smiles data. + src_raw_data_dir : data dir where the raw DRP data is stored + """ + splitdir = src_raw_data_dir/splitdir_name + if len(split_file_name) == 1 and split_file_name[0] == "full": + # Full dataset (take all samples) + ids = list(range(rsp_df.shape[0])) + else: + # Check if the split file exists and load + ids = [] + for fname in split_file_name: + assert (splitdir/fname).exists(), "split_file_name not found." + with open(splitdir/fname) as f: + ids_ = [int(line.rstrip()) for line in f] + ids.extend(ids_) + + """ + # Method 1 + splitdir = Path(os.path.join(src_raw_data_dir))/"splits" + if len(args.split_file_name) == 1 and args.split_file_name[0] == "full": + # Full dataset (take all samples) + ids = list(range(rsp_df.shape[0])) + outdir_name = "full" + else: + # Check if the split file exists and load + ids = [] + split_id_str = [] # e.g. split_5 + split_type_str = [] # e.g. tr, vl, te + for fname in args.split_file_name: + assert (splitdir/fname).exists(), "split_file_name not found." + with open(splitdir/fname) as f: + # Get the ids + ids_ = [int(line.rstrip()) for line in f] + ids.extend(ids_) + # Get the name + fname_sep = fname.split("_") + split_id_str.append("_".join([s for s in fname_sep[:2]])) + split_type_str.append(fname_sep[2]) + assert len(set(split_id_str)) == 1, "Data splits must be from the same dataset source." + split_id_str = list(set(split_id_str))[0] + split_type_str = "_".join([x for x in split_type_str]) + outdir_name = f"{split_id_str}_{split_type_str}" + ML_DATADIR = main_data_dir/"ml_data" + root = ML_DATADIR/f"data.{args.source_data_name}"/outdir_name # ML data + os.makedirs(root, exist_ok=True) + """ + + """ + # Method 2 + splitdir = src_raw_data_dir/args.splitdir_name + if len(args.split_file_name) == 1 and args.split_file_name[0] == "full": + # Full dataset (take all samples) + ids = list(range(rsp_df.shape[0])) + else: + # Check if the split file exists and load + ids = [] + for fname in args.split_file_name: + assert (splitdir/fname).exists(), "split_file_name not found." + with open(splitdir/fname) as f: + ids_ = [int(line.rstrip()) for line in f] + ids.extend(ids_) + """ + return ids + + +def get_common_samples( + df1: pd.DataFrame, + df2: pd.DataFrame, + ref_col: str) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Args: + df1, df2 (pd.DataFrame): dataframes + ref_col (str): the ref column to find the common values + + Returns: + df1, df2 + + Example: + TODO + """ + # Retain (canc, drug) response samples for which we have omic data + common_ids = list(set(df1[ref_col]).intersection(df2[ref_col])) + # print(df1.shape) + df1 = df1[ df1[improve_globals.canc_col_name].isin(common_ids) ].reset_index(drop=True) + # print(df1.shape) + # print(df2.shape) + df2 = df2[ df2[improve_globals.canc_col_name].isin(common_ids) ].reset_index(drop=True) + # print(df2.shape) + return df1, df2 + + +def read_df(fpath: str, sep: str=","): + """ + IMPROVE-specific func. + Load a dataframe. Supports csv and parquet files. + sep : the sepator in the csv file + """ + # TODO: this func might be available in candle + assert Path(fpath).exists(), f"File {fpath} was not found." + if "parquet" in str(fpath): + df = pd.read_parquet(fpath) + else: + df = pd.read_csv(fpath, sep=sep) + return df + + +def get_subset_df(df: pd.DataFrame, ids: list) -> pd.DataFrame: + """ Get a subset of the input dataframe based on row ids.""" + df = df.loc[ids] + return df + + +def rmse(y, f): + rmse = sqrt(((y - f)**2).mean(axis=0)) + return rmse + + +def mse(y, f): + mse = ((y - f)**2).mean(axis=0) + return mse + + +def pearson(y, f): + rp = np.corrcoef(y, f)[0, 1] + return rp + + +def spearman(y, f): + rs = stats.spearmanr(y, f)[0] + return rs + + +def r_square(y_true, y_pred): + from sklearn.metrics import r2_score + return r2_score(y_true, y_pred) diff --git a/infer.py b/infer.py new file mode 100755 index 0000000..babd421 --- /dev/null +++ b/infer.py @@ -0,0 +1,84 @@ +import candle +import os +import sys +#import json +#from json import JSONEncoder +from preprocess_new import mkdir, preprocess +import numpy as np +import pandas as pd +from datetime import datetime +import torch as tch +import torch.utils.data as tchud +import polars as pl +import sklearn.metrics as skmts +#sys.path.append("/usr/local/PathDSP/PathDSP") +sys.path.append("/usr/local/PathDSP/PathDSP") +import FNN_new + + +file_path = os.path.dirname(os.path.realpath(__file__)) +required = None +additional_definitions = None + + +# initialize class +class PathDSP_candle(candle.Benchmark): + def set_locals(self): + ''' + Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the benchmark. + ''' + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + +def initialize_parameters(): + preprocessor_bmk = PathDSP_candle(file_path, + 'PathDSP_params.txt', + 'pytorch', + prog='PathDSP_candle', + desc='Data Preprocessor' + ) + #Initialize parameters + gParameters = candle.finalize_parameters(preprocessor_bmk) + return gParameters + + +def run(params): + trained_net = FNN_new.mynet.FNN(Xtest_arr.shape[1]) + trained_net.load_state_dict(tch.load(params['data_dir'] + '/model.pt')) + trained_net.eval() + test_df = pl.read_csv(params['test_data'], separator = "\t").to_pandas() + FNN_new.myutil.set_seed(params["seed_int"]) + device = FNN_new.myutil.get_device(uth=params["gpu_int"]) + Xtest_arr = test_df.iloc[:, 0:-1].values + ytest_arr = test_df.iloc[:, -1].values + Xtest_arr = np.array(Xtest_arr).astype('float32') + ytest_arr = np.array(ytest_arr).astype('float32') + test_dataset = FNN_new.mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) + test_dl = tchud.DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False) + start = datetime.now() + prediction_list = FNN_new.predict(trained_net, test_dl, device) + print('Inference time :[Finished in {:}]'.format(FNN_new.cal_time(datetime.now(), start))) + # evaluation metrics + mse = skmts.mean_squared_error(ytest_arr, prediction_list) + rmse = np.sqrt(mse) + r2_pred = FNN_new.r2_score(ytest_arr, prediction_list) + loss_pred = pd.DataFrame({'metric': ['rmse', 'r2'], + 'value': [rmse, r2_pred]}) + loss_pred.to_csv(params['data_dir'] + '/Loss_pred.txt', header=True, index=False, sep="\t") + ytest_df = test_df.iloc[:, -1].to_frame() + ytest_df['prediction'] = prediction_list + ytest_df.to_csv(params['data_dir'] + '/Prediction.txt', header=True, index=True, sep="\t") + + +def candle_main(): + params = initialize_parameters() + data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/' + params = preprocess(params, data_dir) + run(params) + +if __name__ == "__main__": + candle_main() diff --git a/preprocess.sh b/preprocess.sh new file mode 100644 index 0000000..a20dd1d --- /dev/null +++ b/preprocess.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +######################################################################### +### THIS IS A TEMPLATE FILE. SUBSTITUTE #PATH# WITH THE MODEL EXECUTABLE. +######################################################################### + + +# arg 1 CUDA_VISIBLE_DEVICES +# arg 2 CANDLE_DATA_DIR +# arg 3 CANDLE_CONFIG + +### Path to your CANDLEized model's main Python script### + +CANDLE_MODEL=/usr/local/PathDSP/preprocess_new.py + +if [ $# -lt 2 ] ; then + echo "Illegal number of parameters" + echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" + exit +fi + +if [ $# -eq 2 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + CMD="python ${CANDLE_MODEL}" + echo "CMD = $CMD" + +elif [ $# -ge 3 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + + # if original $3 is a file, set candle_config and passthrough $@ + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$CANDLE_DATA_DIR/$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD" + + # else passthrough $@ + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + + fi +fi + + +# Display runtime arguments +echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" +echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" + +# Set up environmental variables and execute model +echo "activating environment" +#source /opt/conda/etc/profile.d/conda.sh +source activate /usr/local/conda_envs/PathDSP_env +echo "running command ${CMD}" +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD diff --git a/preprocess_new.py b/preprocess_new.py new file mode 100644 index 0000000..c9162e0 --- /dev/null +++ b/preprocess_new.py @@ -0,0 +1,397 @@ +#!/homes/ac.rgnanaolivu/miniconda3/envs/rohan_python/bin/python + +import sys +import os +import numpy as np +import polars as pl +#import torch +#import torch.utils.data as du +#from torch.autograd import Variable +#import torch.nn as nn +#import torch.nn.functional as F +#from code.drugcell_NN import * +import argparse +import numpy as np +import pandas as pd +import candle +#import time +#import logging +#import networkx as nx +#import networkx.algorithms.components.connected as nxacc +#import networkx.algorithms.dag as nxadag +#from pathlib import Path +from functools import reduce +import improve_utils +# import RDKit +from rdkit import Chem +from rdkit.Chem import AllChem +from datetime import datetime +# import NetPEA modules +import RWR as rwr +import NetPEA as pea +#import gsea module +import gseapy as gp + + + + +file_path = os.path.dirname(os.path.realpath(__file__)) +#fdir = Path('__file__').resolve().parent +#source = 'csa_data/raw_data/splits/' +required = None +additional_definitions = None + +# This should be set outside as a user environment variable +#os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' + +# initialize class +class PathDSP_candle(candle.Benchmark): + def set_locals(self): + ''' + Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the benchmark. + ''' + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + + +def initialize_parameters(): + preprocessor_bmk = PathDSP_candle(file_path, + 'PathDSP_params.txt', + 'pytorch', + prog='PathDSP_candle', + desc='Data Preprocessor' + ) + #Initialize parameters + gParameters = candle.finalize_parameters(preprocessor_bmk) + return gParameters + + +def mkdir(directory): + directories = directory.split('/') + + folder = '' + for d in directories: + folder += d + '/' + if not os.path.exists(folder): + print('creating folder: %s'%folder) + os.mkdir(folder) + + +def preprocess(params, data_dir): + print(os.environ['CANDLE_DATA_DIR']) + #requirements go here + #keys_parsing = ['output_dir', 'hidden', 'result', 'metric', 'data_type'] + if not os.path.exists(data_dir): + mkdir(data_dir) + params['data_dir'] = data_dir + #args = candle.ArgumentStruct(**params) + for i in ['train_data', 'test_data', 'val_data', 'drug_bits_file', 'dgnet_file', + 'mutnet_file', 'cnvnet_file', 'exp_file']: + params[i] = params['data_dir'] + '/' + params[i] + return(params) + +def download_anl_data(params): + csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'], 'csa_data', 'raw_data') + splits_dir = os.path.join(csa_data_folder, 'splits') + x_data_dir = os.path.join(csa_data_folder, 'x_data') + y_data_dir = os.path.join(csa_data_folder, 'y_data') + + if not os.path.exists(csa_data_folder): + print('creating folder: %s'%csa_data_folder) + os.makedirs(csa_data_folder) + mkdir(splits_dir) + mkdir(x_data_dir) + mkdir(y_data_dir) + + for improve_file in ['CCLE_all.txt', + 'CCLE_split_' + str(params['split']) + '_test.txt', + 'CCLE_split_' + str(params['split']) + '_train.txt', + 'CCLE_split_' + str(params['split']) + '_val.txt', + 'CTRPv2_all.txt', + 'CTRPv2_split_' + str(params['split']) + '_test.txt', + 'CTRPv2_split_' + str(params['split']) + '_train.txt', + 'CTRPv2_split_' + str(params['split']) + '_val.txt', + 'gCSI_all.txt', + 'GDSCv1_all.txt', + 'GDSCv2_all.txt' + ]: + url_dir = params['improve_data_url'] + '/splits/' + candle.file_utils.get_file(improve_file, url_dir + improve_file, + datadir=splits_dir, + cache_subdir=None) + + for improve_file in ['cancer_mutation_count.tsv', 'drug_SMILES.tsv', 'drug_info.tsv', 'cancer_discretized_copy_number.tsv', 'cancer_gene_expression.tsv']: + url_dir = params['improve_data_url'] + '/x_data/' + candle.file_utils.get_file(fname=improve_file, origin=url_dir + improve_file, + datadir=x_data_dir, + cache_subdir=None) + + url_dir = params['improve_data_url'] + '/y_data/' + response_file = 'response.tsv' + candle.file_utils.get_file(fname=response_file, origin=url_dir + response_file, + datadir=y_data_dir, + cache_subdir=None) + + ## get gene-set data and string data + for db_file in [params['gene_set'], params['ppi_data'], params['drug_target']]: + candle.file_utils.get_file(db_file, params['data_url'] + '/' +db_file, + datadir=params['data_dir'], + cache_subdir=None) + + + + +# set timer +def cal_time(end, start): + '''return time spent''' + # end = datetime.now(), start = datetime.now() + datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' + spend = datetime.strptime(str(end), datetimeFormat) - \ + datetime.strptime(str(start),datetimeFormat) + return spend + + +def download_author_data(params): + data_download_filepath = candle.get_file(params['original_data'], params['original_data_url'] + '/' + params['original_data'], + datadir = params['data_dir'], + cache_subdir = None) + print('download_path: {}'.format(data_download_filepath)) + + +def smile2bits(params): + start = datetime.now() + rs_all = improve_utils.load_single_drug_response_data(source=params['data_type'], + split=params['split'], split_type=["train", "test", "val"], + y_col_name=params['metric']) + smile_df = improve_utils.load_smiles_data() + smile_df.columns = ['drug', 'smile'] + smile_df = smile_df.drop_duplicates(subset=['drug'], keep='first').set_index('drug') + smile_df = smile_df.loc[smile_df.index.isin(rs_all['improve_chem_id']),] + bit_int = params['bit_int'] + record_list = [] + # smile2bits drug by drug + n_drug = 1 + for idx, row in smile_df.iterrows(): + drug = idx + smile = row['smile'] + mol = Chem.MolFromSmiles(smile) + if mol is None: + continue + mbit = list( AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=bit_int) ) + #drug_mbit_dict.update({drug:mbit}) + # append to result + record_list.append( tuple([drug]+mbit) ) + if len(mbit) == bit_int: + n_drug+=1 + print('total {:} drugs with bits'.format(n_drug)) + # convert dict to dataframe + colname_list = ['drug'] + ['mBit_'+str(i) for i in range(bit_int)] + drug_mbit_df = pd.DataFrame.from_records(record_list, columns=colname_list) + #drug_mbit_df = pd.DataFrame.from_dict(drug_mbit_dict, orient='index', columns=colname_list) + #drug_mbit_df.index.name = 'drug' + print('unique drugs={:}'.format(len(drug_mbit_df['drug'].unique()))) + # save to file + drug_mbit_df.to_csv(params['drug_bits_file'], header=True, index=False, sep='\t') + print('[Finished in {:}]'.format(cal_time(datetime.now(), start))) + +def times_expression(rwr, exp): + ''' + :param rwrDf: dataframe of cell by gene probability matrix + :param expDf: dataframe of cell by gene expression matrix + :return rwr_timesexp_df: dataframe of cell by gene probability matrix, + in which genes are multiplied with expression values + + Note: this function assumes cells are all overlapped while gene maybe not + ''' + cell_list = sorted(list(set(rwr.index) & set(exp.index))) + gene_list = sorted(list(set(rwr.columns)&set(exp.columns))) + + if len(cell_list) == 0: + print('ERROR! no overlapping cell lines') + sys.exit(1) + if len(gene_list) == 0: + print('ERROR! no overlapping genes') + sys.exit(1) + + # multiply with gene expression for overlapping cell, gene + rwr_timesexp = rwr.loc[cell_list, gene_list]*exp.loc[cell_list, gene_list] + + # concat with other gene + out_gene_list = list(set(rwr.columns)-set(gene_list)) + out_df = pd.concat([rwr_timesexp, rwr[out_gene_list]], axis=1) + return out_df + +def run_netpea(params, dtype, multiply_expression): + # timer + start_time = datetime.now() + ppi_path = params['data_dir'] + '/STRING/9606.protein_name.links.v11.0.pkl' + pathway_path = params['data_dir'] + '/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt' + log_transform = False + permutation_int = params['permutation_int'] + seed_int = params['seed_int'] + cpu_int = params['cpu_int'] + csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'], 'csa_data', 'raw_data') + rs_all = improve_utils.load_single_drug_response_data(source=params['data_type'], + split=params['split'], split_type=["train", "test", "val"], + y_col_name=params['metric']) + if dtype == 'DGnet': + drug_info = pd.read_csv(csa_data_folder + '/x_data/drug_info.tsv', sep='\t') + drug_info['NAME'] = drug_info['NAME'].str.upper() + target_info = pd.read_csv(params['data_dir'] + '/data/DB.Drug.Target.txt', sep = '\t') + target_info = target_info.rename(columns={'drug': 'NAME'}) + combined_df = pd.merge(drug_info, target_info, how = 'left', on = 'NAME').dropna(subset=['gene']) + combined_df = combined_df.loc[combined_df['improve_chem_id'].isin(rs_all['improve_chem_id']),] + restart_path = params['data_dir'] + '/drug_target.txt' + combined_df.iloc[:,-2:].to_csv(restart_path, sep = '\t', header= True, index=False) + outpath = params['dgnet_file'] + elif dtype == 'MUTnet': + mutation_data = improve_utils.load_mutation_count_data(gene_system_identifier='Gene_Symbol') + mutation_data = mutation_data.reset_index() + mutation_data = pd.melt(mutation_data, id_vars='improve_sample_id').loc[lambda x: x['value'] > 0] + mutation_data = mutation_data.loc[mutation_data['improve_sample_id'].isin(rs_all['improve_sample_id']),] + restart_path = params['data_dir'] + '/mutation_data.txt' + mutation_data.iloc[:,0:2].to_csv(restart_path, sep = '\t', header= True, index=False) + outpath = params['mutnet_file'] + else: + cnv_data = improve_utils.load_discretized_copy_number_data(gene_system_identifier='Gene_Symbol') + cnv_data = cnv_data.reset_index() + cnv_data = pd.melt(cnv_data, id_vars='improve_sample_id').loc[lambda x: x['value'] != 0] + cnv_data = cnv_data.loc[cnv_data['improve_sample_id'].isin(rs_all['improve_sample_id']),] + restart_path = params['data_dir'] + '/cnv_data.txt' + cnv_data.iloc[:,0:2].to_csv(restart_path, sep = '\t', header= True, index=False) + outpath = params['cnvnet_file'] + # perform Random Walk + print(datetime.now(), 'performing random walk with restart') + rwr_df = rwr.RWR(ppi_path, restart_path, restartProbFloat=0.5, convergenceFloat=0.00001, normalize='l1', weighted=True).get_prob() + # multiply with gene expression + if multiply_expression: + print(datetime.now(), 'multiplying gene expression with random walk probability for genes were expressed') + exp_df = improve_utils.load_gene_expression_data(gene_system_identifier='Gene_Symbol') + rwr_df = times_expression(rwr_df, exp_df) + #rwr_df.to_csv(out_path+'.RWR.txt', header=True, index=True, sep='\t') + # perform Pathwa Enrichment Analysis + print(datetime.now(), 'performing network-based pathway enrichment') + cell_pathway_df = pea.NetPEA(rwr_df, pathway_path, log_transform=log_transform, permutation=permutation_int, seed=seed_int, n_cpu=cpu_int, out_path=outpath) + print( '[Finished in {:}]'.format(cal_time(datetime.now(), start_time)) ) + +def prep_input(params): + # Read data files + drug_mbit_df = pd.read_csv(params['drug_bits_file'], sep = '\t', index_col=0) + drug_mbit_df = drug_mbit_df.reset_index().rename(columns={'drug': 'drug_id'}) + DGnet = pd.read_csv(params['dgnet_file'], sep='\t', index_col=0) + DGnet = DGnet.add_suffix('_dgnet').reset_index().rename(columns={'index': 'drug_id'}) + CNVnet = pd.read_csv(params['cnvnet_file'], sep= '\t',index_col=0) + CNVnet = CNVnet.add_suffix('_cnvnet').reset_index().rename(columns={'index': 'sample_id'}) + MUTnet = pd.read_csv(params['mutnet_file'], sep='\t',index_col=0) + MUTnet = MUTnet.add_suffix('_mutnet').reset_index().rename(columns={'index': 'sample_id'}) + EXP = pd.read_csv(params['exp_file'], sep = '\t', index_col=0) + EXP = EXP.add_suffix('_exp').reset_index().rename(columns={'index': 'sample_id'}) + response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], + split_type=['train', 'test', 'val'], + y_col_name= params['metric']) + response_df = response_df.rename(columns={'improve_chem_id': 'drug_id', 'improve_sample_id': 'sample_id'}) + # Extract relevant IDs + + common_drug_ids = reduce(np.intersect1d, (drug_mbit_df['drug_id'], DGnet['drug_id'], response_df['drug_id'])) + common_sample_ids = reduce(np.intersect1d, (CNVnet['sample_id'], MUTnet['sample_id'], EXP['sample_id'] , response_df['sample_id'])) + response_df = response_df.loc[(response_df['drug_id'].isin(common_drug_ids)) & + (response_df['sample_id'].isin(common_sample_ids)), :] + drug_mbit_df = drug_mbit_df.loc[drug_mbit_df['drug_id'].isin(common_drug_ids), :].set_index('drug_id').sort_index() + DGnet = DGnet.loc[DGnet['drug_id'].isin(common_drug_ids), :].set_index('drug_id').sort_index() + CNVnet = CNVnet.loc[CNVnet['sample_id'].isin(common_sample_ids), :].set_index('sample_id').sort_index() + MUTnet = MUTnet.loc[MUTnet['sample_id'].isin(common_sample_ids), :].set_index('sample_id').sort_index() + EXP = EXP.loc[EXP['sample_id'].isin(common_sample_ids), :].set_index('sample_id').sort_index() + + drug_data = drug_mbit_df.join(DGnet) + sample_data = CNVnet.join([MUTnet, EXP]) + ## export train,val,test set + for i in ['train', 'test', 'val']: + response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], + split_type=i, + y_col_name= params['metric']) + response_df = response_df.rename(columns={'improve_chem_id': 'drug_id', 'improve_sample_id': 'sample_id'}) + response_df = response_df.loc[(response_df['drug_id'].isin(common_drug_ids)) & + (response_df['sample_id'].isin(common_sample_ids)), :] + comb_data_mtx = pd.DataFrame({'drug_id': response_df['drug_id'].values, + 'sample_id': response_df['sample_id'].values}) + comb_data_mtx = comb_data_mtx.set_index(['drug_id', 'sample_id']).join(drug_data, on = 'drug_id').join(sample_data, on = 'sample_id') + comb_data_mtx['response'] = response_df[params['metric']].values + comb_data_mtx = comb_data_mtx.dropna() + pl.from_pandas(comb_data_mtx).write_csv(params[i + '_data'], separator = '\t', has_header = True) + + +def run_ssgsea(params): + expMat = improve_utils.load_gene_expression_data(sep='\t') + rs_all = improve_utils.load_single_drug_response_data(source=params['data_type'], + split=params['split'], split_type=["train", "test", "val"], + y_col_name=params['metric']) + expMat = expMat.loc[expMat.index.isin(rs_all['improve_sample_id']),] + gct = expMat.T # gene (rows) cell lines (columns) + pathway_path = params['data_dir'] + '/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt' + gmt = pathway_path + tmp_str = params['data_dir'] + + if not os.path.isdir(tmp_str): + os.mkdir(tmp_str) + + # run enrichment + ssgsea = gp.ssgsea(data=gct, #gct: a matrix of gene by sample + gene_sets=gmt, #gmt format + outdir=tmp_str, + scale=True, + permutation_num=0, #1000 + no_plot=True, + processes=params['cpu_int'], + #min_size=0, + format='png') + + result_mat = ssgsea.res2d.T # get the normalized enrichment score (i.e., NES) + result_mat.to_csv(tmp_str+'ssGSEA.txt', header=True, index=True, sep="\t") + + f = open(tmp_str+'ssGSEA.txt', 'r') + lines = f.readlines() + total_dict = {} + for cell in set(lines[1].split()): + total_dict[cell] = {} + cell_lines = lines[1].split() + vals = lines[4].split() + for i, pathway in enumerate((lines[2].split())): + if i > 0: + total_dict[cell_lines[i]][pathway] = float(vals[i]) + df = pd.DataFrame(total_dict) + df.T.to_csv(params['exp_file'], header=True, index=True, sep="\t") + + +def candle_main(anl): + params = initialize_parameters() + data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/' + params = preprocess(params, data_dir) + if params['improve_analysis'] == 'yes' or anl: + download_anl_data(params) + print('convert drug to bits.') + smile2bits(params) + print('compute DGnet.') + run_netpea(params, dtype = 'DGnet', multiply_expression=False) + print('compute MUTnet.') + run_netpea(params, dtype = 'MUTnet', multiply_expression=True) + print('compute CNVnet.') + run_netpea(params, dtype = 'CNVnet', multiply_expression=True) + print('compute EXP.') + run_ssgsea(params) + print('prepare final input file.') + prep_input(params) + else: + download_author_data(params) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('-a', dest='anl', default=False) + args = parser.parse_args() + start = datetime.now() + candle_main(args.anl) + print('[Finished in {:}]'.format(cal_time(datetime.now(), start))) diff --git a/train.py b/train.py new file mode 100644 index 0000000..28b97b1 --- /dev/null +++ b/train.py @@ -0,0 +1,73 @@ +import candle +import os +import sys +#import json +#from json import JSONEncoder +from preprocess_new import mkdir, preprocess +#sys.path.append("/usr/local/PathDSP/PathDSP") +sys.path.append("/usr/local/PathDSP/PathDSP") +import FNN_new + +file_path = os.path.dirname(os.path.realpath(__file__)) +# This should be set outside as a user environment variable +#os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' +required = None +additional_definitions = None + +# initialize class +class PathDSP_candle(candle.Benchmark): + def set_locals(self): + ''' + Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the benchmark. + ''' + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + +def initialize_parameters(): + preprocessor_bmk = PathDSP_candle(file_path, + 'PathDSP_params.txt', + 'pytorch', + prog='PathDSP_candle', + desc='Data Preprocessor' + ) + #Initialize parameters + gParameters = candle.finalize_parameters(preprocessor_bmk) + return gParameters + +# class CustomData: +# def __init__(self, name, value): +# self.name = name +# self.value = value + +# class CustomEncoder(json.JSONEncoder): +# def default(self, o): +# return o.__dict__ + + +# def run(params): +# params['data_type'] = str(params['data_type']) +# json_out = params['output_dir']+'/params.json' +# print(params) + +# with open (json_out, 'w') as fp: +# json.dump(params, fp, indent=4, cls=CustomEncoder) + +# scores = main(params) +# with open(params['output_dir'] + "/scores.json", "w", encoding="utf-8") as f: +# json.dump(scores, f, ensure_ascii=False, indent=4) +# # print('IMPROVE_RESULT RMSE:\t' + str(scores['rmse'])) + + +def candle_main(): + params = initialize_parameters() + data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/' + params = preprocess(params, data_dir) + FNN_new.main(params) + + +if __name__ == "__main__": + candle_main() diff --git a/train.sh b/train.sh new file mode 100755 index 0000000..165eac6 --- /dev/null +++ b/train.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# arg 1 CUDA_VISIBLE_DEVICES +# arg 2 CANDLE_DATA_DIR +# arg 3 CANDLE_CONFIG + +### Path to your CANDLEized model's main Python script### +CANDLE_MODEL=train.py + +### Set env if CANDLE_MODEL is not in same directory as this script +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} + +CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL} +if [ ! -f ${CANDLE_MODEL} ] ; then + echo No such file ${CANDLE_MODEL} + exit 404 +fi + +if [ $# -lt 2 ]; then + echo "Illegal number of parameters" + echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" + exit +fi + +if [ $# -eq 2 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + CMD="python ${CANDLE_MODEL}" + echo "CMD = $CMD" + +elif [ $# -ge 3 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + + # if original $3 is a file, set candle_config and passthrough $@ + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$CANDLE_DATA_DIR/$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD $@" + + # else passthrough $@ + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + + fi +fi + +if [ -d ${CANDLE_DATA_DIR} ]; then + if [ "$(ls -A ${CANDLE_DATA_DIR})" ] ; then + echo "using data from ${CANDLE_DATA_DIR}" + else + ./candle_glue.sh + echo "using original data placed in ${CANDLE_DATA_DIR}" + fi +fi + +export CANDLE_DATA_DIR=${CANDLE_DATA_DIR} +FULL_DATA_DIR="$CANDLE_DATA_DIR/$MODEL_NAME/Data" +echo $FULL_DATA_DIR + +if [ -d ${FULL_DATA_DIR} ]; then + if [ "$(ls -A ${FULL_DATA_DIR})" ] ; then + echo "using data from ${FULL_DATA_DIR}" + else + ./candle_glue.sh + echo "using original data placed in ${FULL_DATA_DIR}" + fi +else + ./candle_glue.sh + echo "using original data placed in ${FULL_DATA_DIR}" +fi + +# Display runtime arguments +echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" +echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" + +# Set up environmental variables and execute model +echo "activating environment" +#source /opt/conda/etc/profile.d/conda.sh +source activate /usr/local/conda_envs/PathDSP_env +echo "running command ${CMD}" +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD From 6951b093e60aa74b753bcaf0c9dacc40f20e0db1 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 2 Nov 2023 15:08:09 -0700 Subject: [PATCH 064/254] process author data --- PathDSP_params.txt | 2 +- README.md | 18 ++++++++++++++++++ preprocess_new.py | 20 ++++++++++++++------ 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index fbe7355..a1b289f 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -37,4 +37,4 @@ final_hiddens=6 epochs=800 optimizer = 'adam' loss = 'mse' -improve_analysis='yes' +improve_analysis='no' diff --git a/README.md b/README.md index 6c89fee..3e5df51 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,24 @@ # PathDSP Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores +# Example usage with singularity container +Setup Singularity + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git +cd Singularity +./setup +``` + +Build Singularity from definition file + +``` +singularity build --fakeroot PathDSP.sif definitions/PathDSP.def +``` + + +# Docs from original authors (below) + # Requirments # Input format diff --git a/preprocess_new.py b/preprocess_new.py index c9162e0..f21b7e4 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -31,8 +31,7 @@ import NetPEA as pea #import gsea module import gseapy as gp - - +import sklearn.model_selection as skms file_path = os.path.dirname(os.path.realpath(__file__)) @@ -156,11 +155,19 @@ def cal_time(end, start): def download_author_data(params): - data_download_filepath = candle.get_file(params['original_data'], params['original_data_url'] + '/' + params['original_data'], + data_download_filepath = candle.file_utils.get_file(params['original_data'], params['original_data_url'] + '/' + params['original_data'], datadir = params['data_dir'], cache_subdir = None) print('download_path: {}'.format(data_download_filepath)) - + random_seed = 42 + df = pd.read_csv(params['data_dir'] + "/input.txt", sep='\t') # Modify the separator if needed + df = df.set_index(['drug', 'cell']) + train_data, temp_data = skms.train_test_split(df, test_size=0.2, random_state=random_seed) + val_data, test_data = skms.train_test_split(temp_data, test_size=0.5, random_state=random_seed) + pl.from_pandas(train_data).write_csv(params['train_data'], separator = '\t', has_header = True) + pl.from_pandas(val_data).write_csv(params['val_data'], separator = '\t', has_header = True) + pl.from_pandas(test_data).write_csv(params['test_data'], separator = '\t', has_header = True) + def smile2bits(params): start = datetime.now() @@ -370,7 +377,7 @@ def candle_main(anl): params = initialize_parameters() data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/' params = preprocess(params, data_dir) - if params['improve_analysis'] == 'yes' or anl: + if params['improve_analysis'] == 'yes' or anl == 1: download_anl_data(params) print('convert drug to bits.') smile2bits(params) @@ -390,7 +397,8 @@ def candle_main(anl): if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument('-a', dest='anl', default=False) + parser.add_argument('anl', type=int, default=0, help='''whether to perform preprocessing using anl data or directly use processed + data from the original paper, default to 0 to use processed data from original paper''') args = parser.parse_args() start = datetime.now() candle_main(args.anl) From b5d8c57d3c2e8603dd045bb05aec4d5018174cd2 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Mon, 6 Nov 2023 08:09:19 -0800 Subject: [PATCH 065/254] fix args --- README.md | 15 +++++++++++++++ preprocess_new.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3e5df51..afb9642 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Setup Singularity git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git cd Singularity ./setup +source config/improve.env ``` Build Singularity from definition file @@ -16,6 +17,20 @@ Build Singularity from definition file singularity build --fakeroot PathDSP.sif definitions/PathDSP.def ``` +Perform preprocessing step using processed data from original paper + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif /usr/local/PathDSP/preprocess.sh 0 /candle_data_dir 0 +``` + +Alternatively, perform preprocessing step using raw data from IMPROVE project + +``` +singularity exec --nv ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif /usr/local/PathDSP/preprocess.sh 0 /candle_data_dir 1 +``` + + + # Docs from original authors (below) diff --git a/preprocess_new.py b/preprocess_new.py index f21b7e4..336cdb8 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -397,7 +397,7 @@ def candle_main(anl): if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument('anl', type=int, default=0, help='''whether to perform preprocessing using anl data or directly use processed + parser.add_argument('-a', dest='anl', type=int, default=0, help='''whether to perform preprocessing using anl data or directly use processed data from the original paper, default to 0 to use processed data from original paper''') args = parser.parse_args() start = datetime.now() From 8ece3ff6636cc641b009d16b9ed5d844369ca0f2 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Mon, 6 Nov 2023 09:18:17 -0800 Subject: [PATCH 066/254] add infer.sh --- README.md | 21 ++++++++++++++++-- infer.py | 10 ++++----- infer.sh | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++ preprocess.sh | 0 test.sh | 7 ------ 5 files changed, 84 insertions(+), 14 deletions(-) create mode 100755 infer.sh mode change 100644 => 100755 preprocess.sh delete mode 100755 test.sh diff --git a/README.md b/README.md index afb9642..4e463c9 100644 --- a/README.md +++ b/README.md @@ -20,15 +20,32 @@ singularity build --fakeroot PathDSP.sif definitions/PathDSP.def Perform preprocessing step using processed data from original paper ``` -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif /usr/local/PathDSP/preprocess.sh 0 /candle_data_dir 0 +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif /usr/local/PathDSP/preprocess.sh 0 /candle_data_dir "-a 0" ``` Alternatively, perform preprocessing step using raw data from IMPROVE project ``` -singularity exec --nv ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif /usr/local/PathDSP/preprocess.sh 0 /candle_data_dir 1 +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif /usr/local/PathDSP/preprocess.sh 0 /candle_data_dir "-a 1" ``` +Train the model + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif /usr/local/PathDSP/train.sh 0 /candle_data_dir +``` + +Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss.txt` +Final trained model is located at: `${IMPROVE_DATA_DIR}/Data/model.pt` + +Perform inference on the testing data + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif /usr/local/PathDSP/infer.sh 0 /candle_data_dir +``` + +Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss_pred.txt` +Final prediction on testing data is located at: `${IMPROVE_DATA_DIR}/Data/Prediction.txt` diff --git a/infer.py b/infer.py index babd421..31a88a8 100755 --- a/infer.py +++ b/infer.py @@ -47,16 +47,16 @@ def initialize_parameters(): def run(params): - trained_net = FNN_new.mynet.FNN(Xtest_arr.shape[1]) - trained_net.load_state_dict(tch.load(params['data_dir'] + '/model.pt')) - trained_net.eval() test_df = pl.read_csv(params['test_data'], separator = "\t").to_pandas() - FNN_new.myutil.set_seed(params["seed_int"]) - device = FNN_new.myutil.get_device(uth=params["gpu_int"]) Xtest_arr = test_df.iloc[:, 0:-1].values ytest_arr = test_df.iloc[:, -1].values Xtest_arr = np.array(Xtest_arr).astype('float32') ytest_arr = np.array(ytest_arr).astype('float32') + trained_net = FNN_new.mynet.FNN(Xtest_arr.shape[1]) + trained_net.load_state_dict(tch.load(params['data_dir'] + '/model.pt')) + trained_net.eval() + FNN_new.myutil.set_seed(params["seed_int"]) + device = FNN_new.myutil.get_device(uth=params["gpu_int"]) test_dataset = FNN_new.mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) test_dl = tchud.DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False) start = datetime.now() diff --git a/infer.sh b/infer.sh new file mode 100755 index 0000000..19146ae --- /dev/null +++ b/infer.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +#!/bin/bash + +######################################################################### +### THIS IS A TEMPLATE FILE. SUBSTITUTE #PATH# WITH THE MODEL EXECUTABLE. +######################################################################### + + +# arg 1 CUDA_VISIBLE_DEVICES +# arg 2 CANDLE_DATA_DIR +# arg 3 CANDLE_CONFIG + +### Path to your CANDLEized model's main Python script### +CANDLE_MODEL=infer.py + +if [ $# -lt 2 ] ; then + echo "Illegal number of parameters" + echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" + exit +fi + +if [ $# -eq 2 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + CMD="python ${CANDLE_MODEL}" + echo "CMD = $CMD" + +elif [ $# -ge 3 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + + # if original $3 is a file, set candle_config and passthrough $@ + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$CANDLE_DATA_DIR/$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD $@" + + # else passthrough $@ + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + + fi +fi + + + +# Display runtime arguments +echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" +echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" + +# Set up environmental variables and execute model +echo "activating environment" +source activate /usr/local/conda_envs/PathDSP_env +echo "running command ${CMD}" +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD diff --git a/preprocess.sh b/preprocess.sh old mode 100644 new mode 100755 diff --git a/test.sh b/test.sh deleted file mode 100755 index f854b2b..0000000 --- a/test.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -#The point of this is to test if the thing works at all - -python get_test_data.py -python ./PathDSP/FNN.py -i tmp/common/input_txt_Nick.txt -o ./output_prefix - From 92b75189553d6884b0d406c10c033bb0f180bd83 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Mon, 6 Nov 2023 11:53:43 -0800 Subject: [PATCH 067/254] update doc --- PathDSP.def | 2 +- README.md | 57 +++++++++++++++++++++++++++++++++++++++++++++++ improve_utils.py | 2 +- preprocess.sh | 2 +- preprocess_new.py | 2 -- 5 files changed, 60 insertions(+), 5 deletions(-) diff --git a/PathDSP.def b/PathDSP.def index ae656c7..8ed6d86 100644 --- a/PathDSP.def +++ b/PathDSP.def @@ -42,7 +42,7 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime #install python modules and model prerequites cd /usr/local - git clone -b develop https://github.com/Liuy12/PathDSP.git + git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git cd PathDSP # download conda diff --git a/README.md b/README.md index 4e463c9..d1ffbd9 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,64 @@ singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif /u Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss_pred.txt` Final prediction on testing data is located at: `${IMPROVE_DATA_DIR}/Data/Prediction.txt` +# Example usage with Conda +Download PathDSP + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git +cd PathDSP +``` + +Create environment + +``` +conda env create -f environment_082223.yml -n PathDSP_env +``` + +Activate environment + +``` +conda activate PathDSP_env +``` + +Intall CANDLE package + +``` +pip install git+https://github.com/ECP-CANDLE/candle_lib@develop +``` + +Perform preprocessing step using processed data from original paper + +``` +export CUDA_VISIBLE_DEVICES=0 +export CANDLE_DATA_DIR=./Data/ +bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 0" +``` + +Alternatively, perform preprocessing step using raw data from IMPROVE project + +``` +bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 1" +``` + +Train the model + +``` +bash train.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +``` + +Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss.txt` +Final trained model is located at: `${CANDLE_DATA_DIR}/Data/model.pt` + +Perform inference on the testing data + +``` +bash infer.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +``` + +Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss_pred.txt` +Final prediction on testing data is located at: `${CANDLE_DATA_DIR}/Data/Prediction.txt` # Docs from original authors (below) diff --git a/improve_utils.py b/improve_utils.py index 9a7676b..1956ee1 100644 --- a/improve_utils.py +++ b/improve_utils.py @@ -35,7 +35,7 @@ # TODO: # This is CANDLE_DATA_DIR (or something...). # How this is going to be passed to the code? -improve_globals.main_data_dir = PosixPath("/candle_data_dir/csa_data/") +improve_globals.main_data_dir = PosixPath(os.environ.get("CANDLE_DATA_DIR") + "/csa_data/") # improve_globals.main_data_dir = fdir/"improve_data_dir" # imp_globals.main_data_dir = fdir/"candle_data_dir" diff --git a/preprocess.sh b/preprocess.sh index a20dd1d..a7a435d 100755 --- a/preprocess.sh +++ b/preprocess.sh @@ -11,7 +11,7 @@ ### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=/usr/local/PathDSP/preprocess_new.py +CANDLE_MODEL=preprocess_new.py if [ $# -lt 2 ] ; then echo "Illegal number of parameters" diff --git a/preprocess_new.py b/preprocess_new.py index 336cdb8..cec9e6f 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -1,5 +1,3 @@ -#!/homes/ac.rgnanaolivu/miniconda3/envs/rohan_python/bin/python - import sys import os import numpy as np From ba8eb0e43c02f00c55ce31192250a2c2d7ab9186 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Mon, 6 Nov 2023 13:52:26 -0800 Subject: [PATCH 068/254] fix path --- PathDSP/FNN_new.py | 6 +++--- README.md | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/PathDSP/FNN_new.py b/PathDSP/FNN_new.py index 00c8879..0f84e5d 100644 --- a/PathDSP/FNN_new.py +++ b/PathDSP/FNN_new.py @@ -10,7 +10,7 @@ Calculate RMSE at once, Oct. 3, 2020 revised """ - +import os import argparse import numpy as np import pandas as pd @@ -84,7 +84,7 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): trainloss_list = [] # metrics: MSE, size equals to EPOCH validloss_list = [] # metrics: MSE, size equals to EPOCH validr2_list = [] # metrics: r2, size equals to EPOCH - early_stopping = myutil.EarlyStopping(patience=30, verbose=True) # initialize the early_stopping + early_stopping = myutil.EarlyStopping(patience=30, verbose=True, path= os.environ.get("CANDLE_DATA_DIR") + "/Data/checkpoint.pt") # initialize the early_stopping # repeat the training for EPOCH times start_total = datetime.now() for epoch in range(epochs): @@ -143,7 +143,7 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): print('Total time (all epochs) :[Finished in {:}]'.format(cal_time(datetime.now(), start_total))) # load the last checkpoint with the best model - net.load_state_dict(tch.load('checkpoint.pt')) + net.load_state_dict(tch.load(os.environ.get('CANDLE_DATA_DIR') + '/Data/checkpoint.pt')) return net, trainloss_list, validloss_list, validr2_list diff --git a/README.md b/README.md index d1ffbd9..d8905e7 100644 --- a/README.md +++ b/README.md @@ -20,19 +20,19 @@ singularity build --fakeroot PathDSP.sif definitions/PathDSP.def Perform preprocessing step using processed data from original paper ``` -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif /usr/local/PathDSP/preprocess.sh 0 /candle_data_dir "-a 0" +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 0" ``` Alternatively, perform preprocessing step using raw data from IMPROVE project ``` -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif /usr/local/PathDSP/preprocess.sh 0 /candle_data_dir "-a 1" +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 1" ``` Train the model ``` -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif /usr/local/PathDSP/train.sh 0 /candle_data_dir +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir ``` Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss.txt` @@ -41,7 +41,7 @@ Final trained model is located at: `${IMPROVE_DATA_DIR}/Data/model.pt` Perform inference on the testing data ``` -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif /usr/local/PathDSP/infer.sh 0 /candle_data_dir +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir ``` Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss_pred.txt` From 91b5ecb30eaa93d889f8387025ebb8a104f3df4a Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Mon, 6 Nov 2023 14:44:12 -0800 Subject: [PATCH 069/254] fix conda --- infer.py | 1 + train.py | 1 + 2 files changed, 2 insertions(+) diff --git a/infer.py b/infer.py index 31a88a8..639c121 100755 --- a/infer.py +++ b/infer.py @@ -13,6 +13,7 @@ import sklearn.metrics as skmts #sys.path.append("/usr/local/PathDSP/PathDSP") sys.path.append("/usr/local/PathDSP/PathDSP") +sys.path.append(os.getcwd() + "/PathDSP") import FNN_new diff --git a/train.py b/train.py index 28b97b1..fb18f9d 100644 --- a/train.py +++ b/train.py @@ -6,6 +6,7 @@ from preprocess_new import mkdir, preprocess #sys.path.append("/usr/local/PathDSP/PathDSP") sys.path.append("/usr/local/PathDSP/PathDSP") +sys.path.append(os.getcwd() + "/PathDSP") import FNN_new file_path = os.path.dirname(os.path.realpath(__file__)) From cd7303f2e9fa2cfe222a9ba8e1e021edcfcf5aed Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Mon, 6 Nov 2023 16:47:22 -0600 Subject: [PATCH 070/254] add documentation for container and conda usage (#6) * update preprocess script * update preprocess script * add improve_utils script * add nea scripts * update params * add gitignore * EXP processing * updated to integrate with prep_input * add definition file * update .gitignore * update filename for ssGSEA * add FNN_new * add train/infer * update params * add .yml * update params * update conda path * fix conda * update preprocess.sh * update preprocess.sh * update preprocess_new.py * update env * update preproce_new.py * update preproce_new.py * update files * update params * fix params * update preproce_new.py * update preprocess_new.py * update preprocess_new.py * update file * update file * update file * update script * add def * add script * update file * update FNN_new * update FNN * update params * fix param * fix bug * add time * update def * update yml * update train.sh * update train.sh * update train.py * update train * fix bug * update file * update file * use polars * update files * update preprocess * update infer.sh * process author data * fix args * add infer.sh * update doc * fix path * fix conda --------- Co-authored-by: willherbert27 --- PathDSP.def | 2 +- PathDSP/FNN_new.py | 6 +-- PathDSP_params.txt | 2 +- README.md | 107 +++++++++++++++++++++++++++++++++++++++++++++ improve_utils.py | 2 +- infer.py | 11 ++--- infer.sh | 60 +++++++++++++++++++++++++ preprocess.sh | 2 +- preprocess_new.py | 22 ++++++---- test.sh | 7 --- train.py | 1 + 11 files changed, 195 insertions(+), 27 deletions(-) create mode 100755 infer.sh mode change 100644 => 100755 preprocess.sh delete mode 100755 test.sh diff --git a/PathDSP.def b/PathDSP.def index ae656c7..8ed6d86 100644 --- a/PathDSP.def +++ b/PathDSP.def @@ -42,7 +42,7 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime #install python modules and model prerequites cd /usr/local - git clone -b develop https://github.com/Liuy12/PathDSP.git + git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git cd PathDSP # download conda diff --git a/PathDSP/FNN_new.py b/PathDSP/FNN_new.py index 00c8879..0f84e5d 100644 --- a/PathDSP/FNN_new.py +++ b/PathDSP/FNN_new.py @@ -10,7 +10,7 @@ Calculate RMSE at once, Oct. 3, 2020 revised """ - +import os import argparse import numpy as np import pandas as pd @@ -84,7 +84,7 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): trainloss_list = [] # metrics: MSE, size equals to EPOCH validloss_list = [] # metrics: MSE, size equals to EPOCH validr2_list = [] # metrics: r2, size equals to EPOCH - early_stopping = myutil.EarlyStopping(patience=30, verbose=True) # initialize the early_stopping + early_stopping = myutil.EarlyStopping(patience=30, verbose=True, path= os.environ.get("CANDLE_DATA_DIR") + "/Data/checkpoint.pt") # initialize the early_stopping # repeat the training for EPOCH times start_total = datetime.now() for epoch in range(epochs): @@ -143,7 +143,7 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): print('Total time (all epochs) :[Finished in {:}]'.format(cal_time(datetime.now(), start_total))) # load the last checkpoint with the best model - net.load_state_dict(tch.load('checkpoint.pt')) + net.load_state_dict(tch.load(os.environ.get('CANDLE_DATA_DIR') + '/Data/checkpoint.pt')) return net, trainloss_list, validloss_list, validr2_list diff --git a/PathDSP_params.txt b/PathDSP_params.txt index fbe7355..a1b289f 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -37,4 +37,4 @@ final_hiddens=6 epochs=800 optimizer = 'adam' loss = 'mse' -improve_analysis='yes' +improve_analysis='no' diff --git a/README.md b/README.md index 6c89fee..d8905e7 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,113 @@ # PathDSP Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores +# Example usage with singularity container +Setup Singularity + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git +cd Singularity +./setup +source config/improve.env +``` + +Build Singularity from definition file + +``` +singularity build --fakeroot PathDSP.sif definitions/PathDSP.def +``` + +Perform preprocessing step using processed data from original paper + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 0" +``` + +Alternatively, perform preprocessing step using raw data from IMPROVE project + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 1" +``` + +Train the model + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir +``` + +Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss.txt` +Final trained model is located at: `${IMPROVE_DATA_DIR}/Data/model.pt` + +Perform inference on the testing data + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir +``` + +Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss_pred.txt` +Final prediction on testing data is located at: `${IMPROVE_DATA_DIR}/Data/Prediction.txt` + +# Example usage with Conda + +Download PathDSP + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git +cd PathDSP +``` + +Create environment + +``` +conda env create -f environment_082223.yml -n PathDSP_env +``` + +Activate environment + +``` +conda activate PathDSP_env +``` + +Intall CANDLE package + +``` +pip install git+https://github.com/ECP-CANDLE/candle_lib@develop +``` + +Perform preprocessing step using processed data from original paper + +``` +export CUDA_VISIBLE_DEVICES=0 +export CANDLE_DATA_DIR=./Data/ +bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 0" +``` + +Alternatively, perform preprocessing step using raw data from IMPROVE project + +``` +bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 1" +``` + +Train the model + +``` +bash train.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +``` + +Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss.txt` +Final trained model is located at: `${CANDLE_DATA_DIR}/Data/model.pt` + +Perform inference on the testing data + +``` +bash infer.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +``` + +Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss_pred.txt` +Final prediction on testing data is located at: `${CANDLE_DATA_DIR}/Data/Prediction.txt` + +# Docs from original authors (below) + # Requirments # Input format diff --git a/improve_utils.py b/improve_utils.py index 9a7676b..1956ee1 100644 --- a/improve_utils.py +++ b/improve_utils.py @@ -35,7 +35,7 @@ # TODO: # This is CANDLE_DATA_DIR (or something...). # How this is going to be passed to the code? -improve_globals.main_data_dir = PosixPath("/candle_data_dir/csa_data/") +improve_globals.main_data_dir = PosixPath(os.environ.get("CANDLE_DATA_DIR") + "/csa_data/") # improve_globals.main_data_dir = fdir/"improve_data_dir" # imp_globals.main_data_dir = fdir/"candle_data_dir" diff --git a/infer.py b/infer.py index babd421..639c121 100755 --- a/infer.py +++ b/infer.py @@ -13,6 +13,7 @@ import sklearn.metrics as skmts #sys.path.append("/usr/local/PathDSP/PathDSP") sys.path.append("/usr/local/PathDSP/PathDSP") +sys.path.append(os.getcwd() + "/PathDSP") import FNN_new @@ -47,16 +48,16 @@ def initialize_parameters(): def run(params): - trained_net = FNN_new.mynet.FNN(Xtest_arr.shape[1]) - trained_net.load_state_dict(tch.load(params['data_dir'] + '/model.pt')) - trained_net.eval() test_df = pl.read_csv(params['test_data'], separator = "\t").to_pandas() - FNN_new.myutil.set_seed(params["seed_int"]) - device = FNN_new.myutil.get_device(uth=params["gpu_int"]) Xtest_arr = test_df.iloc[:, 0:-1].values ytest_arr = test_df.iloc[:, -1].values Xtest_arr = np.array(Xtest_arr).astype('float32') ytest_arr = np.array(ytest_arr).astype('float32') + trained_net = FNN_new.mynet.FNN(Xtest_arr.shape[1]) + trained_net.load_state_dict(tch.load(params['data_dir'] + '/model.pt')) + trained_net.eval() + FNN_new.myutil.set_seed(params["seed_int"]) + device = FNN_new.myutil.get_device(uth=params["gpu_int"]) test_dataset = FNN_new.mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) test_dl = tchud.DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False) start = datetime.now() diff --git a/infer.sh b/infer.sh new file mode 100755 index 0000000..19146ae --- /dev/null +++ b/infer.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +#!/bin/bash + +######################################################################### +### THIS IS A TEMPLATE FILE. SUBSTITUTE #PATH# WITH THE MODEL EXECUTABLE. +######################################################################### + + +# arg 1 CUDA_VISIBLE_DEVICES +# arg 2 CANDLE_DATA_DIR +# arg 3 CANDLE_CONFIG + +### Path to your CANDLEized model's main Python script### +CANDLE_MODEL=infer.py + +if [ $# -lt 2 ] ; then + echo "Illegal number of parameters" + echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" + exit +fi + +if [ $# -eq 2 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + CMD="python ${CANDLE_MODEL}" + echo "CMD = $CMD" + +elif [ $# -ge 3 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + + # if original $3 is a file, set candle_config and passthrough $@ + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$CANDLE_DATA_DIR/$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD $@" + + # else passthrough $@ + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + + fi +fi + + + +# Display runtime arguments +echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" +echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" + +# Set up environmental variables and execute model +echo "activating environment" +source activate /usr/local/conda_envs/PathDSP_env +echo "running command ${CMD}" +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD diff --git a/preprocess.sh b/preprocess.sh old mode 100644 new mode 100755 index a20dd1d..a7a435d --- a/preprocess.sh +++ b/preprocess.sh @@ -11,7 +11,7 @@ ### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=/usr/local/PathDSP/preprocess_new.py +CANDLE_MODEL=preprocess_new.py if [ $# -lt 2 ] ; then echo "Illegal number of parameters" diff --git a/preprocess_new.py b/preprocess_new.py index c9162e0..cec9e6f 100644 --- a/preprocess_new.py +++ b/preprocess_new.py @@ -1,5 +1,3 @@ -#!/homes/ac.rgnanaolivu/miniconda3/envs/rohan_python/bin/python - import sys import os import numpy as np @@ -31,8 +29,7 @@ import NetPEA as pea #import gsea module import gseapy as gp - - +import sklearn.model_selection as skms file_path = os.path.dirname(os.path.realpath(__file__)) @@ -156,11 +153,19 @@ def cal_time(end, start): def download_author_data(params): - data_download_filepath = candle.get_file(params['original_data'], params['original_data_url'] + '/' + params['original_data'], + data_download_filepath = candle.file_utils.get_file(params['original_data'], params['original_data_url'] + '/' + params['original_data'], datadir = params['data_dir'], cache_subdir = None) print('download_path: {}'.format(data_download_filepath)) - + random_seed = 42 + df = pd.read_csv(params['data_dir'] + "/input.txt", sep='\t') # Modify the separator if needed + df = df.set_index(['drug', 'cell']) + train_data, temp_data = skms.train_test_split(df, test_size=0.2, random_state=random_seed) + val_data, test_data = skms.train_test_split(temp_data, test_size=0.5, random_state=random_seed) + pl.from_pandas(train_data).write_csv(params['train_data'], separator = '\t', has_header = True) + pl.from_pandas(val_data).write_csv(params['val_data'], separator = '\t', has_header = True) + pl.from_pandas(test_data).write_csv(params['test_data'], separator = '\t', has_header = True) + def smile2bits(params): start = datetime.now() @@ -370,7 +375,7 @@ def candle_main(anl): params = initialize_parameters() data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/' params = preprocess(params, data_dir) - if params['improve_analysis'] == 'yes' or anl: + if params['improve_analysis'] == 'yes' or anl == 1: download_anl_data(params) print('convert drug to bits.') smile2bits(params) @@ -390,7 +395,8 @@ def candle_main(anl): if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument('-a', dest='anl', default=False) + parser.add_argument('-a', dest='anl', type=int, default=0, help='''whether to perform preprocessing using anl data or directly use processed + data from the original paper, default to 0 to use processed data from original paper''') args = parser.parse_args() start = datetime.now() candle_main(args.anl) diff --git a/test.sh b/test.sh deleted file mode 100755 index f854b2b..0000000 --- a/test.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -#The point of this is to test if the thing works at all - -python get_test_data.py -python ./PathDSP/FNN.py -i tmp/common/input_txt_Nick.txt -o ./output_prefix - diff --git a/train.py b/train.py index 28b97b1..fb18f9d 100644 --- a/train.py +++ b/train.py @@ -6,6 +6,7 @@ from preprocess_new import mkdir, preprocess #sys.path.append("/usr/local/PathDSP/PathDSP") sys.path.append("/usr/local/PathDSP/PathDSP") +sys.path.append(os.getcwd() + "/PathDSP") import FNN_new file_path = os.path.dirname(os.path.realpath(__file__)) From 2ea4a3d28345067096f4c14b3592ee4be9209ac5 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Mon, 13 Nov 2023 13:47:44 -0800 Subject: [PATCH 071/254] use improve repo --- PathDSP.def | 2 +- PathDSP_params.txt | 3 + preprocess.sh | 2 +- preprocess_improve.py | 677 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 682 insertions(+), 2 deletions(-) create mode 100644 preprocess_improve.py diff --git a/PathDSP.def b/PathDSP.def index 8ed6d86..b0eb7e6 100644 --- a/PathDSP.def +++ b/PathDSP.def @@ -44,7 +44,7 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime cd /usr/local git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git cd PathDSP - + git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git # download conda /opt/conda/bin/conda env create -f environment_082223.yml --prefix /usr/local/conda_envs/PathDSP_env/ diff --git a/PathDSP_params.txt b/PathDSP_params.txt index a1b289f..f0b2857 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -8,6 +8,9 @@ original_data='input.zip' gene_set = 'MSigdb.zip' ppi_data = 'STRING.zip' drug_target = 'raw_data.zip' +train_split_file = "CCLE_split_0_train.txt" +val_split_file = "CCLE_split_0_val.txt" +test_split_file = "CCLE_split_0_test.txt" train_data = 'PathDSP_train.txt' test_data = 'PathDSP_test.txt' val_data = 'PathDSP_val.txt' diff --git a/preprocess.sh b/preprocess.sh index a7a435d..ab0de90 100755 --- a/preprocess.sh +++ b/preprocess.sh @@ -11,7 +11,7 @@ ### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=preprocess_new.py +CANDLE_MODEL=preprocess_improve.py if [ $# -lt 2 ] ; then echo "Illegal number of parameters" diff --git a/preprocess_improve.py b/preprocess_improve.py new file mode 100644 index 0000000..fa6d6d0 --- /dev/null +++ b/preprocess_improve.py @@ -0,0 +1,677 @@ +import sys +import os +import numpy as np +import polars as pl + +# import torch +# import torch.utils.data as du +# from torch.autograd import Variable +# import torch.nn as nn +# import torch.nn.functional as F +# from code.drugcell_NN import * +import argparse +import numpy as np +import pandas as pd +import candle + +# import time +# import logging +# import networkx as nx +# import networkx.algorithms.components.connected as nxacc +# import networkx.algorithms.dag as nxadag +# from pathlib import Path +from functools import reduce + +# import improve_utils +# swtich from improve_utils to improve repo +sys.path.append("./IMPROVE/") +from improve import drug_resp_pred as drp +from pathlib import Path + +# import RDKit +from rdkit import Chem +from rdkit.Chem import AllChem +from datetime import datetime + +# import NetPEA modules +import RWR as rwr +import NetPEA as pea + +# import gsea module +import gseapy as gp +import sklearn.model_selection as skms + + +file_path = os.path.dirname(os.path.realpath(__file__)) +# fdir = Path('__file__').resolve().parent +# source = 'csa_data/raw_data/splits/' +required = None +additional_definitions = None + +# This should be set outside as a user environment variable +# os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' + + +# initialize class +class PathDSP_candle(candle.Benchmark): + def set_locals(self): + """ + Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the benchmark. + """ + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + + +def initialize_parameters(): + preprocessor_bmk = PathDSP_candle( + file_path, + "PathDSP_params.txt", + "pytorch", + prog="PathDSP_candle", + desc="Data Preprocessor", + ) + # Initialize parameters + gParameters = candle.finalize_parameters(preprocessor_bmk) + return gParameters + + +def mkdir(directory): + directories = directory.split("/") + + folder = "" + for d in directories: + folder += d + "/" + if not os.path.exists(folder): + print("creating folder: %s" % folder) + os.mkdir(folder) + + +def preprocess(params, data_dir): + print(os.environ["CANDLE_DATA_DIR"]) + # requirements go here + # keys_parsing = ['output_dir', 'hidden', 'result', 'metric', 'data_type'] + if not os.path.exists(data_dir): + mkdir(data_dir) + params["data_dir"] = data_dir + # args = candle.ArgumentStruct(**params) + for i in [ + "train_data", + "test_data", + "val_data", + "drug_bits_file", + "dgnet_file", + "mutnet_file", + "cnvnet_file", + "exp_file", + ]: + params[i] = params["data_dir"] + "/" + params[i] + params["x_data_path"] = ( + os.environ["CANDLE_DATA_DIR"] + "/csa_data/raw_data/x_data/" + ) + return params + + +def download_anl_data(params): + csa_data_folder = os.path.join( + os.environ["CANDLE_DATA_DIR"], "csa_data", "raw_data" + ) + splits_dir = os.path.join(csa_data_folder, "splits") + x_data_dir = os.path.join(csa_data_folder, "x_data") + y_data_dir = os.path.join(csa_data_folder, "y_data") + + if not os.path.exists(csa_data_folder): + print("creating folder: %s" % csa_data_folder) + os.makedirs(csa_data_folder) + mkdir(splits_dir) + mkdir(x_data_dir) + mkdir(y_data_dir) + + for improve_file in [ + "CCLE_all.txt", + "CCLE_split_" + str(params["split"]) + "_test.txt", + "CCLE_split_" + str(params["split"]) + "_train.txt", + "CCLE_split_" + str(params["split"]) + "_val.txt", + "CTRPv2_all.txt", + "CTRPv2_split_" + str(params["split"]) + "_test.txt", + "CTRPv2_split_" + str(params["split"]) + "_train.txt", + "CTRPv2_split_" + str(params["split"]) + "_val.txt", + "gCSI_all.txt", + "GDSCv1_all.txt", + "GDSCv2_all.txt", + ]: + url_dir = params["improve_data_url"] + "/splits/" + candle.file_utils.get_file( + improve_file, url_dir + improve_file, datadir=splits_dir, cache_subdir=None + ) + + for improve_file in [ + "cancer_mutation_count.tsv", + "drug_SMILES.tsv", + "drug_info.tsv", + "cancer_discretized_copy_number.tsv", + "cancer_gene_expression.tsv", + ]: + url_dir = params["improve_data_url"] + "/x_data/" + candle.file_utils.get_file( + fname=improve_file, + origin=url_dir + improve_file, + datadir=x_data_dir, + cache_subdir=None, + ) + + url_dir = params["improve_data_url"] + "/y_data/" + response_file = "response.tsv" + candle.file_utils.get_file( + fname=response_file, + origin=url_dir + response_file, + datadir=y_data_dir, + cache_subdir=None, + ) + + ## get gene-set data and string data + for db_file in [params["gene_set"], params["ppi_data"], params["drug_target"]]: + candle.file_utils.get_file( + db_file, + params["data_url"] + "/" + db_file, + datadir=params["data_dir"], + cache_subdir=None, + ) + + +# set timer +def cal_time(end, start): + """return time spent""" + # end = datetime.now(), start = datetime.now() + datetimeFormat = "%Y-%m-%d %H:%M:%S.%f" + spend = datetime.strptime(str(end), datetimeFormat) - datetime.strptime( + str(start), datetimeFormat + ) + return spend + + +def download_author_data(params): + data_download_filepath = candle.file_utils.get_file( + params["original_data"], + params["original_data_url"] + "/" + params["original_data"], + datadir=params["data_dir"], + cache_subdir=None, + ) + print("download_path: {}".format(data_download_filepath)) + random_seed = 42 + df = pd.read_csv( + params["data_dir"] + "/input.txt", sep="\t" + ) # Modify the separator if needed + df = df.set_index(["drug", "cell"]) + train_data, temp_data = skms.train_test_split( + df, test_size=0.2, random_state=random_seed + ) + val_data, test_data = skms.train_test_split( + temp_data, test_size=0.5, random_state=random_seed + ) + pl.from_pandas(train_data).write_csv( + params["train_data"], separator="\t", has_header=True + ) + pl.from_pandas(val_data).write_csv( + params["val_data"], separator="\t", has_header=True + ) + pl.from_pandas(test_data).write_csv( + params["test_data"], separator="\t", has_header=True + ) + + +def load_smiles_data(fname: str, sep: str = "\t", verbose: bool = True) -> pd.DataFrame: + """ + IMPROVE-specific func. + Read smiles data. + src_raw_data_dir : data dir where the raw DRP data is stored + """ + df = pd.read_csv(fname, sep=sep) + + # TODO: updated this after we update the data + df.columns = ["improve_chem_id", "smiles"] + + if verbose: + print(f"SMILES data: {df.shape}") + # print(df.dtypes) + # print(df.dtypes.value_counts()) + return df + + +def smile2bits(params): + start = datetime.now() + # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], + # split=params['split'], split_type=["train", "test", "val"], + # y_col_name=params['metric']) + response_df = [ + drp.load_response_data( + y_data_fpath=Path( + params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/y_data/response.tsv" + ), + split_fpath=Path( + params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/splits/" + params[file] + ), + ) + for file in ["train_split_file", "test_split_file", "val_split_file"] + ] + response_df = pd.concat(response_df, ignore_index=True) + # smile_df = improve_utils.load_smiles_data() + # params['x_data_path'] = params['CANDLE_DATA_DIR'] + '/x_data/' + # dd = drp.DrugsLoader(params) + smile_df = load_smiles_data( + fname=params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/x_data/drug_SMILES.tsv" + ) + smile_df.columns = ["drug", "smile"] + smile_df = smile_df.drop_duplicates(subset=["drug"], keep="first").set_index("drug") + smile_df = smile_df.loc[smile_df.index.isin(response_df["improve_chem_id"]),] + bit_int = params["bit_int"] + record_list = [] + # smile2bits drug by drug + n_drug = 1 + for idx, row in smile_df.iterrows(): + drug = idx + smile = row["smile"] + mol = Chem.MolFromSmiles(smile) + if mol is None: + continue + mbit = list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=bit_int)) + # drug_mbit_dict.update({drug:mbit}) + # append to result + record_list.append(tuple([drug] + mbit)) + if len(mbit) == bit_int: + n_drug += 1 + print("total {:} drugs with bits".format(n_drug)) + # convert dict to dataframe + colname_list = ["drug"] + ["mBit_" + str(i) for i in range(bit_int)] + drug_mbit_df = pd.DataFrame.from_records(record_list, columns=colname_list) + # drug_mbit_df = pd.DataFrame.from_dict(drug_mbit_dict, orient='index', columns=colname_list) + # drug_mbit_df.index.name = 'drug' + print("unique drugs={:}".format(len(drug_mbit_df["drug"].unique()))) + # save to file + drug_mbit_df.to_csv(params["drug_bits_file"], header=True, index=False, sep="\t") + print("[Finished in {:}]".format(cal_time(datetime.now(), start))) + + +def times_expression(rwr, exp): + """ + :param rwrDf: dataframe of cell by gene probability matrix + :param expDf: dataframe of cell by gene expression matrix + :return rwr_timesexp_df: dataframe of cell by gene probability matrix, + in which genes are multiplied with expression values + + Note: this function assumes cells are all overlapped while gene maybe not + """ + cell_list = sorted(list(set(rwr.index) & set(exp.index))) + gene_list = sorted(list(set(rwr.columns) & set(exp.columns))) + + if len(cell_list) == 0: + print("ERROR! no overlapping cell lines") + sys.exit(1) + if len(gene_list) == 0: + print("ERROR! no overlapping genes") + sys.exit(1) + + # multiply with gene expression for overlapping cell, gene + rwr_timesexp = rwr.loc[cell_list, gene_list] * exp.loc[cell_list, gene_list] + + # concat with other gene + out_gene_list = list(set(rwr.columns) - set(gene_list)) + out_df = pd.concat([rwr_timesexp, rwr[out_gene_list]], axis=1) + return out_df + + +def run_netpea(params, dtype, multiply_expression): + # timer + start_time = datetime.now() + ppi_path = params["data_dir"] + "/STRING/9606.protein_name.links.v11.0.pkl" + pathway_path = ( + params["data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" + ) + log_transform = False + permutation_int = params["permutation_int"] + seed_int = params["seed_int"] + cpu_int = params["cpu_int"] + csa_data_folder = os.path.join( + os.environ["CANDLE_DATA_DIR"], "csa_data", "raw_data" + ) + # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], + # split=params['split'], split_type=["train", "test", "val"], + # y_col_name=params['metric']) + response_df = [ + drp.load_response_data( + y_data_fpath=Path( + params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/y_data/response.tsv" + ), + split_fpath=Path( + params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/splits/" + params[file] + ), + ) + for file in ["train_split_file", "test_split_file", "val_split_file"] + ] + response_df = pd.concat(response_df, ignore_index=True) + if dtype == "DGnet": + drug_info = pd.read_csv(csa_data_folder + "/x_data/drug_info.tsv", sep="\t") + drug_info["NAME"] = drug_info["NAME"].str.upper() + target_info = pd.read_csv( + params["data_dir"] + "/data/DB.Drug.Target.txt", sep="\t" + ) + target_info = target_info.rename(columns={"drug": "NAME"}) + combined_df = pd.merge(drug_info, target_info, how="left", on="NAME").dropna( + subset=["gene"] + ) + combined_df = combined_df.loc[ + combined_df["improve_chem_id"].isin(response_df["improve_chem_id"]), + ] + restart_path = params["data_dir"] + "/drug_target.txt" + combined_df.iloc[:, -2:].to_csv( + restart_path, sep="\t", header=True, index=False + ) + outpath = params["dgnet_file"] + elif dtype == "MUTnet": + # mutation_data = improve_utils.load_mutation_count_data(gene_system_identifier='Gene_Symbol') + mutation_data = drp.load_omics_data( + params, + omics_type="mutation_count", + canc_col_name="improve_sample_id", + gene_system_identifier="Gene_Symbol", + ) + mutation_data = mutation_data.reset_index() + mutation_data = pd.melt(mutation_data, id_vars="improve_sample_id").loc[ + lambda x: x["value"] > 0 + ] + mutation_data = mutation_data.loc[ + mutation_data["improve_sample_id"].isin(response_df["improve_sample_id"]), + ] + restart_path = params["data_dir"] + "/mutation_data.txt" + mutation_data.iloc[:, 0:2].to_csv( + restart_path, sep="\t", header=True, index=False + ) + outpath = params["mutnet_file"] + else: + # cnv_data = improve_utils.load_discretized_copy_number_data(gene_system_identifier='Gene_Symbol') + cnv_data = drp.load_omics_data( + params, + omics_type="discretized_copy_number", + canc_col_name="improve_sample_id", + gene_system_identifier="Gene_Symbol", + ) + cnv_data = cnv_data.reset_index() + cnv_data = pd.melt(cnv_data, id_vars="improve_sample_id").loc[ + lambda x: x["value"] != 0 + ] + cnv_data = cnv_data.loc[ + cnv_data["improve_sample_id"].isin(response_df["improve_sample_id"]), + ] + restart_path = params["data_dir"] + "/cnv_data.txt" + cnv_data.iloc[:, 0:2].to_csv(restart_path, sep="\t", header=True, index=False) + outpath = params["cnvnet_file"] + # perform Random Walk + print(datetime.now(), "performing random walk with restart") + rwr_df = rwr.RWR( + ppi_path, + restart_path, + restartProbFloat=0.5, + convergenceFloat=0.00001, + normalize="l1", + weighted=True, + ).get_prob() + # multiply with gene expression + if multiply_expression: + print( + datetime.now(), + "multiplying gene expression with random walk probability for genes were expressed", + ) + # exp_df = improve_utils.load_gene_expression_data(gene_system_identifier='Gene_Symbol') + exp_df = drp.load_omics_data( + params, + omics_type="gene_expression", + canc_col_name="improve_sample_id", + gene_system_identifier="Gene_Symbol", + ) + rwr_df = times_expression(rwr_df, exp_df) + # rwr_df.to_csv(out_path+'.RWR.txt', header=True, index=True, sep='\t') + # perform Pathwa Enrichment Analysis + print(datetime.now(), "performing network-based pathway enrichment") + cell_pathway_df = pea.NetPEA( + rwr_df, + pathway_path, + log_transform=log_transform, + permutation=permutation_int, + seed=seed_int, + n_cpu=cpu_int, + out_path=outpath, + ) + print("[Finished in {:}]".format(cal_time(datetime.now(), start_time))) + + +def prep_input(params): + # Read data files + drug_mbit_df = pd.read_csv(params["drug_bits_file"], sep="\t", index_col=0) + drug_mbit_df = drug_mbit_df.reset_index().rename(columns={"drug": "drug_id"}) + DGnet = pd.read_csv(params["dgnet_file"], sep="\t", index_col=0) + DGnet = ( + DGnet.add_suffix("_dgnet").reset_index().rename(columns={"index": "drug_id"}) + ) + CNVnet = pd.read_csv(params["cnvnet_file"], sep="\t", index_col=0) + CNVnet = ( + CNVnet.add_suffix("_cnvnet") + .reset_index() + .rename(columns={"index": "sample_id"}) + ) + MUTnet = pd.read_csv(params["mutnet_file"], sep="\t", index_col=0) + MUTnet = ( + MUTnet.add_suffix("_mutnet") + .reset_index() + .rename(columns={"index": "sample_id"}) + ) + EXP = pd.read_csv(params["exp_file"], sep="\t", index_col=0) + EXP = EXP.add_suffix("_exp").reset_index().rename(columns={"index": "sample_id"}) + # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], + # split_type=['train', 'test', 'val'], + # y_col_name= params['metric']) + response_df = [ + drp.load_response_data( + y_data_fpath=Path( + params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/y_data/response.tsv" + ), + split_fpath=Path( + params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/splits/" + params[file] + ), + ) + for file in ["train_split_file", "test_split_file", "val_split_file"] + ] + response_df = pd.concat(response_df, ignore_index=True) + response_df = response_df.rename( + columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} + ) + # Extract relevant IDs + + common_drug_ids = reduce( + np.intersect1d, + (drug_mbit_df["drug_id"], DGnet["drug_id"], response_df["drug_id"]), + ) + common_sample_ids = reduce( + np.intersect1d, + ( + CNVnet["sample_id"], + MUTnet["sample_id"], + EXP["sample_id"], + response_df["sample_id"], + ), + ) + response_df = response_df.loc[ + (response_df["drug_id"].isin(common_drug_ids)) + & (response_df["sample_id"].isin(common_sample_ids)), + :, + ] + drug_mbit_df = ( + drug_mbit_df.loc[drug_mbit_df["drug_id"].isin(common_drug_ids), :] + .set_index("drug_id") + .sort_index() + ) + DGnet = ( + DGnet.loc[DGnet["drug_id"].isin(common_drug_ids), :] + .set_index("drug_id") + .sort_index() + ) + CNVnet = ( + CNVnet.loc[CNVnet["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + MUTnet = ( + MUTnet.loc[MUTnet["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + EXP = ( + EXP.loc[EXP["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + + drug_data = drug_mbit_df.join(DGnet) + sample_data = CNVnet.join([MUTnet, EXP]) + ## export train,val,test set + # for i in ['train', 'test', 'val']: + for i in ["train_split_file", "test_split_file", "val_split_file"]: + # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], + # split_type=i, + # y_col_name= params['metric']) + response_df = drp.load_response_data( + y_data_fpath=Path( + params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/y_data/response.tsv" + ), + split_fpath=Path( + params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/splits/" + params[i] + ), + ) + response_df = response_df.rename( + columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} + ) + response_df = response_df.loc[ + (response_df["drug_id"].isin(common_drug_ids)) + & (response_df["sample_id"].isin(common_sample_ids)), + :, + ] + comb_data_mtx = pd.DataFrame( + { + "drug_id": response_df["drug_id"].values, + "sample_id": response_df["sample_id"].values, + } + ) + comb_data_mtx = ( + comb_data_mtx.set_index(["drug_id", "sample_id"]) + .join(drug_data, on="drug_id") + .join(sample_data, on="sample_id") + ) + comb_data_mtx["response"] = response_df[params["metric"]].values + comb_data_mtx = comb_data_mtx.dropna() + pl.from_pandas(comb_data_mtx).write_csv( + params[i + "_data"], separator="\t", has_header=True + ) + + +def run_ssgsea(params): + # expMat = improve_utils.load_gene_expression_data(sep='\t') + expMat = drp.load_omics_data( + params, + omics_type="gene_expression", + canc_col_name="improve_sample_id", + gene_system_identifier="Gene_Symbol", + ) + # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], + # split=params['split'], split_type=["train", "test", "val"], + # y_col_name=params['metric']) + response_df = [ + drp.load_response_data( + y_data_fpath=Path( + params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/y_data/response.tsv" + ), + split_fpath=Path( + params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/splits/" + params[file] + ), + ) + for file in ["train_split_file", "test_split_file", "val_split_file"] + ] + response_df = pd.concat(response_df, ignore_index=True) + expMat = expMat.loc[expMat.index.isin(response_df["improve_sample_id"]),] + gct = expMat.T # gene (rows) cell lines (columns) + pathway_path = ( + params["data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" + ) + gmt = pathway_path + tmp_str = params["data_dir"] + + if not os.path.isdir(tmp_str): + os.mkdir(tmp_str) + + # run enrichment + ssgsea = gp.ssgsea( + data=gct, # gct: a matrix of gene by sample + gene_sets=gmt, # gmt format + outdir=tmp_str, + scale=True, + permutation_num=0, # 1000 + no_plot=True, + processes=params["cpu_int"], + # min_size=0, + format="png", + ) + + result_mat = ssgsea.res2d.T # get the normalized enrichment score (i.e., NES) + result_mat.to_csv(tmp_str + "ssGSEA.txt", header=True, index=True, sep="\t") + + f = open(tmp_str + "ssGSEA.txt", "r") + lines = f.readlines() + total_dict = {} + for cell in set(lines[1].split()): + total_dict[cell] = {} + cell_lines = lines[1].split() + vals = lines[4].split() + for i, pathway in enumerate((lines[2].split())): + if i > 0: + total_dict[cell_lines[i]][pathway] = float(vals[i]) + df = pd.DataFrame(total_dict) + df.T.to_csv(params["exp_file"], header=True, index=True, sep="\t") + + +def candle_main(anl): + params = initialize_parameters() + data_dir = os.environ["CANDLE_DATA_DIR"] + "/" + "/Data/" + params = preprocess(params, data_dir) + if params["improve_analysis"] == "yes" or anl == 1: + download_anl_data(params) + print("convert drug to bits.") + smile2bits(params) + print("compute DGnet.") + run_netpea(params, dtype="DGnet", multiply_expression=False) + print("compute MUTnet.") + run_netpea(params, dtype="MUTnet", multiply_expression=True) + print("compute CNVnet.") + run_netpea(params, dtype="CNVnet", multiply_expression=True) + print("compute EXP.") + run_ssgsea(params) + print("prepare final input file.") + prep_input(params) + else: + download_author_data(params) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "-a", + dest="anl", + type=int, + default=0, + help="""whether to perform preprocessing using anl data or directly use processed + data from the original paper, default to 0 to use processed data from original paper""", + ) + args = parser.parse_args() + start = datetime.now() + candle_main(args.anl) + print("[Finished in {:}]".format(cal_time(datetime.now(), start))) From 8ee16b9228f47f14f3fecaf5c2f89c81d84f4dcd Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 6 Dec 2023 08:13:55 -0800 Subject: [PATCH 072/254] use improve module --- PathDSP.def | 3 +- PathDSP_default_model.txt | 43 +++ PathDSP_infer_improve.py | 88 +++++ PathDSP_params.txt | 16 +- PathDSP_preprocess_improve.py | 518 ++++++++++++++++++++++++++ PathDSP_train_improve.py | 310 ++++++++++++++++ README.md | 94 +++-- README_old.md | 134 +++++++ preprocess_improve.py | 677 ---------------------------------- 9 files changed, 1151 insertions(+), 732 deletions(-) create mode 100644 PathDSP_default_model.txt create mode 100755 PathDSP_infer_improve.py create mode 100644 PathDSP_preprocess_improve.py create mode 100644 PathDSP_train_improve.py create mode 100644 README_old.md delete mode 100644 preprocess_improve.py diff --git a/PathDSP.def b/PathDSP.def index b0eb7e6..61e45c2 100644 --- a/PathDSP.def +++ b/PathDSP.def @@ -43,8 +43,9 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime #install python modules and model prerequites cd /usr/local git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git - cd PathDSP git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git + export PYTHONPATH=$PYTHONPATH:/usr/local/IMPROVE/ + cd PathDSP # download conda /opt/conda/bin/conda env create -f environment_082223.yml --prefix /usr/local/conda_envs/PathDSP_env/ diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt new file mode 100644 index 0000000..5951ee3 --- /dev/null +++ b/PathDSP_default_model.txt @@ -0,0 +1,43 @@ +[Global_Params] +model_name='PathDSP' + +[Preprocess] +train_split_file = "gCSI_split_0_train.txt" +val_split_file = "gCSI_split_0_val.txt" +test_split_file = "gCSI_split_0_test.txt" +ml_data_outdir = "./ml_data/gCSI-gCSI/split_0" +x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] +x_data_drug_files = [["drug_SMILES.tsv"]] +y_data_files = [["response.tsv"]] +data_format = ".txt" +drug_bits_file='drug_mbit_df.txt' +dgnet_file='DGnet.txt' +mutnet_file='MUTnet.txt' +cnvnet_file='CNVnet.txt' +exp_file='EXP.txt' +bit_int=128 +permutation_int=3 +seed_int=42 +cpu_int=20 + +[Train] +train_ml_data_dir = "./ml_data/gCSI-gCSI/split_0" +val_ml_data_dir = "./ml_data/gCSI-gCSI/split_0" +model_outdir = "./out_models/gCSI/split_0" +model_file_name = "model" +model_file_format = ".pt" +epochs=800 +batch_size = 32 +val_batch = 32 +loss = "mse" +early_stop_metric = "mse" +patience = 30 +cuda_name = "cuda:2" +learning_rate = 0.001 + +[Infer] +test_ml_data_dir = "./ml_data/gCSI-gCSI/split_0" +model_dir = "./out_models/gCSI/split_0" +infer_outdir = "./out_infer/gCSI-gCSI/split_0" +test_batch = 256 +cuda_name = "cuda:3" \ No newline at end of file diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py new file mode 100755 index 0000000..b91c8fa --- /dev/null +++ b/PathDSP_infer_improve.py @@ -0,0 +1,88 @@ +import candle +import os +import sys +#import json +#from json import JSONEncoder +from PathDSP_preprocess_improve import mkdir, preprocess +from PathDSP_train_improve import predicting +import numpy as np +import pandas as pd +from datetime import datetime +import torch as tch +import torch.utils.data as tchud +import polars as pl +import sklearn.metrics as skmts +#sys.path.append("/usr/local/PathDSP/PathDSP") +#sys.path.append("/usr/local/PathDSP/PathDSP") +#sys.path.append(os.getcwd() + "/PathDSP") +import myModel as mynet +import myDataloader as mydl +import myDatasplit as mysplit +import myUtility as myutil + +from improve import framework as frm +# from improve.torch_utils import TestbedDataset +from improve.metrics import compute_metrics + +from PathDSP_train_improve import ( + preprocess, + cal_time, + metrics_list, + model_preproc_params, + model_train_params, +) + +file_path = os.path.dirname(os.path.realpath(__file__)) + +# [Req] App-specific params +app_infer_params = [] + +# [PathDSP] Model-specific params (Model: PathDSP) +model_infer_params = [] + +def run(params): + frm.create_outdir(outdir=params["infer_outdir"]) + params = preprocess(params) + test_df = pl.read_csv(params['test_data'], separator = "\t").to_pandas() + Xtest_arr = test_df.iloc[:, 0:-1].values + ytest_arr = test_df.iloc[:, -1].values + Xtest_arr = np.array(Xtest_arr).astype('float32') + ytest_arr = np.array(ytest_arr).astype('float32') + trained_net = mynet.FNN(Xtest_arr.shape[1]) + modelpath = frm.build_model_path(params, model_dir=params["model_dir"]) + trained_net.load_state_dict(tch.load(modelpath)) + trained_net.eval() + myutil.set_seed(params["seed_int"]) + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) + test_dl = tchud.DataLoader(test_dataset, batch_size=params['test_batch'], shuffle=False) + start = datetime.now() + test_true, test_pred = predicting(trained_net, device, data_loader=test_dl) + frm.store_predictions_df( + params, y_true=test_true, y_pred=test_pred, stage="test", + outdir=params["infer_outdir"] + ) + test_scores = frm.compute_performace_scores( + params, y_true=test_true, y_pred=test_pred, stage="test", + outdir=params["infer_outdir"], metrics=metrics_list + ) + print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start))) + return test_scores + +def main(): + additional_definitions = model_preproc_params + \ + model_train_params + \ + model_infer_params + \ + app_infer_params + params = frm.initialize_parameters( + file_path, + default_model="PathDSP_default_model.txt", + additional_definitions=additional_definitions, + required=None, + ) + test_scores = run(params) + print("\nFinished inference of PathDSP model.") + + +if __name__ == "__main__": + main() diff --git a/PathDSP_params.txt b/PathDSP_params.txt index f0b2857..12de5d5 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -1,6 +1,15 @@ [Global_Params] - model_name='PathDSP' + +[Preprocess] +train_split_file = "gCSI_split_0_train.txt" +val_split_file = "gCSI_split_0_val.txt" +test_split_file = "gCSI_split_0_test.txt" +ml_data_outdir = "./ml_data/gCSI-gCSI/split_0" +x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] +x_data_drug_files = [["drug_SMILES.tsv"]] +y_data_files = [["response.tsv"]] + data_url='https://zenodo.org/record/6093818/files/' improve_data_url='https://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/raw_data/' original_data_url='https://zenodo.org/record/7532963/files/' @@ -8,9 +17,7 @@ original_data='input.zip' gene_set = 'MSigdb.zip' ppi_data = 'STRING.zip' drug_target = 'raw_data.zip' -train_split_file = "CCLE_split_0_train.txt" -val_split_file = "CCLE_split_0_val.txt" -test_split_file = "CCLE_split_0_test.txt" +raw_data_dir = "raw_data" train_data = 'PathDSP_train.txt' test_data = 'PathDSP_test.txt' val_data = 'PathDSP_val.txt' @@ -22,6 +29,7 @@ exp_file='EXP.txt' #output='Result/' bit_int=128 permutation_int=3 +y_col_name = 'auc' metric='auc' data_type='CTRPv2' split=0 diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py new file mode 100644 index 0000000..1fd31f9 --- /dev/null +++ b/PathDSP_preprocess_improve.py @@ -0,0 +1,518 @@ +import sys +import os +import numpy as np +import polars as pl +import argparse +import numpy as np +import pandas as pd +import candle +from functools import reduce +from improve import drug_resp_pred as drp +from improve import framework as frm +from pathlib import Path +from rdkit import Chem +from rdkit.Chem import AllChem +from datetime import datetime +import RWR as rwr +import NetPEA as pea +import gseapy as gp +import sklearn.model_selection as skms + + +file_path = Path(__file__).resolve().parent + +app_preproc_params = [ + # These arg should be specified in the [modelname]_default_model.txt: + # y_data_files, x_data_canc_files, x_data_drug_files + {"name": "y_data_files", # default + "type": str, + "help": "List of files that contain the y (prediction variable) data. \ + Example: [['response.tsv']]", + }, + {"name": "x_data_canc_files", # [Req] + "type": str, + "help": "List of feature files including gene_system_identifer. Examples: \n\ + 1) [['cancer_gene_expression.tsv', ['Gene_Symbol']]] \n\ + 2) [['cancer_copy_number.tsv', ['Ensembl', 'Entrez']]].", + }, + {"name": "x_data_drug_files", # [Req] + "type": str, + "help": "List of feature files. Examples: \n\ + 1) [['drug_SMILES.tsv']] \n\ + 2) [['drug_SMILES.tsv'], ['drug_ecfp4_nbits512.tsv']]", + }, + {"name": "canc_col_name", + "default": "improve_sample_id", # default + "type": str, + "help": "Column name in the y (response) data file that contains the cancer sample ids.", + }, + {"name": "drug_col_name", # default + "default": "improve_chem_id", + "type": str, + "help": "Column name in the y (response) data file that contains the drug ids.", + }, + +] + +# [PathDSP] Model-specific params +model_preproc_params = [ + {"name": "bit_int", + "type": int, + "default": 128, + "help": "Number of bits for morgan fingerprints.", + }, + {"name": "permutation_int", + "type": int, + "default": 3, + "help": "Number of permutation for calculating enrichment scores.", + }, + {"name": "seed_int", + "type": int, + "default": 42, + "help": "Random seed for random walk algorithm.", + }, + {"name": "cpu_int", + "type": int, + "default": 20, + "help": "Number of cpus to use when calculating pathway enrichment scores.", + }, + {"name": "drug_bits_file", + "type": str, + "default": "drug_mbit_df.txt", + "help": "File name to save the drug bits file.", + }, + {"name": "dgnet_file", + "type": str, + "default": "DGnet.txt", + "help": "File name to save the drug target net file.", + }, + {"name": "mutnet_file", + "type": str, + "default": "MUTnet.txt", + "help": "File name to save the mutation net file.", + }, + {"name": "cnvnet_file", + "type": str, + "default": "CNVnet.txt", + "help": "File name to save the CNV net file.", + }, + {"name": "exp_file", + "type": str, + "default": "EXPnet.txt", + "help": "File name to save the EXP net file.", + }, +] + +preprocess_params = app_preproc_params + model_preproc_params +req_preprocess_args = [ll["name"] for ll in preprocess_params] + + +def mkdir(directory): + directories = directory.split("/") + + folder = "" + for d in directories: + folder += d + "/" + if not os.path.exists(folder): + print("creating folder: %s" % folder) + os.mkdir(folder) + + +def preprocess(params): + params["train_data"] = frm.build_ml_data_name(params, 'train') + params["val_data"] = frm.build_ml_data_name(params, 'val') + params["test_data"] = frm.build_ml_data_name(params, 'test') + params["author_data_dir"] = os.getenv("AUTHOR_DATA_DIR") + for i in [ + "train_data", + "test_data", + "val_data", + "drug_bits_file", + "dgnet_file", + "mutnet_file", + "cnvnet_file", + "exp_file", + ]: + params[i] = params["ml_data_outdir"] + "/" + params[i] + + return params + + +# set timer +def cal_time(end, start): + """return time spent""" + # end = datetime.now(), start = datetime.now() + datetimeFormat = "%Y-%m-%d %H:%M:%S.%f" + spend = datetime.strptime(str(end), datetimeFormat) - datetime.strptime( + str(start), datetimeFormat + ) + return spend + +def response_out(params, split_file): + response_df = drp.DrugResponseLoader(params, split_file=split_file, verbose=True) + return response_df.dfs["response.tsv"] + + +def smile2bits(params): + start = datetime.now() + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + + smile_df = drp.DrugsLoader(params) + + smile_df = smile_df.dfs['drug_SMILES.tsv'] + smile_df = smile_df.reset_index() + smile_df.columns = ["drug", "smile"] + smile_df = smile_df.drop_duplicates(subset=["drug"], keep="first").set_index("drug") + smile_df = smile_df.loc[smile_df.index.isin(response_df["improve_chem_id"]),] + bit_int = params["bit_int"] + record_list = [] + # smile2bits drug by drug + n_drug = 1 + for idx, row in smile_df.iterrows(): + drug = idx + smile = row["smile"] + mol = Chem.MolFromSmiles(smile) + if mol is None: + continue + mbit = list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=bit_int)) + # drug_mbit_dict.update({drug:mbit}) + # append to result + record_list.append(tuple([drug] + mbit)) + if len(mbit) == bit_int: + n_drug += 1 + print("total {:} drugs with bits".format(n_drug)) + # convert dict to dataframe + colname_list = ["drug"] + ["mBit_" + str(i) for i in range(bit_int)] + drug_mbit_df = pd.DataFrame.from_records(record_list, columns=colname_list) + # drug_mbit_df = pd.DataFrame.from_dict(drug_mbit_dict, orient='index', columns=colname_list) + # drug_mbit_df.index.name = 'drug' + print("unique drugs={:}".format(len(drug_mbit_df["drug"].unique()))) + # save to file + drug_mbit_df.to_csv(params["drug_bits_file"], header=True, index=False, sep="\t") + print("[Finished in {:}]".format(cal_time(datetime.now(), start))) + + +def times_expression(rwr, exp): + """ + :param rwrDf: dataframe of cell by gene probability matrix + :param expDf: dataframe of cell by gene expression matrix + :return rwr_timesexp_df: dataframe of cell by gene probability matrix, + in which genes are multiplied with expression values + + Note: this function assumes cells are all overlapped while gene maybe not + """ + cell_list = sorted(list(set(rwr.index) & set(exp.index))) + gene_list = sorted(list(set(rwr.columns) & set(exp.columns))) + + if len(cell_list) == 0: + print("ERROR! no overlapping cell lines") + sys.exit(1) + if len(gene_list) == 0: + print("ERROR! no overlapping genes") + sys.exit(1) + + # multiply with gene expression for overlapping cell, gene + rwr_timesexp = rwr.loc[cell_list, gene_list] * exp.loc[cell_list, gene_list] + + # concat with other gene + out_gene_list = list(set(rwr.columns) - set(gene_list)) + out_df = pd.concat([rwr_timesexp, rwr[out_gene_list]], axis=1) + return out_df + + +def run_netpea(params, dtype, multiply_expression): + # timer + start_time = datetime.now() + ppi_path = params["author_data_dir"] + "/STRING/9606.protein_name.links.v11.0.pkl" + pathway_path = ( + params["author_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" + ) + log_transform = False + permutation_int = params["permutation_int"] + seed_int = params["seed_int"] + cpu_int = params["cpu_int"] + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + omics_data = drp.OmicsLoader(params) + + if dtype == "DGnet": + drug_info = pd.read_csv(os.environ["IMPROVE_DATA_DIR"] + "/raw_data/x_data/drug_info.tsv", sep="\t") + drug_info["NAME"] = drug_info["NAME"].str.upper() + target_info = pd.read_csv( + params["author_data_dir"] + "/data/DB.Drug.Target.txt", sep="\t" + ) + target_info = target_info.rename(columns={"drug": "NAME"}) + combined_df = pd.merge(drug_info, target_info, how="left", on="NAME").dropna( + subset=["gene"] + ) + combined_df = combined_df.loc[ + combined_df["improve_chem_id"].isin(response_df["improve_chem_id"]), + ] + restart_path = params["ml_data_outdir"] + "/drug_target.txt" + combined_df.iloc[:, -2:].to_csv( + restart_path, sep="\t", header=True, index=False + ) + outpath = params["dgnet_file"] + elif dtype == "MUTnet": + mutation_data = omics_data.dfs['cancer_mutation_count.tsv'] + #mutation_data = mutation_data.reset_index() + mutation_data = pd.melt(mutation_data, id_vars="improve_sample_id").loc[ + lambda x: x["value"] > 0 + ] + mutation_data = mutation_data.loc[ + mutation_data["improve_sample_id"].isin(response_df["improve_sample_id"]), + ] + restart_path = params["ml_data_outdir"] + "/mutation_data.txt" + mutation_data.iloc[:, 0:2].to_csv( + restart_path, sep="\t", header=True, index=False + ) + outpath = params["mutnet_file"] + else: + cnv_data = omics_data.dfs['cancer_discretized_copy_number.tsv'] + #cnv_data = cnv_data.reset_index() + cnv_data = pd.melt(cnv_data, id_vars="improve_sample_id").loc[ + lambda x: x["value"] != 0 + ] + cnv_data = cnv_data.loc[ + cnv_data["improve_sample_id"].isin(response_df["improve_sample_id"]), + ] + restart_path = params["ml_data_outdir"] + "/cnv_data.txt" + cnv_data.iloc[:, 0:2].to_csv(restart_path, sep="\t", header=True, index=False) + outpath = params["cnvnet_file"] + # perform Random Walk + print(datetime.now(), "performing random walk with restart") + rwr_df = rwr.RWR( + ppi_path, + restart_path, + restartProbFloat=0.5, + convergenceFloat=0.00001, + normalize="l1", + weighted=True, + ).get_prob() + # multiply with gene expression + if multiply_expression: + print( + datetime.now(), + "multiplying gene expression with random walk probability for genes were expressed", + ) + # exp_df = improve_utils.load_gene_expression_data(gene_system_identifier='Gene_Symbol') + # exp_df = drp.load_omics_data( + # params, + # omics_type="gene_expression", + # canc_col_name="improve_sample_id", + # gene_system_identifier="Gene_Symbol", + # ) + exp_df = omics_data.dfs['cancer_gene_expression.tsv'] + exp_df = exp_df.set_index(params['canc_col_name']) + rwr_df = times_expression(rwr_df, exp_df) + # rwr_df.to_csv(out_path+'.RWR.txt', header=True, index=True, sep='\t') + # perform Pathwa Enrichment Analysis + print(datetime.now(), "performing network-based pathway enrichment") + cell_pathway_df = pea.NetPEA( + rwr_df, + pathway_path, + log_transform=log_transform, + permutation=permutation_int, + seed=seed_int, + n_cpu=cpu_int, + out_path=outpath, + ) + print("[Finished in {:}]".format(cal_time(datetime.now(), start_time))) + + +def prep_input(params): + # Read data files + drug_mbit_df = pd.read_csv(params["drug_bits_file"], sep="\t", index_col=0) + drug_mbit_df = drug_mbit_df.reset_index().rename(columns={"drug": "drug_id"}) + DGnet = pd.read_csv(params["dgnet_file"], sep="\t", index_col=0) + DGnet = ( + DGnet.add_suffix("_dgnet").reset_index().rename(columns={"index": "drug_id"}) + ) + CNVnet = pd.read_csv(params["cnvnet_file"], sep="\t", index_col=0) + CNVnet = ( + CNVnet.add_suffix("_cnvnet") + .reset_index() + .rename(columns={"index": "sample_id"}) + ) + MUTnet = pd.read_csv(params["mutnet_file"], sep="\t", index_col=0) + MUTnet = ( + MUTnet.add_suffix("_mutnet") + .reset_index() + .rename(columns={"index": "sample_id"}) + ) + EXP = pd.read_csv(params["exp_file"], sep="\t", index_col=0) + EXP = EXP.add_suffix("_exp").reset_index().rename(columns={"index": "sample_id"}) + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + response_df = response_df.rename( + columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} + ) + # Extract relevant IDs + + common_drug_ids = reduce( + np.intersect1d, + (drug_mbit_df["drug_id"], DGnet["drug_id"], response_df["drug_id"]), + ) + common_sample_ids = reduce( + np.intersect1d, + ( + CNVnet["sample_id"], + MUTnet["sample_id"], + EXP["sample_id"], + response_df["sample_id"], + ), + ) + response_df = response_df.loc[ + (response_df["drug_id"].isin(common_drug_ids)) + & (response_df["sample_id"].isin(common_sample_ids)), + :, + ] + drug_mbit_df = ( + drug_mbit_df.loc[drug_mbit_df["drug_id"].isin(common_drug_ids), :] + .set_index("drug_id") + .sort_index() + ) + DGnet = ( + DGnet.loc[DGnet["drug_id"].isin(common_drug_ids), :] + .set_index("drug_id") + .sort_index() + ) + CNVnet = ( + CNVnet.loc[CNVnet["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + MUTnet = ( + MUTnet.loc[MUTnet["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + EXP = ( + EXP.loc[EXP["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + + drug_data = drug_mbit_df.join(DGnet) + sample_data = CNVnet.join([MUTnet, EXP]) + ## export train,val,test set + # for i in ['train', 'test', 'val']: + for i in ["train", "test", "val"]: + response_df = drp.DrugResponseLoader(params, split_file=params[i+"_split_file"], verbose=True) + response_df = response_df.dfs['response.tsv'] + response_df = response_df.rename( + columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} + ) + response_df = response_df.loc[ + (response_df["drug_id"].isin(common_drug_ids)) + & (response_df["sample_id"].isin(common_sample_ids)), + :, + ] + comb_data_mtx = pd.DataFrame( + { + "drug_id": response_df["drug_id"].values, + "sample_id": response_df["sample_id"].values, + } + ) + comb_data_mtx = ( + comb_data_mtx.set_index(["drug_id", "sample_id"]) + .join(drug_data, on="drug_id") + .join(sample_data, on="sample_id") + ) + comb_data_mtx["response"] = response_df[params["y_col_name"]].values + comb_data_mtx = comb_data_mtx.dropna() + pl.from_pandas(comb_data_mtx).write_csv( + params[i + "_data"], separator="\t", has_header=True + ) + + +def run_ssgsea(params): + # expMat = improve_utils.load_gene_expression_data(sep='\t') + # expMat = drp.load_omics_data( + # params, + # omics_type="gene_expression", + # canc_col_name="improve_sample_id", + # gene_system_identifier="Gene_Symbol", + # ) + omics_data = drp.OmicsLoader(params) + expMat = omics_data.dfs['cancer_gene_expression.tsv'] + expMat = expMat.set_index(params['canc_col_name']) + + # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], + # split=params['split'], split_type=["train", "test", "val"], + # y_col_name=params['metric']) + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + expMat = expMat.loc[expMat.index.isin(response_df["improve_sample_id"]),] + gct = expMat.T # gene (rows) cell lines (columns) + pathway_path = ( + params["author_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" + ) + gmt = pathway_path + tmp_str = params["ml_data_outdir"] + "/tmpdir_ssgsea/" + + if not os.path.isdir(tmp_str): + os.mkdir(tmp_str) + + # run enrichment + ssgsea = gp.ssgsea( + data=gct, # gct: a matrix of gene by sample + gene_sets=gmt, # gmt format + outdir=tmp_str, + scale=True, + permutation_num=0, # 1000 + no_plot=True, + processes=params["cpu_int"], + # min_size=0, + format="png", + ) + + result_mat = ssgsea.res2d.T # get the normalized enrichment score (i.e., NES) + result_mat.to_csv(tmp_str + "ssGSEA.txt", header=True, index=True, sep="\t") + + f = open(tmp_str + "ssGSEA.txt", "r") + lines = f.readlines() + total_dict = {} + for cell in set(lines[1].split()): + total_dict[cell] = {} + cell_lines = lines[1].split() + vals = lines[4].split() + for i, pathway in enumerate((lines[2].split())): + if i > 0: + total_dict[cell_lines[i]][pathway] = float(vals[i]) + df = pd.DataFrame(total_dict) + df.T.to_csv(params["exp_file"], header=True, index=True, sep="\t") + +def run(params): + params = frm.build_paths(params) + frm.create_outdir(outdir=params["ml_data_outdir"]) + params = preprocess(params) + print("convert drug to bits.") + smile2bits(params) + print("compute DGnet.") + run_netpea(params, dtype="DGnet", multiply_expression=False) + print("compute MUTnet.") + run_netpea(params, dtype="MUTnet", multiply_expression=True) + print("compute CNVnet.") + run_netpea(params, dtype="CNVnet", multiply_expression=True) + print("compute EXP.") + run_ssgsea(params) + print("prepare final input file.") + prep_input(params) + + +def main(): + params = frm.initialize_parameters( + file_path, + default_model="PathDSP_default_model.txt", + additional_definitions=preprocess_params, + required=req_preprocess_args, + ) + run(params) + + +if __name__ == "__main__": + start = datetime.now() + main() + print("[Preprocessing finished in {:}]".format(cal_time(datetime.now(), start))) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py new file mode 100644 index 0000000..25f32d9 --- /dev/null +++ b/PathDSP_train_improve.py @@ -0,0 +1,310 @@ +import candle +import os +import sys +import datetime +# IMPROVE/CANDLE imports +from improve import framework as frm +from improve.metrics import compute_metrics +#from model_utils.torch_utils import predicting +#import json +#from json import JSONEncoder +from PathDSP_preprocess_improve import cal_time, preprocess, model_preproc_params, app_preproc_params, preprocess_params + +#sys.path.append("/usr/local/PathDSP/PathDSP") +#sys.path.append("/usr/local/PathDSP/PathDSP") +#sys.path.append(os.getcwd() + "/PathDSP") +#import FNN_new +import os +import argparse +import numpy as np +import pandas as pd +import scipy.stats as scistat +from datetime import datetime + +import sklearn.preprocessing as skpre +import sklearn.model_selection as skms +import sklearn.metrics as skmts +import sklearn.utils as skut + +import torch as tch +import torch.utils.data as tchud + +import myModel as mynet +import myDataloader as mydl +import myUtility as myutil +import polars as pl + +file_path = os.path.dirname(os.path.realpath(__file__)) + +# [Req] List of metrics names to be compute performance scores +metrics_list = ["mse", "rmse", "pcc", "scc", "r2"] + +# Currently, there are no app-specific args for the train script. +app_train_params = [] + +# [PathDSP] Model-specific params (Model: PathDSP) +model_train_params = [ + {"name": "cuda_name", # TODO. frm. How should we control this? + "action": "store", + "type": str, + "help": "Cuda device (e.g.: cuda:0, cuda:1."}, + {"name": "learning_rate", + "type": float, + "default": 0.0001, + "help": "Learning rate for the optimizer." + }, + +] + +class RMSELoss(tch.nn.Module): + def __init__(self): + super(RMSELoss,self).__init__() + + def forward(self,x,y): + eps = 1e-6 + criterion = tch.nn.MSELoss() + loss = tch.sqrt(criterion(x, y) + eps) + return loss + + + +def predicting(model, device, data_loader): + """ Method to make predictions/inference. + This is used in *train.py and *infer.py + + Parameters + ---------- + model : pytorch model + Model to evaluate. + device : string + Identifier for hardware that will be used to evaluate model. + data_loader : pytorch data loader. + Object to load data to evaluate. + + Returns + ------- + total_labels: numpy array + Array with ground truth. + total_preds: numpy array + Array with inferred outputs. + """ + model.to(device) + model.eval() + total_preds = tch.Tensor() + total_labels = tch.Tensor() + print("Make prediction for {} samples...".format(len(data_loader.dataset))) + with tch.no_grad(): + for i, (data_x, data_y) in enumerate(data_loader): + data_x, data_y = data_x.to(device), data_y.to(device) + data_y_pred = model(data_x) + # Is this computationally efficient? + total_preds = tch.cat((total_preds, data_y_pred.cpu()), 0) # preds to tensor + total_labels = tch.cat((total_labels, data_y.view(-1, 1).cpu()), 0) # labels to tensor + return total_labels.numpy().flatten(), total_preds.numpy().flatten() + +def r2_score(y_true, y_pred): + y_mean = np.mean(y_true) + ss_tot = np.sum((y_true - y_mean)**2) + ss_res = np.sum((y_true - y_pred)**2) + r2 = 1 - ss_res / ss_tot + return r2 + +def cal_time(end, start): + '''return time spent''' + # end = datetime.now(), start = datetime.now() + datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' + spend = datetime.strptime(str(end), datetimeFormat) - \ + datetime.strptime(str(start),datetimeFormat) + return spend + + +def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn, params): + """ + Return train and valid performance including loss + + :param net: model + :param train_dl: train dataloader + :param valid_dl: valid dataloader + :param epochs: integer representing EPOCH + :param learning_rate: float representing LEARNING_RATE + :param device: string representing cpu or cuda:0 + :param opt_fn: optimization function in torch (e.g., tch.optim.Adam) + :param loss_fn: loss function in torch (e.g., tch.nn.MSELoss) + """ + # setup + criterion = RMSELoss() # setup LOSS function + optimizer = opt_fn(net.parameters(), lr=learning_rate, weight_decay=1e-5) # setup optimizer + net = net.to(device) # load the network onto the device + trainloss_list = [] # metrics: MSE, size equals to EPOCH + validloss_list = [] # metrics: MSE, size equals to EPOCH + validr2_list = [] # metrics: r2, size equals to EPOCH + early_stopping = myutil.EarlyStopping(patience=params['patience'], verbose=True, path= params["model_outdir"] + "/checkpoint.pt") # initialize the early_stopping + # repeat the training for EPOCH times + start_total = datetime.now() + for epoch in range(epochs): + ## training phase + start = datetime.now() + net.train() + # initial loss + train_epoch_loss = 0.0 # save loss for each epoch, batch by batch + for i, (X_train, y_train) in enumerate(train_dl): + X_train, y_train = X_train.to(device), y_train.to(device) # load data onto the device + y_train_pred = net(X_train) # train result + train_loss = criterion(y_train_pred, y_train.float()) # calculate loss + optimizer.zero_grad() # clear gradients + train_loss.backward() # backpropagation + #### add this if you have gradient explosion problem ### + clip_value = 5 + tch.nn.utils.clip_grad_value_(net.parameters(), clip_value) + ########climp gradient within -5 ~ 5 ################### + optimizer.step() # update weights + train_epoch_loss += train_loss.item() # adding loss from each batch + # calculate total loss of all batches + avg_train_loss = train_epoch_loss / len(train_dl) + trainloss_list.append( avg_train_loss ) + print('epoch ' + str(epoch) + ' :[Finished in {:}]'.format(cal_time(datetime.now(), start))) + ## validation phase + with tch.no_grad(): + net.eval() + valid_epoch_loss = 0.0 # save loss for each epoch, batch by batch + ss_res = 0.0 + ss_tot = 0.0 + for i, (X_valid, y_valid) in enumerate(valid_dl): + X_valid, y_valid = X_valid.to(device), y_valid.to(device) # load data onto the device + y_valid_pred = net(X_valid) # valid result + valid_loss = criterion(y_valid_pred, y_valid.float())#y_valid.unsqueeze(1)) # calculate loss + valid_epoch_loss += valid_loss.item() # adding loss from each batch + ss_res += tch.sum((y_valid_pred - y_valid.float())**2) + ss_tot += tch.sum((y_valid_pred - y_valid.mean())**2) + + + # calculate total loss of all batches, and append to result list + avg_valid_loss = valid_epoch_loss / len(valid_dl) + validloss_list.append( avg_valid_loss) + valid_r2 = 1 - ss_res / ss_tot + validr2_list.append(valid_r2.cpu().numpy()) + # display print message + #print('epoch={:}/{:}, train loss={:.5f}, valid loss={:.5f}'.format( + # epoch+1, epochs, train_epoch_loss / len(train_dl), + # valid_epoch_loss / len(valid_dl))) + + # early_stopping needs the validation loss to check if it has decresed, + # and if it has, it will make a checkpoint of the current model + early_stopping(avg_valid_loss, net) + + if early_stopping.early_stop: + print("Early stopping") + break + + print('Total time (all epochs) :[Finished in {:}]'.format(cal_time(datetime.now(), start_total))) + # load the last checkpoint with the best model + net.load_state_dict(tch.load(params["model_outdir"] + '/checkpoint.pt')) + + return net, trainloss_list, validloss_list, validr2_list + + +def run(params): + frm.create_outdir(outdir=params["model_outdir"]) + modelpath = frm.build_model_path(params, model_dir=params["model_outdir"]) + train_data_fname = frm.build_ml_data_name(params, stage="train") + val_data_fname = frm.build_ml_data_name(params, stage="val") + params = preprocess(params) + + # set parameters + myutil.set_seed(params["seed_int"]) + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + learning_rate = params['learning_rate'] + epoch = params['epochs'] + batch_size = params['batch_size'] + val_batch = params['val_batch'] + opt_fn = tch.optim.Adam + + # ------------------------------------------------------ + # [PathDSP] Prepare dataloaders + # ------------------------------------------------------ + print('loadinig data') + train_df = pl.read_csv(params['train_data'], separator = "\t").to_pandas() + val_df = pl.read_csv(params['val_data'], separator = "\t").to_pandas() + Xtrain_arr = train_df.iloc[:, 0:-1].values + Xvalid_arr = val_df.iloc[:, 0:-1].values + ytrain_arr = train_df.iloc[:, -1].values + yvalid_arr = val_df.iloc[:, -1].values + Xtrain_arr = np.array(Xtrain_arr).astype('float32') + Xvalid_arr = np.array(Xvalid_arr).astype('float32') + ytrain_arr = np.array(ytrain_arr).astype('float32') + yvalid_arr = np.array(yvalid_arr).astype('float32') + # create mini-batch + train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) + valid_dataset = mydl.NumpyDataset(tch.from_numpy(Xvalid_arr), tch.from_numpy(yvalid_arr)) + train_dl = tchud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + valid_dl = tchud.DataLoader(valid_dataset, batch_size=val_batch, shuffle=False) + + # ------------------------------------------------------ + # [PathDSP] Prepare model + # ------------------------------------------------------ + # initial weight + def init_weights(m): + if type(m) == tch.nn.Linear: + tch.nn.init.kaiming_uniform_(m.weight) + m.bias.data.fill_(0.01) + # load model + n_features = Xtrain_arr.shape[1] + net = mynet.FNN(n_features) + net.apply(init_weights) + + # ------------------------------------------------------ + # [PathDSP] Training + # ------------------------------------------------------ + print('start training process') + trained_net, train_loss_list, valid_loss_list, valid_r2_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn, params) + + loss_df = pd.DataFrame({'epoch':[i+1 for i in range(len(train_loss_list))], + 'train loss':train_loss_list, + 'valid loss': valid_loss_list, + 'valid r2': valid_r2_list}) + loss_df.to_csv(params['model_outdir'] + '/Val_Loss_orig.txt', header=True, index=False, sep="\t") + + # make train/valid loss plots + best_model = trained_net + tch.save(best_model.state_dict(), modelpath) + best_model.eval() + # Compute predictions + val_true, val_pred = predicting(best_model, device, data_loader=valid_dl) # (groud truth), (predictions) + + # ----------------------------- + # [Req] Save raw predictions in dataframe + # ----------------------------- + # import ipdb; ipdb.set_trace() + frm.store_predictions_df( + params, y_true=val_true, y_pred=val_pred, stage="val", + outdir=params["model_outdir"] + ) + + # ----------------------------- + # [Req] Compute performance scores + # ----------------------------- + # import ipdb; ipdb.set_trace() + val_scores = frm.compute_performace_scores( + params, y_true=val_true, y_pred=val_pred, stage="val", + outdir=params["model_outdir"], metrics=metrics_list + ) + return val_scores + + +def main(): + additional_definitions = model_preproc_params + \ + model_train_params + \ + app_train_params + params = frm.initialize_parameters( + file_path, + default_model="PathDSP_default_model.txt", + additional_definitions=additional_definitions, + required=None, + ) + val_scores = run(params) + + +if __name__ == "__main__": + start = datetime.now() + main() + print("[Training finished in {:}]".format(cal_time(datetime.now(), start))) diff --git a/README.md b/README.md index d8905e7..4acc239 100644 --- a/README.md +++ b/README.md @@ -1,61 +1,49 @@ # PathDSP Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores -# Example usage with singularity container -Setup Singularity +# Download benchmark data -``` -git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git -cd Singularity -./setup -source config/improve.env -``` - -Build Singularity from definition file +Download the cross-study analysis (CSA) benchmark data into the model directory from https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/ ``` -singularity build --fakeroot PathDSP.sif definitions/PathDSP.def +mkdir process_dir +cd process_dir +wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data ``` -Perform preprocessing step using processed data from original paper - -``` -singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 0" -``` +Benchmarmakr data will be downladed under `process_dir/csa_data/` -Alternatively, perform preprocessing step using raw data from IMPROVE project +# Download author data ``` -singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 1" +mkdir author_data +cd author_data +wget https://zenodo.org/record/6093818/files/MSigdb.zip +wget https://zenodo.org/record/6093818/files/raw_data.zip +wget https://zenodo.org/record/6093818/files/STRING.zip +unzip MSigdb.zip +unzip raw_data.zip +unzip STRING.zip ``` -Train the model - -``` -singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir -``` - -Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss.txt` -Final trained model is located at: `${IMPROVE_DATA_DIR}/Data/model.pt` - -Perform inference on the testing data - -``` -singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir -``` - -Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss_pred.txt` -Final prediction on testing data is located at: `${IMPROVE_DATA_DIR}/Data/Prediction.txt` +Author data will be downloaded under `process_dir/author_data/` # Example usage with Conda -Download PathDSP +Download PathDSP and IMPROVE ``` +cd ../ +mkdir repo +cd repo git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git +git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git cd PathDSP ``` +PathDSP will be installed at `process_dir/repo/PathDSP` +IMPROVE will be installed at `process_dir/repo/IMPROVE` + Create environment ``` @@ -68,43 +56,49 @@ Activate environment conda activate PathDSP_env ``` -Intall CANDLE package +Install CANDLE package ``` pip install git+https://github.com/ECP-CANDLE/candle_lib@develop ``` -Perform preprocessing step using processed data from original paper +Define enviroment variabels ``` -export CUDA_VISIBLE_DEVICES=0 -export CANDLE_DATA_DIR=./Data/ -bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 0" +improve_lib="/path/to/IMPROVE/repo/" +pathdsp_lib="/path/to/pathdsp/repo/" +# notice the extra PathDSP folder after pathdsp_lib +export PYTHONPATH=$PYTHONPATH:improve_lib:pathdsp_lib/PathDSP/ +export IMPROVE_DATA_DIR="/path/to/csa_data/" +export AUTHOR_DATA_DIR="/path/to/author_data/" ``` -Alternatively, perform preprocessing step using raw data from IMPROVE project +Perform preprocessing step ``` -bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 1" +# go two upper level +cd ../../ +python repo/PathDSP/PathDSP_preprocess_improve.py ``` Train the model ``` -bash train.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +python repo/PathDSP/PathDSP_train_improve.py ``` -Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss.txt` -Final trained model is located at: `${CANDLE_DATA_DIR}/Data/model.pt` +Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json` +Final trained model is located at: `${train_ml_data_dir}/model.pt`. Parameter definitions can be found at `process_dir/repo/PathDSP/PathDSP_default_model.txt` Perform inference on the testing data ``` -bash infer.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +python PathDSP_infer_improve.py ``` -Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss_pred.txt` -Final prediction on testing data is located at: `${CANDLE_DATA_DIR}/Data/Prediction.txt` +Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` +Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` + # Docs from original authors (below) @@ -131,4 +125,4 @@ Pathway enrichment scores for categorical data (i.e., mutation, copy number vari # Reference -Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7 +Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7 \ No newline at end of file diff --git a/README_old.md b/README_old.md new file mode 100644 index 0000000..84ea104 --- /dev/null +++ b/README_old.md @@ -0,0 +1,134 @@ +# PathDSP +Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores + +# Example usage with singularity container +Setup Singularity + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git +cd Singularity +./setup +source config/improve.env +``` + +Build Singularity from definition file + +``` +singularity build --fakeroot PathDSP.sif definitions/PathDSP.def +``` + +Perform preprocessing step using processed data from original paper + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 0" +``` + +Alternatively, perform preprocessing step using raw data from IMPROVE project + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 1" +``` + +Train the model + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir +``` + +Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss.txt` +Final trained model is located at: `${IMPROVE_DATA_DIR}/Data/model.pt` + +Perform inference on the testing data + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir +``` + +Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss_pred.txt` +Final prediction on testing data is located at: `${IMPROVE_DATA_DIR}/Data/Prediction.txt` + +# Example usage with Conda + +Download PathDSP + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git +cd PathDSP +``` + +Create environment + +``` +conda env create -f environment_082223.yml -n PathDSP_env +``` + +Activate environment + +``` +conda activate PathDSP_env +``` + +Intall CANDLE package + +``` +pip install git+https://github.com/ECP-CANDLE/candle_lib@develop +``` + +Perform preprocessing step using processed data from original paper + +``` +export CUDA_VISIBLE_DEVICES=0 +export CANDLE_DATA_DIR=./Data/ +bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 0" +``` + +Alternatively, perform preprocessing step using raw data from IMPROVE project + +``` +bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 1" +``` + +Train the model + +``` +bash train.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +``` + +Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss.txt` +Final trained model is located at: `${CANDLE_DATA_DIR}/Data/model.pt` + +Perform inference on the testing data + +``` +bash infer.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +``` + +Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss_pred.txt` +Final prediction on testing data is located at: `${CANDLE_DATA_DIR}/Data/Prediction.txt` + +# Docs from original authors (below) + +# Requirments + +# Input format + +|drug|cell|feature_1|....|feature_n|drug_response| +|----|----|--------|----|--------|----| +|5-FU|03|0|....|0.02|-2.3| +|5-FU|23|1|....|0.04|-3.4| + +Where feature_1 to feature_n are the pathway enrichment scores and the chemical fingerprint coming from data processing +# Usage: +```python +# run FNN +python ./PathDSP/PathDSP/FNN.py -i input.txt -o ./output_prefix + +Where input.txt should be in the input format shown above. +Example input file can be found at https://zenodo.org/record/7532963 +``` +# Data preprocessing +Pathway enrichment scores for categorical data (i.e., mutation, copy number variation, and drug targets) were obtained by running the NetPEA algorithm, which is available at: https://github.com/TangYiChing/NetPEA, while pathway enrichment scores for numeric data (i.e., gene expression) was generated with the single-sample Gene Set Enrichment Analsysis (ssGSEA) available here: https://gseapy.readthedocs.io/en/master/gseapy_example.html#3)-command-line-usage-of-single-sample-gseaby + + +# Reference +Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7 \ No newline at end of file diff --git a/preprocess_improve.py b/preprocess_improve.py deleted file mode 100644 index fa6d6d0..0000000 --- a/preprocess_improve.py +++ /dev/null @@ -1,677 +0,0 @@ -import sys -import os -import numpy as np -import polars as pl - -# import torch -# import torch.utils.data as du -# from torch.autograd import Variable -# import torch.nn as nn -# import torch.nn.functional as F -# from code.drugcell_NN import * -import argparse -import numpy as np -import pandas as pd -import candle - -# import time -# import logging -# import networkx as nx -# import networkx.algorithms.components.connected as nxacc -# import networkx.algorithms.dag as nxadag -# from pathlib import Path -from functools import reduce - -# import improve_utils -# swtich from improve_utils to improve repo -sys.path.append("./IMPROVE/") -from improve import drug_resp_pred as drp -from pathlib import Path - -# import RDKit -from rdkit import Chem -from rdkit.Chem import AllChem -from datetime import datetime - -# import NetPEA modules -import RWR as rwr -import NetPEA as pea - -# import gsea module -import gseapy as gp -import sklearn.model_selection as skms - - -file_path = os.path.dirname(os.path.realpath(__file__)) -# fdir = Path('__file__').resolve().parent -# source = 'csa_data/raw_data/splits/' -required = None -additional_definitions = None - -# This should be set outside as a user environment variable -# os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' - - -# initialize class -class PathDSP_candle(candle.Benchmark): - def set_locals(self): - """ - Functionality to set variables specific for the benchmark - - required: set of required parameters for the benchmark. - - additional_definitions: list of dictionaries describing the additional parameters for the benchmark. - """ - if required is not None: - self.required = set(required) - if additional_definitions is not None: - self.additional_definitions = additional_definitions - - -def initialize_parameters(): - preprocessor_bmk = PathDSP_candle( - file_path, - "PathDSP_params.txt", - "pytorch", - prog="PathDSP_candle", - desc="Data Preprocessor", - ) - # Initialize parameters - gParameters = candle.finalize_parameters(preprocessor_bmk) - return gParameters - - -def mkdir(directory): - directories = directory.split("/") - - folder = "" - for d in directories: - folder += d + "/" - if not os.path.exists(folder): - print("creating folder: %s" % folder) - os.mkdir(folder) - - -def preprocess(params, data_dir): - print(os.environ["CANDLE_DATA_DIR"]) - # requirements go here - # keys_parsing = ['output_dir', 'hidden', 'result', 'metric', 'data_type'] - if not os.path.exists(data_dir): - mkdir(data_dir) - params["data_dir"] = data_dir - # args = candle.ArgumentStruct(**params) - for i in [ - "train_data", - "test_data", - "val_data", - "drug_bits_file", - "dgnet_file", - "mutnet_file", - "cnvnet_file", - "exp_file", - ]: - params[i] = params["data_dir"] + "/" + params[i] - params["x_data_path"] = ( - os.environ["CANDLE_DATA_DIR"] + "/csa_data/raw_data/x_data/" - ) - return params - - -def download_anl_data(params): - csa_data_folder = os.path.join( - os.environ["CANDLE_DATA_DIR"], "csa_data", "raw_data" - ) - splits_dir = os.path.join(csa_data_folder, "splits") - x_data_dir = os.path.join(csa_data_folder, "x_data") - y_data_dir = os.path.join(csa_data_folder, "y_data") - - if not os.path.exists(csa_data_folder): - print("creating folder: %s" % csa_data_folder) - os.makedirs(csa_data_folder) - mkdir(splits_dir) - mkdir(x_data_dir) - mkdir(y_data_dir) - - for improve_file in [ - "CCLE_all.txt", - "CCLE_split_" + str(params["split"]) + "_test.txt", - "CCLE_split_" + str(params["split"]) + "_train.txt", - "CCLE_split_" + str(params["split"]) + "_val.txt", - "CTRPv2_all.txt", - "CTRPv2_split_" + str(params["split"]) + "_test.txt", - "CTRPv2_split_" + str(params["split"]) + "_train.txt", - "CTRPv2_split_" + str(params["split"]) + "_val.txt", - "gCSI_all.txt", - "GDSCv1_all.txt", - "GDSCv2_all.txt", - ]: - url_dir = params["improve_data_url"] + "/splits/" - candle.file_utils.get_file( - improve_file, url_dir + improve_file, datadir=splits_dir, cache_subdir=None - ) - - for improve_file in [ - "cancer_mutation_count.tsv", - "drug_SMILES.tsv", - "drug_info.tsv", - "cancer_discretized_copy_number.tsv", - "cancer_gene_expression.tsv", - ]: - url_dir = params["improve_data_url"] + "/x_data/" - candle.file_utils.get_file( - fname=improve_file, - origin=url_dir + improve_file, - datadir=x_data_dir, - cache_subdir=None, - ) - - url_dir = params["improve_data_url"] + "/y_data/" - response_file = "response.tsv" - candle.file_utils.get_file( - fname=response_file, - origin=url_dir + response_file, - datadir=y_data_dir, - cache_subdir=None, - ) - - ## get gene-set data and string data - for db_file in [params["gene_set"], params["ppi_data"], params["drug_target"]]: - candle.file_utils.get_file( - db_file, - params["data_url"] + "/" + db_file, - datadir=params["data_dir"], - cache_subdir=None, - ) - - -# set timer -def cal_time(end, start): - """return time spent""" - # end = datetime.now(), start = datetime.now() - datetimeFormat = "%Y-%m-%d %H:%M:%S.%f" - spend = datetime.strptime(str(end), datetimeFormat) - datetime.strptime( - str(start), datetimeFormat - ) - return spend - - -def download_author_data(params): - data_download_filepath = candle.file_utils.get_file( - params["original_data"], - params["original_data_url"] + "/" + params["original_data"], - datadir=params["data_dir"], - cache_subdir=None, - ) - print("download_path: {}".format(data_download_filepath)) - random_seed = 42 - df = pd.read_csv( - params["data_dir"] + "/input.txt", sep="\t" - ) # Modify the separator if needed - df = df.set_index(["drug", "cell"]) - train_data, temp_data = skms.train_test_split( - df, test_size=0.2, random_state=random_seed - ) - val_data, test_data = skms.train_test_split( - temp_data, test_size=0.5, random_state=random_seed - ) - pl.from_pandas(train_data).write_csv( - params["train_data"], separator="\t", has_header=True - ) - pl.from_pandas(val_data).write_csv( - params["val_data"], separator="\t", has_header=True - ) - pl.from_pandas(test_data).write_csv( - params["test_data"], separator="\t", has_header=True - ) - - -def load_smiles_data(fname: str, sep: str = "\t", verbose: bool = True) -> pd.DataFrame: - """ - IMPROVE-specific func. - Read smiles data. - src_raw_data_dir : data dir where the raw DRP data is stored - """ - df = pd.read_csv(fname, sep=sep) - - # TODO: updated this after we update the data - df.columns = ["improve_chem_id", "smiles"] - - if verbose: - print(f"SMILES data: {df.shape}") - # print(df.dtypes) - # print(df.dtypes.value_counts()) - return df - - -def smile2bits(params): - start = datetime.now() - # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], - # split=params['split'], split_type=["train", "test", "val"], - # y_col_name=params['metric']) - response_df = [ - drp.load_response_data( - y_data_fpath=Path( - params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/y_data/response.tsv" - ), - split_fpath=Path( - params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/splits/" + params[file] - ), - ) - for file in ["train_split_file", "test_split_file", "val_split_file"] - ] - response_df = pd.concat(response_df, ignore_index=True) - # smile_df = improve_utils.load_smiles_data() - # params['x_data_path'] = params['CANDLE_DATA_DIR'] + '/x_data/' - # dd = drp.DrugsLoader(params) - smile_df = load_smiles_data( - fname=params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/x_data/drug_SMILES.tsv" - ) - smile_df.columns = ["drug", "smile"] - smile_df = smile_df.drop_duplicates(subset=["drug"], keep="first").set_index("drug") - smile_df = smile_df.loc[smile_df.index.isin(response_df["improve_chem_id"]),] - bit_int = params["bit_int"] - record_list = [] - # smile2bits drug by drug - n_drug = 1 - for idx, row in smile_df.iterrows(): - drug = idx - smile = row["smile"] - mol = Chem.MolFromSmiles(smile) - if mol is None: - continue - mbit = list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=bit_int)) - # drug_mbit_dict.update({drug:mbit}) - # append to result - record_list.append(tuple([drug] + mbit)) - if len(mbit) == bit_int: - n_drug += 1 - print("total {:} drugs with bits".format(n_drug)) - # convert dict to dataframe - colname_list = ["drug"] + ["mBit_" + str(i) for i in range(bit_int)] - drug_mbit_df = pd.DataFrame.from_records(record_list, columns=colname_list) - # drug_mbit_df = pd.DataFrame.from_dict(drug_mbit_dict, orient='index', columns=colname_list) - # drug_mbit_df.index.name = 'drug' - print("unique drugs={:}".format(len(drug_mbit_df["drug"].unique()))) - # save to file - drug_mbit_df.to_csv(params["drug_bits_file"], header=True, index=False, sep="\t") - print("[Finished in {:}]".format(cal_time(datetime.now(), start))) - - -def times_expression(rwr, exp): - """ - :param rwrDf: dataframe of cell by gene probability matrix - :param expDf: dataframe of cell by gene expression matrix - :return rwr_timesexp_df: dataframe of cell by gene probability matrix, - in which genes are multiplied with expression values - - Note: this function assumes cells are all overlapped while gene maybe not - """ - cell_list = sorted(list(set(rwr.index) & set(exp.index))) - gene_list = sorted(list(set(rwr.columns) & set(exp.columns))) - - if len(cell_list) == 0: - print("ERROR! no overlapping cell lines") - sys.exit(1) - if len(gene_list) == 0: - print("ERROR! no overlapping genes") - sys.exit(1) - - # multiply with gene expression for overlapping cell, gene - rwr_timesexp = rwr.loc[cell_list, gene_list] * exp.loc[cell_list, gene_list] - - # concat with other gene - out_gene_list = list(set(rwr.columns) - set(gene_list)) - out_df = pd.concat([rwr_timesexp, rwr[out_gene_list]], axis=1) - return out_df - - -def run_netpea(params, dtype, multiply_expression): - # timer - start_time = datetime.now() - ppi_path = params["data_dir"] + "/STRING/9606.protein_name.links.v11.0.pkl" - pathway_path = ( - params["data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" - ) - log_transform = False - permutation_int = params["permutation_int"] - seed_int = params["seed_int"] - cpu_int = params["cpu_int"] - csa_data_folder = os.path.join( - os.environ["CANDLE_DATA_DIR"], "csa_data", "raw_data" - ) - # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], - # split=params['split'], split_type=["train", "test", "val"], - # y_col_name=params['metric']) - response_df = [ - drp.load_response_data( - y_data_fpath=Path( - params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/y_data/response.tsv" - ), - split_fpath=Path( - params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/splits/" + params[file] - ), - ) - for file in ["train_split_file", "test_split_file", "val_split_file"] - ] - response_df = pd.concat(response_df, ignore_index=True) - if dtype == "DGnet": - drug_info = pd.read_csv(csa_data_folder + "/x_data/drug_info.tsv", sep="\t") - drug_info["NAME"] = drug_info["NAME"].str.upper() - target_info = pd.read_csv( - params["data_dir"] + "/data/DB.Drug.Target.txt", sep="\t" - ) - target_info = target_info.rename(columns={"drug": "NAME"}) - combined_df = pd.merge(drug_info, target_info, how="left", on="NAME").dropna( - subset=["gene"] - ) - combined_df = combined_df.loc[ - combined_df["improve_chem_id"].isin(response_df["improve_chem_id"]), - ] - restart_path = params["data_dir"] + "/drug_target.txt" - combined_df.iloc[:, -2:].to_csv( - restart_path, sep="\t", header=True, index=False - ) - outpath = params["dgnet_file"] - elif dtype == "MUTnet": - # mutation_data = improve_utils.load_mutation_count_data(gene_system_identifier='Gene_Symbol') - mutation_data = drp.load_omics_data( - params, - omics_type="mutation_count", - canc_col_name="improve_sample_id", - gene_system_identifier="Gene_Symbol", - ) - mutation_data = mutation_data.reset_index() - mutation_data = pd.melt(mutation_data, id_vars="improve_sample_id").loc[ - lambda x: x["value"] > 0 - ] - mutation_data = mutation_data.loc[ - mutation_data["improve_sample_id"].isin(response_df["improve_sample_id"]), - ] - restart_path = params["data_dir"] + "/mutation_data.txt" - mutation_data.iloc[:, 0:2].to_csv( - restart_path, sep="\t", header=True, index=False - ) - outpath = params["mutnet_file"] - else: - # cnv_data = improve_utils.load_discretized_copy_number_data(gene_system_identifier='Gene_Symbol') - cnv_data = drp.load_omics_data( - params, - omics_type="discretized_copy_number", - canc_col_name="improve_sample_id", - gene_system_identifier="Gene_Symbol", - ) - cnv_data = cnv_data.reset_index() - cnv_data = pd.melt(cnv_data, id_vars="improve_sample_id").loc[ - lambda x: x["value"] != 0 - ] - cnv_data = cnv_data.loc[ - cnv_data["improve_sample_id"].isin(response_df["improve_sample_id"]), - ] - restart_path = params["data_dir"] + "/cnv_data.txt" - cnv_data.iloc[:, 0:2].to_csv(restart_path, sep="\t", header=True, index=False) - outpath = params["cnvnet_file"] - # perform Random Walk - print(datetime.now(), "performing random walk with restart") - rwr_df = rwr.RWR( - ppi_path, - restart_path, - restartProbFloat=0.5, - convergenceFloat=0.00001, - normalize="l1", - weighted=True, - ).get_prob() - # multiply with gene expression - if multiply_expression: - print( - datetime.now(), - "multiplying gene expression with random walk probability for genes were expressed", - ) - # exp_df = improve_utils.load_gene_expression_data(gene_system_identifier='Gene_Symbol') - exp_df = drp.load_omics_data( - params, - omics_type="gene_expression", - canc_col_name="improve_sample_id", - gene_system_identifier="Gene_Symbol", - ) - rwr_df = times_expression(rwr_df, exp_df) - # rwr_df.to_csv(out_path+'.RWR.txt', header=True, index=True, sep='\t') - # perform Pathwa Enrichment Analysis - print(datetime.now(), "performing network-based pathway enrichment") - cell_pathway_df = pea.NetPEA( - rwr_df, - pathway_path, - log_transform=log_transform, - permutation=permutation_int, - seed=seed_int, - n_cpu=cpu_int, - out_path=outpath, - ) - print("[Finished in {:}]".format(cal_time(datetime.now(), start_time))) - - -def prep_input(params): - # Read data files - drug_mbit_df = pd.read_csv(params["drug_bits_file"], sep="\t", index_col=0) - drug_mbit_df = drug_mbit_df.reset_index().rename(columns={"drug": "drug_id"}) - DGnet = pd.read_csv(params["dgnet_file"], sep="\t", index_col=0) - DGnet = ( - DGnet.add_suffix("_dgnet").reset_index().rename(columns={"index": "drug_id"}) - ) - CNVnet = pd.read_csv(params["cnvnet_file"], sep="\t", index_col=0) - CNVnet = ( - CNVnet.add_suffix("_cnvnet") - .reset_index() - .rename(columns={"index": "sample_id"}) - ) - MUTnet = pd.read_csv(params["mutnet_file"], sep="\t", index_col=0) - MUTnet = ( - MUTnet.add_suffix("_mutnet") - .reset_index() - .rename(columns={"index": "sample_id"}) - ) - EXP = pd.read_csv(params["exp_file"], sep="\t", index_col=0) - EXP = EXP.add_suffix("_exp").reset_index().rename(columns={"index": "sample_id"}) - # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], - # split_type=['train', 'test', 'val'], - # y_col_name= params['metric']) - response_df = [ - drp.load_response_data( - y_data_fpath=Path( - params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/y_data/response.tsv" - ), - split_fpath=Path( - params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/splits/" + params[file] - ), - ) - for file in ["train_split_file", "test_split_file", "val_split_file"] - ] - response_df = pd.concat(response_df, ignore_index=True) - response_df = response_df.rename( - columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} - ) - # Extract relevant IDs - - common_drug_ids = reduce( - np.intersect1d, - (drug_mbit_df["drug_id"], DGnet["drug_id"], response_df["drug_id"]), - ) - common_sample_ids = reduce( - np.intersect1d, - ( - CNVnet["sample_id"], - MUTnet["sample_id"], - EXP["sample_id"], - response_df["sample_id"], - ), - ) - response_df = response_df.loc[ - (response_df["drug_id"].isin(common_drug_ids)) - & (response_df["sample_id"].isin(common_sample_ids)), - :, - ] - drug_mbit_df = ( - drug_mbit_df.loc[drug_mbit_df["drug_id"].isin(common_drug_ids), :] - .set_index("drug_id") - .sort_index() - ) - DGnet = ( - DGnet.loc[DGnet["drug_id"].isin(common_drug_ids), :] - .set_index("drug_id") - .sort_index() - ) - CNVnet = ( - CNVnet.loc[CNVnet["sample_id"].isin(common_sample_ids), :] - .set_index("sample_id") - .sort_index() - ) - MUTnet = ( - MUTnet.loc[MUTnet["sample_id"].isin(common_sample_ids), :] - .set_index("sample_id") - .sort_index() - ) - EXP = ( - EXP.loc[EXP["sample_id"].isin(common_sample_ids), :] - .set_index("sample_id") - .sort_index() - ) - - drug_data = drug_mbit_df.join(DGnet) - sample_data = CNVnet.join([MUTnet, EXP]) - ## export train,val,test set - # for i in ['train', 'test', 'val']: - for i in ["train_split_file", "test_split_file", "val_split_file"]: - # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], - # split_type=i, - # y_col_name= params['metric']) - response_df = drp.load_response_data( - y_data_fpath=Path( - params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/y_data/response.tsv" - ), - split_fpath=Path( - params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/splits/" + params[i] - ), - ) - response_df = response_df.rename( - columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} - ) - response_df = response_df.loc[ - (response_df["drug_id"].isin(common_drug_ids)) - & (response_df["sample_id"].isin(common_sample_ids)), - :, - ] - comb_data_mtx = pd.DataFrame( - { - "drug_id": response_df["drug_id"].values, - "sample_id": response_df["sample_id"].values, - } - ) - comb_data_mtx = ( - comb_data_mtx.set_index(["drug_id", "sample_id"]) - .join(drug_data, on="drug_id") - .join(sample_data, on="sample_id") - ) - comb_data_mtx["response"] = response_df[params["metric"]].values - comb_data_mtx = comb_data_mtx.dropna() - pl.from_pandas(comb_data_mtx).write_csv( - params[i + "_data"], separator="\t", has_header=True - ) - - -def run_ssgsea(params): - # expMat = improve_utils.load_gene_expression_data(sep='\t') - expMat = drp.load_omics_data( - params, - omics_type="gene_expression", - canc_col_name="improve_sample_id", - gene_system_identifier="Gene_Symbol", - ) - # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], - # split=params['split'], split_type=["train", "test", "val"], - # y_col_name=params['metric']) - response_df = [ - drp.load_response_data( - y_data_fpath=Path( - params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/y_data/response.tsv" - ), - split_fpath=Path( - params["CANDLE_DATA_DIR"] + "/csa_data/raw_data/splits/" + params[file] - ), - ) - for file in ["train_split_file", "test_split_file", "val_split_file"] - ] - response_df = pd.concat(response_df, ignore_index=True) - expMat = expMat.loc[expMat.index.isin(response_df["improve_sample_id"]),] - gct = expMat.T # gene (rows) cell lines (columns) - pathway_path = ( - params["data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" - ) - gmt = pathway_path - tmp_str = params["data_dir"] - - if not os.path.isdir(tmp_str): - os.mkdir(tmp_str) - - # run enrichment - ssgsea = gp.ssgsea( - data=gct, # gct: a matrix of gene by sample - gene_sets=gmt, # gmt format - outdir=tmp_str, - scale=True, - permutation_num=0, # 1000 - no_plot=True, - processes=params["cpu_int"], - # min_size=0, - format="png", - ) - - result_mat = ssgsea.res2d.T # get the normalized enrichment score (i.e., NES) - result_mat.to_csv(tmp_str + "ssGSEA.txt", header=True, index=True, sep="\t") - - f = open(tmp_str + "ssGSEA.txt", "r") - lines = f.readlines() - total_dict = {} - for cell in set(lines[1].split()): - total_dict[cell] = {} - cell_lines = lines[1].split() - vals = lines[4].split() - for i, pathway in enumerate((lines[2].split())): - if i > 0: - total_dict[cell_lines[i]][pathway] = float(vals[i]) - df = pd.DataFrame(total_dict) - df.T.to_csv(params["exp_file"], header=True, index=True, sep="\t") - - -def candle_main(anl): - params = initialize_parameters() - data_dir = os.environ["CANDLE_DATA_DIR"] + "/" + "/Data/" - params = preprocess(params, data_dir) - if params["improve_analysis"] == "yes" or anl == 1: - download_anl_data(params) - print("convert drug to bits.") - smile2bits(params) - print("compute DGnet.") - run_netpea(params, dtype="DGnet", multiply_expression=False) - print("compute MUTnet.") - run_netpea(params, dtype="MUTnet", multiply_expression=True) - print("compute CNVnet.") - run_netpea(params, dtype="CNVnet", multiply_expression=True) - print("compute EXP.") - run_ssgsea(params) - print("prepare final input file.") - prep_input(params) - else: - download_author_data(params) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "-a", - dest="anl", - type=int, - default=0, - help="""whether to perform preprocessing using anl data or directly use processed - data from the original paper, default to 0 to use processed data from original paper""", - ) - args = parser.parse_args() - start = datetime.now() - candle_main(args.anl) - print("[Finished in {:}]".format(cal_time(datetime.now(), start))) From dc1f3b456d566c58fbb1dc6ac651d430c4609b9a Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 6 Dec 2023 09:21:46 -0800 Subject: [PATCH 073/254] update readme --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 4acc239..6adfdae 100644 --- a/README.md +++ b/README.md @@ -68,8 +68,7 @@ Define enviroment variabels improve_lib="/path/to/IMPROVE/repo/" pathdsp_lib="/path/to/pathdsp/repo/" # notice the extra PathDSP folder after pathdsp_lib -export PYTHONPATH=$PYTHONPATH:improve_lib:pathdsp_lib/PathDSP/ -export IMPROVE_DATA_DIR="/path/to/csa_data/" +export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/export IMPROVE_DATA_DIR="/path/to/csa_data/" export AUTHOR_DATA_DIR="/path/to/author_data/" ``` From a216999a776dbbb8ab0f2d7124067bc419d4ecbf Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Wed, 6 Dec 2023 14:05:20 -0600 Subject: [PATCH 074/254] update to use IMPROVE library (#8) * update preprocess script * update preprocess script * add improve_utils script * add nea scripts * update params * add gitignore * EXP processing * updated to integrate with prep_input * add definition file * update .gitignore * update filename for ssGSEA * add FNN_new * add train/infer * update params * add .yml * update params * update conda path * fix conda * update preprocess.sh * update preprocess.sh * update preprocess_new.py * update env * update preproce_new.py * update preproce_new.py * update files * update params * fix params * update preproce_new.py * update preprocess_new.py * update preprocess_new.py * update file * update file * update file * update script * add def * add script * update file * update FNN_new * update FNN * update params * fix param * fix bug * add time * update def * update yml * update train.sh * update train.sh * update train.py * update train * fix bug * update file * update file * use polars * update files * update preprocess * update infer.sh * process author data * fix args * add infer.sh * update doc * fix path * fix conda * use improve repo * use improve module * update readme --------- Co-authored-by: willherbert27 --- PathDSP.def | 3 +- PathDSP_default_model.txt | 43 +++ PathDSP_infer_improve.py | 88 ++++++ PathDSP_params.txt | 13 +- PathDSP_preprocess_improve.py | 518 ++++++++++++++++++++++++++++++++++ PathDSP_train_improve.py | 310 ++++++++++++++++++++ README.md | 93 +++--- README_old.md | 134 +++++++++ preprocess.sh | 2 +- 9 files changed, 1151 insertions(+), 53 deletions(-) create mode 100644 PathDSP_default_model.txt create mode 100755 PathDSP_infer_improve.py create mode 100644 PathDSP_preprocess_improve.py create mode 100644 PathDSP_train_improve.py create mode 100644 README_old.md diff --git a/PathDSP.def b/PathDSP.def index 8ed6d86..61e45c2 100644 --- a/PathDSP.def +++ b/PathDSP.def @@ -43,8 +43,9 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime #install python modules and model prerequites cd /usr/local git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git + git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git + export PYTHONPATH=$PYTHONPATH:/usr/local/IMPROVE/ cd PathDSP - # download conda /opt/conda/bin/conda env create -f environment_082223.yml --prefix /usr/local/conda_envs/PathDSP_env/ diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt new file mode 100644 index 0000000..5951ee3 --- /dev/null +++ b/PathDSP_default_model.txt @@ -0,0 +1,43 @@ +[Global_Params] +model_name='PathDSP' + +[Preprocess] +train_split_file = "gCSI_split_0_train.txt" +val_split_file = "gCSI_split_0_val.txt" +test_split_file = "gCSI_split_0_test.txt" +ml_data_outdir = "./ml_data/gCSI-gCSI/split_0" +x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] +x_data_drug_files = [["drug_SMILES.tsv"]] +y_data_files = [["response.tsv"]] +data_format = ".txt" +drug_bits_file='drug_mbit_df.txt' +dgnet_file='DGnet.txt' +mutnet_file='MUTnet.txt' +cnvnet_file='CNVnet.txt' +exp_file='EXP.txt' +bit_int=128 +permutation_int=3 +seed_int=42 +cpu_int=20 + +[Train] +train_ml_data_dir = "./ml_data/gCSI-gCSI/split_0" +val_ml_data_dir = "./ml_data/gCSI-gCSI/split_0" +model_outdir = "./out_models/gCSI/split_0" +model_file_name = "model" +model_file_format = ".pt" +epochs=800 +batch_size = 32 +val_batch = 32 +loss = "mse" +early_stop_metric = "mse" +patience = 30 +cuda_name = "cuda:2" +learning_rate = 0.001 + +[Infer] +test_ml_data_dir = "./ml_data/gCSI-gCSI/split_0" +model_dir = "./out_models/gCSI/split_0" +infer_outdir = "./out_infer/gCSI-gCSI/split_0" +test_batch = 256 +cuda_name = "cuda:3" \ No newline at end of file diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py new file mode 100755 index 0000000..b91c8fa --- /dev/null +++ b/PathDSP_infer_improve.py @@ -0,0 +1,88 @@ +import candle +import os +import sys +#import json +#from json import JSONEncoder +from PathDSP_preprocess_improve import mkdir, preprocess +from PathDSP_train_improve import predicting +import numpy as np +import pandas as pd +from datetime import datetime +import torch as tch +import torch.utils.data as tchud +import polars as pl +import sklearn.metrics as skmts +#sys.path.append("/usr/local/PathDSP/PathDSP") +#sys.path.append("/usr/local/PathDSP/PathDSP") +#sys.path.append(os.getcwd() + "/PathDSP") +import myModel as mynet +import myDataloader as mydl +import myDatasplit as mysplit +import myUtility as myutil + +from improve import framework as frm +# from improve.torch_utils import TestbedDataset +from improve.metrics import compute_metrics + +from PathDSP_train_improve import ( + preprocess, + cal_time, + metrics_list, + model_preproc_params, + model_train_params, +) + +file_path = os.path.dirname(os.path.realpath(__file__)) + +# [Req] App-specific params +app_infer_params = [] + +# [PathDSP] Model-specific params (Model: PathDSP) +model_infer_params = [] + +def run(params): + frm.create_outdir(outdir=params["infer_outdir"]) + params = preprocess(params) + test_df = pl.read_csv(params['test_data'], separator = "\t").to_pandas() + Xtest_arr = test_df.iloc[:, 0:-1].values + ytest_arr = test_df.iloc[:, -1].values + Xtest_arr = np.array(Xtest_arr).astype('float32') + ytest_arr = np.array(ytest_arr).astype('float32') + trained_net = mynet.FNN(Xtest_arr.shape[1]) + modelpath = frm.build_model_path(params, model_dir=params["model_dir"]) + trained_net.load_state_dict(tch.load(modelpath)) + trained_net.eval() + myutil.set_seed(params["seed_int"]) + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) + test_dl = tchud.DataLoader(test_dataset, batch_size=params['test_batch'], shuffle=False) + start = datetime.now() + test_true, test_pred = predicting(trained_net, device, data_loader=test_dl) + frm.store_predictions_df( + params, y_true=test_true, y_pred=test_pred, stage="test", + outdir=params["infer_outdir"] + ) + test_scores = frm.compute_performace_scores( + params, y_true=test_true, y_pred=test_pred, stage="test", + outdir=params["infer_outdir"], metrics=metrics_list + ) + print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start))) + return test_scores + +def main(): + additional_definitions = model_preproc_params + \ + model_train_params + \ + model_infer_params + \ + app_infer_params + params = frm.initialize_parameters( + file_path, + default_model="PathDSP_default_model.txt", + additional_definitions=additional_definitions, + required=None, + ) + test_scores = run(params) + print("\nFinished inference of PathDSP model.") + + +if __name__ == "__main__": + main() diff --git a/PathDSP_params.txt b/PathDSP_params.txt index a1b289f..12de5d5 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -1,6 +1,15 @@ [Global_Params] - model_name='PathDSP' + +[Preprocess] +train_split_file = "gCSI_split_0_train.txt" +val_split_file = "gCSI_split_0_val.txt" +test_split_file = "gCSI_split_0_test.txt" +ml_data_outdir = "./ml_data/gCSI-gCSI/split_0" +x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] +x_data_drug_files = [["drug_SMILES.tsv"]] +y_data_files = [["response.tsv"]] + data_url='https://zenodo.org/record/6093818/files/' improve_data_url='https://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/raw_data/' original_data_url='https://zenodo.org/record/7532963/files/' @@ -8,6 +17,7 @@ original_data='input.zip' gene_set = 'MSigdb.zip' ppi_data = 'STRING.zip' drug_target = 'raw_data.zip' +raw_data_dir = "raw_data" train_data = 'PathDSP_train.txt' test_data = 'PathDSP_test.txt' val_data = 'PathDSP_val.txt' @@ -19,6 +29,7 @@ exp_file='EXP.txt' #output='Result/' bit_int=128 permutation_int=3 +y_col_name = 'auc' metric='auc' data_type='CTRPv2' split=0 diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py new file mode 100644 index 0000000..1fd31f9 --- /dev/null +++ b/PathDSP_preprocess_improve.py @@ -0,0 +1,518 @@ +import sys +import os +import numpy as np +import polars as pl +import argparse +import numpy as np +import pandas as pd +import candle +from functools import reduce +from improve import drug_resp_pred as drp +from improve import framework as frm +from pathlib import Path +from rdkit import Chem +from rdkit.Chem import AllChem +from datetime import datetime +import RWR as rwr +import NetPEA as pea +import gseapy as gp +import sklearn.model_selection as skms + + +file_path = Path(__file__).resolve().parent + +app_preproc_params = [ + # These arg should be specified in the [modelname]_default_model.txt: + # y_data_files, x_data_canc_files, x_data_drug_files + {"name": "y_data_files", # default + "type": str, + "help": "List of files that contain the y (prediction variable) data. \ + Example: [['response.tsv']]", + }, + {"name": "x_data_canc_files", # [Req] + "type": str, + "help": "List of feature files including gene_system_identifer. Examples: \n\ + 1) [['cancer_gene_expression.tsv', ['Gene_Symbol']]] \n\ + 2) [['cancer_copy_number.tsv', ['Ensembl', 'Entrez']]].", + }, + {"name": "x_data_drug_files", # [Req] + "type": str, + "help": "List of feature files. Examples: \n\ + 1) [['drug_SMILES.tsv']] \n\ + 2) [['drug_SMILES.tsv'], ['drug_ecfp4_nbits512.tsv']]", + }, + {"name": "canc_col_name", + "default": "improve_sample_id", # default + "type": str, + "help": "Column name in the y (response) data file that contains the cancer sample ids.", + }, + {"name": "drug_col_name", # default + "default": "improve_chem_id", + "type": str, + "help": "Column name in the y (response) data file that contains the drug ids.", + }, + +] + +# [PathDSP] Model-specific params +model_preproc_params = [ + {"name": "bit_int", + "type": int, + "default": 128, + "help": "Number of bits for morgan fingerprints.", + }, + {"name": "permutation_int", + "type": int, + "default": 3, + "help": "Number of permutation for calculating enrichment scores.", + }, + {"name": "seed_int", + "type": int, + "default": 42, + "help": "Random seed for random walk algorithm.", + }, + {"name": "cpu_int", + "type": int, + "default": 20, + "help": "Number of cpus to use when calculating pathway enrichment scores.", + }, + {"name": "drug_bits_file", + "type": str, + "default": "drug_mbit_df.txt", + "help": "File name to save the drug bits file.", + }, + {"name": "dgnet_file", + "type": str, + "default": "DGnet.txt", + "help": "File name to save the drug target net file.", + }, + {"name": "mutnet_file", + "type": str, + "default": "MUTnet.txt", + "help": "File name to save the mutation net file.", + }, + {"name": "cnvnet_file", + "type": str, + "default": "CNVnet.txt", + "help": "File name to save the CNV net file.", + }, + {"name": "exp_file", + "type": str, + "default": "EXPnet.txt", + "help": "File name to save the EXP net file.", + }, +] + +preprocess_params = app_preproc_params + model_preproc_params +req_preprocess_args = [ll["name"] for ll in preprocess_params] + + +def mkdir(directory): + directories = directory.split("/") + + folder = "" + for d in directories: + folder += d + "/" + if not os.path.exists(folder): + print("creating folder: %s" % folder) + os.mkdir(folder) + + +def preprocess(params): + params["train_data"] = frm.build_ml_data_name(params, 'train') + params["val_data"] = frm.build_ml_data_name(params, 'val') + params["test_data"] = frm.build_ml_data_name(params, 'test') + params["author_data_dir"] = os.getenv("AUTHOR_DATA_DIR") + for i in [ + "train_data", + "test_data", + "val_data", + "drug_bits_file", + "dgnet_file", + "mutnet_file", + "cnvnet_file", + "exp_file", + ]: + params[i] = params["ml_data_outdir"] + "/" + params[i] + + return params + + +# set timer +def cal_time(end, start): + """return time spent""" + # end = datetime.now(), start = datetime.now() + datetimeFormat = "%Y-%m-%d %H:%M:%S.%f" + spend = datetime.strptime(str(end), datetimeFormat) - datetime.strptime( + str(start), datetimeFormat + ) + return spend + +def response_out(params, split_file): + response_df = drp.DrugResponseLoader(params, split_file=split_file, verbose=True) + return response_df.dfs["response.tsv"] + + +def smile2bits(params): + start = datetime.now() + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + + smile_df = drp.DrugsLoader(params) + + smile_df = smile_df.dfs['drug_SMILES.tsv'] + smile_df = smile_df.reset_index() + smile_df.columns = ["drug", "smile"] + smile_df = smile_df.drop_duplicates(subset=["drug"], keep="first").set_index("drug") + smile_df = smile_df.loc[smile_df.index.isin(response_df["improve_chem_id"]),] + bit_int = params["bit_int"] + record_list = [] + # smile2bits drug by drug + n_drug = 1 + for idx, row in smile_df.iterrows(): + drug = idx + smile = row["smile"] + mol = Chem.MolFromSmiles(smile) + if mol is None: + continue + mbit = list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=bit_int)) + # drug_mbit_dict.update({drug:mbit}) + # append to result + record_list.append(tuple([drug] + mbit)) + if len(mbit) == bit_int: + n_drug += 1 + print("total {:} drugs with bits".format(n_drug)) + # convert dict to dataframe + colname_list = ["drug"] + ["mBit_" + str(i) for i in range(bit_int)] + drug_mbit_df = pd.DataFrame.from_records(record_list, columns=colname_list) + # drug_mbit_df = pd.DataFrame.from_dict(drug_mbit_dict, orient='index', columns=colname_list) + # drug_mbit_df.index.name = 'drug' + print("unique drugs={:}".format(len(drug_mbit_df["drug"].unique()))) + # save to file + drug_mbit_df.to_csv(params["drug_bits_file"], header=True, index=False, sep="\t") + print("[Finished in {:}]".format(cal_time(datetime.now(), start))) + + +def times_expression(rwr, exp): + """ + :param rwrDf: dataframe of cell by gene probability matrix + :param expDf: dataframe of cell by gene expression matrix + :return rwr_timesexp_df: dataframe of cell by gene probability matrix, + in which genes are multiplied with expression values + + Note: this function assumes cells are all overlapped while gene maybe not + """ + cell_list = sorted(list(set(rwr.index) & set(exp.index))) + gene_list = sorted(list(set(rwr.columns) & set(exp.columns))) + + if len(cell_list) == 0: + print("ERROR! no overlapping cell lines") + sys.exit(1) + if len(gene_list) == 0: + print("ERROR! no overlapping genes") + sys.exit(1) + + # multiply with gene expression for overlapping cell, gene + rwr_timesexp = rwr.loc[cell_list, gene_list] * exp.loc[cell_list, gene_list] + + # concat with other gene + out_gene_list = list(set(rwr.columns) - set(gene_list)) + out_df = pd.concat([rwr_timesexp, rwr[out_gene_list]], axis=1) + return out_df + + +def run_netpea(params, dtype, multiply_expression): + # timer + start_time = datetime.now() + ppi_path = params["author_data_dir"] + "/STRING/9606.protein_name.links.v11.0.pkl" + pathway_path = ( + params["author_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" + ) + log_transform = False + permutation_int = params["permutation_int"] + seed_int = params["seed_int"] + cpu_int = params["cpu_int"] + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + omics_data = drp.OmicsLoader(params) + + if dtype == "DGnet": + drug_info = pd.read_csv(os.environ["IMPROVE_DATA_DIR"] + "/raw_data/x_data/drug_info.tsv", sep="\t") + drug_info["NAME"] = drug_info["NAME"].str.upper() + target_info = pd.read_csv( + params["author_data_dir"] + "/data/DB.Drug.Target.txt", sep="\t" + ) + target_info = target_info.rename(columns={"drug": "NAME"}) + combined_df = pd.merge(drug_info, target_info, how="left", on="NAME").dropna( + subset=["gene"] + ) + combined_df = combined_df.loc[ + combined_df["improve_chem_id"].isin(response_df["improve_chem_id"]), + ] + restart_path = params["ml_data_outdir"] + "/drug_target.txt" + combined_df.iloc[:, -2:].to_csv( + restart_path, sep="\t", header=True, index=False + ) + outpath = params["dgnet_file"] + elif dtype == "MUTnet": + mutation_data = omics_data.dfs['cancer_mutation_count.tsv'] + #mutation_data = mutation_data.reset_index() + mutation_data = pd.melt(mutation_data, id_vars="improve_sample_id").loc[ + lambda x: x["value"] > 0 + ] + mutation_data = mutation_data.loc[ + mutation_data["improve_sample_id"].isin(response_df["improve_sample_id"]), + ] + restart_path = params["ml_data_outdir"] + "/mutation_data.txt" + mutation_data.iloc[:, 0:2].to_csv( + restart_path, sep="\t", header=True, index=False + ) + outpath = params["mutnet_file"] + else: + cnv_data = omics_data.dfs['cancer_discretized_copy_number.tsv'] + #cnv_data = cnv_data.reset_index() + cnv_data = pd.melt(cnv_data, id_vars="improve_sample_id").loc[ + lambda x: x["value"] != 0 + ] + cnv_data = cnv_data.loc[ + cnv_data["improve_sample_id"].isin(response_df["improve_sample_id"]), + ] + restart_path = params["ml_data_outdir"] + "/cnv_data.txt" + cnv_data.iloc[:, 0:2].to_csv(restart_path, sep="\t", header=True, index=False) + outpath = params["cnvnet_file"] + # perform Random Walk + print(datetime.now(), "performing random walk with restart") + rwr_df = rwr.RWR( + ppi_path, + restart_path, + restartProbFloat=0.5, + convergenceFloat=0.00001, + normalize="l1", + weighted=True, + ).get_prob() + # multiply with gene expression + if multiply_expression: + print( + datetime.now(), + "multiplying gene expression with random walk probability for genes were expressed", + ) + # exp_df = improve_utils.load_gene_expression_data(gene_system_identifier='Gene_Symbol') + # exp_df = drp.load_omics_data( + # params, + # omics_type="gene_expression", + # canc_col_name="improve_sample_id", + # gene_system_identifier="Gene_Symbol", + # ) + exp_df = omics_data.dfs['cancer_gene_expression.tsv'] + exp_df = exp_df.set_index(params['canc_col_name']) + rwr_df = times_expression(rwr_df, exp_df) + # rwr_df.to_csv(out_path+'.RWR.txt', header=True, index=True, sep='\t') + # perform Pathwa Enrichment Analysis + print(datetime.now(), "performing network-based pathway enrichment") + cell_pathway_df = pea.NetPEA( + rwr_df, + pathway_path, + log_transform=log_transform, + permutation=permutation_int, + seed=seed_int, + n_cpu=cpu_int, + out_path=outpath, + ) + print("[Finished in {:}]".format(cal_time(datetime.now(), start_time))) + + +def prep_input(params): + # Read data files + drug_mbit_df = pd.read_csv(params["drug_bits_file"], sep="\t", index_col=0) + drug_mbit_df = drug_mbit_df.reset_index().rename(columns={"drug": "drug_id"}) + DGnet = pd.read_csv(params["dgnet_file"], sep="\t", index_col=0) + DGnet = ( + DGnet.add_suffix("_dgnet").reset_index().rename(columns={"index": "drug_id"}) + ) + CNVnet = pd.read_csv(params["cnvnet_file"], sep="\t", index_col=0) + CNVnet = ( + CNVnet.add_suffix("_cnvnet") + .reset_index() + .rename(columns={"index": "sample_id"}) + ) + MUTnet = pd.read_csv(params["mutnet_file"], sep="\t", index_col=0) + MUTnet = ( + MUTnet.add_suffix("_mutnet") + .reset_index() + .rename(columns={"index": "sample_id"}) + ) + EXP = pd.read_csv(params["exp_file"], sep="\t", index_col=0) + EXP = EXP.add_suffix("_exp").reset_index().rename(columns={"index": "sample_id"}) + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + response_df = response_df.rename( + columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} + ) + # Extract relevant IDs + + common_drug_ids = reduce( + np.intersect1d, + (drug_mbit_df["drug_id"], DGnet["drug_id"], response_df["drug_id"]), + ) + common_sample_ids = reduce( + np.intersect1d, + ( + CNVnet["sample_id"], + MUTnet["sample_id"], + EXP["sample_id"], + response_df["sample_id"], + ), + ) + response_df = response_df.loc[ + (response_df["drug_id"].isin(common_drug_ids)) + & (response_df["sample_id"].isin(common_sample_ids)), + :, + ] + drug_mbit_df = ( + drug_mbit_df.loc[drug_mbit_df["drug_id"].isin(common_drug_ids), :] + .set_index("drug_id") + .sort_index() + ) + DGnet = ( + DGnet.loc[DGnet["drug_id"].isin(common_drug_ids), :] + .set_index("drug_id") + .sort_index() + ) + CNVnet = ( + CNVnet.loc[CNVnet["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + MUTnet = ( + MUTnet.loc[MUTnet["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + EXP = ( + EXP.loc[EXP["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + + drug_data = drug_mbit_df.join(DGnet) + sample_data = CNVnet.join([MUTnet, EXP]) + ## export train,val,test set + # for i in ['train', 'test', 'val']: + for i in ["train", "test", "val"]: + response_df = drp.DrugResponseLoader(params, split_file=params[i+"_split_file"], verbose=True) + response_df = response_df.dfs['response.tsv'] + response_df = response_df.rename( + columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} + ) + response_df = response_df.loc[ + (response_df["drug_id"].isin(common_drug_ids)) + & (response_df["sample_id"].isin(common_sample_ids)), + :, + ] + comb_data_mtx = pd.DataFrame( + { + "drug_id": response_df["drug_id"].values, + "sample_id": response_df["sample_id"].values, + } + ) + comb_data_mtx = ( + comb_data_mtx.set_index(["drug_id", "sample_id"]) + .join(drug_data, on="drug_id") + .join(sample_data, on="sample_id") + ) + comb_data_mtx["response"] = response_df[params["y_col_name"]].values + comb_data_mtx = comb_data_mtx.dropna() + pl.from_pandas(comb_data_mtx).write_csv( + params[i + "_data"], separator="\t", has_header=True + ) + + +def run_ssgsea(params): + # expMat = improve_utils.load_gene_expression_data(sep='\t') + # expMat = drp.load_omics_data( + # params, + # omics_type="gene_expression", + # canc_col_name="improve_sample_id", + # gene_system_identifier="Gene_Symbol", + # ) + omics_data = drp.OmicsLoader(params) + expMat = omics_data.dfs['cancer_gene_expression.tsv'] + expMat = expMat.set_index(params['canc_col_name']) + + # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], + # split=params['split'], split_type=["train", "test", "val"], + # y_col_name=params['metric']) + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + expMat = expMat.loc[expMat.index.isin(response_df["improve_sample_id"]),] + gct = expMat.T # gene (rows) cell lines (columns) + pathway_path = ( + params["author_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" + ) + gmt = pathway_path + tmp_str = params["ml_data_outdir"] + "/tmpdir_ssgsea/" + + if not os.path.isdir(tmp_str): + os.mkdir(tmp_str) + + # run enrichment + ssgsea = gp.ssgsea( + data=gct, # gct: a matrix of gene by sample + gene_sets=gmt, # gmt format + outdir=tmp_str, + scale=True, + permutation_num=0, # 1000 + no_plot=True, + processes=params["cpu_int"], + # min_size=0, + format="png", + ) + + result_mat = ssgsea.res2d.T # get the normalized enrichment score (i.e., NES) + result_mat.to_csv(tmp_str + "ssGSEA.txt", header=True, index=True, sep="\t") + + f = open(tmp_str + "ssGSEA.txt", "r") + lines = f.readlines() + total_dict = {} + for cell in set(lines[1].split()): + total_dict[cell] = {} + cell_lines = lines[1].split() + vals = lines[4].split() + for i, pathway in enumerate((lines[2].split())): + if i > 0: + total_dict[cell_lines[i]][pathway] = float(vals[i]) + df = pd.DataFrame(total_dict) + df.T.to_csv(params["exp_file"], header=True, index=True, sep="\t") + +def run(params): + params = frm.build_paths(params) + frm.create_outdir(outdir=params["ml_data_outdir"]) + params = preprocess(params) + print("convert drug to bits.") + smile2bits(params) + print("compute DGnet.") + run_netpea(params, dtype="DGnet", multiply_expression=False) + print("compute MUTnet.") + run_netpea(params, dtype="MUTnet", multiply_expression=True) + print("compute CNVnet.") + run_netpea(params, dtype="CNVnet", multiply_expression=True) + print("compute EXP.") + run_ssgsea(params) + print("prepare final input file.") + prep_input(params) + + +def main(): + params = frm.initialize_parameters( + file_path, + default_model="PathDSP_default_model.txt", + additional_definitions=preprocess_params, + required=req_preprocess_args, + ) + run(params) + + +if __name__ == "__main__": + start = datetime.now() + main() + print("[Preprocessing finished in {:}]".format(cal_time(datetime.now(), start))) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py new file mode 100644 index 0000000..25f32d9 --- /dev/null +++ b/PathDSP_train_improve.py @@ -0,0 +1,310 @@ +import candle +import os +import sys +import datetime +# IMPROVE/CANDLE imports +from improve import framework as frm +from improve.metrics import compute_metrics +#from model_utils.torch_utils import predicting +#import json +#from json import JSONEncoder +from PathDSP_preprocess_improve import cal_time, preprocess, model_preproc_params, app_preproc_params, preprocess_params + +#sys.path.append("/usr/local/PathDSP/PathDSP") +#sys.path.append("/usr/local/PathDSP/PathDSP") +#sys.path.append(os.getcwd() + "/PathDSP") +#import FNN_new +import os +import argparse +import numpy as np +import pandas as pd +import scipy.stats as scistat +from datetime import datetime + +import sklearn.preprocessing as skpre +import sklearn.model_selection as skms +import sklearn.metrics as skmts +import sklearn.utils as skut + +import torch as tch +import torch.utils.data as tchud + +import myModel as mynet +import myDataloader as mydl +import myUtility as myutil +import polars as pl + +file_path = os.path.dirname(os.path.realpath(__file__)) + +# [Req] List of metrics names to be compute performance scores +metrics_list = ["mse", "rmse", "pcc", "scc", "r2"] + +# Currently, there are no app-specific args for the train script. +app_train_params = [] + +# [PathDSP] Model-specific params (Model: PathDSP) +model_train_params = [ + {"name": "cuda_name", # TODO. frm. How should we control this? + "action": "store", + "type": str, + "help": "Cuda device (e.g.: cuda:0, cuda:1."}, + {"name": "learning_rate", + "type": float, + "default": 0.0001, + "help": "Learning rate for the optimizer." + }, + +] + +class RMSELoss(tch.nn.Module): + def __init__(self): + super(RMSELoss,self).__init__() + + def forward(self,x,y): + eps = 1e-6 + criterion = tch.nn.MSELoss() + loss = tch.sqrt(criterion(x, y) + eps) + return loss + + + +def predicting(model, device, data_loader): + """ Method to make predictions/inference. + This is used in *train.py and *infer.py + + Parameters + ---------- + model : pytorch model + Model to evaluate. + device : string + Identifier for hardware that will be used to evaluate model. + data_loader : pytorch data loader. + Object to load data to evaluate. + + Returns + ------- + total_labels: numpy array + Array with ground truth. + total_preds: numpy array + Array with inferred outputs. + """ + model.to(device) + model.eval() + total_preds = tch.Tensor() + total_labels = tch.Tensor() + print("Make prediction for {} samples...".format(len(data_loader.dataset))) + with tch.no_grad(): + for i, (data_x, data_y) in enumerate(data_loader): + data_x, data_y = data_x.to(device), data_y.to(device) + data_y_pred = model(data_x) + # Is this computationally efficient? + total_preds = tch.cat((total_preds, data_y_pred.cpu()), 0) # preds to tensor + total_labels = tch.cat((total_labels, data_y.view(-1, 1).cpu()), 0) # labels to tensor + return total_labels.numpy().flatten(), total_preds.numpy().flatten() + +def r2_score(y_true, y_pred): + y_mean = np.mean(y_true) + ss_tot = np.sum((y_true - y_mean)**2) + ss_res = np.sum((y_true - y_pred)**2) + r2 = 1 - ss_res / ss_tot + return r2 + +def cal_time(end, start): + '''return time spent''' + # end = datetime.now(), start = datetime.now() + datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' + spend = datetime.strptime(str(end), datetimeFormat) - \ + datetime.strptime(str(start),datetimeFormat) + return spend + + +def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn, params): + """ + Return train and valid performance including loss + + :param net: model + :param train_dl: train dataloader + :param valid_dl: valid dataloader + :param epochs: integer representing EPOCH + :param learning_rate: float representing LEARNING_RATE + :param device: string representing cpu or cuda:0 + :param opt_fn: optimization function in torch (e.g., tch.optim.Adam) + :param loss_fn: loss function in torch (e.g., tch.nn.MSELoss) + """ + # setup + criterion = RMSELoss() # setup LOSS function + optimizer = opt_fn(net.parameters(), lr=learning_rate, weight_decay=1e-5) # setup optimizer + net = net.to(device) # load the network onto the device + trainloss_list = [] # metrics: MSE, size equals to EPOCH + validloss_list = [] # metrics: MSE, size equals to EPOCH + validr2_list = [] # metrics: r2, size equals to EPOCH + early_stopping = myutil.EarlyStopping(patience=params['patience'], verbose=True, path= params["model_outdir"] + "/checkpoint.pt") # initialize the early_stopping + # repeat the training for EPOCH times + start_total = datetime.now() + for epoch in range(epochs): + ## training phase + start = datetime.now() + net.train() + # initial loss + train_epoch_loss = 0.0 # save loss for each epoch, batch by batch + for i, (X_train, y_train) in enumerate(train_dl): + X_train, y_train = X_train.to(device), y_train.to(device) # load data onto the device + y_train_pred = net(X_train) # train result + train_loss = criterion(y_train_pred, y_train.float()) # calculate loss + optimizer.zero_grad() # clear gradients + train_loss.backward() # backpropagation + #### add this if you have gradient explosion problem ### + clip_value = 5 + tch.nn.utils.clip_grad_value_(net.parameters(), clip_value) + ########climp gradient within -5 ~ 5 ################### + optimizer.step() # update weights + train_epoch_loss += train_loss.item() # adding loss from each batch + # calculate total loss of all batches + avg_train_loss = train_epoch_loss / len(train_dl) + trainloss_list.append( avg_train_loss ) + print('epoch ' + str(epoch) + ' :[Finished in {:}]'.format(cal_time(datetime.now(), start))) + ## validation phase + with tch.no_grad(): + net.eval() + valid_epoch_loss = 0.0 # save loss for each epoch, batch by batch + ss_res = 0.0 + ss_tot = 0.0 + for i, (X_valid, y_valid) in enumerate(valid_dl): + X_valid, y_valid = X_valid.to(device), y_valid.to(device) # load data onto the device + y_valid_pred = net(X_valid) # valid result + valid_loss = criterion(y_valid_pred, y_valid.float())#y_valid.unsqueeze(1)) # calculate loss + valid_epoch_loss += valid_loss.item() # adding loss from each batch + ss_res += tch.sum((y_valid_pred - y_valid.float())**2) + ss_tot += tch.sum((y_valid_pred - y_valid.mean())**2) + + + # calculate total loss of all batches, and append to result list + avg_valid_loss = valid_epoch_loss / len(valid_dl) + validloss_list.append( avg_valid_loss) + valid_r2 = 1 - ss_res / ss_tot + validr2_list.append(valid_r2.cpu().numpy()) + # display print message + #print('epoch={:}/{:}, train loss={:.5f}, valid loss={:.5f}'.format( + # epoch+1, epochs, train_epoch_loss / len(train_dl), + # valid_epoch_loss / len(valid_dl))) + + # early_stopping needs the validation loss to check if it has decresed, + # and if it has, it will make a checkpoint of the current model + early_stopping(avg_valid_loss, net) + + if early_stopping.early_stop: + print("Early stopping") + break + + print('Total time (all epochs) :[Finished in {:}]'.format(cal_time(datetime.now(), start_total))) + # load the last checkpoint with the best model + net.load_state_dict(tch.load(params["model_outdir"] + '/checkpoint.pt')) + + return net, trainloss_list, validloss_list, validr2_list + + +def run(params): + frm.create_outdir(outdir=params["model_outdir"]) + modelpath = frm.build_model_path(params, model_dir=params["model_outdir"]) + train_data_fname = frm.build_ml_data_name(params, stage="train") + val_data_fname = frm.build_ml_data_name(params, stage="val") + params = preprocess(params) + + # set parameters + myutil.set_seed(params["seed_int"]) + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + learning_rate = params['learning_rate'] + epoch = params['epochs'] + batch_size = params['batch_size'] + val_batch = params['val_batch'] + opt_fn = tch.optim.Adam + + # ------------------------------------------------------ + # [PathDSP] Prepare dataloaders + # ------------------------------------------------------ + print('loadinig data') + train_df = pl.read_csv(params['train_data'], separator = "\t").to_pandas() + val_df = pl.read_csv(params['val_data'], separator = "\t").to_pandas() + Xtrain_arr = train_df.iloc[:, 0:-1].values + Xvalid_arr = val_df.iloc[:, 0:-1].values + ytrain_arr = train_df.iloc[:, -1].values + yvalid_arr = val_df.iloc[:, -1].values + Xtrain_arr = np.array(Xtrain_arr).astype('float32') + Xvalid_arr = np.array(Xvalid_arr).astype('float32') + ytrain_arr = np.array(ytrain_arr).astype('float32') + yvalid_arr = np.array(yvalid_arr).astype('float32') + # create mini-batch + train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) + valid_dataset = mydl.NumpyDataset(tch.from_numpy(Xvalid_arr), tch.from_numpy(yvalid_arr)) + train_dl = tchud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + valid_dl = tchud.DataLoader(valid_dataset, batch_size=val_batch, shuffle=False) + + # ------------------------------------------------------ + # [PathDSP] Prepare model + # ------------------------------------------------------ + # initial weight + def init_weights(m): + if type(m) == tch.nn.Linear: + tch.nn.init.kaiming_uniform_(m.weight) + m.bias.data.fill_(0.01) + # load model + n_features = Xtrain_arr.shape[1] + net = mynet.FNN(n_features) + net.apply(init_weights) + + # ------------------------------------------------------ + # [PathDSP] Training + # ------------------------------------------------------ + print('start training process') + trained_net, train_loss_list, valid_loss_list, valid_r2_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn, params) + + loss_df = pd.DataFrame({'epoch':[i+1 for i in range(len(train_loss_list))], + 'train loss':train_loss_list, + 'valid loss': valid_loss_list, + 'valid r2': valid_r2_list}) + loss_df.to_csv(params['model_outdir'] + '/Val_Loss_orig.txt', header=True, index=False, sep="\t") + + # make train/valid loss plots + best_model = trained_net + tch.save(best_model.state_dict(), modelpath) + best_model.eval() + # Compute predictions + val_true, val_pred = predicting(best_model, device, data_loader=valid_dl) # (groud truth), (predictions) + + # ----------------------------- + # [Req] Save raw predictions in dataframe + # ----------------------------- + # import ipdb; ipdb.set_trace() + frm.store_predictions_df( + params, y_true=val_true, y_pred=val_pred, stage="val", + outdir=params["model_outdir"] + ) + + # ----------------------------- + # [Req] Compute performance scores + # ----------------------------- + # import ipdb; ipdb.set_trace() + val_scores = frm.compute_performace_scores( + params, y_true=val_true, y_pred=val_pred, stage="val", + outdir=params["model_outdir"], metrics=metrics_list + ) + return val_scores + + +def main(): + additional_definitions = model_preproc_params + \ + model_train_params + \ + app_train_params + params = frm.initialize_parameters( + file_path, + default_model="PathDSP_default_model.txt", + additional_definitions=additional_definitions, + required=None, + ) + val_scores = run(params) + + +if __name__ == "__main__": + start = datetime.now() + main() + print("[Training finished in {:}]".format(cal_time(datetime.now(), start))) diff --git a/README.md b/README.md index d8905e7..6adfdae 100644 --- a/README.md +++ b/README.md @@ -1,61 +1,49 @@ # PathDSP Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores -# Example usage with singularity container -Setup Singularity +# Download benchmark data -``` -git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git -cd Singularity -./setup -source config/improve.env -``` - -Build Singularity from definition file +Download the cross-study analysis (CSA) benchmark data into the model directory from https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/ ``` -singularity build --fakeroot PathDSP.sif definitions/PathDSP.def +mkdir process_dir +cd process_dir +wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data ``` -Perform preprocessing step using processed data from original paper - -``` -singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 0" -``` +Benchmarmakr data will be downladed under `process_dir/csa_data/` -Alternatively, perform preprocessing step using raw data from IMPROVE project +# Download author data ``` -singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 1" +mkdir author_data +cd author_data +wget https://zenodo.org/record/6093818/files/MSigdb.zip +wget https://zenodo.org/record/6093818/files/raw_data.zip +wget https://zenodo.org/record/6093818/files/STRING.zip +unzip MSigdb.zip +unzip raw_data.zip +unzip STRING.zip ``` -Train the model - -``` -singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir -``` - -Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss.txt` -Final trained model is located at: `${IMPROVE_DATA_DIR}/Data/model.pt` - -Perform inference on the testing data - -``` -singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir -``` - -Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss_pred.txt` -Final prediction on testing data is located at: `${IMPROVE_DATA_DIR}/Data/Prediction.txt` +Author data will be downloaded under `process_dir/author_data/` # Example usage with Conda -Download PathDSP +Download PathDSP and IMPROVE ``` +cd ../ +mkdir repo +cd repo git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git +git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git cd PathDSP ``` +PathDSP will be installed at `process_dir/repo/PathDSP` +IMPROVE will be installed at `process_dir/repo/IMPROVE` + Create environment ``` @@ -68,43 +56,48 @@ Activate environment conda activate PathDSP_env ``` -Intall CANDLE package +Install CANDLE package ``` pip install git+https://github.com/ECP-CANDLE/candle_lib@develop ``` -Perform preprocessing step using processed data from original paper +Define enviroment variabels ``` -export CUDA_VISIBLE_DEVICES=0 -export CANDLE_DATA_DIR=./Data/ -bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 0" +improve_lib="/path/to/IMPROVE/repo/" +pathdsp_lib="/path/to/pathdsp/repo/" +# notice the extra PathDSP folder after pathdsp_lib +export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/export IMPROVE_DATA_DIR="/path/to/csa_data/" +export AUTHOR_DATA_DIR="/path/to/author_data/" ``` -Alternatively, perform preprocessing step using raw data from IMPROVE project +Perform preprocessing step ``` -bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 1" +# go two upper level +cd ../../ +python repo/PathDSP/PathDSP_preprocess_improve.py ``` Train the model ``` -bash train.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +python repo/PathDSP/PathDSP_train_improve.py ``` -Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss.txt` -Final trained model is located at: `${CANDLE_DATA_DIR}/Data/model.pt` +Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json` +Final trained model is located at: `${train_ml_data_dir}/model.pt`. Parameter definitions can be found at `process_dir/repo/PathDSP/PathDSP_default_model.txt` Perform inference on the testing data ``` -bash infer.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +python PathDSP_infer_improve.py ``` -Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss_pred.txt` -Final prediction on testing data is located at: `${CANDLE_DATA_DIR}/Data/Prediction.txt` +Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` +Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` + # Docs from original authors (below) @@ -131,4 +124,4 @@ Pathway enrichment scores for categorical data (i.e., mutation, copy number vari # Reference -Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7 +Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7 \ No newline at end of file diff --git a/README_old.md b/README_old.md new file mode 100644 index 0000000..84ea104 --- /dev/null +++ b/README_old.md @@ -0,0 +1,134 @@ +# PathDSP +Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores + +# Example usage with singularity container +Setup Singularity + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git +cd Singularity +./setup +source config/improve.env +``` + +Build Singularity from definition file + +``` +singularity build --fakeroot PathDSP.sif definitions/PathDSP.def +``` + +Perform preprocessing step using processed data from original paper + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 0" +``` + +Alternatively, perform preprocessing step using raw data from IMPROVE project + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 1" +``` + +Train the model + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir +``` + +Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss.txt` +Final trained model is located at: `${IMPROVE_DATA_DIR}/Data/model.pt` + +Perform inference on the testing data + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir +``` + +Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss_pred.txt` +Final prediction on testing data is located at: `${IMPROVE_DATA_DIR}/Data/Prediction.txt` + +# Example usage with Conda + +Download PathDSP + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git +cd PathDSP +``` + +Create environment + +``` +conda env create -f environment_082223.yml -n PathDSP_env +``` + +Activate environment + +``` +conda activate PathDSP_env +``` + +Intall CANDLE package + +``` +pip install git+https://github.com/ECP-CANDLE/candle_lib@develop +``` + +Perform preprocessing step using processed data from original paper + +``` +export CUDA_VISIBLE_DEVICES=0 +export CANDLE_DATA_DIR=./Data/ +bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 0" +``` + +Alternatively, perform preprocessing step using raw data from IMPROVE project + +``` +bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 1" +``` + +Train the model + +``` +bash train.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +``` + +Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss.txt` +Final trained model is located at: `${CANDLE_DATA_DIR}/Data/model.pt` + +Perform inference on the testing data + +``` +bash infer.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +``` + +Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss_pred.txt` +Final prediction on testing data is located at: `${CANDLE_DATA_DIR}/Data/Prediction.txt` + +# Docs from original authors (below) + +# Requirments + +# Input format + +|drug|cell|feature_1|....|feature_n|drug_response| +|----|----|--------|----|--------|----| +|5-FU|03|0|....|0.02|-2.3| +|5-FU|23|1|....|0.04|-3.4| + +Where feature_1 to feature_n are the pathway enrichment scores and the chemical fingerprint coming from data processing +# Usage: +```python +# run FNN +python ./PathDSP/PathDSP/FNN.py -i input.txt -o ./output_prefix + +Where input.txt should be in the input format shown above. +Example input file can be found at https://zenodo.org/record/7532963 +``` +# Data preprocessing +Pathway enrichment scores for categorical data (i.e., mutation, copy number variation, and drug targets) were obtained by running the NetPEA algorithm, which is available at: https://github.com/TangYiChing/NetPEA, while pathway enrichment scores for numeric data (i.e., gene expression) was generated with the single-sample Gene Set Enrichment Analsysis (ssGSEA) available here: https://gseapy.readthedocs.io/en/master/gseapy_example.html#3)-command-line-usage-of-single-sample-gseaby + + +# Reference +Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7 \ No newline at end of file diff --git a/preprocess.sh b/preprocess.sh index a7a435d..ab0de90 100755 --- a/preprocess.sh +++ b/preprocess.sh @@ -11,7 +11,7 @@ ### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=preprocess_new.py +CANDLE_MODEL=preprocess_improve.py if [ $# -lt 2 ] ; then echo "Illegal number of parameters" From d8e722253be616ced5c09697033968bbc901b382 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 3 Jan 2024 09:38:22 -0800 Subject: [PATCH 075/254] update csa --- PathDSP_cs_model.txt | 46 ++++++ PathDSP_default_model.txt | 24 +-- PathDSP_infer_improve.py | 8 +- PathDSP_preprocess_improve.py | 20 +-- PathDSP_train_improve.py | 47 +++++- README.md | 5 +- csa_wf_v3.py | 292 ++++++++++++++++++++++++++++++++++ csa_workflow_params.txt | 8 + 8 files changed, 417 insertions(+), 33 deletions(-) create mode 100644 PathDSP_cs_model.txt create mode 100644 csa_wf_v3.py create mode 100644 csa_workflow_params.txt diff --git a/PathDSP_cs_model.txt b/PathDSP_cs_model.txt new file mode 100644 index 0000000..44a4791 --- /dev/null +++ b/PathDSP_cs_model.txt @@ -0,0 +1,46 @@ +[Global_Params] +model_name='PathDSP' + +[Preprocess] +train_split_file = "GDSCv1_split_0_train.txt" +val_split_file = "GDSCv1_split_0_val.txt" +test_split_file = "GDSCv1_split_0_test.txt" +ml_data_outdir = "./ml_data/GDSCv1-GDSCv1/split_0" +x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] +x_data_drug_files = [["drug_SMILES.tsv"]] +y_data_files = [["response.tsv"]] +data_format = ".txt" +drug_bits_file='drug_mbit_df.txt' +dgnet_file='DGnet.txt' +mutnet_file='MUTnet.txt' +cnvnet_file='CNVnet.txt' +exp_file='EXP.txt' +bit_int=128 +permutation_int=3 +seed_int=42 +cpu_int=20 + +[Train] +train_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0" +val_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0" +model_outdir = "./out_models/GDSCv1/split_0" +model_file_name = "model" +model_file_format = ".pt" +epochs=800 +batch_size = 32 +val_batch = 32 +loss = "mse" +early_stop_metric = "mse" +patience = 30 +cuda_name = "cuda:2" +learning_rate = 0.001 + +[Infer] +test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0" +model_dir = "./out_models/GDSCv1/split_0" +infer_outdir = "./out_infer/GDSCv1-GDSCv1/split_0" +test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0" +model_dir = "./out_models/GDSCv1/split_0" +infer_outdir = "./out_infer/GDSCv1-GDSCv1/split_0" +test_batch = 256 +cuda_name = "cuda:3" \ No newline at end of file diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index 5951ee3..a059aae 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -2,10 +2,10 @@ model_name='PathDSP' [Preprocess] -train_split_file = "gCSI_split_0_train.txt" -val_split_file = "gCSI_split_0_val.txt" -test_split_file = "gCSI_split_0_test.txt" -ml_data_outdir = "./ml_data/gCSI-gCSI/split_0" +train_split_file = "GDSCv1_split_4_train.txt" +val_split_file = "GDSCv1_split_4_val.txt" +test_split_file = "GDSCv1_split_4_test.txt" +ml_data_outdir = "./ml_data/GDSCv1-GDSCv1/split_4" x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] x_data_drug_files = [["drug_SMILES.tsv"]] y_data_files = [["response.tsv"]] @@ -21,14 +21,14 @@ seed_int=42 cpu_int=20 [Train] -train_ml_data_dir = "./ml_data/gCSI-gCSI/split_0" -val_ml_data_dir = "./ml_data/gCSI-gCSI/split_0" -model_outdir = "./out_models/gCSI/split_0" +train_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" +val_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" +model_outdir = "./out_models/GDSCv1/split_4" model_file_name = "model" model_file_format = ".pt" epochs=800 -batch_size = 32 -val_batch = 32 +batch_size = 12 +val_batch = 12 loss = "mse" early_stop_metric = "mse" patience = 30 @@ -36,8 +36,8 @@ cuda_name = "cuda:2" learning_rate = 0.001 [Infer] -test_ml_data_dir = "./ml_data/gCSI-gCSI/split_0" -model_dir = "./out_models/gCSI/split_0" -infer_outdir = "./out_infer/gCSI-gCSI/split_0" +test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" +model_dir = "./out_models/GDSCv1/split_4" +infer_outdir = "./out_infer/GDSCv1-GDSCv1/split_4" test_batch = 256 cuda_name = "cuda:3" \ No newline at end of file diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index b91c8fa..1f9aeeb 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -43,7 +43,8 @@ def run(params): frm.create_outdir(outdir=params["infer_outdir"]) params = preprocess(params) - test_df = pl.read_csv(params['test_data'], separator = "\t").to_pandas() + test_data_fname = frm.build_ml_data_name(params, stage="test") + test_df = pl.read_csv(params["test_ml_data_dir"] + "/" + test_data_fname, separator = "\t").to_pandas() Xtest_arr = test_df.iloc[:, 0:-1].values ytest_arr = test_df.iloc[:, -1].values Xtest_arr = np.array(Xtest_arr).astype('float32') @@ -69,7 +70,7 @@ def run(params): print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start))) return test_scores -def main(): +def main(args): additional_definitions = model_preproc_params + \ model_train_params + \ model_infer_params + \ @@ -77,6 +78,7 @@ def main(): params = frm.initialize_parameters( file_path, default_model="PathDSP_default_model.txt", + #default_model="PathDSP_cs_model.txt", additional_definitions=additional_definitions, required=None, ) @@ -85,4 +87,4 @@ def main(): if __name__ == "__main__": - main() + main(sys.argv[1:]) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 1fd31f9..43afbd7 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -17,6 +17,7 @@ import NetPEA as pea import gseapy as gp import sklearn.model_selection as skms +from sklearn.preprocessing import StandardScaler file_path = Path(__file__).resolve().parent @@ -119,14 +120,8 @@ def mkdir(directory): def preprocess(params): - params["train_data"] = frm.build_ml_data_name(params, 'train') - params["val_data"] = frm.build_ml_data_name(params, 'val') - params["test_data"] = frm.build_ml_data_name(params, 'test') params["author_data_dir"] = os.getenv("AUTHOR_DATA_DIR") for i in [ - "train_data", - "test_data", - "val_data", "drug_bits_file", "dgnet_file", "mutnet_file", @@ -420,10 +415,14 @@ def prep_input(params): .join(drug_data, on="drug_id") .join(sample_data, on="sample_id") ) - comb_data_mtx["response"] = response_df[params["y_col_name"]].values + ss = StandardScaler() + comb_data_mtx.iloc[:,params["bit_int"]:comb_data_mtx.shape[1]] = ss.fit_transform(comb_data_mtx.iloc[:,params["bit_int"]:comb_data_mtx.shape[1]]) + ## add 0.01 to avoid possible inf values + comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) comb_data_mtx = comb_data_mtx.dropna() pl.from_pandas(comb_data_mtx).write_csv( - params[i + "_data"], separator="\t", has_header=True + params["ml_data_outdir"] + "/" + frm.build_ml_data_name(params, i) +, separator="\t", has_header=True ) @@ -502,10 +501,11 @@ def run(params): prep_input(params) -def main(): +def main(args): params = frm.initialize_parameters( file_path, default_model="PathDSP_default_model.txt", + #default_model="PathDSP_cs_model.txt", additional_definitions=preprocess_params, required=req_preprocess_args, ) @@ -514,5 +514,5 @@ def main(): if __name__ == "__main__": start = datetime.now() - main() + main(sys.argv[1:]) print("[Preprocessing finished in {:}]".format(cal_time(datetime.now(), start))) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 25f32d9..49c584c 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -102,6 +102,39 @@ def predicting(model, device, data_loader): total_labels = tch.cat((total_labels, data_y.view(-1, 1).cpu()), 0) # labels to tensor return total_labels.numpy().flatten(), total_preds.numpy().flatten() +def predict(net, device, test_dl): + """ + Return prediction list + + :param net: model + :param train_dl: train dataloader + :param device: string representing cpu or cuda:0 + """ + # create result lists + prediction_list = list() + true_list = list() + + with tch.no_grad(): + net = net.to(device) # load the network onto the device + net.eval() + for i, (X_test, y_test) in enumerate(test_dl): + X_test, y_test = X_test.to(device), y_test.to(device) # load data onto the device + y_test_pred = net(X_test) # test result + # bring data back to cpu in np.array format, and append to result lists + prediction_list.append( y_test_pred.cpu().numpy() ) + true_list.append(y_test.cpu().numpy()) + #print(prediction_list) + + # merge all batches + prediction_list = np.vstack(prediction_list) + prediction_list = np.hstack(prediction_list).tolist() + true_list = np.vstack(true_list) + true_list = np.hstack(true_list).tolist() + # return + return true_list, prediction_list + + + def r2_score(y_true, y_pred): y_mean = np.mean(y_true) ss_tot = np.sum((y_true - y_mean)**2) @@ -223,8 +256,8 @@ def run(params): # [PathDSP] Prepare dataloaders # ------------------------------------------------------ print('loadinig data') - train_df = pl.read_csv(params['train_data'], separator = "\t").to_pandas() - val_df = pl.read_csv(params['val_data'], separator = "\t").to_pandas() + train_df = pl.read_csv(params["train_ml_data_dir"] + "/" + train_data_fname, separator = "\t").to_pandas() + val_df = pl.read_csv(params["val_ml_data_dir"] + "/" + val_data_fname, separator = "\t").to_pandas() Xtrain_arr = train_df.iloc[:, 0:-1].values Xvalid_arr = val_df.iloc[:, 0:-1].values ytrain_arr = train_df.iloc[:, -1].values @@ -267,9 +300,10 @@ def init_weights(m): # make train/valid loss plots best_model = trained_net tch.save(best_model.state_dict(), modelpath) - best_model.eval() + #best_model.eval() # Compute predictions - val_true, val_pred = predicting(best_model, device, data_loader=valid_dl) # (groud truth), (predictions) + #val_true, val_pred = predicting(best_model, device, valid_dl) # (groud truth), (predictions) + val_true, val_pred = predict(best_model, device, valid_dl) # (groud truth), (predictions) # ----------------------------- # [Req] Save raw predictions in dataframe @@ -291,13 +325,14 @@ def init_weights(m): return val_scores -def main(): +def main(args): additional_definitions = model_preproc_params + \ model_train_params + \ app_train_params params = frm.initialize_parameters( file_path, default_model="PathDSP_default_model.txt", + #default_model="PathDSP_cs_model.txt", additional_definitions=additional_definitions, required=None, ) @@ -306,5 +341,5 @@ def main(): if __name__ == "__main__": start = datetime.now() - main() + main(sys.argv[1:]) print("[Training finished in {:}]".format(cal_time(datetime.now(), start))) diff --git a/README.md b/README.md index 6adfdae..bac8978 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,8 @@ Define enviroment variabels improve_lib="/path/to/IMPROVE/repo/" pathdsp_lib="/path/to/pathdsp/repo/" # notice the extra PathDSP folder after pathdsp_lib -export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/export IMPROVE_DATA_DIR="/path/to/csa_data/" +export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/ +export IMPROVE_DATA_DIR="/path/to/csa_data/" export AUTHOR_DATA_DIR="/path/to/author_data/" ``` @@ -92,7 +93,7 @@ Final trained model is located at: `${train_ml_data_dir}/model.pt`. Parameter de Perform inference on the testing data ``` -python PathDSP_infer_improve.py +python repo/PathDSP/PathDSP_infer_improve.py ``` Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` diff --git a/csa_wf_v3.py b/csa_wf_v3.py new file mode 100644 index 0000000..066b726 --- /dev/null +++ b/csa_wf_v3.py @@ -0,0 +1,292 @@ +""" Python implementation of cross-study analysis workflow """ +# cuda_name = "cuda:6" +cuda_name = "cuda:7" + +import os +import subprocess +import warnings +from time import time +from pathlib import Path + +import pandas as pd + +# IMPROVE imports +from improve import framework as frm +# import improve_utils +# from improve_utils import improve_globals as ig + +# GraphDRP imports +# TODO: change this for your model +import PathDSP_preprocess_improve +import PathDSP_train_improve +import PathDSP_preprocess_improve + +# from ap_utils.classlogger import Logger +# from ap_utils.utils import get_print_func, Timer + + +class Timer: + """ Measure time. """ + def __init__(self): + self.start = time() + + def timer_end(self): + self.end = time() + return self.end - self.start + + def display_timer(self, print_fn=print): + time_diff = self.timer_end() + if time_diff // 3600 > 0: + print_fn("Runtime: {:.1f} hrs".format( (time_diff)/3600) ) + else: + print_fn("Runtime: {:.1f} mins".format( (time_diff)/60) ) + + +fdir = Path(__file__).resolve().parent + +y_col_name = "auc" +# y_col_name = "auc1" + +maindir = Path(f"./{y_col_name}") +MAIN_ML_DATA_DIR = Path(f"./{maindir}/ml.data") +MAIN_MODEL_DIR = Path(f"./{maindir}/models") +MAIN_INFER_OUTDIR = Path(f"./{maindir}/infer") + +# Check that environment variable "IMPROVE_DATA_DIR" has been specified +if os.getenv("IMPROVE_DATA_DIR") is None: + raise Exception("ERROR ! Required system variable not specified. \ + You must define IMPROVE_DATA_DIR ... Exiting.\n") +os.environ["CANDLE_DATA_DIR"] = os.environ["IMPROVE_DATA_DIR"] + +params = frm.initialize_parameters( + fdir, + default_model="csa_workflow_params.txt", +) + +main_datadir = Path(os.environ["IMPROVE_DATA_DIR"]) +raw_datadir = main_datadir / params["raw_data_dir"] +x_datadir = raw_datadir / params["x_data_dir"] +y_datadir = raw_datadir / params["y_data_dir"] +splits_dir = raw_datadir / params["splits_dir"] + +# lg = Logger(main_datadir/"csa.log") +print_fn = print +# print_fn = get_print_func(lg.logger) +print_fn(f"File path: {fdir}") + +### Source and target data sources +## Set 1 - full analysis +# source_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +## Set 2 - smaller datasets +# source_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# source_datasets = ["GDSCv1", "CTRPv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +## Set 3 - full analysis for a single source +# source_datasets = ["CCLE"] +# source_datasets = ["CTRPv2"] +source_datasets = ["GDSCv1"] +target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv2"] +## Set 4 - same source and target +# source_datasets = ["CCLE"] +# target_datasets = ["CCLE"] +## Set 5 - single source and target +# source_datasets = ["GDSCv1"] +# target_datasets = ["CCLE"] + +# only_cross_study = False +only_cross_study = True + + +## Splits +# split_nums = [] # all splits +# split_nums = [0] +# split_nums = [4, 7] +split_nums = [1, 4, 7] +# split_nums = [1, 3, 5, 7, 9] + +## Parameters of the experiment/run/workflow +# TODO: this should be stored as the experiment metadata that we can go back check +# epochs = 2 +# epochs = 30 +# epochs = 50 +epochs = 70 +# epochs = 100 +# epochs = 150 +# config_file_name = "csa_params.txt" +# config_file_path = fdir/config_file_name + +def build_split_fname(source, split, phasea): + """ Build split file name. If file does not exist continue """ + return f"{source_data_name}_split_{split}_{phase}.txt" + +# =============================================================== +### Generate CSA results (within- and cross-study) +# =============================================================== + +timer = Timer() +# Iterate over source datasets +# Note! The "source_data_name" iterations are independent of each other +print_fn(f"\nsource_datasets: {source_datasets}") +print_fn(f"target_datasets: {target_datasets}") +print_fn(f"split_nums: {split_nums}") +# import pdb; pdb.set_trace() +for source_data_name in source_datasets: + + # Get the split file paths + # This parsing assumes splits file names are: SOURCE_split_NUM_[train/val/test].txt + if len(split_nums) == 0: + # Get all splits + split_files = list((splits_dir).glob(f"{source_data_name}_split_*.txt")) + split_nums = [str(s).split("split_")[1].split("_")[0] for s in split_files] + split_nums = sorted(set(split_nums)) + # num_splits = 1 + else: + # Use the specified splits + split_files = [] + for s in split_nums: + split_files.extend(list((splits_dir).glob(f"{source_data_name}_split_{s}_*.txt"))) + + files_joined = [str(s) for s in split_files] + + # -------------------- + # Preprocess and Train + # -------------------- + # import pdb; pdb.set_trace() + for split in split_nums: + print_fn(f"Split id {split} out of {len(split_nums)} splits.") + # Check that train, val, and test are available. Otherwise, continue to the next split. + # split = 11 + # files_joined = [str(s) for s in split_files] + # TODO: check this! + for phase in ["train", "val", "test"]: + fname = build_split_fname(source_data_name, split, phase) + # print(f"{phase}: {fname}") + if fname not in "\t".join(files_joined): + warnings.warn(f"\nThe {phase} split file {fname} is missing (continue to next split)") + continue + + # import pdb; pdb.set_trace() + for target_data_name in target_datasets: + if only_cross_study and (source_data_name == target_data_name): + continue # only cross-study + print_fn(f"\nSource data: {source_data_name}") + print_fn(f"Target data: {target_data_name}") + + # EXP_ML_DATA_DIR = ig.ml_data_dir/f"{source_data_name}-{target_data_name}"/f"split_{split}" + ml_data_outdir = MAIN_ML_DATA_DIR/f"{source_data_name}-{target_data_name}"/f"split_{split}" + + if source_data_name == target_data_name: + # If source and target are the same, then infer on the test split + test_split_file = f"{source_data_name}_split_{split}_test.txt" + else: + # If source and target are different, then infer on the entire target dataset + test_split_file = f"{target_data_name}_all.txt" + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # p1 (none): Preprocess train data + # import pdb; pdb.set_trace() + # train_split_files = list((ig.splits_dir).glob(f"{source_data_name}_split_0_train*.txt")) # TODO: placeholder for lc analysis + timer_preprocess = Timer() + # ml_data_path = graphdrp_preprocess_improve.main([ + # "--train_split_file", f"{source_data_name}_split_{split}_train.txt", + # "--val_split_file", f"{source_data_name}_split_{split}_val.txt", + # "--test_split_file", str(test_split_file_name), + # "--ml_data_outdir", str(ml_data_outdir), + # "--y_col_name", y_col_name + # ]) + print_fn("\nPreprocessing") + train_split_file = f"{source_data_name}_split_{split}_train.txt" + val_split_file = f"{source_data_name}_split_{split}_val.txt" + # test_split_file = f"{source_data_name}_split_{split}_test.txt" + print_fn(f"train_split_file: {train_split_file}") + print_fn(f"val_split_file: {val_split_file}") + print_fn(f"test_split_file: {test_split_file}") + print_fn(f"ml_data_outdir: {ml_data_outdir}") + # import pdb; pdb.set_trace() + preprocess_run = ["python", + "PathDSP_preprocess_improve.py", + "--train_split_file", str(train_split_file), + "--val_split_file", str(val_split_file), + "--test_split_file", str(test_split_file), + "--ml_data_outdir", str(ml_data_outdir), + "--y_col_name", str(y_col_name) + ] + result = subprocess.run(preprocess_run, capture_output=True, + text=True, check=True) + # print(result.stdout) + # print(result.stderr) + timer_preprocess.display_timer(print_fn) + + # p2 (p1): Train model + # Train a single model for a given [source, split] pair + # Train using train samples and early stop using val samples + # import pdb; pdb.set_trace() + model_outdir = MAIN_MODEL_DIR/f"{source_data_name}"/f"split_{split}" + if model_outdir.exists() is False: + train_ml_data_dir = ml_data_outdir + val_ml_data_dir = ml_data_outdir + timer_train = Timer() + # graphdrp_train_improve.main([ + # "--train_ml_data_dir", str(train_ml_data_dir), + # "--val_ml_data_dir", str(val_ml_data_dir), + # "--model_outdir", str(model_outdir), + # "--epochs", str(epochs), # available in config_file + # # "--ckpt_directory", str(MODEL_OUTDIR), # TODO: we'll use candle known param ckpt_directory instead of model_outdir + # # "--cuda_name", "cuda:5" + # ]) + print_fn("\nTrain") + print_fn(f"train_ml_data_dir: {train_ml_data_dir}") + print_fn(f"val_ml_data_dir: {val_ml_data_dir}") + print_fn(f"model_outdir: {model_outdir}") + # import pdb; pdb.set_trace() + train_run = ["python", + "PathDSP_train_improve.py", + "--train_ml_data_dir", str(train_ml_data_dir), + "--val_ml_data_dir", str(val_ml_data_dir), + "--model_outdir", str(model_outdir), + "--epochs", str(epochs), + "--cuda_name", cuda_name, + "--y_col_name", y_col_name + ] + result = subprocess.run(train_run, capture_output=True, + text=True, check=True) + # print(result.stdout) + # print(result.stderr) + timer_train.display_timer(print_fn) + + # Infer + # p3 (p1, p2): Inference + # import pdb; pdb.set_trace() + test_ml_data_dir = ml_data_outdir + model_dir = model_outdir + infer_outdir = MAIN_INFER_OUTDIR/f"{source_data_name}-{target_data_name}"/f"split_{split}" + timer_infer = Timer() + # graphdrp_infer_improve.main([ + # "--test_ml_data_dir", str(test_ml_data_dir), + # "--model_dir", str(model_dir), + # "--infer_outdir", str(infer_outdir), + # # "--cuda_name", "cuda:5" + # ]) + print_fn("\nInfer") + print_fn(f"test_ml_data_dir: {test_ml_data_dir}") + #print_fn(f"val_ml_data_dir: {val_ml_data_dir}") + print_fn(f"infer_outdir: {infer_outdir}") + # import pdb; pdb.set_trace() + infer_run = ["python", + "PathDSP_infer_improve.py", + "--test_ml_data_dir", str(test_ml_data_dir), + "--model_dir", str(model_dir), + "--infer_outdir", str(infer_outdir), + "--cuda_name", cuda_name, + "--y_col_name", y_col_name + ] + result = subprocess.run(infer_run, capture_output=True, + text=True, check=True) + timer_infer.display_timer(print_fn) + +timer.display_timer(print_fn) +print_fn("Finished a full cross-study run.") diff --git a/csa_workflow_params.txt b/csa_workflow_params.txt new file mode 100644 index 0000000..66a69f8 --- /dev/null +++ b/csa_workflow_params.txt @@ -0,0 +1,8 @@ +[Global_Params] +model_name = "CSA_workflow" + +[CSA_Workflow] +raw_data_dir = "raw_data" +x_data_dir = "x_data" +y_data_dir = "y_data" +splits_dir = "splits" From ea88f9f0039f65f56ea58e353118d96766edb5bc Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Wed, 3 Jan 2024 11:53:14 -0600 Subject: [PATCH 076/254] update to csa study (#9) * update preprocess script * update preprocess script * add improve_utils script * add nea scripts * update params * add gitignore * EXP processing * updated to integrate with prep_input * add definition file * update .gitignore * update filename for ssGSEA * add FNN_new * add train/infer * update params * add .yml * update params * update conda path * fix conda * update preprocess.sh * update preprocess.sh * update preprocess_new.py * update env * update preproce_new.py * update preproce_new.py * update files * update params * fix params * update preproce_new.py * update preprocess_new.py * update preprocess_new.py * update file * update file * update file * update script * add def * add script * update file * update FNN_new * update FNN * update params * fix param * fix bug * add time * update def * update yml * update train.sh * update train.sh * update train.py * update train * fix bug * update file * update file * use polars * update files * update preprocess * update infer.sh * process author data * fix args * add infer.sh * update doc * fix path * fix conda * use improve repo * use improve module * update readme * update csa --------- Co-authored-by: willherbert27 --- PathDSP_cs_model.txt | 46 ++++++ PathDSP_default_model.txt | 24 +-- PathDSP_infer_improve.py | 8 +- PathDSP_preprocess_improve.py | 20 +-- PathDSP_train_improve.py | 46 +++++- README.md | 5 +- csa_wf_v3.py | 292 ++++++++++++++++++++++++++++++++++ csa_workflow_params.txt | 8 + 8 files changed, 416 insertions(+), 33 deletions(-) create mode 100644 PathDSP_cs_model.txt create mode 100644 csa_wf_v3.py create mode 100644 csa_workflow_params.txt diff --git a/PathDSP_cs_model.txt b/PathDSP_cs_model.txt new file mode 100644 index 0000000..44a4791 --- /dev/null +++ b/PathDSP_cs_model.txt @@ -0,0 +1,46 @@ +[Global_Params] +model_name='PathDSP' + +[Preprocess] +train_split_file = "GDSCv1_split_0_train.txt" +val_split_file = "GDSCv1_split_0_val.txt" +test_split_file = "GDSCv1_split_0_test.txt" +ml_data_outdir = "./ml_data/GDSCv1-GDSCv1/split_0" +x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] +x_data_drug_files = [["drug_SMILES.tsv"]] +y_data_files = [["response.tsv"]] +data_format = ".txt" +drug_bits_file='drug_mbit_df.txt' +dgnet_file='DGnet.txt' +mutnet_file='MUTnet.txt' +cnvnet_file='CNVnet.txt' +exp_file='EXP.txt' +bit_int=128 +permutation_int=3 +seed_int=42 +cpu_int=20 + +[Train] +train_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0" +val_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0" +model_outdir = "./out_models/GDSCv1/split_0" +model_file_name = "model" +model_file_format = ".pt" +epochs=800 +batch_size = 32 +val_batch = 32 +loss = "mse" +early_stop_metric = "mse" +patience = 30 +cuda_name = "cuda:2" +learning_rate = 0.001 + +[Infer] +test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0" +model_dir = "./out_models/GDSCv1/split_0" +infer_outdir = "./out_infer/GDSCv1-GDSCv1/split_0" +test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0" +model_dir = "./out_models/GDSCv1/split_0" +infer_outdir = "./out_infer/GDSCv1-GDSCv1/split_0" +test_batch = 256 +cuda_name = "cuda:3" \ No newline at end of file diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index 5951ee3..a059aae 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -2,10 +2,10 @@ model_name='PathDSP' [Preprocess] -train_split_file = "gCSI_split_0_train.txt" -val_split_file = "gCSI_split_0_val.txt" -test_split_file = "gCSI_split_0_test.txt" -ml_data_outdir = "./ml_data/gCSI-gCSI/split_0" +train_split_file = "GDSCv1_split_4_train.txt" +val_split_file = "GDSCv1_split_4_val.txt" +test_split_file = "GDSCv1_split_4_test.txt" +ml_data_outdir = "./ml_data/GDSCv1-GDSCv1/split_4" x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] x_data_drug_files = [["drug_SMILES.tsv"]] y_data_files = [["response.tsv"]] @@ -21,14 +21,14 @@ seed_int=42 cpu_int=20 [Train] -train_ml_data_dir = "./ml_data/gCSI-gCSI/split_0" -val_ml_data_dir = "./ml_data/gCSI-gCSI/split_0" -model_outdir = "./out_models/gCSI/split_0" +train_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" +val_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" +model_outdir = "./out_models/GDSCv1/split_4" model_file_name = "model" model_file_format = ".pt" epochs=800 -batch_size = 32 -val_batch = 32 +batch_size = 12 +val_batch = 12 loss = "mse" early_stop_metric = "mse" patience = 30 @@ -36,8 +36,8 @@ cuda_name = "cuda:2" learning_rate = 0.001 [Infer] -test_ml_data_dir = "./ml_data/gCSI-gCSI/split_0" -model_dir = "./out_models/gCSI/split_0" -infer_outdir = "./out_infer/gCSI-gCSI/split_0" +test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" +model_dir = "./out_models/GDSCv1/split_4" +infer_outdir = "./out_infer/GDSCv1-GDSCv1/split_4" test_batch = 256 cuda_name = "cuda:3" \ No newline at end of file diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index b91c8fa..1f9aeeb 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -43,7 +43,8 @@ def run(params): frm.create_outdir(outdir=params["infer_outdir"]) params = preprocess(params) - test_df = pl.read_csv(params['test_data'], separator = "\t").to_pandas() + test_data_fname = frm.build_ml_data_name(params, stage="test") + test_df = pl.read_csv(params["test_ml_data_dir"] + "/" + test_data_fname, separator = "\t").to_pandas() Xtest_arr = test_df.iloc[:, 0:-1].values ytest_arr = test_df.iloc[:, -1].values Xtest_arr = np.array(Xtest_arr).astype('float32') @@ -69,7 +70,7 @@ def run(params): print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start))) return test_scores -def main(): +def main(args): additional_definitions = model_preproc_params + \ model_train_params + \ model_infer_params + \ @@ -77,6 +78,7 @@ def main(): params = frm.initialize_parameters( file_path, default_model="PathDSP_default_model.txt", + #default_model="PathDSP_cs_model.txt", additional_definitions=additional_definitions, required=None, ) @@ -85,4 +87,4 @@ def main(): if __name__ == "__main__": - main() + main(sys.argv[1:]) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 1fd31f9..43afbd7 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -17,6 +17,7 @@ import NetPEA as pea import gseapy as gp import sklearn.model_selection as skms +from sklearn.preprocessing import StandardScaler file_path = Path(__file__).resolve().parent @@ -119,14 +120,8 @@ def mkdir(directory): def preprocess(params): - params["train_data"] = frm.build_ml_data_name(params, 'train') - params["val_data"] = frm.build_ml_data_name(params, 'val') - params["test_data"] = frm.build_ml_data_name(params, 'test') params["author_data_dir"] = os.getenv("AUTHOR_DATA_DIR") for i in [ - "train_data", - "test_data", - "val_data", "drug_bits_file", "dgnet_file", "mutnet_file", @@ -420,10 +415,14 @@ def prep_input(params): .join(drug_data, on="drug_id") .join(sample_data, on="sample_id") ) - comb_data_mtx["response"] = response_df[params["y_col_name"]].values + ss = StandardScaler() + comb_data_mtx.iloc[:,params["bit_int"]:comb_data_mtx.shape[1]] = ss.fit_transform(comb_data_mtx.iloc[:,params["bit_int"]:comb_data_mtx.shape[1]]) + ## add 0.01 to avoid possible inf values + comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) comb_data_mtx = comb_data_mtx.dropna() pl.from_pandas(comb_data_mtx).write_csv( - params[i + "_data"], separator="\t", has_header=True + params["ml_data_outdir"] + "/" + frm.build_ml_data_name(params, i) +, separator="\t", has_header=True ) @@ -502,10 +501,11 @@ def run(params): prep_input(params) -def main(): +def main(args): params = frm.initialize_parameters( file_path, default_model="PathDSP_default_model.txt", + #default_model="PathDSP_cs_model.txt", additional_definitions=preprocess_params, required=req_preprocess_args, ) @@ -514,5 +514,5 @@ def main(): if __name__ == "__main__": start = datetime.now() - main() + main(sys.argv[1:]) print("[Preprocessing finished in {:}]".format(cal_time(datetime.now(), start))) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 25f32d9..e7cbf20 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -102,6 +102,38 @@ def predicting(model, device, data_loader): total_labels = tch.cat((total_labels, data_y.view(-1, 1).cpu()), 0) # labels to tensor return total_labels.numpy().flatten(), total_preds.numpy().flatten() + +def predict(net, device, test_dl): + """ + Return prediction list + + :param net: model + :param train_dl: train dataloader + :param device: string representing cpu or cuda:0 + """ + # create result lists + prediction_list = list() + true_list = list() + + with tch.no_grad(): + net = net.to(device) # load the network onto the device + net.eval() + for i, (X_test, y_test) in enumerate(test_dl): + X_test, y_test = X_test.to(device), y_test.to(device) # load data onto the device + y_test_pred = net(X_test) # test result + # bring data back to cpu in np.array format, and append to result lists + prediction_list.append( y_test_pred.cpu().numpy() ) + true_list.append(y_test.cpu().numpy()) + #print(prediction_list) + + # merge all batches + prediction_list = np.vstack(prediction_list) + prediction_list = np.hstack(prediction_list).tolist() + true_list = np.vstack(true_list) + true_list = np.hstack(true_list).tolist() + # return + return true_list, prediction_list + def r2_score(y_true, y_pred): y_mean = np.mean(y_true) ss_tot = np.sum((y_true - y_mean)**2) @@ -223,8 +255,8 @@ def run(params): # [PathDSP] Prepare dataloaders # ------------------------------------------------------ print('loadinig data') - train_df = pl.read_csv(params['train_data'], separator = "\t").to_pandas() - val_df = pl.read_csv(params['val_data'], separator = "\t").to_pandas() + train_df = pl.read_csv(params["train_ml_data_dir"] + "/" + train_data_fname, separator = "\t").to_pandas() + val_df = pl.read_csv(params["val_ml_data_dir"] + "/" + val_data_fname, separator = "\t").to_pandas() Xtrain_arr = train_df.iloc[:, 0:-1].values Xvalid_arr = val_df.iloc[:, 0:-1].values ytrain_arr = train_df.iloc[:, -1].values @@ -267,9 +299,10 @@ def init_weights(m): # make train/valid loss plots best_model = trained_net tch.save(best_model.state_dict(), modelpath) - best_model.eval() + #best_model.eval() # Compute predictions - val_true, val_pred = predicting(best_model, device, data_loader=valid_dl) # (groud truth), (predictions) + #val_true, val_pred = predicting(best_model, device, valid_dl) # (groud truth), (predictions) + val_true, val_pred = predict(best_model, device, valid_dl) # (groud truth), (predictions) # ----------------------------- # [Req] Save raw predictions in dataframe @@ -291,13 +324,14 @@ def init_weights(m): return val_scores -def main(): +def main(args): additional_definitions = model_preproc_params + \ model_train_params + \ app_train_params params = frm.initialize_parameters( file_path, default_model="PathDSP_default_model.txt", + #default_model="PathDSP_cs_model.txt", additional_definitions=additional_definitions, required=None, ) @@ -306,5 +340,5 @@ def main(): if __name__ == "__main__": start = datetime.now() - main() + main(sys.argv[1:]) print("[Training finished in {:}]".format(cal_time(datetime.now(), start))) diff --git a/README.md b/README.md index 6adfdae..bac8978 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,8 @@ Define enviroment variabels improve_lib="/path/to/IMPROVE/repo/" pathdsp_lib="/path/to/pathdsp/repo/" # notice the extra PathDSP folder after pathdsp_lib -export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/export IMPROVE_DATA_DIR="/path/to/csa_data/" +export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/ +export IMPROVE_DATA_DIR="/path/to/csa_data/" export AUTHOR_DATA_DIR="/path/to/author_data/" ``` @@ -92,7 +93,7 @@ Final trained model is located at: `${train_ml_data_dir}/model.pt`. Parameter de Perform inference on the testing data ``` -python PathDSP_infer_improve.py +python repo/PathDSP/PathDSP_infer_improve.py ``` Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` diff --git a/csa_wf_v3.py b/csa_wf_v3.py new file mode 100644 index 0000000..066b726 --- /dev/null +++ b/csa_wf_v3.py @@ -0,0 +1,292 @@ +""" Python implementation of cross-study analysis workflow """ +# cuda_name = "cuda:6" +cuda_name = "cuda:7" + +import os +import subprocess +import warnings +from time import time +from pathlib import Path + +import pandas as pd + +# IMPROVE imports +from improve import framework as frm +# import improve_utils +# from improve_utils import improve_globals as ig + +# GraphDRP imports +# TODO: change this for your model +import PathDSP_preprocess_improve +import PathDSP_train_improve +import PathDSP_preprocess_improve + +# from ap_utils.classlogger import Logger +# from ap_utils.utils import get_print_func, Timer + + +class Timer: + """ Measure time. """ + def __init__(self): + self.start = time() + + def timer_end(self): + self.end = time() + return self.end - self.start + + def display_timer(self, print_fn=print): + time_diff = self.timer_end() + if time_diff // 3600 > 0: + print_fn("Runtime: {:.1f} hrs".format( (time_diff)/3600) ) + else: + print_fn("Runtime: {:.1f} mins".format( (time_diff)/60) ) + + +fdir = Path(__file__).resolve().parent + +y_col_name = "auc" +# y_col_name = "auc1" + +maindir = Path(f"./{y_col_name}") +MAIN_ML_DATA_DIR = Path(f"./{maindir}/ml.data") +MAIN_MODEL_DIR = Path(f"./{maindir}/models") +MAIN_INFER_OUTDIR = Path(f"./{maindir}/infer") + +# Check that environment variable "IMPROVE_DATA_DIR" has been specified +if os.getenv("IMPROVE_DATA_DIR") is None: + raise Exception("ERROR ! Required system variable not specified. \ + You must define IMPROVE_DATA_DIR ... Exiting.\n") +os.environ["CANDLE_DATA_DIR"] = os.environ["IMPROVE_DATA_DIR"] + +params = frm.initialize_parameters( + fdir, + default_model="csa_workflow_params.txt", +) + +main_datadir = Path(os.environ["IMPROVE_DATA_DIR"]) +raw_datadir = main_datadir / params["raw_data_dir"] +x_datadir = raw_datadir / params["x_data_dir"] +y_datadir = raw_datadir / params["y_data_dir"] +splits_dir = raw_datadir / params["splits_dir"] + +# lg = Logger(main_datadir/"csa.log") +print_fn = print +# print_fn = get_print_func(lg.logger) +print_fn(f"File path: {fdir}") + +### Source and target data sources +## Set 1 - full analysis +# source_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +## Set 2 - smaller datasets +# source_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# source_datasets = ["GDSCv1", "CTRPv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +## Set 3 - full analysis for a single source +# source_datasets = ["CCLE"] +# source_datasets = ["CTRPv2"] +source_datasets = ["GDSCv1"] +target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv2"] +## Set 4 - same source and target +# source_datasets = ["CCLE"] +# target_datasets = ["CCLE"] +## Set 5 - single source and target +# source_datasets = ["GDSCv1"] +# target_datasets = ["CCLE"] + +# only_cross_study = False +only_cross_study = True + + +## Splits +# split_nums = [] # all splits +# split_nums = [0] +# split_nums = [4, 7] +split_nums = [1, 4, 7] +# split_nums = [1, 3, 5, 7, 9] + +## Parameters of the experiment/run/workflow +# TODO: this should be stored as the experiment metadata that we can go back check +# epochs = 2 +# epochs = 30 +# epochs = 50 +epochs = 70 +# epochs = 100 +# epochs = 150 +# config_file_name = "csa_params.txt" +# config_file_path = fdir/config_file_name + +def build_split_fname(source, split, phasea): + """ Build split file name. If file does not exist continue """ + return f"{source_data_name}_split_{split}_{phase}.txt" + +# =============================================================== +### Generate CSA results (within- and cross-study) +# =============================================================== + +timer = Timer() +# Iterate over source datasets +# Note! The "source_data_name" iterations are independent of each other +print_fn(f"\nsource_datasets: {source_datasets}") +print_fn(f"target_datasets: {target_datasets}") +print_fn(f"split_nums: {split_nums}") +# import pdb; pdb.set_trace() +for source_data_name in source_datasets: + + # Get the split file paths + # This parsing assumes splits file names are: SOURCE_split_NUM_[train/val/test].txt + if len(split_nums) == 0: + # Get all splits + split_files = list((splits_dir).glob(f"{source_data_name}_split_*.txt")) + split_nums = [str(s).split("split_")[1].split("_")[0] for s in split_files] + split_nums = sorted(set(split_nums)) + # num_splits = 1 + else: + # Use the specified splits + split_files = [] + for s in split_nums: + split_files.extend(list((splits_dir).glob(f"{source_data_name}_split_{s}_*.txt"))) + + files_joined = [str(s) for s in split_files] + + # -------------------- + # Preprocess and Train + # -------------------- + # import pdb; pdb.set_trace() + for split in split_nums: + print_fn(f"Split id {split} out of {len(split_nums)} splits.") + # Check that train, val, and test are available. Otherwise, continue to the next split. + # split = 11 + # files_joined = [str(s) for s in split_files] + # TODO: check this! + for phase in ["train", "val", "test"]: + fname = build_split_fname(source_data_name, split, phase) + # print(f"{phase}: {fname}") + if fname not in "\t".join(files_joined): + warnings.warn(f"\nThe {phase} split file {fname} is missing (continue to next split)") + continue + + # import pdb; pdb.set_trace() + for target_data_name in target_datasets: + if only_cross_study and (source_data_name == target_data_name): + continue # only cross-study + print_fn(f"\nSource data: {source_data_name}") + print_fn(f"Target data: {target_data_name}") + + # EXP_ML_DATA_DIR = ig.ml_data_dir/f"{source_data_name}-{target_data_name}"/f"split_{split}" + ml_data_outdir = MAIN_ML_DATA_DIR/f"{source_data_name}-{target_data_name}"/f"split_{split}" + + if source_data_name == target_data_name: + # If source and target are the same, then infer on the test split + test_split_file = f"{source_data_name}_split_{split}_test.txt" + else: + # If source and target are different, then infer on the entire target dataset + test_split_file = f"{target_data_name}_all.txt" + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # p1 (none): Preprocess train data + # import pdb; pdb.set_trace() + # train_split_files = list((ig.splits_dir).glob(f"{source_data_name}_split_0_train*.txt")) # TODO: placeholder for lc analysis + timer_preprocess = Timer() + # ml_data_path = graphdrp_preprocess_improve.main([ + # "--train_split_file", f"{source_data_name}_split_{split}_train.txt", + # "--val_split_file", f"{source_data_name}_split_{split}_val.txt", + # "--test_split_file", str(test_split_file_name), + # "--ml_data_outdir", str(ml_data_outdir), + # "--y_col_name", y_col_name + # ]) + print_fn("\nPreprocessing") + train_split_file = f"{source_data_name}_split_{split}_train.txt" + val_split_file = f"{source_data_name}_split_{split}_val.txt" + # test_split_file = f"{source_data_name}_split_{split}_test.txt" + print_fn(f"train_split_file: {train_split_file}") + print_fn(f"val_split_file: {val_split_file}") + print_fn(f"test_split_file: {test_split_file}") + print_fn(f"ml_data_outdir: {ml_data_outdir}") + # import pdb; pdb.set_trace() + preprocess_run = ["python", + "PathDSP_preprocess_improve.py", + "--train_split_file", str(train_split_file), + "--val_split_file", str(val_split_file), + "--test_split_file", str(test_split_file), + "--ml_data_outdir", str(ml_data_outdir), + "--y_col_name", str(y_col_name) + ] + result = subprocess.run(preprocess_run, capture_output=True, + text=True, check=True) + # print(result.stdout) + # print(result.stderr) + timer_preprocess.display_timer(print_fn) + + # p2 (p1): Train model + # Train a single model for a given [source, split] pair + # Train using train samples and early stop using val samples + # import pdb; pdb.set_trace() + model_outdir = MAIN_MODEL_DIR/f"{source_data_name}"/f"split_{split}" + if model_outdir.exists() is False: + train_ml_data_dir = ml_data_outdir + val_ml_data_dir = ml_data_outdir + timer_train = Timer() + # graphdrp_train_improve.main([ + # "--train_ml_data_dir", str(train_ml_data_dir), + # "--val_ml_data_dir", str(val_ml_data_dir), + # "--model_outdir", str(model_outdir), + # "--epochs", str(epochs), # available in config_file + # # "--ckpt_directory", str(MODEL_OUTDIR), # TODO: we'll use candle known param ckpt_directory instead of model_outdir + # # "--cuda_name", "cuda:5" + # ]) + print_fn("\nTrain") + print_fn(f"train_ml_data_dir: {train_ml_data_dir}") + print_fn(f"val_ml_data_dir: {val_ml_data_dir}") + print_fn(f"model_outdir: {model_outdir}") + # import pdb; pdb.set_trace() + train_run = ["python", + "PathDSP_train_improve.py", + "--train_ml_data_dir", str(train_ml_data_dir), + "--val_ml_data_dir", str(val_ml_data_dir), + "--model_outdir", str(model_outdir), + "--epochs", str(epochs), + "--cuda_name", cuda_name, + "--y_col_name", y_col_name + ] + result = subprocess.run(train_run, capture_output=True, + text=True, check=True) + # print(result.stdout) + # print(result.stderr) + timer_train.display_timer(print_fn) + + # Infer + # p3 (p1, p2): Inference + # import pdb; pdb.set_trace() + test_ml_data_dir = ml_data_outdir + model_dir = model_outdir + infer_outdir = MAIN_INFER_OUTDIR/f"{source_data_name}-{target_data_name}"/f"split_{split}" + timer_infer = Timer() + # graphdrp_infer_improve.main([ + # "--test_ml_data_dir", str(test_ml_data_dir), + # "--model_dir", str(model_dir), + # "--infer_outdir", str(infer_outdir), + # # "--cuda_name", "cuda:5" + # ]) + print_fn("\nInfer") + print_fn(f"test_ml_data_dir: {test_ml_data_dir}") + #print_fn(f"val_ml_data_dir: {val_ml_data_dir}") + print_fn(f"infer_outdir: {infer_outdir}") + # import pdb; pdb.set_trace() + infer_run = ["python", + "PathDSP_infer_improve.py", + "--test_ml_data_dir", str(test_ml_data_dir), + "--model_dir", str(model_dir), + "--infer_outdir", str(infer_outdir), + "--cuda_name", cuda_name, + "--y_col_name", y_col_name + ] + result = subprocess.run(infer_run, capture_output=True, + text=True, check=True) + timer_infer.display_timer(print_fn) + +timer.display_timer(print_fn) +print_fn("Finished a full cross-study run.") diff --git a/csa_workflow_params.txt b/csa_workflow_params.txt new file mode 100644 index 0000000..66a69f8 --- /dev/null +++ b/csa_workflow_params.txt @@ -0,0 +1,8 @@ +[Global_Params] +model_name = "CSA_workflow" + +[CSA_Workflow] +raw_data_dir = "raw_data" +x_data_dir = "x_data" +y_data_dir = "y_data" +splits_dir = "splits" From e74d203eb9f2e6242e774e54b888919fe2f8ac77 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 29 Feb 2024 13:41:23 -0800 Subject: [PATCH 077/254] add dropout --- PathDSP_default_model.txt | 3 ++- PathDSP_train_improve.py | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index a059aae..605b32c 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -26,7 +26,7 @@ val_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" model_outdir = "./out_models/GDSCv1/split_4" model_file_name = "model" model_file_format = ".pt" -epochs=800 +epochs=500 batch_size = 12 val_batch = 12 loss = "mse" @@ -34,6 +34,7 @@ early_stop_metric = "mse" patience = 30 cuda_name = "cuda:2" learning_rate = 0.001 +dropout=0.1 [Infer] test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index e7cbf20..29cac69 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -53,7 +53,11 @@ "default": 0.0001, "help": "Learning rate for the optimizer." }, - + {"name": "dropout", + "type": float, + "default": 0.1, + "help": "Dropout rate for the optimizer." + }, ] class RMSELoss(tch.nn.Module): @@ -282,6 +286,10 @@ def init_weights(m): # load model n_features = Xtrain_arr.shape[1] net = mynet.FNN(n_features) + ## specify dropout rate + for module in net.modules(): + if isinstance(module, tch.nn.Dropout): + module.p = params['dropout'] net.apply(init_weights) # ------------------------------------------------------ From aabd9aabf3ada6b9102abb99e2be128eb00c32f0 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 29 Feb 2024 14:51:08 -0800 Subject: [PATCH 078/254] update def --- PathDSP.def | 15 ++++++---- infer.sh | 4 +-- preprocess.sh | 82 +++++++++++++++++++++++++++++++-------------------- train.sh | 26 +--------------- 4 files changed, 61 insertions(+), 66 deletions(-) diff --git a/PathDSP.def b/PathDSP.def index 61e45c2..44fe114 100644 --- a/PathDSP.def +++ b/PathDSP.def @@ -11,9 +11,11 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime %environment PATH=$PATH:/usr/local/PathDSP - MODEL_DIR=/usr/local/PathDSP + IMPROVE_MODEL_DIR=/usr/local/PathDSP CANDLE_DATA_DIR=/candle_data_dir - + AUTHOR_DATA_DIR=/candle_data_dir + PYTHONPATH=$PYTHONPATH:/usr/local/IMPROVE/:/usr/local/PathDSP/PathDSP/ + %post apt-get update -y apt-get install wget -y @@ -42,15 +44,16 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime #install python modules and model prerequites cd /usr/local + git clone https://github.com/JDACS4C-IMPROVE/IMPROVE.git git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git - git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git - export PYTHONPATH=$PYTHONPATH:/usr/local/IMPROVE/ cd PathDSP + # download conda /opt/conda/bin/conda env create -f environment_082223.yml --prefix /usr/local/conda_envs/PathDSP_env/ #/opt/conda/bin/conda activate PathDSP_env /usr/local/conda_envs/PathDSP_env/bin/pip install git+https://github.com/ECP-CANDLE/candle_lib@develop - #cp *.sh /usr/local/bin - chmod a+x /usr/local/PathDSP/*.sh + cp *.sh /usr/local/bin + chmod a+x /usr/local/bin/*.sh + chmod a+x /usr/local/PathDSP/*.sh \ No newline at end of file diff --git a/infer.sh b/infer.sh index 19146ae..ef9eac3 100755 --- a/infer.sh +++ b/infer.sh @@ -1,7 +1,5 @@ #!/bin/bash -#!/bin/bash - ######################################################################### ### THIS IS A TEMPLATE FILE. SUBSTITUTE #PATH# WITH THE MODEL EXECUTABLE. ######################################################################### @@ -12,7 +10,7 @@ # arg 3 CANDLE_CONFIG ### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=infer.py +CANDLE_MODEL=PathDSP_infer_improve.py if [ $# -lt 2 ] ; then echo "Illegal number of parameters" diff --git a/preprocess.sh b/preprocess.sh index ab0de90..590905b 100755 --- a/preprocess.sh +++ b/preprocess.sh @@ -4,53 +4,71 @@ ### THIS IS A TEMPLATE FILE. SUBSTITUTE #PATH# WITH THE MODEL EXECUTABLE. ######################################################################### +# arg 1 CANDLE_DATA_DIR +# arg 2 CANDLE_CONFIG -# arg 1 CUDA_VISIBLE_DEVICES -# arg 2 CANDLE_DATA_DIR -# arg 3 CANDLE_CONFIG +### Path and Name to your CANDLEized model's main Python script### + +# e.g. CANDLE_MODEL=graphdrp_preprocess.py +CANDLE_MODEL_SCRIPT=PathDSP_preprocess_improve.py + +# Set env if CANDLE_MODEL is not in same directory as this script +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} + +# Combine path and name and check if executable exists +CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL_SCRIPT} +if [ ! -f ${CANDLE_MODEL} ] ; then + echo No such file ${CANDLE_MODEL} + exit 404 +fi -### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=preprocess_improve.py if [ $# -lt 2 ] ; then - echo "Illegal number of parameters" - echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" - exit + echo "Illegal number of parameters" + echo "CANDLE_DATA_DIR PARAMS are required" + exit -1 fi + + if [ $# -eq 2 ] ; then - CUDA_VISIBLE_DEVICES=$1 ; shift - CANDLE_DATA_DIR=$1 ; shift - CMD="python ${CANDLE_MODEL}" - echo "CMD = $CMD" - -elif [ $# -ge 3 ] ; then - CUDA_VISIBLE_DEVICES=$1 ; shift - CANDLE_DATA_DIR=$1 ; shift - - # if original $3 is a file, set candle_config and passthrough $@ - if [ -f $CANDLE_DATA_DIR/$1 ] ; then - echo "$CANDLE_DATA_DIR/$1 is a file" - CANDLE_CONFIG=$1 ; shift - CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" - echo "CMD = $CMD" + + CANDLE_DATA_DIR=$1 ; shift + + # if $2 is a file, then set candle_config + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + CONFIG_FILE=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file ${CONFIG_FILE}" + else + CMD="python ${CANDLE_MODEL} $@" + echo CMD=\"$CMD\" + fi + +elif [ $# -ge 3 ] ; then + + CANDLE_DATA_DIR=$1 ; shift + + # if $2 is a file, then set candle_config + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD $@" # else passthrough $@ - else - echo "$1 is not a file" - CMD="python ${CANDLE_MODEL} $@" - echo "CMD = $CMD" - - fi -fi + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + fi +fi # Display runtime arguments -echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" - +echo "running command ${CMD}" # Set up environmental variables and execute model echo "activating environment" #source /opt/conda/etc/profile.d/conda.sh diff --git a/train.sh b/train.sh index 165eac6..28f3d6e 100755 --- a/train.sh +++ b/train.sh @@ -5,7 +5,7 @@ # arg 3 CANDLE_CONFIG ### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=train.py +CANDLE_MODEL=PathDSP_train_improve.py ### Set env if CANDLE_MODEL is not in same directory as this script IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} @@ -48,30 +48,6 @@ elif [ $# -ge 3 ] ; then fi fi -if [ -d ${CANDLE_DATA_DIR} ]; then - if [ "$(ls -A ${CANDLE_DATA_DIR})" ] ; then - echo "using data from ${CANDLE_DATA_DIR}" - else - ./candle_glue.sh - echo "using original data placed in ${CANDLE_DATA_DIR}" - fi -fi - -export CANDLE_DATA_DIR=${CANDLE_DATA_DIR} -FULL_DATA_DIR="$CANDLE_DATA_DIR/$MODEL_NAME/Data" -echo $FULL_DATA_DIR - -if [ -d ${FULL_DATA_DIR} ]; then - if [ "$(ls -A ${FULL_DATA_DIR})" ] ; then - echo "using data from ${FULL_DATA_DIR}" - else - ./candle_glue.sh - echo "using original data placed in ${FULL_DATA_DIR}" - fi -else - ./candle_glue.sh - echo "using original data placed in ${FULL_DATA_DIR}" -fi # Display runtime arguments echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" From e2273458a6fc2df6fbc589afd32497c43cebdbdc Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Sun, 3 Mar 2024 20:49:56 -0800 Subject: [PATCH 079/254] rebuild image --- PathDSP.def | 22 +++++------ PathDSP_default_model.txt | 2 +- README.md | 43 +++++++++++++++++++++ infer.sh | 80 ++++++++++++++++++++++----------------- train.sh | 71 ++++++++++++++++++---------------- 5 files changed, 138 insertions(+), 80 deletions(-) diff --git a/PathDSP.def b/PathDSP.def index 44fe114..b1c2154 100644 --- a/PathDSP.def +++ b/PathDSP.def @@ -10,18 +10,18 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime %environment - PATH=$PATH:/usr/local/PathDSP - IMPROVE_MODEL_DIR=/usr/local/PathDSP - CANDLE_DATA_DIR=/candle_data_dir - AUTHOR_DATA_DIR=/candle_data_dir - PYTHONPATH=$PYTHONPATH:/usr/local/IMPROVE/:/usr/local/PathDSP/PathDSP/ + export PATH=$PATH:/usr/local/PathDSP + export IMPROVE_MODEL_DIR=/usr/local/PathDSP + export CANDLE_DATA_DIR=/candle_data_dir + export AUTHOR_DATA_DIR=/candle_data_dir + export PYTHONPATH=$PYTHONPATH:/usr/local/IMPROVE/:/usr/local/PathDSP/PathDSP/ %post apt-get update -y apt-get install wget -y apt-get install -y gnupg apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80 - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys A4B469963BF863CC + apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys A4B469963BF863CC apt-get install build-essential -y apt-get install git -y @@ -31,12 +31,12 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime # install gpu fix and clean up cd / chmod +x Singularity_gpu_fix.sh - ./Singularity_gpu_fix.sh + ./Singularity_gpu_fix.sh rm Singularity_gpu_fix.sh - # these three need to be compiled and linked to the cuda libs. - # at the moment, what works for me is to build these in a - # singularity shell in a sandbox with the --nv flag to singularity set. + # these three need to be compiled and linked to the cuda libs. + # at the moment, what works for me is to build these in a + # singularity shell in a sandbox with the --nv flag to singularity set. # create default internal candle_data_dir, map external candle_data_dir here @@ -45,7 +45,7 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime #install python modules and model prerequites cd /usr/local git clone https://github.com/JDACS4C-IMPROVE/IMPROVE.git - git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git + git clone -b develop https://github.com/Liuy12/PathDSP.git cd PathDSP # download conda diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index 605b32c..42c71ed 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -32,7 +32,7 @@ val_batch = 12 loss = "mse" early_stop_metric = "mse" patience = 30 -cuda_name = "cuda:2" +cuda_name = "cuda:0" learning_rate = 0.001 dropout=0.1 diff --git a/README.md b/README.md index bac8978..15ec9d1 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,49 @@ python repo/PathDSP/PathDSP_infer_improve.py Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` +# Example usage with singularity container + +Download csa data benchmark data similar as mentioned above. Then download author data into the same directory as csa data. + +Setup Singularity + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git +cd Singularity +./setup +source config/improve.env +``` + +Build Singularity from definition file + +``` +singularity build --fakeroot PathDSP.sif definitions/PathDSP.def +``` + +Perform preprocessing using csa benchmarking data + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh /candle_data_dir --ml_data_outdir /candle_data_dir/preprocess_data/ +``` + +Train the model + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh /candle_data_dir --train_ml_data_dir /candle_data_dir/preprocess_data/ --val_ml_data_dir /candle_data_dir/preprocess_data/ --model_outdir /candle_data_dir/out_model/ +``` + +Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json` +Final trained model is located at: `${train_ml_data_dir}/model.pt`. + +Perform inference on the testing data + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh /candle_data_dir --test_ml_data_dir /candle_data_dir/preprocess_data/ --model_dir /candle_data_dir/out_model/ --infer_outdir /candle_data_dir/out_infer/ +``` + +Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` +Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` + # Docs from original authors (below) diff --git a/infer.sh b/infer.sh index ef9eac3..571a7d7 100755 --- a/infer.sh +++ b/infer.sh @@ -1,53 +1,63 @@ #!/bin/bash -######################################################################### -### THIS IS A TEMPLATE FILE. SUBSTITUTE #PATH# WITH THE MODEL EXECUTABLE. -######################################################################### - - -# arg 1 CUDA_VISIBLE_DEVICES -# arg 2 CANDLE_DATA_DIR -# arg 3 CANDLE_CONFIG +# arg 1 CANDLE_DATA_DIR +# arg 2 CANDLE_CONFIG ### Path to your CANDLEized model's main Python script### CANDLE_MODEL=PathDSP_infer_improve.py +### Set env if CANDLE_MODEL is not in same directory as this script +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} + +CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL} +if [ ! -f ${CANDLE_MODEL} ] ; then + echo No such file ${CANDLE_MODEL} + exit 404 +fi + if [ $# -lt 2 ] ; then - echo "Illegal number of parameters" - echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" - exit + echo "Illegal number of parameters" + echo "CANDLE_DATA_DIR PARAMS are required" + exit -1 fi if [ $# -eq 2 ] ; then - CUDA_VISIBLE_DEVICES=$1 ; shift - CANDLE_DATA_DIR=$1 ; shift - CMD="python ${CANDLE_MODEL}" - echo "CMD = $CMD" - -elif [ $# -ge 3 ] ; then - CUDA_VISIBLE_DEVICES=$1 ; shift - CANDLE_DATA_DIR=$1 ; shift - - # if original $3 is a file, set candle_config and passthrough $@ - if [ -f $CANDLE_DATA_DIR/$1 ] ; then - echo "$CANDLE_DATA_DIR/$1 is a file" - CANDLE_CONFIG=$1 ; shift - CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" - echo "CMD = $CMD $@" - - # else passthrough $@ - else - echo "$1 is not a file" - CMD="python ${CANDLE_MODEL} $@" - echo "CMD = $CMD" - - fi + + CANDLE_DATA_DIR=$1 ; shift + + # if $2 is a file, then set candle_config + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + CONFIG_FILE=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file ${CONFIG_FILE}" + else + CMD="python ${CANDLE_MODEL} $@" + echo CMD=\"$CMD\" + fi + +elif [ $# -ge 3 ] ; then + + CANDLE_DATA_DIR=$1 ; shift + + # if $2 is a file, then set candle_config + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD $@" + + # else passthrough $@ + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + + fi fi # Display runtime arguments -echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +#echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" diff --git a/train.sh b/train.sh index 28f3d6e..23ac094 100755 --- a/train.sh +++ b/train.sh @@ -1,8 +1,7 @@ #!/bin/bash - -# arg 1 CUDA_VISIBLE_DEVICES -# arg 2 CANDLE_DATA_DIR -# arg 3 CANDLE_CONFIG + +# arg 1 CANDLE_DATA_DIR +# arg 2 CANDLE_CONFIG ### Path to your CANDLEized model's main Python script### CANDLE_MODEL=PathDSP_train_improve.py @@ -16,41 +15,47 @@ if [ ! -f ${CANDLE_MODEL} ] ; then exit 404 fi -if [ $# -lt 2 ]; then - echo "Illegal number of parameters" - echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" - exit +if [ $# -lt 2 ] ; then + echo "Illegal number of parameters" + echo "CANDLE_DATA_DIR PARAMS are required" + exit -1 fi if [ $# -eq 2 ] ; then - CUDA_VISIBLE_DEVICES=$1 ; shift - CANDLE_DATA_DIR=$1 ; shift - CMD="python ${CANDLE_MODEL}" - echo "CMD = $CMD" - -elif [ $# -ge 3 ] ; then - CUDA_VISIBLE_DEVICES=$1 ; shift - CANDLE_DATA_DIR=$1 ; shift - - # if original $3 is a file, set candle_config and passthrough $@ - if [ -f $CANDLE_DATA_DIR/$1 ] ; then - echo "$CANDLE_DATA_DIR/$1 is a file" - CANDLE_CONFIG=$1 ; shift - CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" - echo "CMD = $CMD $@" - - # else passthrough $@ - else - echo "$1 is not a file" - CMD="python ${CANDLE_MODEL} $@" - echo "CMD = $CMD" - - fi -fi + + CANDLE_DATA_DIR=$1 ; shift + + # if $2 is a file, then set candle_config + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + CONFIG_FILE=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file ${CONFIG_FILE}" + else + CMD="python ${CANDLE_MODEL} $@" + echo CMD=\"$CMD\" + fi + +elif [ $# -ge 3 ] ; then + + CANDLE_DATA_DIR=$1 ; shift + # if $2 is a file, then set candle_config + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD $@" + + # else passthrough $@ + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + + fi +fi # Display runtime arguments -echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +#echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" From d53e43a499a6d3f34cd417fe278244fa06322dca Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Sun, 3 Mar 2024 20:52:19 -0800 Subject: [PATCH 080/254] update def --- PathDSP.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PathDSP.def b/PathDSP.def index b1c2154..b754001 100644 --- a/PathDSP.def +++ b/PathDSP.def @@ -45,7 +45,7 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime #install python modules and model prerequites cd /usr/local git clone https://github.com/JDACS4C-IMPROVE/IMPROVE.git - git clone -b develop https://github.com/Liuy12/PathDSP.git + git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git cd PathDSP # download conda From 889253017cee4d6463803c66d455e587b948ba7c Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Sun, 3 Mar 2024 22:54:42 -0600 Subject: [PATCH 081/254] prepare reformatted model for singularity (#10) * update preprocess script * update preprocess script * add improve_utils script * add nea scripts * update params * add gitignore * EXP processing * updated to integrate with prep_input * add definition file * update .gitignore * update filename for ssGSEA * add FNN_new * add train/infer * update params * add .yml * update params * update conda path * fix conda * update preprocess.sh * update preprocess.sh * update preprocess_new.py * update env * update preproce_new.py * update preproce_new.py * update files * update params * fix params * update preproce_new.py * update preprocess_new.py * update preprocess_new.py * update file * update file * update file * update script * add def * add script * update file * update FNN_new * update FNN * update params * fix param * fix bug * add time * update def * update yml * update train.sh * update train.sh * update train.py * update train * fix bug * update file * update file * use polars * update files * update preprocess * update infer.sh * process author data * fix args * add infer.sh * update doc * fix path * fix conda * use improve repo * use improve module * update readme * update csa * add dropout * update def * rebuild image * update def --------- Co-authored-by: willherbert27 --- PathDSP.def | 29 +++++++------ PathDSP_default_model.txt | 5 ++- PathDSP_train_improve.py | 10 ++++- README.md | 43 +++++++++++++++++++ infer.sh | 82 +++++++++++++++++++----------------- preprocess.sh | 82 ++++++++++++++++++++++-------------- train.sh | 87 +++++++++++++++------------------------ 7 files changed, 200 insertions(+), 138 deletions(-) diff --git a/PathDSP.def b/PathDSP.def index 61e45c2..b754001 100644 --- a/PathDSP.def +++ b/PathDSP.def @@ -10,16 +10,18 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime %environment - PATH=$PATH:/usr/local/PathDSP - MODEL_DIR=/usr/local/PathDSP - CANDLE_DATA_DIR=/candle_data_dir - + export PATH=$PATH:/usr/local/PathDSP + export IMPROVE_MODEL_DIR=/usr/local/PathDSP + export CANDLE_DATA_DIR=/candle_data_dir + export AUTHOR_DATA_DIR=/candle_data_dir + export PYTHONPATH=$PYTHONPATH:/usr/local/IMPROVE/:/usr/local/PathDSP/PathDSP/ + %post apt-get update -y apt-get install wget -y apt-get install -y gnupg apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80 - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys A4B469963BF863CC + apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys A4B469963BF863CC apt-get install build-essential -y apt-get install git -y @@ -29,12 +31,12 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime # install gpu fix and clean up cd / chmod +x Singularity_gpu_fix.sh - ./Singularity_gpu_fix.sh + ./Singularity_gpu_fix.sh rm Singularity_gpu_fix.sh - # these three need to be compiled and linked to the cuda libs. - # at the moment, what works for me is to build these in a - # singularity shell in a sandbox with the --nv flag to singularity set. + # these three need to be compiled and linked to the cuda libs. + # at the moment, what works for me is to build these in a + # singularity shell in a sandbox with the --nv flag to singularity set. # create default internal candle_data_dir, map external candle_data_dir here @@ -42,15 +44,16 @@ From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime #install python modules and model prerequites cd /usr/local + git clone https://github.com/JDACS4C-IMPROVE/IMPROVE.git git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git - git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git - export PYTHONPATH=$PYTHONPATH:/usr/local/IMPROVE/ cd PathDSP + # download conda /opt/conda/bin/conda env create -f environment_082223.yml --prefix /usr/local/conda_envs/PathDSP_env/ #/opt/conda/bin/conda activate PathDSP_env /usr/local/conda_envs/PathDSP_env/bin/pip install git+https://github.com/ECP-CANDLE/candle_lib@develop - #cp *.sh /usr/local/bin - chmod a+x /usr/local/PathDSP/*.sh + cp *.sh /usr/local/bin + chmod a+x /usr/local/bin/*.sh + chmod a+x /usr/local/PathDSP/*.sh \ No newline at end of file diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index a059aae..42c71ed 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -26,14 +26,15 @@ val_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" model_outdir = "./out_models/GDSCv1/split_4" model_file_name = "model" model_file_format = ".pt" -epochs=800 +epochs=500 batch_size = 12 val_batch = 12 loss = "mse" early_stop_metric = "mse" patience = 30 -cuda_name = "cuda:2" +cuda_name = "cuda:0" learning_rate = 0.001 +dropout=0.1 [Infer] test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index e7cbf20..29cac69 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -53,7 +53,11 @@ "default": 0.0001, "help": "Learning rate for the optimizer." }, - + {"name": "dropout", + "type": float, + "default": 0.1, + "help": "Dropout rate for the optimizer." + }, ] class RMSELoss(tch.nn.Module): @@ -282,6 +286,10 @@ def init_weights(m): # load model n_features = Xtrain_arr.shape[1] net = mynet.FNN(n_features) + ## specify dropout rate + for module in net.modules(): + if isinstance(module, tch.nn.Dropout): + module.p = params['dropout'] net.apply(init_weights) # ------------------------------------------------------ diff --git a/README.md b/README.md index bac8978..15ec9d1 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,49 @@ python repo/PathDSP/PathDSP_infer_improve.py Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` +# Example usage with singularity container + +Download csa data benchmark data similar as mentioned above. Then download author data into the same directory as csa data. + +Setup Singularity + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git +cd Singularity +./setup +source config/improve.env +``` + +Build Singularity from definition file + +``` +singularity build --fakeroot PathDSP.sif definitions/PathDSP.def +``` + +Perform preprocessing using csa benchmarking data + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh /candle_data_dir --ml_data_outdir /candle_data_dir/preprocess_data/ +``` + +Train the model + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh /candle_data_dir --train_ml_data_dir /candle_data_dir/preprocess_data/ --val_ml_data_dir /candle_data_dir/preprocess_data/ --model_outdir /candle_data_dir/out_model/ +``` + +Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json` +Final trained model is located at: `${train_ml_data_dir}/model.pt`. + +Perform inference on the testing data + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh /candle_data_dir --test_ml_data_dir /candle_data_dir/preprocess_data/ --model_dir /candle_data_dir/out_model/ --infer_outdir /candle_data_dir/out_infer/ +``` + +Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` +Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` + # Docs from original authors (below) diff --git a/infer.sh b/infer.sh index 19146ae..571a7d7 100755 --- a/infer.sh +++ b/infer.sh @@ -1,55 +1,63 @@ #!/bin/bash -#!/bin/bash - -######################################################################### -### THIS IS A TEMPLATE FILE. SUBSTITUTE #PATH# WITH THE MODEL EXECUTABLE. -######################################################################### +# arg 1 CANDLE_DATA_DIR +# arg 2 CANDLE_CONFIG +### Path to your CANDLEized model's main Python script### +CANDLE_MODEL=PathDSP_infer_improve.py -# arg 1 CUDA_VISIBLE_DEVICES -# arg 2 CANDLE_DATA_DIR -# arg 3 CANDLE_CONFIG +### Set env if CANDLE_MODEL is not in same directory as this script +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} -### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=infer.py +CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL} +if [ ! -f ${CANDLE_MODEL} ] ; then + echo No such file ${CANDLE_MODEL} + exit 404 +fi if [ $# -lt 2 ] ; then - echo "Illegal number of parameters" - echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" - exit + echo "Illegal number of parameters" + echo "CANDLE_DATA_DIR PARAMS are required" + exit -1 fi if [ $# -eq 2 ] ; then - CUDA_VISIBLE_DEVICES=$1 ; shift - CANDLE_DATA_DIR=$1 ; shift - CMD="python ${CANDLE_MODEL}" - echo "CMD = $CMD" - -elif [ $# -ge 3 ] ; then - CUDA_VISIBLE_DEVICES=$1 ; shift - CANDLE_DATA_DIR=$1 ; shift - - # if original $3 is a file, set candle_config and passthrough $@ - if [ -f $CANDLE_DATA_DIR/$1 ] ; then - echo "$CANDLE_DATA_DIR/$1 is a file" - CANDLE_CONFIG=$1 ; shift - CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" - echo "CMD = $CMD $@" - - # else passthrough $@ - else - echo "$1 is not a file" - CMD="python ${CANDLE_MODEL} $@" - echo "CMD = $CMD" - - fi + + CANDLE_DATA_DIR=$1 ; shift + + # if $2 is a file, then set candle_config + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + CONFIG_FILE=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file ${CONFIG_FILE}" + else + CMD="python ${CANDLE_MODEL} $@" + echo CMD=\"$CMD\" + fi + +elif [ $# -ge 3 ] ; then + + CANDLE_DATA_DIR=$1 ; shift + + # if $2 is a file, then set candle_config + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD $@" + + # else passthrough $@ + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + + fi fi # Display runtime arguments -echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +#echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" diff --git a/preprocess.sh b/preprocess.sh index ab0de90..590905b 100755 --- a/preprocess.sh +++ b/preprocess.sh @@ -4,53 +4,71 @@ ### THIS IS A TEMPLATE FILE. SUBSTITUTE #PATH# WITH THE MODEL EXECUTABLE. ######################################################################### +# arg 1 CANDLE_DATA_DIR +# arg 2 CANDLE_CONFIG -# arg 1 CUDA_VISIBLE_DEVICES -# arg 2 CANDLE_DATA_DIR -# arg 3 CANDLE_CONFIG +### Path and Name to your CANDLEized model's main Python script### + +# e.g. CANDLE_MODEL=graphdrp_preprocess.py +CANDLE_MODEL_SCRIPT=PathDSP_preprocess_improve.py + +# Set env if CANDLE_MODEL is not in same directory as this script +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} + +# Combine path and name and check if executable exists +CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL_SCRIPT} +if [ ! -f ${CANDLE_MODEL} ] ; then + echo No such file ${CANDLE_MODEL} + exit 404 +fi -### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=preprocess_improve.py if [ $# -lt 2 ] ; then - echo "Illegal number of parameters" - echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" - exit + echo "Illegal number of parameters" + echo "CANDLE_DATA_DIR PARAMS are required" + exit -1 fi + + if [ $# -eq 2 ] ; then - CUDA_VISIBLE_DEVICES=$1 ; shift - CANDLE_DATA_DIR=$1 ; shift - CMD="python ${CANDLE_MODEL}" - echo "CMD = $CMD" - -elif [ $# -ge 3 ] ; then - CUDA_VISIBLE_DEVICES=$1 ; shift - CANDLE_DATA_DIR=$1 ; shift - - # if original $3 is a file, set candle_config and passthrough $@ - if [ -f $CANDLE_DATA_DIR/$1 ] ; then - echo "$CANDLE_DATA_DIR/$1 is a file" - CANDLE_CONFIG=$1 ; shift - CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" - echo "CMD = $CMD" + + CANDLE_DATA_DIR=$1 ; shift + + # if $2 is a file, then set candle_config + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + CONFIG_FILE=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file ${CONFIG_FILE}" + else + CMD="python ${CANDLE_MODEL} $@" + echo CMD=\"$CMD\" + fi + +elif [ $# -ge 3 ] ; then + + CANDLE_DATA_DIR=$1 ; shift + + # if $2 is a file, then set candle_config + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD $@" # else passthrough $@ - else - echo "$1 is not a file" - CMD="python ${CANDLE_MODEL} $@" - echo "CMD = $CMD" - - fi -fi + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + fi +fi # Display runtime arguments -echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" - +echo "running command ${CMD}" # Set up environmental variables and execute model echo "activating environment" #source /opt/conda/etc/profile.d/conda.sh diff --git a/train.sh b/train.sh index 165eac6..23ac094 100755 --- a/train.sh +++ b/train.sh @@ -1,11 +1,10 @@ #!/bin/bash - -# arg 1 CUDA_VISIBLE_DEVICES -# arg 2 CANDLE_DATA_DIR -# arg 3 CANDLE_CONFIG + +# arg 1 CANDLE_DATA_DIR +# arg 2 CANDLE_CONFIG ### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=train.py +CANDLE_MODEL=PathDSP_train_improve.py ### Set env if CANDLE_MODEL is not in same directory as this script IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} @@ -16,65 +15,47 @@ if [ ! -f ${CANDLE_MODEL} ] ; then exit 404 fi -if [ $# -lt 2 ]; then - echo "Illegal number of parameters" - echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" - exit +if [ $# -lt 2 ] ; then + echo "Illegal number of parameters" + echo "CANDLE_DATA_DIR PARAMS are required" + exit -1 fi if [ $# -eq 2 ] ; then - CUDA_VISIBLE_DEVICES=$1 ; shift - CANDLE_DATA_DIR=$1 ; shift - CMD="python ${CANDLE_MODEL}" - echo "CMD = $CMD" + + CANDLE_DATA_DIR=$1 ; shift + + # if $2 is a file, then set candle_config + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + CONFIG_FILE=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file ${CONFIG_FILE}" + else + CMD="python ${CANDLE_MODEL} $@" + echo CMD=\"$CMD\" + fi -elif [ $# -ge 3 ] ; then - CUDA_VISIBLE_DEVICES=$1 ; shift - CANDLE_DATA_DIR=$1 ; shift - - # if original $3 is a file, set candle_config and passthrough $@ - if [ -f $CANDLE_DATA_DIR/$1 ] ; then - echo "$CANDLE_DATA_DIR/$1 is a file" - CANDLE_CONFIG=$1 ; shift - CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" - echo "CMD = $CMD $@" +elif [ $# -ge 3 ] ; then - # else passthrough $@ - else - echo "$1 is not a file" - CMD="python ${CANDLE_MODEL} $@" - echo "CMD = $CMD" - - fi -fi + CANDLE_DATA_DIR=$1 ; shift -if [ -d ${CANDLE_DATA_DIR} ]; then - if [ "$(ls -A ${CANDLE_DATA_DIR})" ] ; then - echo "using data from ${CANDLE_DATA_DIR}" - else - ./candle_glue.sh - echo "using original data placed in ${CANDLE_DATA_DIR}" - fi -fi + # if $2 is a file, then set candle_config + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD $@" -export CANDLE_DATA_DIR=${CANDLE_DATA_DIR} -FULL_DATA_DIR="$CANDLE_DATA_DIR/$MODEL_NAME/Data" -echo $FULL_DATA_DIR + # else passthrough $@ + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" -if [ -d ${FULL_DATA_DIR} ]; then - if [ "$(ls -A ${FULL_DATA_DIR})" ] ; then - echo "using data from ${FULL_DATA_DIR}" - else - ./candle_glue.sh - echo "using original data placed in ${FULL_DATA_DIR}" - fi -else - ./candle_glue.sh - echo "using original data placed in ${FULL_DATA_DIR}" + fi fi # Display runtime arguments -echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +#echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" From 226a7fa8c904db6ca3d95e99fcae514876a27a38 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Thu, 21 Mar 2024 16:51:54 -0500 Subject: [PATCH 082/254] add cuda device (#15) --- README.md | 52 ++++++++++++++++++++--------------- download_author_data.sh | 37 +++++++++++++++++++++++++ infer.sh | 52 +++++++++++++++++------------------ train.sh | 60 ++++++++++++++++++++--------------------- 4 files changed, 124 insertions(+), 77 deletions(-) create mode 100644 download_author_data.sh diff --git a/README.md b/README.md index 15ec9d1..aca66fc 100644 --- a/README.md +++ b/README.md @@ -11,42 +11,35 @@ cd process_dir wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data ``` -Benchmarmakr data will be downladed under `process_dir/csa_data/` - -# Download author data - -``` -mkdir author_data -cd author_data -wget https://zenodo.org/record/6093818/files/MSigdb.zip -wget https://zenodo.org/record/6093818/files/raw_data.zip -wget https://zenodo.org/record/6093818/files/STRING.zip -unzip MSigdb.zip -unzip raw_data.zip -unzip STRING.zip -``` - -Author data will be downloaded under `process_dir/author_data/` +Benchmark data will be downloaded under `process_dir/csa_data/` # Example usage with Conda Download PathDSP and IMPROVE ``` -cd ../ mkdir repo cd repo git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git -cd PathDSP ``` +# Download author data + +``` +cd ../ +mkdir author_data +bash repo/PathDSP/download_author_data.sh author_data/ +``` + +Author data will be downloaded under `process_dir/author_data/` PathDSP will be installed at `process_dir/repo/PathDSP` IMPROVE will be installed at `process_dir/repo/IMPROVE` Create environment ``` +cd repo/PathDSP/ conda env create -f environment_082223.yml -n PathDSP_env ``` @@ -101,7 +94,24 @@ Final prediction on testing data is located at: `${infer_outdir}/test_y_data_pre # Example usage with singularity container -Download csa data benchmark data similar as mentioned above. Then download author data into the same directory as csa data. +# Download benchmark data + +Download the cross-study analysis (CSA) benchmark data into the model directory from https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/ + +``` +mkdir process_dir +cd process_dir +wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data +``` + +# Download author data + +Download model specific data under csa_data/ directory + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git +bash PathDSP/download_author_data.sh csa_data/ +``` Setup Singularity @@ -127,7 +137,7 @@ singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif pr Train the model ``` -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh /candle_data_dir --train_ml_data_dir /candle_data_dir/preprocess_data/ --val_ml_data_dir /candle_data_dir/preprocess_data/ --model_outdir /candle_data_dir/out_model/ +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir --train_ml_data_dir /candle_data_dir/preprocess_data/ --val_ml_data_dir /candle_data_dir/preprocess_data/ --model_outdir /candle_data_dir/out_model/ ``` Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json` @@ -136,7 +146,7 @@ Final trained model is located at: `${train_ml_data_dir}/model.pt`. Perform inference on the testing data ``` -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh /candle_data_dir --test_ml_data_dir /candle_data_dir/preprocess_data/ --model_dir /candle_data_dir/out_model/ --infer_outdir /candle_data_dir/out_infer/ +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir --test_ml_data_dir /candle_data_dir/preprocess_data/ --model_dir /candle_data_dir/out_model/ --infer_outdir /candle_data_dir/out_infer/ ``` Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` diff --git a/download_author_data.sh b/download_author_data.sh new file mode 100644 index 0000000..999fc4c --- /dev/null +++ b/download_author_data.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# arg 1: output directory to download model-specific data +# If run via container, it needs to be downloaded under the csa_data folder + +OUTPUT_DIR=$1 + +# Check if the data is already downloaded +if [ -f "$OUTPUT_DIR/.downloaded" ]; then + echo "Data present, skipping download" +# Download data if no other download is in progress +elif [ ! -f "$OUTPUT_DIR/.downloading_author_data" ]; then + touch "$OUTPUT_DIR/.downloading_author_data" + # Download files + # Unzip files + wget -P $OUTPUT_DIR https://zenodo.org/record/6093818/files/MSigdb.zip + wget -P $OUTPUT_DIR https://zenodo.org/record/6093818/files/raw_data.zip + wget -P $OUTPUT_DIR https://zenodo.org/record/6093818/files/STRING.zip + unzip -d $OUTPUT_DIR $OUTPUT_DIR/MSigdb.zip + unzip -d $OUTPUT_DIR $OUTPUT_DIR/raw_data.zip + unzip -d $OUTPUT_DIR $OUTPUT_DIR/STRING.zip + touch "$OUTPUT_DIR/.downloaded" + rm "$OUTPUT_DIR/.downloading_author_data" +else + # Wait for other download to finish + iteration=0 + echo "Waiting for external download" + while [ -f "$OUTPUT_DIR/.downloading_author_data" ]; do + iteration=$((iteration + 1)) + if [ "$iteration" -gt 10 ]; then + # Download takes too long, exit and warn user + echo "Check output directory, download still in progress after $iteration minutes." + exit 1 + fi + sleep 60 + done +fi diff --git a/infer.sh b/infer.sh index 571a7d7..4be3ec5 100755 --- a/infer.sh +++ b/infer.sh @@ -1,7 +1,8 @@ #!/bin/bash -# arg 1 CANDLE_DATA_DIR -# arg 2 CANDLE_CONFIG +# arg 1 CUDA_VISIBLE_DEVICES +# arg 2 CANDLE_DATA_DIR +# arg 3 CANDLE_CONFIG ### Path to your CANDLEized model's main Python script### CANDLE_MODEL=PathDSP_infer_improve.py @@ -15,39 +16,37 @@ if [ ! -f ${CANDLE_MODEL} ] ; then exit 404 fi -if [ $# -lt 2 ] ; then +if [ $# -lt 2 ]; then echo "Illegal number of parameters" - echo "CANDLE_DATA_DIR PARAMS are required" + echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" exit -1 fi -if [ $# -eq 2 ] ; then - - CANDLE_DATA_DIR=$1 ; shift - - # if $2 is a file, then set candle_config - if [ -f $CANDLE_DATA_DIR/$1 ] ; then - CONFIG_FILE=$1 ; shift - CMD="python ${CANDLE_MODEL} --config_file ${CONFIG_FILE}" - else - CMD="python ${CANDLE_MODEL} $@" - echo CMD=\"$CMD\" - fi - -elif [ $# -ge 3 ] ; then +if [ $# -eq 2 ]; then + CUDA_VISIBLE_DEVICES=$1 + shift + CANDLE_DATA_DIR=$1 + shift + CMD="python ${CANDLE_MODEL}" + echo "CMD = ${CMD}" +elif [ $# -ge 3 ]; then + CUDA_VISIBLE_DEVICES=$1 + shift + CANDLE_DATA_DIR=$1 + shift - CANDLE_DATA_DIR=$1 ; shift - - # if $2 is a file, then set candle_config - if [ -f $CANDLE_DATA_DIR/$1 ] ; then - echo "$1 is a file" - CANDLE_CONFIG=$1 ; shift + # if original $3 is a file, set candle_config and passthrough $@ + ### if [ -f $CANDLE_DATA_DIR/$1 ] ; then + if [ -f $1 ]; then + echo "$CANDLE_DATA_DIR/$1 is a file" + CANDLE_CONFIG=$1 + shift CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" - echo "CMD = $CMD $@" + echo "CMD = $CMD" # else passthrough $@ else - echo "$1 is not a file" + echo "$1 is not a file" CMD="python ${CANDLE_MODEL} $@" echo "CMD = $CMD" @@ -58,6 +57,7 @@ fi # Display runtime arguments #echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" diff --git a/train.sh b/train.sh index 23ac094..325f9a4 100755 --- a/train.sh +++ b/train.sh @@ -1,53 +1,52 @@ #!/bin/bash -# arg 1 CANDLE_DATA_DIR -# arg 2 CANDLE_CONFIG +# arg 1 CUDA_VISIBLE_DEVICES +# arg 2 CANDLE_DATA_DIR +# arg 3 CANDLE_CONFIG ### Path to your CANDLEized model's main Python script### CANDLE_MODEL=PathDSP_train_improve.py ### Set env if CANDLE_MODEL is not in same directory as this script -IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$(dirname -- "$0")} CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL} -if [ ! -f ${CANDLE_MODEL} ] ; then - echo No such file ${CANDLE_MODEL} - exit 404 +if [ ! -f ${CANDLE_MODEL} ]; then + echo No such file ${CANDLE_MODEL} + exit 404 fi -if [ $# -lt 2 ] ; then +if [ $# -lt 2 ]; then echo "Illegal number of parameters" - echo "CANDLE_DATA_DIR PARAMS are required" + echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" exit -1 fi -if [ $# -eq 2 ] ; then - - CANDLE_DATA_DIR=$1 ; shift - - # if $2 is a file, then set candle_config - if [ -f $CANDLE_DATA_DIR/$1 ] ; then - CONFIG_FILE=$1 ; shift - CMD="python ${CANDLE_MODEL} --config_file ${CONFIG_FILE}" - else - CMD="python ${CANDLE_MODEL} $@" - echo CMD=\"$CMD\" - fi - -elif [ $# -ge 3 ] ; then +if [ $# -eq 2 ]; then + CUDA_VISIBLE_DEVICES=$1 + shift + CANDLE_DATA_DIR=$1 + shift + CMD="python ${CANDLE_MODEL}" + echo "CMD = ${CMD}" +elif [ $# -ge 3 ]; then + CUDA_VISIBLE_DEVICES=$1 + shift + CANDLE_DATA_DIR=$1 + shift - CANDLE_DATA_DIR=$1 ; shift - - # if $2 is a file, then set candle_config - if [ -f $CANDLE_DATA_DIR/$1 ] ; then - echo "$1 is a file" - CANDLE_CONFIG=$1 ; shift + # if original $3 is a file, set candle_config and passthrough $@ + ### if [ -f $CANDLE_DATA_DIR/$1 ] ; then + if [ -f $1 ]; then + echo "$CANDLE_DATA_DIR/$1 is a file" + CANDLE_CONFIG=$1 + shift CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" - echo "CMD = $CMD $@" + echo "CMD = $CMD" # else passthrough $@ else - echo "$1 is not a file" + echo "$1 is not a file" CMD="python ${CANDLE_MODEL} $@" echo "CMD = $CMD" @@ -56,6 +55,7 @@ fi # Display runtime arguments #echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" From 638580c1675880de55210bfd738fc0b25b801ddd Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Fri, 29 Mar 2024 09:52:25 -0500 Subject: [PATCH 083/254] add cuda device (#16) From 70b27ab17330b7a1b198036166171c031d0da933 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Fri, 29 Mar 2024 12:46:22 -0700 Subject: [PATCH 084/254] add hpo script for dh --- .gitignore | 5 +- hpo_subprocess.py | 139 ++++++++++++++++++++++++++++++++++++++++++++ subprocess_train.sh | 42 +++++++++++++ 3 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 hpo_subprocess.py create mode 100755 subprocess_train.sh diff --git a/.gitignore b/.gitignore index f72ee49..1e75e02 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ .ipynb_checkpoints/ PathDSP/__pycache__/ __pycache__/ -EDA.ipynb +EDA.ipynib +ml_data/ +dh_hpo_improve/ +dh_hpo_logs/ diff --git a/hpo_subprocess.py b/hpo_subprocess.py new file mode 100644 index 0000000..50db6fd --- /dev/null +++ b/hpo_subprocess.py @@ -0,0 +1,139 @@ +""" +Before running this script, first need to preprocess the data. +This can be done by running preprocess_example.sh + +It is assumed that the csa benchmark data is downloaded via download_csa.sh +and the env vars $IMPROVE_DATA_DIR and $PYTHONPATH are set: +export IMPROVE_DATA_DIR="./csa_data/" +export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib + +mpirun -np 2 python hpo_subprocess.py + +TODO: how to distribute HPO to mulitple GPUs? +""" +# import copy +import json +import subprocess +import pandas as pd +import os +import logging + +from deephyper.evaluator import Evaluator, profile +from deephyper.evaluator.callback import TqdmCallback +from deephyper.problem import HpProblem +from deephyper.search.hps import CBO +from mpi4py import MPI + + +logging.basicConfig( + # filename=f"deephyper.{rank}.log, # optional if we want to store the logs to disk + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(filename)s:%(funcName)s - %(message)s", + force=True, +) + +# --------------------- +# Enable using multiple GPUs +# --------------------- +#comm = MPI.COMM_WORLD +#rank = comm.Get_rank() +# need to set -np to 3 or higher for mpirun +os.environ["CUDA_VISIBLE_DEVICES"] = "6,7" + +# --------------------- +# Hyperparameters +# --------------------- +problem = HpProblem() + +problem.add_hyperparameter((8, 512, "log-uniform"), "batch_size", default_value=64) +problem.add_hyperparameter((1e-6, 1e-2, "log-uniform"), + "learning_rate", default_value=0.001) +# problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.0) +# problem.add_hyperparameter([True, False], "early_stopping", default_value=False) + +# --------------------- +# Some IMPROVE settings +# --------------------- +source = "GDSCv1" +split = 4 +train_ml_data_dir = f"ml_data/{source}-{source}/split_{split}" +val_ml_data_dir = f"ml_data/{source}-{source}/split_{split}" +model_outdir = f"dh_hpo_improve/{source}/split_{split}" +log_dir = "dh_hpo_logs/" +subprocess_bashscript = "subprocess_train.sh" + + +@profile +def run(job, optuna_trial=None): + + # config = copy.deepcopy(job.parameters) + # params = { + # "epochs": DEEPHYPER_BENCHMARK_MAX_EPOCHS, + # "timeout": DEEPHYPER_BENCHMARK_TIMEOUT, + # "verbose": False, + # } + # if len(config) > 0: + # remap_hyperparameters(config) + # params.update(config) + + model_outdir_job_id = model_outdir + f"/{job.id}" + + # val_scores = main_train_grapdrp([ + # "--train_ml_data_dir", str(train_ml_data_dir), + # "--val_ml_data_dir", str(val_ml_data_dir), + # "--model_outdir", str(model_outdir_job_id), + # ]) + subprocess_res = subprocess.run( + [ + "bash", subprocess_bashscript, + str(train_ml_data_dir), + str(val_ml_data_dir), + str(model_outdir_job_id) + ], + capture_output=True, text=True, check=True + ) + + # print(subprocess_res.stdout) + # print(subprocess_res.stderr) + + # Load val_scores and get val_loss + # f = open(model_outdir + "/val_scores.json") + f = open(model_outdir_job_id + "/val_scores.json") + val_scores = json.load(f) + objective = -val_scores["val_loss"] + # print("objective:", objective) + + # Checkpoint the model weights + with open(f"{log_dir}/model_{job.id}.pkl", "w") as f: + f.write("model weights") + + # return score + return {"objective": objective, "metadata": val_scores} + + +if __name__ == "__main__": + with Evaluator.create( + run, method="mpicomm", method_kwargs={"callbacks": [TqdmCallback()]} + ) as evaluator: + + if evaluator is not None: + print(problem) + + search = CBO( + problem, + evaluator, + log_dir=log_dir, + verbose=1, + ) + + # max_evals = 2 + # max_evals = 4 + # max_evals = 10 + # max_evals = 20 + max_evals = 100 + # max_evals = 100 + results = search.search(max_evals=max_evals) + results = results.sort_values("m:val_loss", ascending=True) + results.to_csv(model_outdir + "/hpo_results.csv", index=False) + + print("Finished deephyper HPO.") diff --git a/subprocess_train.sh b/subprocess_train.sh new file mode 100755 index 0000000..bda0769 --- /dev/null +++ b/subprocess_train.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0 +# CUDA_VISIBLE_DEVICES=5 bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0 + +# Need to comment this when using ' eval "$(conda shell.bash hook)" ' +# set -e + +# Activate conda env for model using "conda activate myenv" +# https://saturncloud.io/blog/activating-conda-environments-from-scripts-a-guide-for-data-scientists +# https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script +# This doesn't work w/o eval "$(conda shell.bash hook)" +CONDA_ENV=PathDSP_env +echo "Allow conda commands in shell script by running 'conda shell.bash hook'" +eval "$(conda shell.bash hook)" +echo "Activated conda commands in shell script" +conda activate $CONDA_ENV +echo "Activated conda env $CONDA_ENV" + +train_ml_data_dir=$1 +val_ml_data_dir=$2 +model_outdir=$3 +echo "train_ml_data_dir: $train_ml_data_dir" +echo "val_ml_data_dir: $val_ml_data_dir" +echo "model_outdir: $model_outdir" + +# epochs=10 +epochs=20 +# epochs=50 + +# All train outputs are saved in params["model_outdir"] +#CUDA_VISIBLE_DEVICES=6,7 python PathDSP_train_improve.py \ +#CUDA_VISIBLE_DEVICES=5 +#CUDA_VISIBLE_DEVICES=6,7 +python PathDSP_train_improve.py \ + --train_ml_data_dir $train_ml_data_dir \ + --val_ml_data_dir $val_ml_data_dir \ + --model_outdir $model_outdir \ + --epochs $epochs + +conda deactivate +echo "Deactivated conda env $CONDA_ENV" From 5c552d54a2c1358972ee6337ed0d5dbe2741cde4 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Mon, 1 Apr 2024 09:26:58 -0700 Subject: [PATCH 085/254] use multiple gpus --- hpo_subprocess.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/hpo_subprocess.py b/hpo_subprocess.py index 50db6fd..e5c31b4 100644 --- a/hpo_subprocess.py +++ b/hpo_subprocess.py @@ -7,7 +7,7 @@ export IMPROVE_DATA_DIR="./csa_data/" export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib -mpirun -np 2 python hpo_subprocess.py +mpirun -np 10 python hpo_subprocess.py TODO: how to distribute HPO to mulitple GPUs? """ @@ -17,13 +17,37 @@ import pandas as pd import os import logging - +import os +import mpi4py +from mpi4py import MPI from deephyper.evaluator import Evaluator, profile from deephyper.evaluator.callback import TqdmCallback from deephyper.problem import HpProblem from deephyper.search.hps import CBO from mpi4py import MPI +# --------------------- +# Enable using multiple GPUs +# --------------------- + +mpi4py.rc.initialize = False +mpi4py.rc.threads = True +mpi4py.rc.thread_level = "multiple" +mpi4py.rc.recv_mprobe = False + +if not MPI.Is_initialized(): + MPI.Init_thread() + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() +size = comm.Get_size() + +num_gpus_per_node = 2 +os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node + 6) + +# --------------------- +# Enable logging +# --------------------- logging.basicConfig( # filename=f"deephyper.{rank}.log, # optional if we want to store the logs to disk @@ -32,14 +56,6 @@ force=True, ) -# --------------------- -# Enable using multiple GPUs -# --------------------- -#comm = MPI.COMM_WORLD -#rank = comm.Get_rank() -# need to set -np to 3 or higher for mpirun -os.environ["CUDA_VISIBLE_DEVICES"] = "6,7" - # --------------------- # Hyperparameters # --------------------- @@ -130,7 +146,7 @@ def run(job, optuna_trial=None): # max_evals = 4 # max_evals = 10 # max_evals = 20 - max_evals = 100 + max_evals = 20 # max_evals = 100 results = search.search(max_evals=max_evals) results = results.sort_values("m:val_loss", ascending=True) From 8e11b6ebf5c12d451accfbd2b2438a96b33c29f7 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Thu, 11 Apr 2024 20:44:07 +0000 Subject: [PATCH 086/254] update hpo scripts for polaris --- README.md | 173 +++++--------------------------------------- hpo_subprocess.py | 4 +- install_polaris.sh | 45 ++++++++++++ subprocess_train.sh | 14 ++-- 4 files changed, 75 insertions(+), 161 deletions(-) create mode 100644 install_polaris.sh diff --git a/README.md b/README.md index aca66fc..3b10e78 100644 --- a/README.md +++ b/README.md @@ -1,63 +1,34 @@ -# PathDSP -Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores +# Setup environment on Polaris for deephyper -# Download benchmark data - -Download the cross-study analysis (CSA) benchmark data into the model directory from https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/ +Install conda environment for deephyper ``` -mkdir process_dir -cd process_dir -wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data +git clone -b deephyper https://github.com/JDACS4C-IMPROVE/PathDSP.git +bash ./PathDSP/install_polaris.sh ``` -Benchmark data will be downloaded under `process_dir/csa_data/` - -# Example usage with Conda - -Download PathDSP and IMPROVE +Install conda environment for the curated model (PathDSP) ``` -mkdir repo -cd repo -git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git +## install IMPROVE git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git +conda env create -f environment_082223.yml -p $PathDSP_env +conda activate $PathDSP_env +pip install git+https://github.com/ECP-CANDLE/candle_lib@develop ``` -# Download author data +Download additional author data (PathDSP only) ``` -cd ../ mkdir author_data -bash repo/PathDSP/download_author_data.sh author_data/ -``` - -Author data will be downloaded under `process_dir/author_data/` -PathDSP will be installed at `process_dir/repo/PathDSP` -IMPROVE will be installed at `process_dir/repo/IMPROVE` - -Create environment - -``` -cd repo/PathDSP/ -conda env create -f environment_082223.yml -n PathDSP_env -``` - -Activate environment - -``` -conda activate PathDSP_env -``` - -Install CANDLE package - -``` -pip install git+https://github.com/ECP-CANDLE/candle_lib@develop +bash ./PathDSP/download_author_data.sh author_data/ ``` -Define enviroment variabels +Define environment variables ``` +### need to request an interactive node first from polaris +### use debug queue for testing improve_lib="/path/to/IMPROVE/repo/" pathdsp_lib="/path/to/pathdsp/repo/" # notice the extra PathDSP folder after pathdsp_lib @@ -66,116 +37,10 @@ export IMPROVE_DATA_DIR="/path/to/csa_data/" export AUTHOR_DATA_DIR="/path/to/author_data/" ``` -Perform preprocessing step - -``` -# go two upper level -cd ../../ -python repo/PathDSP/PathDSP_preprocess_improve.py -``` - -Train the model - -``` -python repo/PathDSP/PathDSP_train_improve.py -``` - -Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json` -Final trained model is located at: `${train_ml_data_dir}/model.pt`. Parameter definitions can be found at `process_dir/repo/PathDSP/PathDSP_default_model.txt` - -Perform inference on the testing data - -``` -python repo/PathDSP/PathDSP_infer_improve.py -``` - -Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` -Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` - -# Example usage with singularity container - -# Download benchmark data - -Download the cross-study analysis (CSA) benchmark data into the model directory from https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/ - -``` -mkdir process_dir -cd process_dir -wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data -``` - -# Download author data - -Download model specific data under csa_data/ directory - -``` -git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git -bash PathDSP/download_author_data.sh csa_data/ -``` - -Setup Singularity - -``` -git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git -cd Singularity -./setup -source config/improve.env -``` - -Build Singularity from definition file - -``` -singularity build --fakeroot PathDSP.sif definitions/PathDSP.def -``` - -Perform preprocessing using csa benchmarking data - -``` -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh /candle_data_dir --ml_data_outdir /candle_data_dir/preprocess_data/ -``` - -Train the model - -``` -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir --train_ml_data_dir /candle_data_dir/preprocess_data/ --val_ml_data_dir /candle_data_dir/preprocess_data/ --model_outdir /candle_data_dir/out_model/ -``` - -Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json` -Final trained model is located at: `${train_ml_data_dir}/model.pt`. - -Perform inference on the testing data - -``` -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir --test_ml_data_dir /candle_data_dir/preprocess_data/ --model_dir /candle_data_dir/out_model/ --infer_outdir /candle_data_dir/out_infer/ -``` - -Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` -Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` - - -# Docs from original authors (below) - -# Requirments - -# Input format +Activate deephyper environment and perform HPO -|drug|cell|feature_1|....|feature_n|drug_response| -|----|----|--------|----|--------|----| -|5-FU|03|0|....|0.02|-2.3| -|5-FU|23|1|....|0.04|-3.4| - -Where feature_1 to feature_n are the pathway enrichment scores and the chemical fingerprint coming from data processing -# Usage: -```python -# run FNN -python ./PathDSP/PathDSP/FNN.py -i input.txt -o ./output_prefix - -Where input.txt should be in the input format shown above. -Example input file can be found at https://zenodo.org/record/7532963 ``` -# Data preprocessing -Pathway enrichment scores for categorical data (i.e., mutation, copy number variation, and drug targets) were obtained by running the NetPEA algorithm, which is available at: https://github.com/TangYiChing/NetPEA, while pathway enrichment scores for numeric data (i.e., gene expression) was generated with the single-sample Gene Set Enrichment Analsysis (ssGSEA) available here: https://gseapy.readthedocs.io/en/master/gseapy_example.html#3)-command-line-usage-of-single-sample-gseaby - - -# Reference -Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7 \ No newline at end of file +bash ./activate-dhenv.sh +cd PathDSP +mpirun -np 10 python hpo_subprocess.py +``` \ No newline at end of file diff --git a/hpo_subprocess.py b/hpo_subprocess.py index e5c31b4..5947522 100644 --- a/hpo_subprocess.py +++ b/hpo_subprocess.py @@ -43,7 +43,7 @@ size = comm.Get_size() num_gpus_per_node = 2 -os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node + 6) +os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) # --------------------- # Enable logging @@ -146,7 +146,7 @@ def run(job, optuna_trial=None): # max_evals = 4 # max_evals = 10 # max_evals = 20 - max_evals = 20 + max_evals = 10 # max_evals = 100 results = search.search(max_evals=max_evals) results = results.sort_values("m:val_loss", ascending=True) diff --git a/install_polaris.sh b/install_polaris.sh new file mode 100644 index 0000000..1fba33b --- /dev/null +++ b/install_polaris.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# From Romain Egele (this script is called install.sh) + +# Generic installation script for DeepHyper on ALCF's Polaris. +# This script is meant to be run on the login node of the machine. +# It will install DeepHyper and its dependencies in the current directory. +# A good practice is to create a `build` folder and launch the script from there, +# e.g. from the root of the DeepHyper repository: +# $ mkdir build && cd build && ../install/alcf/polaris.sh +# The script will also create a file named `activate-dhenv.sh` that will +# Setup the environment each time it is sourced `source activate-dhenv.sh`. + +set -xe + +# Load modules available on the current system +module load PrgEnv-gnu/8.3.3 +module load conda/2023-10-04 + +# Copy the base conda environment +conda create -p dhenv python=3.9 pip -y +conda activate dhenv/ +pip install --upgrade pip + +# For mpi4py +module swap PrgEnv-nvhpc PrgEnv-gnu +module load nvhpc-mixed +git clone https://github.com/mpi4py/mpi4py.git +cd mpi4py/ +MPICC=CC python setup.py install +cd ../ + +# Install the DeepHyper's Python package +git clone -b develop git@github.com:deephyper/deephyper.git +pip install -e "deephyper/[hps,mpi]" + +# Create activation script +touch activate-dhenv.sh +echo "#!/bin/bash" >> activate-dhenv.sh + +# Append modules loading and conda activation +echo "" >> activate-dhenv.sh +echo "module load PrgEnv-gnu/8.3.3" >> activate-dhenv.sh +echo "module load conda/2023-10-04" >> activate-dhenv.sh +echo "conda activate $PWD/dhenv/" >> activate-dhenv.sh diff --git a/subprocess_train.sh b/subprocess_train.sh index bda0769..e9a0cee 100755 --- a/subprocess_train.sh +++ b/subprocess_train.sh @@ -10,11 +10,14 @@ # https://saturncloud.io/blog/activating-conda-environments-from-scripts-a-guide-for-data-scientists # https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script # This doesn't work w/o eval "$(conda shell.bash hook)" -CONDA_ENV=PathDSP_env -echo "Allow conda commands in shell script by running 'conda shell.bash hook'" -eval "$(conda shell.bash hook)" +CONDA_ENV=$PathDSP_env +#echo "Allow conda commands in shell script by running 'conda shell.bash hook'" +#eval "$(conda shell.bash hook)" echo "Activated conda commands in shell script" -conda activate $CONDA_ENV +#conda activate $CONDA_ENV +#source activate $CONDA_ENV +#source /soft/datascience/conda/2023-10-04/mconda3/bin/activate $CONDA_ENV +source activate $CONDA_ENV echo "Activated conda env $CONDA_ENV" train_ml_data_dir=$1 @@ -38,5 +41,6 @@ python PathDSP_train_improve.py \ --model_outdir $model_outdir \ --epochs $epochs -conda deactivate +#conda deactivate +source /soft/datascience/conda/2023-10-04/mconda3/bin/deactivate echo "Deactivated conda env $CONDA_ENV" From 2f45d28da48f5172be94340ddcbbd26a04f12db1 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Thu, 11 Apr 2024 20:52:09 +0000 Subject: [PATCH 087/254] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3b10e78..cf52caf 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Install conda environment for deephyper ``` -git clone -b deephyper https://github.com/JDACS4C-IMPROVE/PathDSP.git +git clone -b deephyper https://github.com/Liuy12/PathDSP.git bash ./PathDSP/install_polaris.sh ``` From 5e7e0df3dd34f665216379ee05b85278b7d8f61a Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Fri, 12 Apr 2024 23:07:46 +0000 Subject: [PATCH 088/254] update readme --- README.md | 43 +++++++++++++++++++++++++++++++++++-------- install_polaris.sh | 2 +- subprocess_train.sh | 4 ++-- 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index cf52caf..33b0991 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,19 @@ Install conda environment for the curated model (PathDSP) ``` ## install IMPROVE git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git -conda env create -f environment_082223.yml -p $PathDSP_env -conda activate $PathDSP_env +## define where to install PathDSP env +export PathDSP_env=./PathDSP_env/ +conda env create -f ./PathDSP/environment_082223.yml -p $PathDSP_env +conda activate ${PathDSP_env} pip install git+https://github.com/ECP-CANDLE/candle_lib@develop ``` +Download csa benchmark data + +``` +wget --cut-dirs=7 -P ./ -nH -np -m ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data +``` + Download additional author data (PathDSP only) ``` @@ -27,20 +35,39 @@ bash ./PathDSP/download_author_data.sh author_data/ Define environment variables ``` -### need to request an interactive node first from polaris +### need to firstly request an interactive node first from polaris ### use debug queue for testing -improve_lib="/path/to/IMPROVE/repo/" -pathdsp_lib="/path/to/pathdsp/repo/" +### it might take a while for a node to become available +qsub -A IMPROVE -I -l select=1 -l filesystems=home:eagle -l walltime=1:00:00 -q debug +### NEED to cd into your working directory again once the job started +improve_lib="$PWD/IMPROVE/" +pathdsp_lib="$PWD/PathDSP/" # notice the extra PathDSP folder after pathdsp_lib export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/ -export IMPROVE_DATA_DIR="/path/to/csa_data/" -export AUTHOR_DATA_DIR="/path/to/author_data/" +export IMPROVE_DATA_DIR="$PWD/csa_data/" +export AUTHOR_DATA_DIR="$PWD/author_data/" +export PathDSP_env="$PWD/PathDSP_env/" +``` + +Perform preprocessing + +``` +conda activate $PathDSP_env +## You can copy the processed files under my home dir +cp -r /home/yuanhangl_alcf/improve_project/repo/PathDSP/ml_data/ ./PathDSP/ +## Alternatively, run the preprocess script +## This script taks around 40 mins to complete +## python PathDSP/PathDSP_preprocess_improve.py --ml_data_outdir=./PathDSP/ml_data/GDSCv1-GDSCv1/split_4/ ``` Activate deephyper environment and perform HPO ``` -bash ./activate-dhenv.sh +# the .sh script sometimes does not activate the environment somehow +# bash ./activate-dhenv.sh +module load PrgEnv-gnu/8.3.3 +module load conda/2023-10-04 +conda activate ./dhenv/ cd PathDSP mpirun -np 10 python hpo_subprocess.py ``` \ No newline at end of file diff --git a/install_polaris.sh b/install_polaris.sh index 1fba33b..eabb823 100644 --- a/install_polaris.sh +++ b/install_polaris.sh @@ -23,7 +23,7 @@ conda activate dhenv/ pip install --upgrade pip # For mpi4py -module swap PrgEnv-nvhpc PrgEnv-gnu +#module swap PrgEnv-nvhpc PrgEnv-gnu module load nvhpc-mixed git clone https://github.com/mpi4py/mpi4py.git cd mpi4py/ diff --git a/subprocess_train.sh b/subprocess_train.sh index e9a0cee..f71985b 100755 --- a/subprocess_train.sh +++ b/subprocess_train.sh @@ -16,8 +16,8 @@ CONDA_ENV=$PathDSP_env echo "Activated conda commands in shell script" #conda activate $CONDA_ENV #source activate $CONDA_ENV -#source /soft/datascience/conda/2023-10-04/mconda3/bin/activate $CONDA_ENV -source activate $CONDA_ENV +source /soft/datascience/conda/2023-10-04/mconda3/bin/activate $CONDA_ENV +#source activate $CONDA_ENV echo "Activated conda env $CONDA_ENV" train_ml_data_dir=$1 From 9a588d6454108bef772bee382585c172026cc504 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Mon, 15 Apr 2024 14:12:02 +0000 Subject: [PATCH 089/254] update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 33b0991..19386b8 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ Perform preprocessing ``` conda activate $PathDSP_env ## You can copy the processed files under my home dir -cp -r /home/yuanhangl_alcf/improve_project/repo/PathDSP/ml_data/ ./PathDSP/ +cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ ./PathDSP/ ## Alternatively, run the preprocess script ## This script taks around 40 mins to complete ## python PathDSP/PathDSP_preprocess_improve.py --ml_data_outdir=./PathDSP/ml_data/GDSCv1-GDSCv1/split_4/ @@ -70,4 +70,4 @@ module load conda/2023-10-04 conda activate ./dhenv/ cd PathDSP mpirun -np 10 python hpo_subprocess.py -``` \ No newline at end of file +``` From c55a3d53d29ed5aff24401a7caf3c165bfb504b0 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Tue, 23 Apr 2024 13:35:36 -0700 Subject: [PATCH 090/254] add dh singularity scripts --- .gitignore | 9 ++ PathDSP_infer_improve.py | 5 +- PathDSP_train_improve.py | 6 +- hpo_subprocess_singularity.py | 154 ++++++++++++++++++++++++++++++++ subprocess_train_singularity.sh | 53 +++++++++++ 5 files changed, 225 insertions(+), 2 deletions(-) create mode 100644 hpo_subprocess_singularity.py create mode 100755 subprocess_train_singularity.sh diff --git a/.gitignore b/.gitignore index 1e75e02..38b428b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,12 @@ ml_data/ dh_hpo_improve/ dh_hpo_logs/ +## gpu utilization +PathDSP_gpu_util_model.txt +gpu_log_strip.txt +gpu_logs.txt +out_models/ +train_gpu_util.sh + +## image +PathDSP.sif diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index 1f9aeeb..5223892 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -54,7 +54,10 @@ def run(params): trained_net.load_state_dict(tch.load(modelpath)) trained_net.eval() myutil.set_seed(params["seed_int"]) - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + if 'CUDA_VISIBLE_DEVICES' in os.environ: + device = 'cuda:'+str(os.environ['CUDA_VISIBLE_DEVICES']) + else: + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) test_dl = tchud.DataLoader(test_dataset, batch_size=params['test_batch'], shuffle=False) start = datetime.now() diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 29cac69..ec649b7 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -248,7 +248,11 @@ def run(params): # set parameters myutil.set_seed(params["seed_int"]) - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + ## set device + if 'CUDA_VISIBLE_DEVICES' in os.environ: + device = 'cuda:'+str(os.environ['CUDA_VISIBLE_DEVICES']) + else: + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) learning_rate = params['learning_rate'] epoch = params['epochs'] batch_size = params['batch_size'] diff --git a/hpo_subprocess_singularity.py b/hpo_subprocess_singularity.py new file mode 100644 index 0000000..2fde459 --- /dev/null +++ b/hpo_subprocess_singularity.py @@ -0,0 +1,154 @@ +""" +Before running this script, first need to preprocess the data. +This can be done by running preprocess_example.sh + +It is assumed that the csa benchmark data is downloaded via download_csa.sh +and the env vars $IMPROVE_DATA_DIR and $PYTHONPATH are set: +export IMPROVE_DATA_DIR="./csa_data/" +export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib + +mpirun -np 10 python hpo_subprocess.py + +TODO: how to distribute HPO to mulitple GPUs? +""" +# import copy +import json +import subprocess +import pandas as pd +import os +import logging +import mpi4py +from mpi4py import MPI +from deephyper.evaluator import Evaluator, profile +from deephyper.evaluator.callback import TqdmCallback +from deephyper.problem import HpProblem +from deephyper.search.hps import CBO + +# --------------------- +# Enable using multiple GPUs +# --------------------- + +mpi4py.rc.initialize = False +mpi4py.rc.threads = True +mpi4py.rc.thread_level = "multiple" +mpi4py.rc.recv_mprobe = False + +if not MPI.Is_initialized(): + MPI.Init_thread() + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() +size = comm.Get_size() + +num_gpus_per_node = 5 +os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) + +# --------------------- +# Enable logging +# --------------------- + +logging.basicConfig( + # filename=f"deephyper.{rank}.log, # optional if we want to store the logs to disk + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(filename)s:%(funcName)s - %(message)s", + force=True, +) + +# --------------------- +# Hyperparameters +# --------------------- +problem = HpProblem() + +problem.add_hyperparameter((8, 512, "log-uniform"), "batch_size", default_value=64) +problem.add_hyperparameter((1e-6, 1e-2, "log-uniform"), + "learning_rate", default_value=0.001) +# problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.0) +# problem.add_hyperparameter([True, False], "early_stopping", default_value=False) + +# --------------------- +# Some IMPROVE settings +# --------------------- +source = "GDSCv1" +split = 4 +train_ml_data_dir = f"ml_data/{source}-{source}/split_{split}" +val_ml_data_dir = f"ml_data/{source}-{source}/split_{split}" +model_outdir = f"dh_hpo_improve/{source}/split_{split}" +log_dir = "dh_hpo_logs/" +subprocess_bashscript = "subprocess_train_singularity.sh" + + +@profile +def run(job, optuna_trial=None): + + # config = copy.deepcopy(job.parameters) + # params = { + # "epochs": DEEPHYPER_BENCHMARK_MAX_EPOCHS, + # "timeout": DEEPHYPER_BENCHMARK_TIMEOUT, + # "verbose": False, + # } + # if len(config) > 0: + # remap_hyperparameters(config) + # params.update(config) + + model_outdir_job_id = model_outdir + f"/{job.id}" + + # val_scores = main_train_grapdrp([ + # "--train_ml_data_dir", str(train_ml_data_dir), + # "--val_ml_data_dir", str(val_ml_data_dir), + # "--model_outdir", str(model_outdir_job_id), + # ]) + subprocess_res = subprocess.run( + [ + "bash", subprocess_bashscript, + str(train_ml_data_dir), + str(val_ml_data_dir), + str(model_outdir_job_id), + str(os.environ["CUDA_VISIBLE_DEVICES"]) + ], + capture_output=True, text=True, check=True + ) + + # print(subprocess_res.stdout) + # print(subprocess_res.stderr) + + # Load val_scores and get val_loss + # f = open(model_outdir + "/val_scores.json") + f = open(os.path.join(os.environ["IMPROVE_DATA_DIR"], model_outdir_job_id, "val_scores.json")) + val_scores = json.load(f) + objective = -val_scores["val_loss"] + # print("objective:", objective) + + # Checkpoint the model weights + with open(f"{log_dir}/model_{job.id}.pkl", "w") as f: + f.write("model weights") + + # return score + return {"objective": objective, "metadata": val_scores} + + +if __name__ == "__main__": + with Evaluator.create( + run, method="mpicomm", method_kwargs={"callbacks": [TqdmCallback()]} + ) as evaluator: + + if evaluator is not None: + print(problem) + + search = CBO( + problem, + evaluator, + log_dir=log_dir, + verbose=1, + ) + + # max_evals = 2 + # max_evals = 4 + # max_evals = 10 + # max_evals = 20 + max_evals = 10 + # max_evals = 100 + results = search.search(max_evals=max_evals) + results = results.sort_values("m:val_loss", ascending=True) + results.to_csv(os.path.join(os.environ["IMPROVE_DATA_DIR"], model_outdir, "/hpo_results.csv"), index=False) + + print("Finished deephyper HPO.") diff --git a/subprocess_train_singularity.sh b/subprocess_train_singularity.sh new file mode 100755 index 0000000..3ee1db8 --- /dev/null +++ b/subprocess_train_singularity.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0 +# CUDA_VISIBLE_DEVICES=5 bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0 + +# Need to comment this when using ' eval "$(conda shell.bash hook)" ' +# set -e + +# Activate conda env for model using "conda activate myenv" +# https://saturncloud.io/blog/activating-conda-environments-from-scripts-a-guide-for-data-scientists +# https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script +# This doesn't work w/o eval "$(conda shell.bash hook)" +#CONDA_ENV=$PathDSP_env +#echo "Allow conda commands in shell script by running 'conda shell.bash hook'" +#eval "$(conda shell.bash hook)" +#echo "Activated conda commands in shell script" +#conda activate $CONDA_ENV +#source activate $CONDA_ENV +#conda_path=${dirname `which conda`} +#source $conda_path/activate $CONDA_ENV +#source activate $CONDA_ENV +#echo "Activated conda env $CONDA_ENV" + +train_ml_data_dir=$1 +val_ml_data_dir=$2 +model_outdir=$3 +CUDA_VISIBLE_DEVICES=$4 +echo "train_ml_data_dir: $train_ml_data_dir" +echo "val_ml_data_dir: $val_ml_data_dir" +echo "model_outdir: $model_outdir" +echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" + +# epochs=10 +epochs=10 +# epochs=50 + +# All train outputs are saved in params["model_outdir"] +#CUDA_VISIBLE_DEVICES=6,7 python PathDSP_train_improve.py \ +#CUDA_VISIBLE_DEVICES=5 +#CUDA_VISIBLE_DEVICES=6,7 +# python PathDSP_train_improve.py \ +# --train_ml_data_dir $train_ml_data_dir \ +# --val_ml_data_dir $val_ml_data_dir \ +# --model_outdir $model_outdir \ +# --epochs $epochs + +#conda deactivate +#source $conda_path/deactivate +#echo "Deactivated conda env $CONDA_ENV" + +echo "train using singularity container" +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh ${CUDA_VISIBLE_DEVICES} /candle_data_dir --train_ml_data_dir /candle_data_dir/$train_ml_data_dir --val_ml_data_dir /candle_data_dir/$val_ml_data_dir/ --model_outdir /candle_data_dir/$model_outdir/ --epochs $epochs + From 82773c7a44f3e3be2109fd0645bdc0ccb0175ae9 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Tue, 23 Apr 2024 13:41:41 -0700 Subject: [PATCH 091/254] fix gpu device --- .gitignore | 9 +++++++++ PathDSP_infer_improve.py | 5 ++++- PathDSP_train_improve.py | 6 +++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 1e75e02..38b428b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,12 @@ ml_data/ dh_hpo_improve/ dh_hpo_logs/ +## gpu utilization +PathDSP_gpu_util_model.txt +gpu_log_strip.txt +gpu_logs.txt +out_models/ +train_gpu_util.sh + +## image +PathDSP.sif diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index 1f9aeeb..5223892 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -54,7 +54,10 @@ def run(params): trained_net.load_state_dict(tch.load(modelpath)) trained_net.eval() myutil.set_seed(params["seed_int"]) - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + if 'CUDA_VISIBLE_DEVICES' in os.environ: + device = 'cuda:'+str(os.environ['CUDA_VISIBLE_DEVICES']) + else: + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) test_dl = tchud.DataLoader(test_dataset, batch_size=params['test_batch'], shuffle=False) start = datetime.now() diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 29cac69..ec649b7 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -248,7 +248,11 @@ def run(params): # set parameters myutil.set_seed(params["seed_int"]) - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + ## set device + if 'CUDA_VISIBLE_DEVICES' in os.environ: + device = 'cuda:'+str(os.environ['CUDA_VISIBLE_DEVICES']) + else: + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) learning_rate = params['learning_rate'] epoch = params['epochs'] batch_size = params['batch_size'] From 86770cdb3165c843f8ab5e598fa55a9f252c873c Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Wed, 24 Apr 2024 14:50:23 -0700 Subject: [PATCH 092/254] pass hps to model --- PathDSP_train_improve.py | 6 ++++++ hpo_subprocess.py | 10 +++++++--- subprocess_train.sh | 21 ++++++++++++++++----- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index ec649b7..483695e 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -33,6 +33,7 @@ import myDataloader as mydl import myUtility as myutil import polars as pl +import json file_path = os.path.dirname(os.path.realpath(__file__)) @@ -348,6 +349,11 @@ def main(args): required=None, ) val_scores = run(params) + # with open(params["model_outdir"] + '/params.json', 'w') as json_file: + # json.dump(params, json_file, indent=4) + df = pd.DataFrame.from_dict(params, orient='index', columns=['value']) + df.to_csv(params["model_outdir"] + '/params.txt',sep="\t") + if __name__ == "__main__": diff --git a/hpo_subprocess.py b/hpo_subprocess.py index 5947522..ce283f4 100644 --- a/hpo_subprocess.py +++ b/hpo_subprocess.py @@ -93,7 +93,8 @@ def run(job, optuna_trial=None): # params.update(config) model_outdir_job_id = model_outdir + f"/{job.id}" - + learning_rate = job.parameters["learning_rate"] + batch_size = job.parameters["batch_size"] # val_scores = main_train_grapdrp([ # "--train_ml_data_dir", str(train_ml_data_dir), # "--val_ml_data_dir", str(val_ml_data_dir), @@ -104,7 +105,10 @@ def run(job, optuna_trial=None): "bash", subprocess_bashscript, str(train_ml_data_dir), str(val_ml_data_dir), - str(model_outdir_job_id) + str(model_outdir_job_id), + str(learning_rate), + str(batch_size), + str(os.environ["CUDA_VISIBLE_DEVICES"]) ], capture_output=True, text=True, check=True ) @@ -146,7 +150,7 @@ def run(job, optuna_trial=None): # max_evals = 4 # max_evals = 10 # max_evals = 20 - max_evals = 10 + max_evals = 2 # max_evals = 100 results = search.search(max_evals=max_evals) results = results.sort_values("m:val_loss", ascending=True) diff --git a/subprocess_train.sh b/subprocess_train.sh index f71985b..7c23265 100755 --- a/subprocess_train.sh +++ b/subprocess_train.sh @@ -16,31 +16,42 @@ CONDA_ENV=$PathDSP_env echo "Activated conda commands in shell script" #conda activate $CONDA_ENV #source activate $CONDA_ENV -source /soft/datascience/conda/2023-10-04/mconda3/bin/activate $CONDA_ENV +conda_path=$(dirname $(dirname $(which conda))) +source $conda_path/bin/activate $CONDA_ENV +#source /soft/datascience/conda/2023-10-04/mconda3/bin/activate $CONDA_ENV #source activate $CONDA_ENV echo "Activated conda env $CONDA_ENV" train_ml_data_dir=$1 val_ml_data_dir=$2 model_outdir=$3 +learning_rate=$4 +batch_size=$5 +CUDA_VISIBLE_DEVICES=$6 + echo "train_ml_data_dir: $train_ml_data_dir" echo "val_ml_data_dir: $val_ml_data_dir" echo "model_outdir: $model_outdir" +echo "learning_rate: $learning_rate" +echo "batch_size: $batch_size" +echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" # epochs=10 -epochs=20 +epochs=10 # epochs=50 # All train outputs are saved in params["model_outdir"] #CUDA_VISIBLE_DEVICES=6,7 python PathDSP_train_improve.py \ #CUDA_VISIBLE_DEVICES=5 #CUDA_VISIBLE_DEVICES=6,7 -python PathDSP_train_improve.py \ +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} python PathDSP_train_improve.py \ --train_ml_data_dir $train_ml_data_dir \ --val_ml_data_dir $val_ml_data_dir \ --model_outdir $model_outdir \ - --epochs $epochs + --epochs $epochs \ + --learning_rate $learning_rate \ + --batch_size $batch_size #conda deactivate -source /soft/datascience/conda/2023-10-04/mconda3/bin/deactivate +source $conda_path/bin/deactivate echo "Deactivated conda env $CONDA_ENV" From ff1bb808d4fce5cce0b8336a6060931694c2f0e3 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Mon, 29 Apr 2024 09:19:15 -0700 Subject: [PATCH 093/254] set priority for cuda_visible_devices --- PathDSP_infer_improve.py | 5 +++-- PathDSP_train_improve.py | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index 5223892..fe35219 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -54,8 +54,9 @@ def run(params): trained_net.load_state_dict(tch.load(modelpath)) trained_net.eval() myutil.set_seed(params["seed_int"]) - if 'CUDA_VISIBLE_DEVICES' in os.environ: - device = 'cuda:'+str(os.environ['CUDA_VISIBLE_DEVICES']) + cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_env_visible is not None: + device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) else: device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 483695e..18adb7f 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -250,9 +250,10 @@ def run(params): # set parameters myutil.set_seed(params["seed_int"]) ## set device - if 'CUDA_VISIBLE_DEVICES' in os.environ: - device = 'cuda:'+str(os.environ['CUDA_VISIBLE_DEVICES']) - else: + cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_env_visible is not None: + device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) + else: device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) learning_rate = params['learning_rate'] epoch = params['epochs'] From c4dcb63eb998a1a603bbdba8c12dcf53872516c9 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Mon, 29 Apr 2024 09:23:19 -0700 Subject: [PATCH 094/254] set priority for cuda_visible_device --- PathDSP_infer_improve.py | 5 +++-- PathDSP_train_improve.py | 13 ++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index 5223892..fe35219 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -54,8 +54,9 @@ def run(params): trained_net.load_state_dict(tch.load(modelpath)) trained_net.eval() myutil.set_seed(params["seed_int"]) - if 'CUDA_VISIBLE_DEVICES' in os.environ: - device = 'cuda:'+str(os.environ['CUDA_VISIBLE_DEVICES']) + cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_env_visible is not None: + device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) else: device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index ec649b7..18adb7f 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -33,6 +33,7 @@ import myDataloader as mydl import myUtility as myutil import polars as pl +import json file_path = os.path.dirname(os.path.realpath(__file__)) @@ -249,9 +250,10 @@ def run(params): # set parameters myutil.set_seed(params["seed_int"]) ## set device - if 'CUDA_VISIBLE_DEVICES' in os.environ: - device = 'cuda:'+str(os.environ['CUDA_VISIBLE_DEVICES']) - else: + cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_env_visible is not None: + device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) + else: device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) learning_rate = params['learning_rate'] epoch = params['epochs'] @@ -348,6 +350,11 @@ def main(args): required=None, ) val_scores = run(params) + # with open(params["model_outdir"] + '/params.json', 'w') as json_file: + # json.dump(params, json_file, indent=4) + df = pd.DataFrame.from_dict(params, orient='index', columns=['value']) + df.to_csv(params["model_outdir"] + '/params.txt',sep="\t") + if __name__ == "__main__": From 2c47366af97ca27c7f92d9d867b5823c4b6e259b Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Mon, 29 Apr 2024 09:51:16 -0700 Subject: [PATCH 095/254] pass hps to singularity --- hpo_subprocess.py | 2 -- hpo_subprocess_singularity.py | 9 ++++++--- subprocess_train_singularity.sh | 9 +++++++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/hpo_subprocess.py b/hpo_subprocess.py index ce283f4..64dddc8 100644 --- a/hpo_subprocess.py +++ b/hpo_subprocess.py @@ -17,9 +17,7 @@ import pandas as pd import os import logging -import os import mpi4py -from mpi4py import MPI from deephyper.evaluator import Evaluator, profile from deephyper.evaluator.callback import TqdmCallback from deephyper.problem import HpProblem diff --git a/hpo_subprocess_singularity.py b/hpo_subprocess_singularity.py index 2fde459..2ee809a 100644 --- a/hpo_subprocess_singularity.py +++ b/hpo_subprocess_singularity.py @@ -40,7 +40,7 @@ rank = comm.Get_rank() size = comm.Get_size() -num_gpus_per_node = 5 +num_gpus_per_node = 2 os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) # --------------------- @@ -91,7 +91,8 @@ def run(job, optuna_trial=None): # params.update(config) model_outdir_job_id = model_outdir + f"/{job.id}" - + learning_rate = job.parameters["learning_rate"] + batch_size = job.parameters["batch_size"] # val_scores = main_train_grapdrp([ # "--train_ml_data_dir", str(train_ml_data_dir), # "--val_ml_data_dir", str(val_ml_data_dir), @@ -103,6 +104,8 @@ def run(job, optuna_trial=None): str(train_ml_data_dir), str(val_ml_data_dir), str(model_outdir_job_id), + str(learning_rate), + str(batch_size), str(os.environ["CUDA_VISIBLE_DEVICES"]) ], capture_output=True, text=True, check=True @@ -145,7 +148,7 @@ def run(job, optuna_trial=None): # max_evals = 4 # max_evals = 10 # max_evals = 20 - max_evals = 10 + max_evals = 2 # max_evals = 100 results = search.search(max_evals=max_evals) results = results.sort_values("m:val_loss", ascending=True) diff --git a/subprocess_train_singularity.sh b/subprocess_train_singularity.sh index 3ee1db8..dc7b1ac 100755 --- a/subprocess_train_singularity.sh +++ b/subprocess_train_singularity.sh @@ -24,10 +24,15 @@ train_ml_data_dir=$1 val_ml_data_dir=$2 model_outdir=$3 -CUDA_VISIBLE_DEVICES=$4 +learning_rate=$4 +batch_size=$5 +CUDA_VISIBLE_DEVICES=$6 + echo "train_ml_data_dir: $train_ml_data_dir" echo "val_ml_data_dir: $val_ml_data_dir" echo "model_outdir: $model_outdir" +echo "learning_rate: $learning_rate" +echo "batch_size: $batch_size" echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" # epochs=10 @@ -49,5 +54,5 @@ epochs=10 #echo "Deactivated conda env $CONDA_ENV" echo "train using singularity container" -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh ${CUDA_VISIBLE_DEVICES} /candle_data_dir --train_ml_data_dir /candle_data_dir/$train_ml_data_dir --val_ml_data_dir /candle_data_dir/$val_ml_data_dir/ --model_outdir /candle_data_dir/$model_outdir/ --epochs $epochs +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh ${CUDA_VISIBLE_DEVICES} /candle_data_dir --train_ml_data_dir /candle_data_dir/$train_ml_data_dir --val_ml_data_dir /candle_data_dir/$val_ml_data_dir/ --model_outdir /candle_data_dir/$model_outdir/ --epochs $epochs --learning_rate $learning_rate --batch_size $batch_size From 64774758c1c173ef9dab16a73fc0e5f32678fe54 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Wed, 1 May 2024 02:17:06 +0000 Subject: [PATCH 096/254] update install instruction --- install_polaris.sh | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/install_polaris.sh b/install_polaris.sh index eabb823..e0aa916 100644 --- a/install_polaris.sh +++ b/install_polaris.sh @@ -14,8 +14,10 @@ set -xe # Load modules available on the current system -module load PrgEnv-gnu/8.3.3 -module load conda/2023-10-04 +module load PrgEnv-gnu +# conda is not avilable +# need to install conda locally +#module load conda/2023-10-04 # Copy the base conda environment conda create -p dhenv python=3.9 pip -y @@ -25,10 +27,11 @@ pip install --upgrade pip # For mpi4py #module swap PrgEnv-nvhpc PrgEnv-gnu module load nvhpc-mixed -git clone https://github.com/mpi4py/mpi4py.git -cd mpi4py/ -MPICC=CC python setup.py install -cd ../ +# git clone https://github.com/mpi4py/mpi4py.git +# cd mpi4py/ +# MPICC=CC python setup.py install +# cd ../ +conda install mpi4py --yes # Install the DeepHyper's Python package git clone -b develop git@github.com:deephyper/deephyper.git @@ -40,6 +43,6 @@ echo "#!/bin/bash" >> activate-dhenv.sh # Append modules loading and conda activation echo "" >> activate-dhenv.sh -echo "module load PrgEnv-gnu/8.3.3" >> activate-dhenv.sh -echo "module load conda/2023-10-04" >> activate-dhenv.sh +echo "module load PrgEnv-gnu" >> activate-dhenv.sh +#echo "module load conda/2023-10-04" >> activate-dhenv.sh echo "conda activate $PWD/dhenv/" >> activate-dhenv.sh From 8ba3a094905163c00956b767d0a7bddb0d389bbd Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Wed, 1 May 2024 02:22:59 +0000 Subject: [PATCH 097/254] update hpo scripts CUDA_VISIBLE_DEVICES does not seem to work. Pass cuda_name directly to the train script --- PathDSP_train_improve.py | 12 +++++++----- hpo_subprocess.py | 10 ++++++---- subprocess_train.sh | 12 ++++++++---- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 18adb7f..5226721 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -250,11 +250,13 @@ def run(params): # set parameters myutil.set_seed(params["seed_int"]) ## set device - cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") - if cuda_env_visible is not None: - device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) - else: - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + # cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + # if cuda_env_visible is not None: + # device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) + # else: + # device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + print("Using device: " + device) learning_rate = params['learning_rate'] epoch = params['epochs'] batch_size = params['batch_size'] diff --git a/hpo_subprocess.py b/hpo_subprocess.py index 64dddc8..2f69d91 100644 --- a/hpo_subprocess.py +++ b/hpo_subprocess.py @@ -40,8 +40,9 @@ rank = comm.Get_rank() size = comm.Get_size() -num_gpus_per_node = 2 -os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) +num_gpus_per_node = 3 +#os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) +cuda_name = "cuda:" + str(rank % num_gpus_per_node) # --------------------- # Enable logging @@ -106,7 +107,8 @@ def run(job, optuna_trial=None): str(model_outdir_job_id), str(learning_rate), str(batch_size), - str(os.environ["CUDA_VISIBLE_DEVICES"]) + str(cuda_name) + #str(os.environ["CUDA_VISIBLE_DEVICES"]) ], capture_output=True, text=True, check=True ) @@ -148,7 +150,7 @@ def run(job, optuna_trial=None): # max_evals = 4 # max_evals = 10 # max_evals = 20 - max_evals = 2 + max_evals = 10 # max_evals = 100 results = search.search(max_evals=max_evals) results = results.sort_values("m:val_loss", ascending=True) diff --git a/subprocess_train.sh b/subprocess_train.sh index 7c23265..94a15d4 100755 --- a/subprocess_train.sh +++ b/subprocess_train.sh @@ -27,14 +27,16 @@ val_ml_data_dir=$2 model_outdir=$3 learning_rate=$4 batch_size=$5 -CUDA_VISIBLE_DEVICES=$6 +cuda_name=$6 +#CUDA_VISIBLE_DEVICES=$6 echo "train_ml_data_dir: $train_ml_data_dir" echo "val_ml_data_dir: $val_ml_data_dir" echo "model_outdir: $model_outdir" echo "learning_rate: $learning_rate" echo "batch_size: $batch_size" -echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" +echo "cuda_name: $cuda_name" +#echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" # epochs=10 epochs=10 @@ -44,13 +46,15 @@ epochs=10 #CUDA_VISIBLE_DEVICES=6,7 python PathDSP_train_improve.py \ #CUDA_VISIBLE_DEVICES=5 #CUDA_VISIBLE_DEVICES=6,7 -CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} python PathDSP_train_improve.py \ +#CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} +python PathDSP_train_improve.py \ --train_ml_data_dir $train_ml_data_dir \ --val_ml_data_dir $val_ml_data_dir \ --model_outdir $model_outdir \ --epochs $epochs \ --learning_rate $learning_rate \ - --batch_size $batch_size + --batch_size $batch_size \ + --cuda_name $cuda_name #conda deactivate source $conda_path/bin/deactivate From 39b97948c39c42e71432b68759fccb9cc4f0801d Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 2 May 2024 11:20:54 -0700 Subject: [PATCH 098/254] remove duplicate cuda_name --- PathDSP_default_model.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index 42c71ed..d6fea26 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -40,5 +40,4 @@ dropout=0.1 test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" model_dir = "./out_models/GDSCv1/split_4" infer_outdir = "./out_infer/GDSCv1-GDSCv1/split_4" -test_batch = 256 -cuda_name = "cuda:3" \ No newline at end of file +test_batch = 256 \ No newline at end of file From 6d7ed79e29c4f9fc95b80edc78a70564f72b13b4 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 2 May 2024 11:22:23 -0700 Subject: [PATCH 099/254] fix issues with cuda_visible_devices CUDA_VISIBLE_DEVICES takes precedence over cuda_name --- PathDSP_infer_improve.py | 11 ++++++----- PathDSP_train_improve.py | 2 +- hpo_subprocess.py | 8 ++++---- subprocess_train.sh | 15 +++++++-------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index fe35219..839ddbe 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -54,11 +54,12 @@ def run(params): trained_net.load_state_dict(tch.load(modelpath)) trained_net.eval() myutil.set_seed(params["seed_int"]) - cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") - if cuda_env_visible is not None: - device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) - else: - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + # cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + # if cuda_env_visible is not None: + # device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) + # else: + # device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) test_dl = tchud.DataLoader(test_dataset, batch_size=params['test_batch'], shuffle=False) start = datetime.now() diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 5226721..f1295af 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -256,7 +256,7 @@ def run(params): # else: # device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) - print("Using device: " + device) + #print("Using device: " + device) learning_rate = params['learning_rate'] epoch = params['epochs'] batch_size = params['batch_size'] diff --git a/hpo_subprocess.py b/hpo_subprocess.py index 2f69d91..9bdbf8d 100644 --- a/hpo_subprocess.py +++ b/hpo_subprocess.py @@ -41,8 +41,8 @@ size = comm.Get_size() num_gpus_per_node = 3 -#os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) -cuda_name = "cuda:" + str(rank % num_gpus_per_node) +os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) +#cuda_name = "cuda:" + str(rank % num_gpus_per_node) # --------------------- # Enable logging @@ -107,8 +107,8 @@ def run(job, optuna_trial=None): str(model_outdir_job_id), str(learning_rate), str(batch_size), - str(cuda_name) - #str(os.environ["CUDA_VISIBLE_DEVICES"]) + #str(cuda_name) + str(os.environ["CUDA_VISIBLE_DEVICES"]) ], capture_output=True, text=True, check=True ) diff --git a/subprocess_train.sh b/subprocess_train.sh index 94a15d4..a2b1541 100755 --- a/subprocess_train.sh +++ b/subprocess_train.sh @@ -27,16 +27,16 @@ val_ml_data_dir=$2 model_outdir=$3 learning_rate=$4 batch_size=$5 -cuda_name=$6 -#CUDA_VISIBLE_DEVICES=$6 +#cuda_name=$6 +CUDA_VISIBLE_DEVICES=$6 echo "train_ml_data_dir: $train_ml_data_dir" echo "val_ml_data_dir: $val_ml_data_dir" echo "model_outdir: $model_outdir" echo "learning_rate: $learning_rate" echo "batch_size: $batch_size" -echo "cuda_name: $cuda_name" -#echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" +#echo "cuda_name: $cuda_name" +echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" # epochs=10 epochs=10 @@ -46,15 +46,14 @@ epochs=10 #CUDA_VISIBLE_DEVICES=6,7 python PathDSP_train_improve.py \ #CUDA_VISIBLE_DEVICES=5 #CUDA_VISIBLE_DEVICES=6,7 -#CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} -python PathDSP_train_improve.py \ +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} python PathDSP_train_improve.py \ --train_ml_data_dir $train_ml_data_dir \ --val_ml_data_dir $val_ml_data_dir \ --model_outdir $model_outdir \ --epochs $epochs \ --learning_rate $learning_rate \ - --batch_size $batch_size \ - --cuda_name $cuda_name + --batch_size $batch_size +# --cuda_name $cuda_name #conda deactivate source $conda_path/bin/deactivate From 563ffaef1e2e2b3a6149bb2e8b62d21ec00d6f3c Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 2 May 2024 11:32:08 -0700 Subject: [PATCH 100/254] fix issue with cuda_visible_devices --- PathDSP_default_model.txt | 3 +-- PathDSP_infer_improve.py | 11 ++++++----- PathDSP_train_improve.py | 12 +++++++----- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index 42c71ed..d6fea26 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -40,5 +40,4 @@ dropout=0.1 test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" model_dir = "./out_models/GDSCv1/split_4" infer_outdir = "./out_infer/GDSCv1-GDSCv1/split_4" -test_batch = 256 -cuda_name = "cuda:3" \ No newline at end of file +test_batch = 256 \ No newline at end of file diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index fe35219..839ddbe 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -54,11 +54,12 @@ def run(params): trained_net.load_state_dict(tch.load(modelpath)) trained_net.eval() myutil.set_seed(params["seed_int"]) - cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") - if cuda_env_visible is not None: - device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) - else: - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + # cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + # if cuda_env_visible is not None: + # device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) + # else: + # device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) test_dl = tchud.DataLoader(test_dataset, batch_size=params['test_batch'], shuffle=False) start = datetime.now() diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 18adb7f..f1295af 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -250,11 +250,13 @@ def run(params): # set parameters myutil.set_seed(params["seed_int"]) ## set device - cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") - if cuda_env_visible is not None: - device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) - else: - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + # cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + # if cuda_env_visible is not None: + # device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) + # else: + # device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + #print("Using device: " + device) learning_rate = params['learning_rate'] epoch = params['epochs'] batch_size = params['batch_size'] From 333b98433725d620d3cfd3a9db93c6bbda2607d4 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 2 May 2024 15:09:35 -0700 Subject: [PATCH 101/254] fix cuda_visible_devices issue When one or multiple device numbers are passed via CUDA_VISIBLE_DEVICES, the values in python script are reindexed and start from 0 --- PathDSP_infer_improve.py | 11 +++++------ PathDSP_train_improve.py | 11 +++++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index 839ddbe..fa308ed 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -54,12 +54,11 @@ def run(params): trained_net.load_state_dict(tch.load(modelpath)) trained_net.eval() myutil.set_seed(params["seed_int"]) - # cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") - # if cuda_env_visible is not None: - # device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) - # else: - # device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_env_visible is not None: + device = 'cuda:0' + else: + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) test_dl = tchud.DataLoader(test_dataset, batch_size=params['test_batch'], shuffle=False) start = datetime.now() diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index f1295af..a51f238 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -250,12 +250,11 @@ def run(params): # set parameters myutil.set_seed(params["seed_int"]) ## set device - # cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") - # if cuda_env_visible is not None: - # device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) - # else: - # device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_env_visible is not None: + device = 'cuda:0' + else: + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) #print("Using device: " + device) learning_rate = params['learning_rate'] epoch = params['epochs'] From be69c3b8c7b20e3209f560e34639adb4f68b2f59 Mon Sep 17 00:00:00 2001 From: Liuy12 Date: Thu, 2 May 2024 15:10:42 -0700 Subject: [PATCH 102/254] fix cuda_visible_devices issue When one or multiple device numbers are passed via CUDA_VISIBLE_DEVICES, the values in python script are reindexed and start from 0 --- PathDSP_infer_improve.py | 11 +++++------ PathDSP_train_improve.py | 11 +++++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index 839ddbe..fa308ed 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -54,12 +54,11 @@ def run(params): trained_net.load_state_dict(tch.load(modelpath)) trained_net.eval() myutil.set_seed(params["seed_int"]) - # cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") - # if cuda_env_visible is not None: - # device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) - # else: - # device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_env_visible is not None: + device = 'cuda:0' + else: + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) test_dl = tchud.DataLoader(test_dataset, batch_size=params['test_batch'], shuffle=False) start = datetime.now() diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index f1295af..a51f238 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -250,12 +250,11 @@ def run(params): # set parameters myutil.set_seed(params["seed_int"]) ## set device - # cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") - # if cuda_env_visible is not None: - # device = 'cuda:'+str(os.getenv("CUDA_VISIBLE_DEVICES")) - # else: - # device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_env_visible is not None: + device = 'cuda:0' + else: + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) #print("Using device: " + device) learning_rate = params['learning_rate'] epoch = params['epochs'] From b551ea91a072ca3e4a60ac7b0518217f3bd86037 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Tue, 7 May 2024 22:08:17 +0000 Subject: [PATCH 103/254] update readme --- README.md | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 19386b8..6385163 100644 --- a/README.md +++ b/README.md @@ -60,14 +60,28 @@ cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ ./PathDSP ## python PathDSP/PathDSP_preprocess_improve.py --ml_data_outdir=./PathDSP/ml_data/GDSCv1-GDSCv1/split_4/ ``` -Activate deephyper environment and perform HPO +Activate deephyper environment ``` # the .sh script sometimes does not activate the environment somehow # bash ./activate-dhenv.sh -module load PrgEnv-gnu/8.3.3 -module load conda/2023-10-04 +module load PrgEnv-gnu +#module load conda/2023-10-04 conda activate ./dhenv/ +``` + +Perform HPO using conda + +``` cd PathDSP +## make sure mpirun is from the current conda environment mpirun -np 10 python hpo_subprocess.py ``` + +Alternatively, perform HPO using singularity container + +``` +module use /soft/spack/gcc/0.6.1/install/modulefiles/Core +module load apptainer +mpirun -np 10 python hpo_subprocess_singularity.py +``` \ No newline at end of file From e6808fd4ff54c32d7b1066b72adf882a5e749731 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Tue, 7 May 2024 22:09:32 +0000 Subject: [PATCH 104/254] update scripts for hpo update scripts for hpo using singularity container --- hpo_subprocess_singularity.py | 6 +++--- subprocess_train_singularity.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hpo_subprocess_singularity.py b/hpo_subprocess_singularity.py index 2ee809a..1834954 100644 --- a/hpo_subprocess_singularity.py +++ b/hpo_subprocess_singularity.py @@ -40,7 +40,7 @@ rank = comm.Get_rank() size = comm.Get_size() -num_gpus_per_node = 2 +num_gpus_per_node = 3 os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) # --------------------- @@ -148,10 +148,10 @@ def run(job, optuna_trial=None): # max_evals = 4 # max_evals = 10 # max_evals = 20 - max_evals = 2 + max_evals = 10 # max_evals = 100 results = search.search(max_evals=max_evals) results = results.sort_values("m:val_loss", ascending=True) - results.to_csv(os.path.join(os.environ["IMPROVE_DATA_DIR"], model_outdir, "/hpo_results.csv"), index=False) + results.to_csv(os.path.join(os.environ["IMPROVE_DATA_DIR"], model_outdir, "hpo_results.csv"), index=False) print("Finished deephyper HPO.") diff --git a/subprocess_train_singularity.sh b/subprocess_train_singularity.sh index dc7b1ac..7e3cf06 100755 --- a/subprocess_train_singularity.sh +++ b/subprocess_train_singularity.sh @@ -54,5 +54,5 @@ epochs=10 #echo "Deactivated conda env $CONDA_ENV" echo "train using singularity container" -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh ${CUDA_VISIBLE_DEVICES} /candle_data_dir --train_ml_data_dir /candle_data_dir/$train_ml_data_dir --val_ml_data_dir /candle_data_dir/$val_ml_data_dir/ --model_outdir /candle_data_dir/$model_outdir/ --epochs $epochs --learning_rate $learning_rate --batch_size $batch_size +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir $PathDSP_sif train.sh ${CUDA_VISIBLE_DEVICES} /candle_data_dir --train_ml_data_dir /candle_data_dir/$train_ml_data_dir --val_ml_data_dir /candle_data_dir/$val_ml_data_dir/ --model_outdir /candle_data_dir/$model_outdir/ --epochs $epochs --learning_rate $learning_rate --batch_size $batch_size From 5403d89488353bafb3efea655ee4451d2bb0edc3 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Mon, 13 May 2024 19:45:29 +0000 Subject: [PATCH 105/254] update .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 38b428b..5d7b436 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,6 @@ train_gpu_util.sh ## image PathDSP.sif + +## log files +dh_hpo_scale_test.* From fb9518637e811b3e2ae498a5a1ed31d924ca2f07 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Mon, 13 May 2024 19:50:39 +0000 Subject: [PATCH 106/254] update installer script --- install_polaris.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/install_polaris.sh b/install_polaris.sh index e0aa916..4970f35 100644 --- a/install_polaris.sh +++ b/install_polaris.sh @@ -45,4 +45,5 @@ echo "#!/bin/bash" >> activate-dhenv.sh echo "" >> activate-dhenv.sh echo "module load PrgEnv-gnu" >> activate-dhenv.sh #echo "module load conda/2023-10-04" >> activate-dhenv.sh -echo "conda activate $PWD/dhenv/" >> activate-dhenv.sh +conda_path=$(dirname $(dirname $(which conda))) +echo "source $conda_path/bin/activate $PWD/dhenv/" >> activate-dhenv.sh From 7a2d9fd99115acb118942893f41413dae78b000f Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Mon, 13 May 2024 19:54:31 +0000 Subject: [PATCH 107/254] update hpo subprocess scripts --- PathDSP_train_improve.py | 5 ++++- hpo_subprocess.py | 8 +++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index a51f238..79c2ce3 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -14,7 +14,6 @@ #sys.path.append("/usr/local/PathDSP/PathDSP") #sys.path.append(os.getcwd() + "/PathDSP") #import FNN_new -import os import argparse import numpy as np import pandas as pd @@ -34,6 +33,7 @@ import myUtility as myutil import polars as pl import json +import socket file_path = os.path.dirname(os.path.realpath(__file__)) @@ -253,6 +253,7 @@ def run(params): cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") if cuda_env_visible is not None: device = 'cuda:0' + params["CUDA_VISIBLE_DEVICES"] = cuda_env_visible else: device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) #print("Using device: " + device) @@ -350,6 +351,8 @@ def main(args): additional_definitions=additional_definitions, required=None, ) + # get node name + params["node_name"] = socket.gethostname() val_scores = run(params) # with open(params["model_outdir"] + '/params.json', 'w') as json_file: # json.dump(params, json_file, indent=4) diff --git a/hpo_subprocess.py b/hpo_subprocess.py index 9bdbf8d..f24fc8e 100644 --- a/hpo_subprocess.py +++ b/hpo_subprocess.py @@ -7,9 +7,11 @@ export IMPROVE_DATA_DIR="./csa_data/" export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib -mpirun -np 10 python hpo_subprocess.py +It also assumes that your processed training data is at: "ml_data/{source}-{source}/split_{split}" +validation data is at: "ml_data/{source}-{source}/split_{split}" +model output files will be saved at "dh_hpo_improve/{source}/split_{split}" -TODO: how to distribute HPO to mulitple GPUs? +mpirun -np 10 python hpo_subprocess.py """ # import copy import json @@ -40,7 +42,7 @@ rank = comm.Get_rank() size = comm.Get_size() -num_gpus_per_node = 3 +num_gpus_per_node = 4 os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) #cuda_name = "cuda:" + str(rank % num_gpus_per_node) From 4373efa8e21561fc9c5d500dd5057f7a173f4634 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Mon, 13 May 2024 19:58:29 +0000 Subject: [PATCH 108/254] add hpo_scale scripts add scripts for scaling hpo across multiple nodes and gpus --- get_hosts_polaris.py | 14 ++++++++++ hpo_scale.sh | 51 +++++++++++++++++++++++++++++++++++++ set_affinity_gpu_polaris.sh | 6 +++++ 3 files changed, 71 insertions(+) create mode 100644 get_hosts_polaris.py create mode 100644 hpo_scale.sh create mode 100755 set_affinity_gpu_polaris.sh diff --git a/get_hosts_polaris.py b/get_hosts_polaris.py new file mode 100644 index 0000000..196008a --- /dev/null +++ b/get_hosts_polaris.py @@ -0,0 +1,14 @@ +import sys + +if __name__ == "__main__": + ranks_per_node = 4 + fname = sys.argv[1] + output = "" + with open(fname, "r") as f: + for i, line in enumerate(f): + line = line.strip("\n") + if i == 0: + output += f"{line}" + for _ in range(ranks_per_node): + output += f",{line}" + print(output) \ No newline at end of file diff --git a/hpo_scale.sh b/hpo_scale.sh new file mode 100644 index 0000000..e21ca2d --- /dev/null +++ b/hpo_scale.sh @@ -0,0 +1,51 @@ +#!/bin/bash +#PBS -l select=2:system=polaris +#PBS -l place=scatter +#PBS -l walltime=00:60:00 +#PBS -q debug +#PBS -A IMPROVE +#PBS -l filesystems=home:eagle +#PBS -N dh_hpo_scale_test + +set -xe + +# Move to the directory where `qsub example-improve.sh` was run +cd ${PBS_O_WORKDIR} + +# Activate the current environement (module load, conda activate etc...) +module load PrgEnv-gnu +# Assume conda is installed +conda_path=$(dirname $(dirname $(which conda))) +# Assume dh_env is defined +source $conda_path/bin/activate $dh_env + +# Resource allocation for DeepHyper +export NDEPTH=16 +export NRANKS_PER_NODE=4 +export NNODES=`wc -l < $PBS_NODEFILE` +export NTOTRANKS=$(( $NNODES * $NRANKS_PER_NODE + 1)) +export OMP_NUM_THREADS=$NDEPTH + +echo NNODES: ${NNODES} +echo NTOTRANKS: ${NTOTRANKS} +echo OMP_NUM_THREADS: ${OMP_NUM_THREADS} + +# GPU profiling, (quite ad-hoc, copy-paste the `profile_gpu_polaris.sh`, requires to install some small +# python package which queries nvidia-smi, you need a simple parser then to collect data.) +# UNCOMMENT IF USEFULL +# export GPUSTAT_LOG_DIR=$PBS_O_WORKDIR/$log_dir +# mpiexec -n ${NNODES} --ppn 1 --depth=1 --cpu-bind depth --envall ../profile_gpu_polaris.sh & + +# Get list of process ids (basically node names) +echo $PBS_NODEFILE +export RANKS_HOSTS=$(python ./get_hosts_polaris.py $PBS_NODEFILE) + +echo RANKS_HOSTS: ${RANKS_HOSTS} +echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK} + +# Launch DeepHyper +# ensure that mpi is pointing to the one within deephyper conda environment +# set_affinity_gpu_polaris.sh does not seem to work right now +# but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, +mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ + --envall \ ./set_affinity_gpu_polaris.sh python hpo_subprocess.py \ No newline at end of file diff --git a/set_affinity_gpu_polaris.sh b/set_affinity_gpu_polaris.sh new file mode 100755 index 0000000..f3d4915 --- /dev/null +++ b/set_affinity_gpu_polaris.sh @@ -0,0 +1,6 @@ +#!/bin/bash +num_gpus=4 +gpu=$((${PMI_LOCAL_RANK} % ${num_gpus})) +export CUDA_VISIBLE_DEVICES=$gpu +echo “RANK= ${PMI_RANK} LOCAL_RANK= ${PMI_LOCAL_RANK} gpu= ${gpu}” +exec "$@" From 89119a9dd961470ebbf4c090eb4e4eb7cdf454a2 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Tue, 14 May 2024 16:22:59 +0000 Subject: [PATCH 109/254] update hpo_scale use source to load environment variables required for IMPROVE --- hpo_scale.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hpo_scale.sh b/hpo_scale.sh index e21ca2d..018b2e3 100644 --- a/hpo_scale.sh +++ b/hpo_scale.sh @@ -12,11 +12,13 @@ set -xe # Move to the directory where `qsub example-improve.sh` was run cd ${PBS_O_WORKDIR} +# source enviroemnt variabels for IMPROVE +source $IMPROVE_env + # Activate the current environement (module load, conda activate etc...) -module load PrgEnv-gnu # Assume conda is installed +module load PrgEnv-gnu conda_path=$(dirname $(dirname $(which conda))) -# Assume dh_env is defined source $conda_path/bin/activate $dh_env # Resource allocation for DeepHyper @@ -47,5 +49,6 @@ echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK} # ensure that mpi is pointing to the one within deephyper conda environment # set_affinity_gpu_polaris.sh does not seem to work right now # but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, -mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ - --envall \ ./set_affinity_gpu_polaris.sh python hpo_subprocess.py \ No newline at end of file +${dh_env}/bin/mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ + --envall \ + ./set_affinity_gpu_polaris.sh python hpo_subprocess.py \ No newline at end of file From 284c0fbbb51bcd866454962685e7083e6c43e87d Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Tue, 14 May 2024 16:23:13 +0000 Subject: [PATCH 110/254] update readme --- README.md | 64 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 6385163..ffd7220 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ -# Setup environment on Polaris for deephyper +# Run HPO using deephyper on Polaris -Install conda environment for deephyper +## Install deephyper environment ``` git clone -b deephyper https://github.com/Liuy12/PathDSP.git bash ./PathDSP/install_polaris.sh ``` -Install conda environment for the curated model (PathDSP) +## Install conda environment for the curated model (PathDSP) ``` ## install IMPROVE @@ -19,69 +19,71 @@ conda activate ${PathDSP_env} pip install git+https://github.com/ECP-CANDLE/candle_lib@develop ``` -Download csa benchmark data +## Download csa benchmark data ``` wget --cut-dirs=7 -P ./ -nH -np -m ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data ``` -Download additional author data (PathDSP only) +## Download additional author data (PathDSP only) ``` mkdir author_data bash ./PathDSP/download_author_data.sh author_data/ ``` -Define environment variables +## Define environment variables ``` -### need to firstly request an interactive node first from polaris -### use debug queue for testing -### it might take a while for a node to become available -qsub -A IMPROVE -I -l select=1 -l filesystems=home:eagle -l walltime=1:00:00 -q debug +### if necessary, request an interactive node from polaris to testing purposes +### qsub -A IMPROVE -I -l select=1 -l filesystems=home:eagle -l walltime=1:00:00 -q debug ### NEED to cd into your working directory again once the job started improve_lib="$PWD/IMPROVE/" pathdsp_lib="$PWD/PathDSP/" # notice the extra PathDSP folder after pathdsp_lib -export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/ -export IMPROVE_DATA_DIR="$PWD/csa_data/" -export AUTHOR_DATA_DIR="$PWD/author_data/" -export PathDSP_env="$PWD/PathDSP_env/" +echo "export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/" >> IMPROVE_env +# IMPROVE_DATA_DIR +echo "export IMPROVE_DATA_DIR=$PWD/csa_data/" >> IMPROVE_env +# AUTHOR_DATA_DIR required for PathDSP +echo "export AUTHOR_DATA_DIR=$PWD/author_data/" >> IMPROVE_env +# PathDSP_env: conda environment path for the model +echo "export PathDSP_env=$PWD/PathDSP_env/" >> IMPROVE_env +# dh_env: conda envoronment path for deephyper +echo "export dh_env=$PWD/dhenv/" >> IMPROVE_env +source $PWD/IMPROVE_env ``` -Perform preprocessing +## Perform preprocessing ``` conda activate $PathDSP_env -## You can copy the processed files under my home dir +## You can copy the processed files for PathDSP cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ ./PathDSP/ ## Alternatively, run the preprocess script ## This script taks around 40 mins to complete ## python PathDSP/PathDSP_preprocess_improve.py --ml_data_outdir=./PathDSP/ml_data/GDSCv1-GDSCv1/split_4/ ``` -Activate deephyper environment - -``` -# the .sh script sometimes does not activate the environment somehow -# bash ./activate-dhenv.sh -module load PrgEnv-gnu -#module load conda/2023-10-04 -conda activate ./dhenv/ -``` - -Perform HPO using conda +## Perform HPO across two nodes based on conda ``` cd PathDSP -## make sure mpirun is from the current conda environment -mpirun -np 10 python hpo_subprocess.py +# supply environment variables to qsub +qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale.sh +## for interactive node, you can run: mpirun -np 10 python hpo_subprocess.py ``` -Alternatively, perform HPO using singularity container +## Alternatively, perform HPO using singularity container across two nodes ``` +## copy processed to IMPROVE_DATA_DIR +cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ $IMPROVE_DATA_DIR +## specify singularity image file for PathDSP +echo "export PathDSP_sif=/lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP.sif" >> IMPROVE_env +## enable singularity on polaris module use /soft/spack/gcc/0.6.1/install/modulefiles/Core module load apptainer -mpirun -np 10 python hpo_subprocess_singularity.py +cd PathDSP +qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale_singularity.sh +## for interative node, run: mpirun -np 10 python hpo_subprocess_singularity.py ``` \ No newline at end of file From 18a467b358625afc557e022e3bd78a0c1aefd2e2 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Tue, 14 May 2024 16:23:39 +0000 Subject: [PATCH 111/254] add hpo scale script for singularity --- hpo_scale_singularity.sh | 58 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 hpo_scale_singularity.sh diff --git a/hpo_scale_singularity.sh b/hpo_scale_singularity.sh new file mode 100644 index 0000000..dfefae2 --- /dev/null +++ b/hpo_scale_singularity.sh @@ -0,0 +1,58 @@ +#!/bin/bash +#PBS -l select=2:system=polaris +#PBS -l place=scatter +#PBS -l walltime=00:60:00 +#PBS -q debug +#PBS -A IMPROVE +#PBS -l filesystems=home:eagle +#PBS -N dh_hpo_scale_test + +set -xe + +# Move to the directory where `qsub example-improve.sh` was run +cd ${PBS_O_WORKDIR} + +# source enviroemnt variabels for IMPROVE +source $IMPROVE_env + +# Activate the current environement (module load, conda activate etc...) +module load PrgEnv-gnu +# Assume conda is installed +conda_path=$(dirname $(dirname $(which conda))) +echo $dh_env +source $conda_path/bin/activate $dh_env + +# load module to run singularity container +module use /soft/spack/gcc/0.6.1/install/modulefiles/Core +module load apptainer + +# Resource allocation for DeepHyper +export NDEPTH=16 +export NRANKS_PER_NODE=4 +export NNODES=`wc -l < $PBS_NODEFILE` +export NTOTRANKS=$(( $NNODES * $NRANKS_PER_NODE + 1)) +export OMP_NUM_THREADS=$NDEPTH + +echo NNODES: ${NNODES} +echo NTOTRANKS: ${NTOTRANKS} +echo OMP_NUM_THREADS: ${OMP_NUM_THREADS} + +# GPU profiling, (quite ad-hoc, copy-paste the `profile_gpu_polaris.sh`, requires to install some small +# python package which queries nvidia-smi, you need a simple parser then to collect data.) +# UNCOMMENT IF USEFULL +# export GPUSTAT_LOG_DIR=$PBS_O_WORKDIR/$log_dir +# mpiexec -n ${NNODES} --ppn 1 --depth=1 --cpu-bind depth --envall ../profile_gpu_polaris.sh & + +# Get list of process ids (basically node names) +echo $PBS_NODEFILE +export RANKS_HOSTS=$(python ./get_hosts_polaris.py $PBS_NODEFILE) + +echo RANKS_HOSTS: ${RANKS_HOSTS} +echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK} + +# Launch DeepHyper +# ensure that mpi is pointing to the one within deephyper conda environment +# set_affinity_gpu_polaris.sh does not seem to work right now +# but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, +${dh_env}/bin/mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ + --envall ./set_affinity_gpu_polaris.sh python hpo_subprocess_singularity.py \ No newline at end of file From 5a1f43c7ed06e289c3861379bec136e08048138b Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Fri, 24 May 2024 19:33:15 +0000 Subject: [PATCH 112/254] update hpo scripts --- README.md | 16 +++------------- hpo_scale.sh | 8 ++++++-- hpo_scale_singularity.sh | 11 ++++++----- hpo_subprocess.py | 10 +++++++--- hpo_subprocess_singularity.py | 13 +++++++------ install_polaris.sh | 13 ++++++++----- 6 files changed, 37 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index ffd7220..a7e1a68 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,10 @@ # Run HPO using deephyper on Polaris -## Install deephyper environment - -``` -git clone -b deephyper https://github.com/Liuy12/PathDSP.git -bash ./PathDSP/install_polaris.sh -``` - ## Install conda environment for the curated model (PathDSP) ``` +## install PathDSP +git clone -b deephyper https://github.com/Liuy12/PathDSP.git ## install IMPROVE git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git ## define where to install PathDSP env @@ -47,9 +42,7 @@ echo "export IMPROVE_DATA_DIR=$PWD/csa_data/" >> IMPROVE_env # AUTHOR_DATA_DIR required for PathDSP echo "export AUTHOR_DATA_DIR=$PWD/author_data/" >> IMPROVE_env # PathDSP_env: conda environment path for the model -echo "export PathDSP_env=$PWD/PathDSP_env/" >> IMPROVE_env -# dh_env: conda envoronment path for deephyper -echo "export dh_env=$PWD/dhenv/" >> IMPROVE_env +echo "export PathDSP_env=$PathDSP_env" >> IMPROVE_env source $PWD/IMPROVE_env ``` @@ -80,9 +73,6 @@ qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale.sh cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ $IMPROVE_DATA_DIR ## specify singularity image file for PathDSP echo "export PathDSP_sif=/lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP.sif" >> IMPROVE_env -## enable singularity on polaris -module use /soft/spack/gcc/0.6.1/install/modulefiles/Core -module load apptainer cd PathDSP qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale_singularity.sh ## for interative node, run: mpirun -np 10 python hpo_subprocess_singularity.py diff --git a/hpo_scale.sh b/hpo_scale.sh index 018b2e3..93d517f 100644 --- a/hpo_scale.sh +++ b/hpo_scale.sh @@ -18,8 +18,12 @@ source $IMPROVE_env # Activate the current environement (module load, conda activate etc...) # Assume conda is installed module load PrgEnv-gnu +module use /soft/modulefiles +module load conda +# activate base environment conda_path=$(dirname $(dirname $(which conda))) -source $conda_path/bin/activate $dh_env +source $conda_path/bin/activate base +#source $conda_path/bin/activate $dh_env # Resource allocation for DeepHyper export NDEPTH=16 @@ -49,6 +53,6 @@ echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK} # ensure that mpi is pointing to the one within deephyper conda environment # set_affinity_gpu_polaris.sh does not seem to work right now # but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, -${dh_env}/bin/mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ +mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ --envall \ ./set_affinity_gpu_polaris.sh python hpo_subprocess.py \ No newline at end of file diff --git a/hpo_scale_singularity.sh b/hpo_scale_singularity.sh index dfefae2..65303d3 100644 --- a/hpo_scale_singularity.sh +++ b/hpo_scale_singularity.sh @@ -17,11 +17,12 @@ source $IMPROVE_env # Activate the current environement (module load, conda activate etc...) module load PrgEnv-gnu -# Assume conda is installed +module use /soft/modulefiles +module load conda +# activate base environment conda_path=$(dirname $(dirname $(which conda))) -echo $dh_env -source $conda_path/bin/activate $dh_env - +source $conda_path/bin/activate base +#source $conda_path/bin/activate $dh_env # load module to run singularity container module use /soft/spack/gcc/0.6.1/install/modulefiles/Core module load apptainer @@ -54,5 +55,5 @@ echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK} # ensure that mpi is pointing to the one within deephyper conda environment # set_affinity_gpu_polaris.sh does not seem to work right now # but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, -${dh_env}/bin/mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ +mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ --envall ./set_affinity_gpu_polaris.sh python hpo_subprocess_singularity.py \ No newline at end of file diff --git a/hpo_subprocess.py b/hpo_subprocess.py index f24fc8e..b925ec1 100644 --- a/hpo_subprocess.py +++ b/hpo_subprocess.py @@ -25,6 +25,7 @@ from deephyper.problem import HpProblem from deephyper.search.hps import CBO from mpi4py import MPI +import socket # --------------------- # Enable using multiple GPUs @@ -41,9 +42,12 @@ comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() +local_rank = os.environ["PMI_LOCAL_RANK"] -num_gpus_per_node = 4 -os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) +# CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh +# uncomment the below commands if running via interactive node +#num_gpus_per_node = 4 +#os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) #cuda_name = "cuda:" + str(rank % num_gpus_per_node) # --------------------- @@ -157,5 +161,5 @@ def run(job, optuna_trial=None): results = search.search(max_evals=max_evals) results = results.sort_values("m:val_loss", ascending=True) results.to_csv(model_outdir + "/hpo_results.csv", index=False) - + print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) print("Finished deephyper HPO.") diff --git a/hpo_subprocess_singularity.py b/hpo_subprocess_singularity.py index 1834954..8d53dfe 100644 --- a/hpo_subprocess_singularity.py +++ b/hpo_subprocess_singularity.py @@ -1,6 +1,5 @@ """ Before running this script, first need to preprocess the data. -This can be done by running preprocess_example.sh It is assumed that the csa benchmark data is downloaded via download_csa.sh and the env vars $IMPROVE_DATA_DIR and $PYTHONPATH are set: @@ -8,8 +7,6 @@ export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib mpirun -np 10 python hpo_subprocess.py - -TODO: how to distribute HPO to mulitple GPUs? """ # import copy import json @@ -23,6 +20,7 @@ from deephyper.evaluator.callback import TqdmCallback from deephyper.problem import HpProblem from deephyper.search.hps import CBO +import socket # --------------------- # Enable using multiple GPUs @@ -39,9 +37,12 @@ comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() +local_rank = os.environ["PMI_LOCAL_RANK"] -num_gpus_per_node = 3 -os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) +# CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh +# uncomment the below commands if running via interactive node +#num_gpus_per_node = 3 +#os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) # --------------------- # Enable logging @@ -153,5 +154,5 @@ def run(job, optuna_trial=None): results = search.search(max_evals=max_evals) results = results.sort_values("m:val_loss", ascending=True) results.to_csv(os.path.join(os.environ["IMPROVE_DATA_DIR"], model_outdir, "hpo_results.csv"), index=False) - + print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) print("Finished deephyper HPO.") diff --git a/install_polaris.sh b/install_polaris.sh index 4970f35..46695e8 100644 --- a/install_polaris.sh +++ b/install_polaris.sh @@ -1,5 +1,8 @@ #!/bin/bash +# install script is not needed as it's troublesome to install mpi4py libraray +# use conda base environment directly instead + # From Romain Egele (this script is called install.sh) # Generic installation script for DeepHyper on ALCF's Polaris. @@ -27,11 +30,11 @@ pip install --upgrade pip # For mpi4py #module swap PrgEnv-nvhpc PrgEnv-gnu module load nvhpc-mixed -# git clone https://github.com/mpi4py/mpi4py.git -# cd mpi4py/ -# MPICC=CC python setup.py install -# cd ../ -conda install mpi4py --yes +git clone https://github.com/mpi4py/mpi4py.git +cd mpi4py/ +MPICC=CC python setup.py install +cd ../ +#conda install mpi4py --yes # Install the DeepHyper's Python package git clone -b develop git@github.com:deephyper/deephyper.git From 95e9bc37a7eda26c5ea7c26c219935fc2be2510e Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Fri, 7 Jun 2024 14:56:29 +0000 Subject: [PATCH 113/254] update default params --- PathDSP_default_model.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index d6fea26..37f9a28 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -33,7 +33,7 @@ loss = "mse" early_stop_metric = "mse" patience = 30 cuda_name = "cuda:0" -learning_rate = 0.001 +learning_rate = 0.0004 dropout=0.1 [Infer] From f3e1631689ad648630e84a77510398bf25dfc698 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Fri, 7 Jun 2024 14:58:06 +0000 Subject: [PATCH 114/254] set epochs to 50 --- subprocess_train_singularity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subprocess_train_singularity.sh b/subprocess_train_singularity.sh index 7e3cf06..7bb8f05 100755 --- a/subprocess_train_singularity.sh +++ b/subprocess_train_singularity.sh @@ -36,7 +36,7 @@ echo "batch_size: $batch_size" echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" # epochs=10 -epochs=10 +epochs=50 # epochs=50 # All train outputs are saved in params["model_outdir"] From cb3e34a11b632b370352af34a2e8b68b5de303a6 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Fri, 7 Jun 2024 14:58:38 +0000 Subject: [PATCH 115/254] update default params values --- hpo_subprocess_singularity.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hpo_subprocess_singularity.py b/hpo_subprocess_singularity.py index 8d53dfe..b81d7d8 100644 --- a/hpo_subprocess_singularity.py +++ b/hpo_subprocess_singularity.py @@ -60,17 +60,17 @@ # --------------------- problem = HpProblem() -problem.add_hyperparameter((8, 512, "log-uniform"), "batch_size", default_value=64) +problem.add_hyperparameter((8, 512, "log-uniform"), "batch_size", default_value=8) problem.add_hyperparameter((1e-6, 1e-2, "log-uniform"), - "learning_rate", default_value=0.001) + "learning_rate", default_value=0.0004) # problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.0) # problem.add_hyperparameter([True, False], "early_stopping", default_value=False) # --------------------- # Some IMPROVE settings # --------------------- -source = "GDSCv1" -split = 4 +source = "gCSI" +split = 0 train_ml_data_dir = f"ml_data/{source}-{source}/split_{split}" val_ml_data_dir = f"ml_data/{source}-{source}/split_{split}" model_outdir = f"dh_hpo_improve/{source}/split_{split}" @@ -149,7 +149,7 @@ def run(job, optuna_trial=None): # max_evals = 4 # max_evals = 10 # max_evals = 20 - max_evals = 10 + max_evals = 100 # max_evals = 100 results = search.search(max_evals=max_evals) results = results.sort_values("m:val_loss", ascending=True) From e0aedfd08522048865c456d02b27655e9a138f67 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Fri, 7 Jun 2024 18:41:45 +0000 Subject: [PATCH 116/254] update scaling scripts provide three scripts for submitting jobs to queues: debug, debug_scaling, prod --- ...arity.sh => hpo_scale_singularity_debug.sh | 3 +- hpo_scale_singularity_debug_scaling.sh | 60 +++++++++++++++++++ hpo_scale_singularity_prod.sh | 60 +++++++++++++++++++ 3 files changed, 122 insertions(+), 1 deletion(-) rename hpo_scale_singularity.sh => hpo_scale_singularity_debug.sh (96%) create mode 100644 hpo_scale_singularity_debug_scaling.sh create mode 100644 hpo_scale_singularity_prod.sh diff --git a/hpo_scale_singularity.sh b/hpo_scale_singularity_debug.sh similarity index 96% rename from hpo_scale_singularity.sh rename to hpo_scale_singularity_debug.sh index 65303d3..f8aa1b6 100644 --- a/hpo_scale_singularity.sh +++ b/hpo_scale_singularity_debug.sh @@ -1,11 +1,12 @@ #!/bin/bash -#PBS -l select=2:system=polaris +#PBS -l select=2:ngpus=4:system=polaris #PBS -l place=scatter #PBS -l walltime=00:60:00 #PBS -q debug #PBS -A IMPROVE #PBS -l filesystems=home:eagle #PBS -N dh_hpo_scale_test +#PBS -M liu.yuanhang@mayo.edu set -xe diff --git a/hpo_scale_singularity_debug_scaling.sh b/hpo_scale_singularity_debug_scaling.sh new file mode 100644 index 0000000..722822f --- /dev/null +++ b/hpo_scale_singularity_debug_scaling.sh @@ -0,0 +1,60 @@ +#!/bin/bash +#PBS -l select=4:ncpus=64:ngpus=4:system=polaris +#PBS -l place=scatter +#PBS -l walltime=00:30:00 +#PBS -q debug-scaling +#PBS -A IMPROVE +#PBS -l filesystems=home:eagle +#PBS -N dh_hpo_scale_test +#PBS -M liu.yuanhang@mayo.edu + +set -xe + +# Move to the directory where `qsub example-improve.sh` was run +cd ${PBS_O_WORKDIR} + +# source enviroemnt variabels for IMPROVE +source $IMPROVE_env + +# Activate the current environement (module load, conda activate etc...) +module load PrgEnv-gnu +module use /soft/modulefiles +module load conda +# activate base environment +conda_path=$(dirname $(dirname $(which conda))) +source $conda_path/bin/activate base +#source $conda_path/bin/activate $dh_env +# load module to run singularity container +module use /soft/spack/gcc/0.6.1/install/modulefiles/Core +module load apptainer + +# Resource allocation for DeepHyper +export NDEPTH=16 +export NRANKS_PER_NODE=4 +export NNODES=`wc -l < $PBS_NODEFILE` +export NTOTRANKS=$(( $NNODES * $NRANKS_PER_NODE + 1)) +export OMP_NUM_THREADS=$NDEPTH + +echo NNODES: ${NNODES} +echo NTOTRANKS: ${NTOTRANKS} +echo OMP_NUM_THREADS: ${OMP_NUM_THREADS} + +# GPU profiling, (quite ad-hoc, copy-paste the `profile_gpu_polaris.sh`, requires to install some small +# python package which queries nvidia-smi, you need a simple parser then to collect data.) +# UNCOMMENT IF USEFULL +# export GPUSTAT_LOG_DIR=$PBS_O_WORKDIR/$log_dir +# mpiexec -n ${NNODES} --ppn 1 --depth=1 --cpu-bind depth --envall ../profile_gpu_polaris.sh & + +# Get list of process ids (basically node names) +echo $PBS_NODEFILE +export RANKS_HOSTS=$(python ./get_hosts_polaris.py $PBS_NODEFILE) + +echo RANKS_HOSTS: ${RANKS_HOSTS} +echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK} + +# Launch DeepHyper +# ensure that mpi is pointing to the one within deephyper conda environment +# set_affinity_gpu_polaris.sh does not seem to work right now +# but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, +mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ + --envall ./set_affinity_gpu_polaris.sh python hpo_subprocess_singularity.py \ No newline at end of file diff --git a/hpo_scale_singularity_prod.sh b/hpo_scale_singularity_prod.sh new file mode 100644 index 0000000..99fd639 --- /dev/null +++ b/hpo_scale_singularity_prod.sh @@ -0,0 +1,60 @@ +#!/bin/bash +#PBS -l select=10:ncpus=64:ngpus=4:system=polaris +#PBS -l place=scatter +#PBS -l walltime=00:10:00 +#PBS -q prod +#PBS -A IMPROVE +#PBS -l filesystems=home:eagle +#PBS -N dh_hpo_scale_test +#PBS -M liu.yuanhang@mayo.edu + +set -xe + +# Move to the directory where `qsub example-improve.sh` was run +cd ${PBS_O_WORKDIR} + +# source enviroemnt variabels for IMPROVE +source $IMPROVE_env + +# Activate the current environement (module load, conda activate etc...) +module load PrgEnv-gnu +module use /soft/modulefiles +module load conda +# activate base environment +conda_path=$(dirname $(dirname $(which conda))) +source $conda_path/bin/activate base +#source $conda_path/bin/activate $dh_env +# load module to run singularity container +module use /soft/spack/gcc/0.6.1/install/modulefiles/Core +module load apptainer + +# Resource allocation for DeepHyper +export NDEPTH=16 +export NRANKS_PER_NODE=4 +export NNODES=`wc -l < $PBS_NODEFILE` +export NTOTRANKS=$(( $NNODES * $NRANKS_PER_NODE + 1)) +export OMP_NUM_THREADS=$NDEPTH + +echo NNODES: ${NNODES} +echo NTOTRANKS: ${NTOTRANKS} +echo OMP_NUM_THREADS: ${OMP_NUM_THREADS} + +# GPU profiling, (quite ad-hoc, copy-paste the `profile_gpu_polaris.sh`, requires to install some small +# python package which queries nvidia-smi, you need a simple parser then to collect data.) +# UNCOMMENT IF USEFULL +# export GPUSTAT_LOG_DIR=$PBS_O_WORKDIR/$log_dir +# mpiexec -n ${NNODES} --ppn 1 --depth=1 --cpu-bind depth --envall ../profile_gpu_polaris.sh & + +# Get list of process ids (basically node names) +echo $PBS_NODEFILE +export RANKS_HOSTS=$(python ./get_hosts_polaris.py $PBS_NODEFILE) + +echo RANKS_HOSTS: ${RANKS_HOSTS} +echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK} + +# Launch DeepHyper +# ensure that mpi is pointing to the one within deephyper conda environment +# set_affinity_gpu_polaris.sh does not seem to work right now +# but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, +mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ + --envall ./set_affinity_gpu_polaris.sh python hpo_subprocess_singularity.py \ No newline at end of file From b7e62c85c7ac22f599e10001f6b6a0110eb5de61 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Fri, 7 Jun 2024 21:20:14 +0000 Subject: [PATCH 117/254] update readme --- README.md | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index a7e1a68..725ce02 100644 --- a/README.md +++ b/README.md @@ -57,16 +57,7 @@ cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ ./PathDSP ## python PathDSP/PathDSP_preprocess_improve.py --ml_data_outdir=./PathDSP/ml_data/GDSCv1-GDSCv1/split_4/ ``` -## Perform HPO across two nodes based on conda - -``` -cd PathDSP -# supply environment variables to qsub -qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale.sh -## for interactive node, you can run: mpirun -np 10 python hpo_subprocess.py -``` - -## Alternatively, perform HPO using singularity container across two nodes +## Perform HPO using singularity container across two nodes ``` ## copy processed to IMPROVE_DATA_DIR @@ -74,6 +65,19 @@ cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ $IMPROVE_ ## specify singularity image file for PathDSP echo "export PathDSP_sif=/lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP.sif" >> IMPROVE_env cd PathDSP -qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale_singularity.sh +## submit to debug queue +qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale_singularity_debug.sh +## to submit to debug-scaling or prod queue +## use hpo_scale_singularity_debug_scaling.sh +## or hpo_scale_singularity_prod.sh ## for interative node, run: mpirun -np 10 python hpo_subprocess_singularity.py +``` + +## Alternatively, perform HPO across two nodes based on conda + +``` +cd PathDSP +# supply environment variables to qsub +qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale.sh +## for interactive node, you can run: mpirun -np 10 python hpo_subprocess.py ``` \ No newline at end of file From 11b93abce2fb2ce0de738298994f6e845e554eb2 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Fri, 7 Jun 2024 21:20:50 +0000 Subject: [PATCH 118/254] add dropout as hp add dropout rate as one of the hyperparameters --- hpo_subprocess_singularity.py | 4 +++- subprocess_train_singularity.sh | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/hpo_subprocess_singularity.py b/hpo_subprocess_singularity.py index b81d7d8..f6ff46b 100644 --- a/hpo_subprocess_singularity.py +++ b/hpo_subprocess_singularity.py @@ -63,7 +63,7 @@ problem.add_hyperparameter((8, 512, "log-uniform"), "batch_size", default_value=8) problem.add_hyperparameter((1e-6, 1e-2, "log-uniform"), "learning_rate", default_value=0.0004) -# problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.0) +problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.1) # problem.add_hyperparameter([True, False], "early_stopping", default_value=False) # --------------------- @@ -94,6 +94,7 @@ def run(job, optuna_trial=None): model_outdir_job_id = model_outdir + f"/{job.id}" learning_rate = job.parameters["learning_rate"] batch_size = job.parameters["batch_size"] + dropout = job.parameters["dropout"] # val_scores = main_train_grapdrp([ # "--train_ml_data_dir", str(train_ml_data_dir), # "--val_ml_data_dir", str(val_ml_data_dir), @@ -107,6 +108,7 @@ def run(job, optuna_trial=None): str(model_outdir_job_id), str(learning_rate), str(batch_size), + str(dropout), str(os.environ["CUDA_VISIBLE_DEVICES"]) ], capture_output=True, text=True, check=True diff --git a/subprocess_train_singularity.sh b/subprocess_train_singularity.sh index 7bb8f05..a700cb7 100755 --- a/subprocess_train_singularity.sh +++ b/subprocess_train_singularity.sh @@ -26,13 +26,15 @@ val_ml_data_dir=$2 model_outdir=$3 learning_rate=$4 batch_size=$5 -CUDA_VISIBLE_DEVICES=$6 +dropout=$6 +CUDA_VISIBLE_DEVICES=$7 echo "train_ml_data_dir: $train_ml_data_dir" echo "val_ml_data_dir: $val_ml_data_dir" echo "model_outdir: $model_outdir" echo "learning_rate: $learning_rate" echo "batch_size: $batch_size" +echo "dropout: $dropout" echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" # epochs=10 @@ -54,5 +56,5 @@ epochs=50 #echo "Deactivated conda env $CONDA_ENV" echo "train using singularity container" -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir $PathDSP_sif train.sh ${CUDA_VISIBLE_DEVICES} /candle_data_dir --train_ml_data_dir /candle_data_dir/$train_ml_data_dir --val_ml_data_dir /candle_data_dir/$val_ml_data_dir/ --model_outdir /candle_data_dir/$model_outdir/ --epochs $epochs --learning_rate $learning_rate --batch_size $batch_size +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir $PathDSP_sif train.sh ${CUDA_VISIBLE_DEVICES} /candle_data_dir --train_ml_data_dir /candle_data_dir/$train_ml_data_dir --val_ml_data_dir /candle_data_dir/$val_ml_data_dir/ --model_outdir /candle_data_dir/$model_outdir/ --epochs $epochs --learning_rate $learning_rate --batch_size $batch_size --dropout $dropout From e0d3973a457dd2b2e8ee578a2c360fb14e9b4543 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Wed, 12 Jun 2024 20:14:13 +0000 Subject: [PATCH 119/254] disable random seed --- PathDSP_train_improve.py | 2 +- hpo_subprocess_singularity.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 79c2ce3..1ab6c98 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -248,7 +248,7 @@ def run(params): params = preprocess(params) # set parameters - myutil.set_seed(params["seed_int"]) + #myutil.set_seed(params["seed_int"]) ## set device cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") if cuda_env_visible is not None: diff --git a/hpo_subprocess_singularity.py b/hpo_subprocess_singularity.py index f6ff46b..abf7a9a 100644 --- a/hpo_subprocess_singularity.py +++ b/hpo_subprocess_singularity.py @@ -69,7 +69,7 @@ # --------------------- # Some IMPROVE settings # --------------------- -source = "gCSI" +source = "GDSCv1" split = 0 train_ml_data_dir = f"ml_data/{source}-{source}/split_{split}" val_ml_data_dir = f"ml_data/{source}-{source}/split_{split}" From 32a8a93bd4e01f782ed8e921a85fc010aa253423 Mon Sep 17 00:00:00 2001 From: Yuanhang Liu Date: Wed, 12 Jun 2024 20:29:52 +0000 Subject: [PATCH 120/254] add readme for deephyper --- README_deephyper.md | 83 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 README_deephyper.md diff --git a/README_deephyper.md b/README_deephyper.md new file mode 100644 index 0000000..725ce02 --- /dev/null +++ b/README_deephyper.md @@ -0,0 +1,83 @@ +# Run HPO using deephyper on Polaris + +## Install conda environment for the curated model (PathDSP) + +``` +## install PathDSP +git clone -b deephyper https://github.com/Liuy12/PathDSP.git +## install IMPROVE +git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git +## define where to install PathDSP env +export PathDSP_env=./PathDSP_env/ +conda env create -f ./PathDSP/environment_082223.yml -p $PathDSP_env +conda activate ${PathDSP_env} +pip install git+https://github.com/ECP-CANDLE/candle_lib@develop +``` + +## Download csa benchmark data + +``` +wget --cut-dirs=7 -P ./ -nH -np -m ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data +``` + +## Download additional author data (PathDSP only) + +``` +mkdir author_data +bash ./PathDSP/download_author_data.sh author_data/ +``` + +## Define environment variables + +``` +### if necessary, request an interactive node from polaris to testing purposes +### qsub -A IMPROVE -I -l select=1 -l filesystems=home:eagle -l walltime=1:00:00 -q debug +### NEED to cd into your working directory again once the job started +improve_lib="$PWD/IMPROVE/" +pathdsp_lib="$PWD/PathDSP/" +# notice the extra PathDSP folder after pathdsp_lib +echo "export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/" >> IMPROVE_env +# IMPROVE_DATA_DIR +echo "export IMPROVE_DATA_DIR=$PWD/csa_data/" >> IMPROVE_env +# AUTHOR_DATA_DIR required for PathDSP +echo "export AUTHOR_DATA_DIR=$PWD/author_data/" >> IMPROVE_env +# PathDSP_env: conda environment path for the model +echo "export PathDSP_env=$PathDSP_env" >> IMPROVE_env +source $PWD/IMPROVE_env +``` + +## Perform preprocessing + +``` +conda activate $PathDSP_env +## You can copy the processed files for PathDSP +cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ ./PathDSP/ +## Alternatively, run the preprocess script +## This script taks around 40 mins to complete +## python PathDSP/PathDSP_preprocess_improve.py --ml_data_outdir=./PathDSP/ml_data/GDSCv1-GDSCv1/split_4/ +``` + +## Perform HPO using singularity container across two nodes + +``` +## copy processed to IMPROVE_DATA_DIR +cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ $IMPROVE_DATA_DIR +## specify singularity image file for PathDSP +echo "export PathDSP_sif=/lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP.sif" >> IMPROVE_env +cd PathDSP +## submit to debug queue +qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale_singularity_debug.sh +## to submit to debug-scaling or prod queue +## use hpo_scale_singularity_debug_scaling.sh +## or hpo_scale_singularity_prod.sh +## for interative node, run: mpirun -np 10 python hpo_subprocess_singularity.py +``` + +## Alternatively, perform HPO across two nodes based on conda + +``` +cd PathDSP +# supply environment variables to qsub +qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale.sh +## for interactive node, you can run: mpirun -np 10 python hpo_subprocess.py +``` \ No newline at end of file From ae39d4ac1fa9d4dae596bbfb2cdce953a9066c87 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:11:42 -0400 Subject: [PATCH 121/254] preprocess - framework-api changes --- PathDSP_preprocess_improve.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 43afbd7..fdb6af3 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -5,10 +5,16 @@ import argparse import numpy as np import pandas as pd -import candle +#import candle from functools import reduce -from improve import drug_resp_pred as drp -from improve import framework as frm +#from improve import drug_resp_pred as drp +#from improve import framework as frm +from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig #NCK +from improvelib.utils import str2bool #NCK +import improvelib.utils as frm #NCK +import improvelib.applications.drug_response_prediction.drug_utils as drugs #NCK +import improvelib.applications.drug_response_prediction.omics_utils as omics #NCK +import improvelib.applications.drug_response_prediction.drp_utils as drp #NCK from pathlib import Path from rdkit import Chem from rdkit.Chem import AllChem @@ -153,7 +159,7 @@ def smile2bits(params): response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] response_df = pd.concat(response_df, ignore_index=True) - smile_df = drp.DrugsLoader(params) + smile_df = drugs.DrugsLoader(params) smile_df = smile_df.dfs['drug_SMILES.tsv'] smile_df = smile_df.reset_index() @@ -229,7 +235,7 @@ def run_netpea(params, dtype, multiply_expression): cpu_int = params["cpu_int"] response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] response_df = pd.concat(response_df, ignore_index=True) - omics_data = drp.OmicsLoader(params) + omics_data = omics.OmicsLoader(params) if dtype == "DGnet": drug_info = pd.read_csv(os.environ["IMPROVE_DATA_DIR"] + "/raw_data/x_data/drug_info.tsv", sep="\t") @@ -434,7 +440,7 @@ def run_ssgsea(params): # canc_col_name="improve_sample_id", # gene_system_identifier="Gene_Symbol", # ) - omics_data = drp.OmicsLoader(params) + omics_data = omics.OmicsLoader(params) expMat = omics_data.dfs['cancer_gene_expression.tsv'] expMat = expMat.set_index(params['canc_col_name']) @@ -502,13 +508,8 @@ def run(params): def main(args): - params = frm.initialize_parameters( - file_path, - default_model="PathDSP_default_model.txt", - #default_model="PathDSP_cs_model.txt", - additional_definitions=preprocess_params, - required=req_preprocess_args, - ) + #params = frm.initialize_parameters(file_path, default_model="PathDSP_default_model.txt", additional_definitions=preprocess_params, required=req_preprocess_args) + params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=preprocess_params, required=req_preprocess_args) run(params) From c3dd54a7dbc34a8805df4b65f8495a8ed0319220 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:17:36 -0400 Subject: [PATCH 122/254] train - framework-api changes --- PathDSP_train_improve.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 1ab6c98..d97faa7 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -1,10 +1,11 @@ -import candle import os import sys import datetime # IMPROVE/CANDLE imports -from improve import framework as frm -from improve.metrics import compute_metrics +#from improve import framework as frm +#from improve.metrics import compute_metrics +from improvelib.applications.drug_response_prediction.config import DRPTrainConfig #NCK +import improvelib.utils as frm #NCK #from model_utils.torch_utils import predicting #import json #from json import JSONEncoder @@ -344,13 +345,8 @@ def main(args): additional_definitions = model_preproc_params + \ model_train_params + \ app_train_params - params = frm.initialize_parameters( - file_path, - default_model="PathDSP_default_model.txt", - #default_model="PathDSP_cs_model.txt", - additional_definitions=additional_definitions, - required=None, - ) + #params = frm.initialize_parameters(file_path, default_model="PathDSP_default_model.txt", additional_definitions=additional_definitions, required=None) + params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=additional_definitions, required=None) #NCK # get node name params["node_name"] = socket.gethostname() val_scores = run(params) From e6111903453eb350ab651ba0a2f4fd885c6ec20b Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:18:36 -0400 Subject: [PATCH 123/254] epochs = 3 for testing --- PathDSP_default_model.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index 37f9a28..093c881 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -26,7 +26,7 @@ val_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" model_outdir = "./out_models/GDSCv1/split_4" model_file_name = "model" model_file_format = ".pt" -epochs=500 +epochs=3 batch_size = 12 val_batch = 12 loss = "mse" From 1cbaf32f91a3597461ddae48486220978cdfc08d Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:56:52 -0400 Subject: [PATCH 124/254] forgot cfg = --- PathDSP_preprocess_improve.py | 1 + PathDSP_train_improve.py | 1 + 2 files changed, 2 insertions(+) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index fdb6af3..0b632b8 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -509,6 +509,7 @@ def run(params): def main(args): #params = frm.initialize_parameters(file_path, default_model="PathDSP_default_model.txt", additional_definitions=preprocess_params, required=req_preprocess_args) + cfg = DRPPreprocessConfig() #NCK params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=preprocess_params, required=req_preprocess_args) run(params) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index d97faa7..fa8065c 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -346,6 +346,7 @@ def main(args): model_train_params + \ app_train_params #params = frm.initialize_parameters(file_path, default_model="PathDSP_default_model.txt", additional_definitions=additional_definitions, required=None) + cfg = DRPTrainConfig() #NCK params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=additional_definitions, required=None) #NCK # get node name params["node_name"] = socket.gethostname() From 82ce3acbd6100f014fff95567ba390e3bbb3ac01 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 19 Aug 2024 10:05:50 -0400 Subject: [PATCH 125/254] fix for output_dir --- PathDSP_preprocess_improve.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 0b632b8..2bcd961 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -491,6 +491,7 @@ def run_ssgsea(params): def run(params): params = frm.build_paths(params) + params["ml_data_outdir"] = params["output_dir"] #NCK frm.create_outdir(outdir=params["ml_data_outdir"]) params = preprocess(params) print("convert drug to bits.") From a54a57ea063d2c0b7882b7d7060b41c8e2c3a3ec Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 19 Aug 2024 10:10:17 -0400 Subject: [PATCH 126/254] testing bug with splits --- PathDSP_params.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 12de5d5..e8879e2 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -2,9 +2,9 @@ model_name='PathDSP' [Preprocess] -train_split_file = "gCSI_split_0_train.txt" -val_split_file = "gCSI_split_0_val.txt" -test_split_file = "gCSI_split_0_test.txt" +train_split_file = 'gCSI_split_0_train.txt' +val_split_file = 'gCSI_split_0_val.txt' +test_split_file = 'gCSI_split_0_test.txt' ml_data_outdir = "./ml_data/gCSI-gCSI/split_0" x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] x_data_drug_files = [["drug_SMILES.tsv"]] From 3dbe44297bdcd53b1f292501dbefc2f7e7c0327c Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 19 Aug 2024 10:11:16 -0400 Subject: [PATCH 127/254] testing bug --- PathDSP_params.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index e8879e2..8d8b72b 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -2,9 +2,9 @@ model_name='PathDSP' [Preprocess] -train_split_file = 'gCSI_split_0_train.txt' -val_split_file = 'gCSI_split_0_val.txt' -test_split_file = 'gCSI_split_0_test.txt' +train_split_file = gCSI_split_0_train.txt +val_split_file = gCSI_split_0_val.txt +test_split_file = gCSI_split_0_test.txt ml_data_outdir = "./ml_data/gCSI-gCSI/split_0" x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] x_data_drug_files = [["drug_SMILES.tsv"]] From cbec4aaa9eb6fdf2b8a7263c408edc31e4f29f90 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 20 Aug 2024 12:31:28 -0400 Subject: [PATCH 128/254] extra raw_data --- PathDSP_preprocess_improve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 2bcd961..8fa64dd 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -238,7 +238,7 @@ def run_netpea(params, dtype, multiply_expression): omics_data = omics.OmicsLoader(params) if dtype == "DGnet": - drug_info = pd.read_csv(os.environ["IMPROVE_DATA_DIR"] + "/raw_data/x_data/drug_info.tsv", sep="\t") + drug_info = pd.read_csv(os.environ["IMPROVE_DATA_DIR"] + "/x_data/drug_info.tsv", sep="\t") drug_info["NAME"] = drug_info["NAME"].str.upper() target_info = pd.read_csv( params["author_data_dir"] + "/data/DB.Drug.Target.txt", sep="\t" From b34b0e607919645f4ba436b0690ffc71de666f7c Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 20 Aug 2024 15:30:00 -0400 Subject: [PATCH 129/254] model_outdir = output_dir --- PathDSP_train_improve.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index fa8065c..c2b908b 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -242,6 +242,7 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn, params): def run(params): + params["model_outdir"] = params["output_dir"] #NCK frm.create_outdir(outdir=params["model_outdir"]) modelpath = frm.build_model_path(params, model_dir=params["model_outdir"]) train_data_fname = frm.build_ml_data_name(params, stage="train") From db52a56453d7d9711ae380bb00030690b763e784 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 20 Aug 2024 15:31:21 -0400 Subject: [PATCH 130/254] ml_data_outdir = input_dir --- PathDSP_train_improve.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index c2b908b..3e89778 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -243,6 +243,7 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn, params): def run(params): params["model_outdir"] = params["output_dir"] #NCK + params["ml_data_outdir"] = params["input_dir"] #NCK frm.create_outdir(outdir=params["model_outdir"]) modelpath = frm.build_model_path(params, model_dir=params["model_outdir"]) train_data_fname = frm.build_ml_data_name(params, stage="train") From 0ba3d03f2683eda4623d0247c477d8fd2b4fbc85 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 20 Aug 2024 15:36:17 -0400 Subject: [PATCH 131/254] train_ml_data_dir to ml_data_dir, etc --- PathDSP_train_improve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 3e89778..8a6728e 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -270,8 +270,8 @@ def run(params): # [PathDSP] Prepare dataloaders # ------------------------------------------------------ print('loadinig data') - train_df = pl.read_csv(params["train_ml_data_dir"] + "/" + train_data_fname, separator = "\t").to_pandas() - val_df = pl.read_csv(params["val_ml_data_dir"] + "/" + val_data_fname, separator = "\t").to_pandas() + train_df = pl.read_csv(params["ml_data_dir"] + "/" + train_data_fname, separator = "\t").to_pandas() + val_df = pl.read_csv(params["ml_data_dir"] + "/" + val_data_fname, separator = "\t").to_pandas() Xtrain_arr = train_df.iloc[:, 0:-1].values Xvalid_arr = val_df.iloc[:, 0:-1].values ytrain_arr = train_df.iloc[:, -1].values From 278dc14f6da294a2a67f7e600159f162f1e055ca Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 20 Aug 2024 15:37:51 -0400 Subject: [PATCH 132/254] ml_data_dir to input_dir --- PathDSP_train_improve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 8a6728e..81556aa 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -270,8 +270,8 @@ def run(params): # [PathDSP] Prepare dataloaders # ------------------------------------------------------ print('loadinig data') - train_df = pl.read_csv(params["ml_data_dir"] + "/" + train_data_fname, separator = "\t").to_pandas() - val_df = pl.read_csv(params["ml_data_dir"] + "/" + val_data_fname, separator = "\t").to_pandas() + train_df = pl.read_csv(params["input_dir"] + "/" + train_data_fname, separator = "\t").to_pandas() + val_df = pl.read_csv(params["input_dir"] + "/" + val_data_fname, separator = "\t").to_pandas() Xtrain_arr = train_df.iloc[:, 0:-1].values Xvalid_arr = val_df.iloc[:, 0:-1].values ytrain_arr = train_df.iloc[:, -1].values From 7a9532349f2aca3d2d81473925c56510a67f3921 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 20 Aug 2024 15:43:43 -0400 Subject: [PATCH 133/254] add data_format to config --- PathDSP_default_model.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index 093c881..483da72 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -26,6 +26,7 @@ val_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" model_outdir = "./out_models/GDSCv1/split_4" model_file_name = "model" model_file_format = ".pt" +data_format = ".txt" epochs=3 batch_size = 12 val_batch = 12 From 6ff3cc682765175412cfb3ee79b3bd85dbe51c54 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 20 Aug 2024 15:57:52 -0400 Subject: [PATCH 134/254] quotes out of config --- PathDSP_default_model.txt | 54 +++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index 483da72..fce2a9f 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -2,43 +2,43 @@ model_name='PathDSP' [Preprocess] -train_split_file = "GDSCv1_split_4_train.txt" -val_split_file = "GDSCv1_split_4_val.txt" -test_split_file = "GDSCv1_split_4_test.txt" -ml_data_outdir = "./ml_data/GDSCv1-GDSCv1/split_4" +train_split_file = GDSCv1_split_4_train.txt +val_split_file = GDSCv1_split_4_val.txt +test_split_file = GDSCv1_split_4_test.txt +ml_data_outdir = ./ml_data/GDSCv1-GDSCv1/split_4 x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] x_data_drug_files = [["drug_SMILES.tsv"]] y_data_files = [["response.tsv"]] -data_format = ".txt" -drug_bits_file='drug_mbit_df.txt' -dgnet_file='DGnet.txt' -mutnet_file='MUTnet.txt' -cnvnet_file='CNVnet.txt' -exp_file='EXP.txt' -bit_int=128 -permutation_int=3 -seed_int=42 -cpu_int=20 +data_format = .txt +drug_bits_file = drug_mbit_df.txt +dgnet_file = DGnet.txt +mutnet_file = MUTnet.txt +cnvnet_file = CNVnet.txt +exp_file = EXP.txt +bit_int = 128 +permutation_int = 3 +seed_int = 42 +cpu_int = 20 [Train] -train_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" -val_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" -model_outdir = "./out_models/GDSCv1/split_4" -model_file_name = "model" -model_file_format = ".pt" -data_format = ".txt" -epochs=3 +train_ml_data_dir = ./ml_data/GDSCv1-GDSCv1/split_4 +val_ml_data_dir = ./ml_data/GDSCv1-GDSCv1/split_4 +model_outdir = ./out_models/GDSCv1/split_4 +model_file_name = model +model_file_format = .pt +data_format = .txt +epochs = 3 batch_size = 12 val_batch = 12 -loss = "mse" -early_stop_metric = "mse" +loss = mse +early_stop_metric = mse patience = 30 -cuda_name = "cuda:0" +cuda_name = cuda:0 learning_rate = 0.0004 dropout=0.1 [Infer] -test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_4" -model_dir = "./out_models/GDSCv1/split_4" -infer_outdir = "./out_infer/GDSCv1-GDSCv1/split_4" +test_ml_data_dir = ./ml_data/GDSCv1-GDSCv1/split_4 +model_dir = ./out_models/GDSCv1/split_4 +infer_outdir = ./out_infer/GDSCv1-GDSCv1/split_4 test_batch = 256 \ No newline at end of file From c539f14ea467d3804d73bac5b949f1e04e8862be Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:21:35 -0400 Subject: [PATCH 135/254] infer - framework-api changes --- PathDSP_infer_improve.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index fa308ed..43a32d7 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -1,4 +1,3 @@ -import candle import os import sys #import json @@ -20,9 +19,11 @@ import myDatasplit as mysplit import myUtility as myutil -from improve import framework as frm +#from improve import framework as frm # from improve.torch_utils import TestbedDataset -from improve.metrics import compute_metrics +#from improve.metrics import compute_metrics +from improvelib.applications.drug_response_prediction.config import DRPInferConfig #NCK +import improvelib.utils as frm #NCK from PathDSP_train_improve import ( preprocess, @@ -41,6 +42,9 @@ model_infer_params = [] def run(params): + params["infer_outdir"] = params["output_dir"] + params["test_ml_data_dir"] = params["input_dir"] + params["model_dir"] = params["input_dir"] frm.create_outdir(outdir=params["infer_outdir"]) params = preprocess(params) test_data_fname = frm.build_ml_data_name(params, stage="test") @@ -75,17 +79,13 @@ def run(params): return test_scores def main(args): + cfg = DRPInferConfig() #NCK additional_definitions = model_preproc_params + \ model_train_params + \ model_infer_params + \ app_infer_params - params = frm.initialize_parameters( - file_path, - default_model="PathDSP_default_model.txt", - #default_model="PathDSP_cs_model.txt", - additional_definitions=additional_definitions, - required=None, - ) + #params = frm.initialize_parameters(file_path, default_model="PathDSP_default_model.txt", additional_definitions=additional_definitions, required=None) + params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=additional_definitions, required=None) #NCK test_scores = run(params) print("\nFinished inference of PathDSP model.") From 69136fdfce4e360b941cb388f9e67077b8e509bb Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:23:47 -0400 Subject: [PATCH 136/254] another input dir --- PathDSP_infer_improve.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index 43a32d7..3ec3167 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -44,6 +44,7 @@ def run(params): params["infer_outdir"] = params["output_dir"] params["test_ml_data_dir"] = params["input_dir"] + params["ml_data_outdir"] = params["input_dir"] params["model_dir"] = params["input_dir"] frm.create_outdir(outdir=params["infer_outdir"]) params = preprocess(params) From 87305c4b556b1bfe7040b32659ab03b28325f8ea Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:24:53 -0400 Subject: [PATCH 137/254] update config --- PathDSP_default_model.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index fce2a9f..63d460b 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -41,4 +41,6 @@ dropout=0.1 test_ml_data_dir = ./ml_data/GDSCv1-GDSCv1/split_4 model_dir = ./out_models/GDSCv1/split_4 infer_outdir = ./out_infer/GDSCv1-GDSCv1/split_4 -test_batch = 256 \ No newline at end of file +test_batch = 256 +model_file_name = model +model_file_format = .pt \ No newline at end of file From 2e50fd0943f3553812bf31f603e921a5b52c0c28 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:38:11 -0400 Subject: [PATCH 138/254] update config --- PathDSP_default_model.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index 63d460b..bc2c698 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -43,4 +43,5 @@ model_dir = ./out_models/GDSCv1/split_4 infer_outdir = ./out_infer/GDSCv1-GDSCv1/split_4 test_batch = 256 model_file_name = model -model_file_format = .pt \ No newline at end of file +model_file_format = .pt +data_format = .txt \ No newline at end of file From c0bb2a4c20881aa5541106b4ecabd385cc2defc6 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Thu, 29 Aug 2024 15:10:08 -0400 Subject: [PATCH 139/254] updates for I/O and params --- PathDSP_default_model.txt | 3 +- PathDSP_infer_improve.py | 55 ++++-------- PathDSP_parameter_definitions.py | 64 ++++++++++++++ PathDSP_preprocess_improve.py | 142 +++++-------------------------- PathDSP_train_improve.py | 81 ++++-------------- 5 files changed, 122 insertions(+), 223 deletions(-) create mode 100644 PathDSP_parameter_definitions.py diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt index bc2c698..c76f6c7 100644 --- a/PathDSP_default_model.txt +++ b/PathDSP_default_model.txt @@ -5,7 +5,7 @@ model_name='PathDSP' train_split_file = GDSCv1_split_4_train.txt val_split_file = GDSCv1_split_4_val.txt test_split_file = GDSCv1_split_4_test.txt -ml_data_outdir = ./ml_data/GDSCv1-GDSCv1/split_4 +# ml_data_outdir = ./ml_data/GDSCv1-GDSCv1/split_4 x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] x_data_drug_files = [["drug_SMILES.tsv"]] y_data_files = [["response.tsv"]] @@ -19,6 +19,7 @@ bit_int = 128 permutation_int = 3 seed_int = 42 cpu_int = 20 +input_supp_data_dir = ../author_data [Train] train_ml_data_dir = ./ml_data/GDSCv1-GDSCv1/split_4 diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index 3ec3167..5dbce68 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -1,61 +1,47 @@ import os import sys -#import json -#from json import JSONEncoder -from PathDSP_preprocess_improve import mkdir, preprocess -from PathDSP_train_improve import predicting import numpy as np import pandas as pd from datetime import datetime import torch as tch import torch.utils.data as tchud import polars as pl -import sklearn.metrics as skmts -#sys.path.append("/usr/local/PathDSP/PathDSP") -#sys.path.append("/usr/local/PathDSP/PathDSP") -#sys.path.append(os.getcwd() + "/PathDSP") import myModel as mynet import myDataloader as mydl -import myDatasplit as mysplit import myUtility as myutil -#from improve import framework as frm -# from improve.torch_utils import TestbedDataset -#from improve.metrics import compute_metrics -from improvelib.applications.drug_response_prediction.config import DRPInferConfig #NCK -import improvelib.utils as frm #NCK - +from PathDSP_preprocess_improve import mkdir, preprocess from PathDSP_train_improve import ( + predicting, preprocess, cal_time, metrics_list, - model_preproc_params, - model_train_params, ) +from improvelib.applications.drug_response_prediction.config import DRPInferConfig #NCK +import improvelib.utils as frm #NCK file_path = os.path.dirname(os.path.realpath(__file__)) -# [Req] App-specific params -app_infer_params = [] - -# [PathDSP] Model-specific params (Model: PathDSP) -model_infer_params = [] def run(params): - params["infer_outdir"] = params["output_dir"] - params["test_ml_data_dir"] = params["input_dir"] - params["ml_data_outdir"] = params["input_dir"] - params["model_dir"] = params["input_dir"] - frm.create_outdir(outdir=params["infer_outdir"]) + if "input_data_dir" in params: + data_dir = params["input_data_dir"] + else: + data_dir = params["input_dir"] + if "input_model_dir" in params: + model_dir = params["input_model_dir"] + else: + model_dir = params["input_dir"] + frm.create_outdir(outdir=params["output_dir"]) params = preprocess(params) test_data_fname = frm.build_ml_data_name(params, stage="test") - test_df = pl.read_csv(params["test_ml_data_dir"] + "/" + test_data_fname, separator = "\t").to_pandas() + test_df = pl.read_csv(data_dir + "/" + test_data_fname, separator = "\t").to_pandas() Xtest_arr = test_df.iloc[:, 0:-1].values ytest_arr = test_df.iloc[:, -1].values Xtest_arr = np.array(Xtest_arr).astype('float32') ytest_arr = np.array(ytest_arr).astype('float32') trained_net = mynet.FNN(Xtest_arr.shape[1]) - modelpath = frm.build_model_path(params, model_dir=params["model_dir"]) + modelpath = frm.build_model_path(params, model_dir=model_dir) trained_net.load_state_dict(tch.load(modelpath)) trained_net.eval() myutil.set_seed(params["seed_int"]) @@ -70,23 +56,18 @@ def run(params): test_true, test_pred = predicting(trained_net, device, data_loader=test_dl) frm.store_predictions_df( params, y_true=test_true, y_pred=test_pred, stage="test", - outdir=params["infer_outdir"] + outdir=params["output_dir"] ) test_scores = frm.compute_performace_scores( params, y_true=test_true, y_pred=test_pred, stage="test", - outdir=params["infer_outdir"], metrics=metrics_list + outdir=params["output_dir"], metrics=metrics_list ) print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start))) return test_scores def main(args): cfg = DRPInferConfig() #NCK - additional_definitions = model_preproc_params + \ - model_train_params + \ - model_infer_params + \ - app_infer_params - #params = frm.initialize_parameters(file_path, default_model="PathDSP_default_model.txt", additional_definitions=additional_definitions, required=None) - params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=additional_definitions, required=None) #NCK + params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=None, required=None) #NCK test_scores = run(params) print("\nFinished inference of PathDSP model.") diff --git a/PathDSP_parameter_definitions.py b/PathDSP_parameter_definitions.py new file mode 100644 index 0000000..ad61721 --- /dev/null +++ b/PathDSP_parameter_definitions.py @@ -0,0 +1,64 @@ +pathdsp_preprocess_params = [ + {"name": "bit_int", + "type": int, + "default": 128, + "help": "Number of bits for morgan fingerprints.", + }, + {"name": "permutation_int", + "type": int, + "default": 3, + "help": "Number of permutation for calculating enrichment scores.", + }, + {"name": "seed_int", + "type": int, + "default": 42, + "help": "Random seed for random walk algorithm.", + }, + {"name": "cpu_int", + "type": int, + "default": 20, + "help": "Number of cpus to use when calculating pathway enrichment scores.", + }, + {"name": "drug_bits_file", + "type": str, + "default": "drug_mbit_df.txt", + "help": "File name to save the drug bits file.", + }, + {"name": "dgnet_file", + "type": str, + "default": "DGnet.txt", + "help": "File name to save the drug target net file.", + }, + {"name": "mutnet_file", + "type": str, + "default": "MUTnet.txt", + "help": "File name to save the mutation net file.", + }, + {"name": "cnvnet_file", + "type": str, + "default": "CNVnet.txt", + "help": "File name to save the CNV net file.", + }, + {"name": "exp_file", + "type": str, + "default": "EXPnet.txt", + "help": "File name to save the EXP net file.", + }, +] + +pathdsp_train_params = [ + {"name": "cuda_name", # TODO. frm. How should we control this? + "action": "store", + "type": str, + "help": "Cuda device (e.g.: cuda:0, cuda:1."}, + {"name": "learning_rate", + "type": float, + "default": 0.0001, + "help": "Learning rate for the optimizer." + }, + {"name": "dropout", + "type": float, + "default": 0.1, + "help": "Dropout rate for the optimizer." + }, +] \ No newline at end of file diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 8fa64dd..9bf898e 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -1,20 +1,9 @@ import sys import os -import numpy as np import polars as pl -import argparse import numpy as np import pandas as pd -#import candle from functools import reduce -#from improve import drug_resp_pred as drp -#from improve import framework as frm -from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig #NCK -from improvelib.utils import str2bool #NCK -import improvelib.utils as frm #NCK -import improvelib.applications.drug_response_prediction.drug_utils as drugs #NCK -import improvelib.applications.drug_response_prediction.omics_utils as omics #NCK -import improvelib.applications.drug_response_prediction.drp_utils as drp #NCK from pathlib import Path from rdkit import Chem from rdkit.Chem import AllChem @@ -24,99 +13,21 @@ import gseapy as gp import sklearn.model_selection as skms from sklearn.preprocessing import StandardScaler +from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig #NCK +from improvelib.utils import str2bool #NCK +import improvelib.utils as frm #NCK +import improvelib.applications.drug_response_prediction.drug_utils as drugs #NCK +import improvelib.applications.drug_response_prediction.omics_utils as omics #NCK +import improvelib.applications.drug_response_prediction.drp_utils as drp #NCK +from PathDSP_parameter_definitions import pathdsp_preprocess_params file_path = Path(__file__).resolve().parent -app_preproc_params = [ - # These arg should be specified in the [modelname]_default_model.txt: - # y_data_files, x_data_canc_files, x_data_drug_files - {"name": "y_data_files", # default - "type": str, - "help": "List of files that contain the y (prediction variable) data. \ - Example: [['response.tsv']]", - }, - {"name": "x_data_canc_files", # [Req] - "type": str, - "help": "List of feature files including gene_system_identifer. Examples: \n\ - 1) [['cancer_gene_expression.tsv', ['Gene_Symbol']]] \n\ - 2) [['cancer_copy_number.tsv', ['Ensembl', 'Entrez']]].", - }, - {"name": "x_data_drug_files", # [Req] - "type": str, - "help": "List of feature files. Examples: \n\ - 1) [['drug_SMILES.tsv']] \n\ - 2) [['drug_SMILES.tsv'], ['drug_ecfp4_nbits512.tsv']]", - }, - {"name": "canc_col_name", - "default": "improve_sample_id", # default - "type": str, - "help": "Column name in the y (response) data file that contains the cancer sample ids.", - }, - {"name": "drug_col_name", # default - "default": "improve_chem_id", - "type": str, - "help": "Column name in the y (response) data file that contains the drug ids.", - }, - -] - -# [PathDSP] Model-specific params -model_preproc_params = [ - {"name": "bit_int", - "type": int, - "default": 128, - "help": "Number of bits for morgan fingerprints.", - }, - {"name": "permutation_int", - "type": int, - "default": 3, - "help": "Number of permutation for calculating enrichment scores.", - }, - {"name": "seed_int", - "type": int, - "default": 42, - "help": "Random seed for random walk algorithm.", - }, - {"name": "cpu_int", - "type": int, - "default": 20, - "help": "Number of cpus to use when calculating pathway enrichment scores.", - }, - {"name": "drug_bits_file", - "type": str, - "default": "drug_mbit_df.txt", - "help": "File name to save the drug bits file.", - }, - {"name": "dgnet_file", - "type": str, - "default": "DGnet.txt", - "help": "File name to save the drug target net file.", - }, - {"name": "mutnet_file", - "type": str, - "default": "MUTnet.txt", - "help": "File name to save the mutation net file.", - }, - {"name": "cnvnet_file", - "type": str, - "default": "CNVnet.txt", - "help": "File name to save the CNV net file.", - }, - {"name": "exp_file", - "type": str, - "default": "EXPnet.txt", - "help": "File name to save the EXP net file.", - }, -] - -preprocess_params = app_preproc_params + model_preproc_params -req_preprocess_args = [ll["name"] for ll in preprocess_params] - +req_preprocess_args = [ll["name"] for ll in pathdsp_preprocess_params] def mkdir(directory): directories = directory.split("/") - folder = "" for d in directories: folder += d + "/" @@ -126,7 +37,6 @@ def mkdir(directory): def preprocess(params): - params["author_data_dir"] = os.getenv("AUTHOR_DATA_DIR") for i in [ "drug_bits_file", "dgnet_file", @@ -134,8 +44,7 @@ def preprocess(params): "cnvnet_file", "exp_file", ]: - params[i] = params["ml_data_outdir"] + "/" + params[i] - + params[i] = params["output_dir"] + "/" + params[i] return params @@ -158,9 +67,7 @@ def smile2bits(params): start = datetime.now() response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] response_df = pd.concat(response_df, ignore_index=True) - smile_df = drugs.DrugsLoader(params) - smile_df = smile_df.dfs['drug_SMILES.tsv'] smile_df = smile_df.reset_index() smile_df.columns = ["drug", "smile"] @@ -212,10 +119,8 @@ def times_expression(rwr, exp): if len(gene_list) == 0: print("ERROR! no overlapping genes") sys.exit(1) - # multiply with gene expression for overlapping cell, gene rwr_timesexp = rwr.loc[cell_list, gene_list] * exp.loc[cell_list, gene_list] - # concat with other gene out_gene_list = list(set(rwr.columns) - set(gene_list)) out_df = pd.concat([rwr_timesexp, rwr[out_gene_list]], axis=1) @@ -225,9 +130,9 @@ def times_expression(rwr, exp): def run_netpea(params, dtype, multiply_expression): # timer start_time = datetime.now() - ppi_path = params["author_data_dir"] + "/STRING/9606.protein_name.links.v11.0.pkl" + ppi_path = params["input_supp_data_dir"] + "/STRING/9606.protein_name.links.v11.0.pkl" pathway_path = ( - params["author_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" + params["input_supp_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" ) log_transform = False permutation_int = params["permutation_int"] @@ -236,12 +141,11 @@ def run_netpea(params, dtype, multiply_expression): response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] response_df = pd.concat(response_df, ignore_index=True) omics_data = omics.OmicsLoader(params) - if dtype == "DGnet": - drug_info = pd.read_csv(os.environ["IMPROVE_DATA_DIR"] + "/x_data/drug_info.tsv", sep="\t") + drug_info = pd.read_csv(params["input_dir"] + "/x_data/drug_info.tsv", sep="\t") drug_info["NAME"] = drug_info["NAME"].str.upper() target_info = pd.read_csv( - params["author_data_dir"] + "/data/DB.Drug.Target.txt", sep="\t" + params["input_supp_data_dir"] + "/data/DB.Drug.Target.txt", sep="\t" ) target_info = target_info.rename(columns={"drug": "NAME"}) combined_df = pd.merge(drug_info, target_info, how="left", on="NAME").dropna( @@ -250,7 +154,7 @@ def run_netpea(params, dtype, multiply_expression): combined_df = combined_df.loc[ combined_df["improve_chem_id"].isin(response_df["improve_chem_id"]), ] - restart_path = params["ml_data_outdir"] + "/drug_target.txt" + restart_path = params["output_dir"] + "/drug_target.txt" combined_df.iloc[:, -2:].to_csv( restart_path, sep="\t", header=True, index=False ) @@ -264,7 +168,7 @@ def run_netpea(params, dtype, multiply_expression): mutation_data = mutation_data.loc[ mutation_data["improve_sample_id"].isin(response_df["improve_sample_id"]), ] - restart_path = params["ml_data_outdir"] + "/mutation_data.txt" + restart_path = params["output_dir"] + "/mutation_data.txt" mutation_data.iloc[:, 0:2].to_csv( restart_path, sep="\t", header=True, index=False ) @@ -278,7 +182,7 @@ def run_netpea(params, dtype, multiply_expression): cnv_data = cnv_data.loc[ cnv_data["improve_sample_id"].isin(response_df["improve_sample_id"]), ] - restart_path = params["ml_data_outdir"] + "/cnv_data.txt" + restart_path = params["output_dir"] + "/cnv_data.txt" cnv_data.iloc[:, 0:2].to_csv(restart_path, sep="\t", header=True, index=False) outpath = params["cnvnet_file"] # perform Random Walk @@ -350,7 +254,6 @@ def prep_input(params): columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} ) # Extract relevant IDs - common_drug_ids = reduce( np.intersect1d, (drug_mbit_df["drug_id"], DGnet["drug_id"], response_df["drug_id"]), @@ -398,7 +301,6 @@ def prep_input(params): drug_data = drug_mbit_df.join(DGnet) sample_data = CNVnet.join([MUTnet, EXP]) ## export train,val,test set - # for i in ['train', 'test', 'val']: for i in ["train", "test", "val"]: response_df = drp.DrugResponseLoader(params, split_file=params[i+"_split_file"], verbose=True) response_df = response_df.dfs['response.tsv'] @@ -427,7 +329,7 @@ def prep_input(params): comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) comb_data_mtx = comb_data_mtx.dropna() pl.from_pandas(comb_data_mtx).write_csv( - params["ml_data_outdir"] + "/" + frm.build_ml_data_name(params, i) + params["output_dir"] + "/" + frm.build_ml_data_name(params, i) , separator="\t", has_header=True ) @@ -452,10 +354,10 @@ def run_ssgsea(params): expMat = expMat.loc[expMat.index.isin(response_df["improve_sample_id"]),] gct = expMat.T # gene (rows) cell lines (columns) pathway_path = ( - params["author_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" + params["input_supp_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" ) gmt = pathway_path - tmp_str = params["ml_data_outdir"] + "/tmpdir_ssgsea/" + tmp_str = params["output_dir"] + "/tmpdir_ssgsea/" if not os.path.isdir(tmp_str): os.mkdir(tmp_str) @@ -491,8 +393,7 @@ def run_ssgsea(params): def run(params): params = frm.build_paths(params) - params["ml_data_outdir"] = params["output_dir"] #NCK - frm.create_outdir(outdir=params["ml_data_outdir"]) + frm.create_outdir(outdir=params["output_dir"]) params = preprocess(params) print("convert drug to bits.") smile2bits(params) @@ -509,9 +410,8 @@ def run(params): def main(args): - #params = frm.initialize_parameters(file_path, default_model="PathDSP_default_model.txt", additional_definitions=preprocess_params, required=req_preprocess_args) cfg = DRPPreprocessConfig() #NCK - params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=preprocess_params, required=req_preprocess_args) + params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=pathdsp_preprocess_params, required=req_preprocess_args) run(params) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 81556aa..b1233ad 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -1,66 +1,27 @@ import os import sys -import datetime -# IMPROVE/CANDLE imports -#from improve import framework as frm -#from improve.metrics import compute_metrics -from improvelib.applications.drug_response_prediction.config import DRPTrainConfig #NCK -import improvelib.utils as frm #NCK -#from model_utils.torch_utils import predicting -#import json -#from json import JSONEncoder -from PathDSP_preprocess_improve import cal_time, preprocess, model_preproc_params, app_preproc_params, preprocess_params - -#sys.path.append("/usr/local/PathDSP/PathDSP") -#sys.path.append("/usr/local/PathDSP/PathDSP") -#sys.path.append(os.getcwd() + "/PathDSP") -#import FNN_new -import argparse import numpy as np import pandas as pd -import scipy.stats as scistat from datetime import datetime - -import sklearn.preprocessing as skpre -import sklearn.model_selection as skms -import sklearn.metrics as skmts -import sklearn.utils as skut - +import socket import torch as tch import torch.utils.data as tchud - import myModel as mynet import myDataloader as mydl import myUtility as myutil import polars as pl -import json -import socket + +from improvelib.applications.drug_response_prediction.config import DRPTrainConfig #NCK +import improvelib.utils as frm #NCK + +from PathDSP_preprocess_improve import cal_time, preprocess +from PathDSP_parameter_definitions import pathdsp_train_params file_path = os.path.dirname(os.path.realpath(__file__)) # [Req] List of metrics names to be compute performance scores metrics_list = ["mse", "rmse", "pcc", "scc", "r2"] -# Currently, there are no app-specific args for the train script. -app_train_params = [] - -# [PathDSP] Model-specific params (Model: PathDSP) -model_train_params = [ - {"name": "cuda_name", # TODO. frm. How should we control this? - "action": "store", - "type": str, - "help": "Cuda device (e.g.: cuda:0, cuda:1."}, - {"name": "learning_rate", - "type": float, - "default": 0.0001, - "help": "Learning rate for the optimizer." - }, - {"name": "dropout", - "type": float, - "default": 0.1, - "help": "Dropout rate for the optimizer." - }, -] class RMSELoss(tch.nn.Module): def __init__(self): @@ -72,8 +33,6 @@ def forward(self,x,y): loss = tch.sqrt(criterion(x, y) + eps) return loss - - def predicting(model, device, data_loader): """ Method to make predictions/inference. This is used in *train.py and *infer.py @@ -176,7 +135,7 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn, params): trainloss_list = [] # metrics: MSE, size equals to EPOCH validloss_list = [] # metrics: MSE, size equals to EPOCH validr2_list = [] # metrics: r2, size equals to EPOCH - early_stopping = myutil.EarlyStopping(patience=params['patience'], verbose=True, path= params["model_outdir"] + "/checkpoint.pt") # initialize the early_stopping + early_stopping = myutil.EarlyStopping(patience=params['patience'], verbose=True, path= params["output_dir"] + "/checkpoint.pt") # initialize the early_stopping # repeat the training for EPOCH times start_total = datetime.now() for epoch in range(epochs): @@ -236,16 +195,14 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn, params): print('Total time (all epochs) :[Finished in {:}]'.format(cal_time(datetime.now(), start_total))) # load the last checkpoint with the best model - net.load_state_dict(tch.load(params["model_outdir"] + '/checkpoint.pt')) + net.load_state_dict(tch.load(params["output_dir"] + '/checkpoint.pt')) return net, trainloss_list, validloss_list, validr2_list def run(params): - params["model_outdir"] = params["output_dir"] #NCK - params["ml_data_outdir"] = params["input_dir"] #NCK - frm.create_outdir(outdir=params["model_outdir"]) - modelpath = frm.build_model_path(params, model_dir=params["model_outdir"]) + frm.create_outdir(outdir=params["output_dir"]) + modelpath = frm.build_model_path(params, model_dir=params["output_dir"]) train_data_fname = frm.build_ml_data_name(params, stage="train") val_data_fname = frm.build_ml_data_name(params, stage="val") params = preprocess(params) @@ -313,7 +270,7 @@ def init_weights(m): 'train loss':train_loss_list, 'valid loss': valid_loss_list, 'valid r2': valid_r2_list}) - loss_df.to_csv(params['model_outdir'] + '/Val_Loss_orig.txt', header=True, index=False, sep="\t") + loss_df.to_csv(params['output_dir'] + '/Val_Loss_orig.txt', header=True, index=False, sep="\t") # make train/valid loss plots best_model = trained_net @@ -329,34 +286,30 @@ def init_weights(m): # import ipdb; ipdb.set_trace() frm.store_predictions_df( params, y_true=val_true, y_pred=val_pred, stage="val", - outdir=params["model_outdir"] + outdir=params["output_dir"] ) # ----------------------------- # [Req] Compute performance scores # ----------------------------- # import ipdb; ipdb.set_trace() - val_scores = frm.compute_performace_scores( + val_scores = frm.compute_performance_scores( params, y_true=val_true, y_pred=val_pred, stage="val", - outdir=params["model_outdir"], metrics=metrics_list + outdir=params["output_dir"], metrics=metrics_list ) return val_scores def main(args): - additional_definitions = model_preproc_params + \ - model_train_params + \ - app_train_params - #params = frm.initialize_parameters(file_path, default_model="PathDSP_default_model.txt", additional_definitions=additional_definitions, required=None) cfg = DRPTrainConfig() #NCK - params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=additional_definitions, required=None) #NCK + params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=pathdsp_train_params, required=None) #NCK # get node name params["node_name"] = socket.gethostname() val_scores = run(params) # with open(params["model_outdir"] + '/params.json', 'w') as json_file: # json.dump(params, json_file, indent=4) df = pd.DataFrame.from_dict(params, orient='index', columns=['value']) - df.to_csv(params["model_outdir"] + '/params.txt',sep="\t") + df.to_csv(params["output_dir"] + '/params.txt',sep="\t") From ce314e1212786e094c2fd4f2052503ef817b1edb Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 30 Aug 2024 12:12:24 -0400 Subject: [PATCH 140/254] readme and setup_improve --- PathDSP/setup_improve.sh | 56 +++++++++ README.md | 245 ++++++++++++++++++--------------------- README_old2.md | 181 +++++++++++++++++++++++++++++ 3 files changed, 352 insertions(+), 130 deletions(-) create mode 100644 PathDSP/setup_improve.sh create mode 100644 README_old2.md diff --git a/PathDSP/setup_improve.sh b/PathDSP/setup_improve.sh new file mode 100644 index 0000000..834903b --- /dev/null +++ b/PathDSP/setup_improve.sh @@ -0,0 +1,56 @@ +#!/bin/bash --login +# Navigate to the dir with the cloned model repo +# Run it like this: source ./setup_improve.sh + +# set -e + +# Get current dir and model dir +model_path=$PWD +echo "Model path: $model_path" +model_name=$(echo "$model_path" | awk -F '/' '{print $NF}') +echo "Model name: $model_name" + +# Download data (if needed) +data_dir="csa_data" +if [ ! -d $PWD/$data_dir/ ]; then + echo "Download CSA data" + source download_csa.sh +else + echo "CSA data folder already exists" +fi + +# Download author data (if needed) - PathDSP specific +author_dir="author_data" +if [ ! -d $PWD/$author_dir/ ]; then + echo "Download author data" + mkdir author_data + source download_author_data.sh author_data/ +else + echo "Author data folder already exists" +fi + +# Env var IMPROVE_DATA_DIR +export IMPROVE_DATA_DIR="./$data_dir/" + +# Env var AUTHOR_DATA_DIR - PathDSP specific +export AUTHOR_DATA_DIR="./$author_dir/" + +# Clone IMPROVE lib (if needed) +pushd ../ +improve_lib_path=$PWD/IMPROVE +improve_branch="v0.0.3-beta" +if [ -d $improve_lib_path ]; then + echo "IMPROVE repo exists in ${improve_lib_path}" +else + # git clone https://github.com/JDACS4C-IMPROVE/IMPROVE + git clone -b $improve_branch https://github.com/JDACS4C-IMPROVE/IMPROVE +fi +pushd $model_name + +# Env var PYTHOPATH +export PYTHONPATH=$PYTHONPATH:$improve_lib_path + +echo +echo "IMPROVE_DATA_DIR: $IMPROVE_DATA_DIR" +echo "AUTHOR_DATA_DIR: $AUTHOR_DATA_DIR" +echo "PYTHONPATH: $PYTHONPATH" \ No newline at end of file diff --git a/README.md b/README.md index aca66fc..5f7bc5c 100644 --- a/README.md +++ b/README.md @@ -1,181 +1,166 @@ # PathDSP -Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores -# Download benchmark data +This repository demonstrates how to use the [IMPROVE library v0.0.3-beta](https://github.com/JDACS4C-IMPROVE/IMPROVE/tree/v0.0.3-beta) for building a drug response prediction (DRP) model using PathDSP, and provides examples with the benchmark [cross-study analysis (CSA) dataset](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/). -Download the cross-study analysis (CSA) benchmark data into the model directory from https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/ +This version, tagged as `v0.0.3-beta`, is the final release before transitioning to `v0.1.0-alpha`, which introduces a new API. Version `v0.0.3-beta` and all previous releases have served as the foundation for developing essential components of the IMPROVE software stack. Subsequent releases build on this legacy with an updated API, designed to encourage broader adoption of IMPROVE and its curated models by the research community. -``` -mkdir process_dir -cd process_dir -wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data -``` +A more detailed tutorial can be found [here](https://jdacs4c-improve.github.io/docs/v0.0.3-beta/content/ModelContributorGuide.html). -Benchmark data will be downloaded under `process_dir/csa_data/` -# Example usage with Conda +## Dependencies +Installation instuctions are detailed below in [Step-by-step instructions](#step-by-step-instructions). -Download PathDSP and IMPROVE +Conda `yml` file [environment_082223.yml](./environment_082223.yml) -``` -mkdir repo -cd repo -git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git -git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git -``` +ML framework: ++ [Torch](https://pytorch.org/) -- deep learning framework for building the prediction model -# Download author data +IMPROVE dependencies: ++ [IMPROVE v0.0.3-beta](https://github.com/JDACS4C-IMPROVE/IMPROVE/tree/v0.0.3-beta) ++ [candle_lib](https://github.com/ECP-CANDLE/candle_lib) - IMPROVE dependency (enables various hyperparameter optimization on HPC machines) -``` -cd ../ -mkdir author_data -bash repo/PathDSP/download_author_data.sh author_data/ -``` -Author data will be downloaded under `process_dir/author_data/` -PathDSP will be installed at `process_dir/repo/PathDSP` -IMPROVE will be installed at `process_dir/repo/IMPROVE` -Create environment +## Dataset +Benchmark data for cross-study analysis (CSA) can be downloaded from this [site](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/). +The data tree is shown below: ``` -cd repo/PathDSP/ -conda env create -f environment_082223.yml -n PathDSP_env +csa_data/raw_data/ +├── splits +│   ├── CCLE_all.txt +│   ├── CCLE_split_0_test.txt +│   ├── CCLE_split_0_train.txt +│   ├── CCLE_split_0_val.txt +│   ├── CCLE_split_1_test.txt +│   ├── CCLE_split_1_train.txt +│   ├── CCLE_split_1_val.txt +│   ├── ... +│   ├── GDSCv2_split_9_test.txt +│   ├── GDSCv2_split_9_train.txt +│   └── GDSCv2_split_9_val.txt +├── x_data +│   ├── cancer_copy_number.tsv +│   ├── cancer_discretized_copy_number.tsv +│   ├── cancer_DNA_methylation.tsv +│   ├── cancer_gene_expression.tsv +│   ├── cancer_miRNA_expression.tsv +│   ├── cancer_mutation_count.tsv +│   ├── cancer_mutation_long_format.tsv +│   ├── cancer_mutation.parquet +│   ├── cancer_RPPA.tsv +│   ├── drug_ecfp4_nbits512.tsv +│   ├── drug_info.tsv +│   ├── drug_mordred_descriptor.tsv +│   └── drug_SMILES.tsv +└── y_data + └── response.tsv ``` -Activate environment -``` -conda activate PathDSP_env -``` +## Model scripts and parameter file ++ `PathDSP_preprocess_improve.py` - takes benchmark data files and transforms into files for training and inference ++ `PathDSP_train_improve.py` - trains the PathDSP model ++ `PathDSP_infer_improve.py` - runs inference with the trained PathDSP model ++ `PathDSP_default_model.txt` - default parameter file -Install CANDLE package -``` -pip install git+https://github.com/ECP-CANDLE/candle_lib@develop -``` -Define enviroment variabels +# Step-by-step instructions +### 1. Clone the model repository ``` -improve_lib="/path/to/IMPROVE/repo/" -pathdsp_lib="/path/to/pathdsp/repo/" -# notice the extra PathDSP folder after pathdsp_lib -export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/ -export IMPROVE_DATA_DIR="/path/to/csa_data/" -export AUTHOR_DATA_DIR="/path/to/author_data/" +git clone https://github.com/JDACS4C-IMPROVE/PathDSP +cd PathDSP +git checkout v0.0.3-beta ``` -Perform preprocessing step +### 2. Set computational environment +Create conda env using `yml` ``` -# go two upper level -cd ../../ -python repo/PathDSP/PathDSP_preprocess_improve.py +conda env create -f environment_082223.yml -n PathDSP_env ``` -Train the model -``` -python repo/PathDSP/PathDSP_train_improve.py +### 3. Run `setup_improve.sh`. +```bash +source setup_improve.sh ``` -Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json` -Final trained model is located at: `${train_ml_data_dir}/model.pt`. Parameter definitions can be found at `process_dir/repo/PathDSP/PathDSP_default_model.txt` +This will: +1. Download cross-study analysis (CSA) benchmark data into `./csa_data/`. +2. Clone IMPROVE repo (checkout tag `v0.0.3-beta`) outside the PathDSP model repo +3. Set up env variables: `IMPROVE_DATA_DIR` (to `./csa_data/`) and `PYTHONPATH` (adds IMPROVE repo). +4. Download the model-specific supplemental data (aka author data) and set up the env variable `AUTHOR_DATA_DIR`. -Perform inference on the testing data +### 4. Preprocess CSA benchmark data (_raw data_) to construct model input data (_ML data_) +```bash +python PathDSP_preprocess_improve.py ``` -python repo/PathDSP/PathDSP_infer_improve.py -``` - -Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` -Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` - -# Example usage with singularity container -# Download benchmark data +Preprocesses the CSA data and creates train, validation (val), and test datasets. -Download the cross-study analysis (CSA) benchmark data into the model directory from https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/ +Generates: +* three model input data files: `train_data.pt`, `val_data.pt`, `test_data.pt` +* three tabular data files, each containing the drug response values (i.e. AUC) and corresponding metadata: `train_y_data.csv`, `val_y_data.csv`, `test_y_data.csv` ``` -mkdir process_dir -cd process_dir -wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data +ml_data +└── GDSCv1-CCLE + └── split_0 + ├── tmpdir_ssgsea + ├── EXP.txt + ├── cnv_data.txt + ├── CNVnet.txt + ├── DGnet.txt + ├── MUTnet.txt + ├── drug_mbit_df.txt + ├── drug_target.txt + ├── mutation_data.txt + ├── test_data.txt + ├── train_data.txt + ├── val_data.txt + └── x_data_gene_expression_scaler.gz ``` -# Download author data -Download model specific data under csa_data/ directory - -``` -git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git -bash PathDSP/download_author_data.sh csa_data/ +### 5. Train PathDSP model +```bash +python PathDSP_train_improve.py ``` -Setup Singularity +Trains PathDSP using the model input data: `train_data.pt` (training), `val_data.pt` (for early stopping). +Generates: +* trained model: `model.pt` +* predictions on val data (tabular data): `val_y_data_predicted.csv` +* prediction performance scores on val data: `val_scores.json` ``` -git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git -cd Singularity -./setup -source config/improve.env +out_models +└── gCSI + └── split_0 + ├── model.pt + ├── checkpoint.pt + ├── Val_Loss_orig.txt + ├── val_scores.json + └── val_y_data_predicted.csv ``` -Build Singularity from definition file -``` -singularity build --fakeroot PathDSP.sif definitions/PathDSP.def -``` +### 6. Run inference on test data with the trained model +```python PathDSP_infer_improve.py``` -Perform preprocessing using csa benchmarking data +Evaluates the performance on a test dataset with the trained model. +Generates: +* predictions on test data (tabular data): `test_y_data_predicted.csv` +* prediction performance scores on test data: `test_scores.json` ``` -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh /candle_data_dir --ml_data_outdir /candle_data_dir/preprocess_data/ -``` - -Train the model - -``` -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir --train_ml_data_dir /candle_data_dir/preprocess_data/ --val_ml_data_dir /candle_data_dir/preprocess_data/ --model_outdir /candle_data_dir/out_model/ -``` - -Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json` -Final trained model is located at: `${train_ml_data_dir}/model.pt`. - -Perform inference on the testing data - -``` -singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir --test_ml_data_dir /candle_data_dir/preprocess_data/ --model_dir /candle_data_dir/out_model/ --infer_outdir /candle_data_dir/out_infer/ -``` - -Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` -Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` - - -# Docs from original authors (below) - -# Requirments - -# Input format - -|drug|cell|feature_1|....|feature_n|drug_response| -|----|----|--------|----|--------|----| -|5-FU|03|0|....|0.02|-2.3| -|5-FU|23|1|....|0.04|-3.4| - -Where feature_1 to feature_n are the pathway enrichment scores and the chemical fingerprint coming from data processing -# Usage: -```python -# run FNN -python ./PathDSP/PathDSP/FNN.py -i input.txt -o ./output_prefix - -Where input.txt should be in the input format shown above. -Example input file can be found at https://zenodo.org/record/7532963 -``` -# Data preprocessing -Pathway enrichment scores for categorical data (i.e., mutation, copy number variation, and drug targets) were obtained by running the NetPEA algorithm, which is available at: https://github.com/TangYiChing/NetPEA, while pathway enrichment scores for numeric data (i.e., gene expression) was generated with the single-sample Gene Set Enrichment Analsysis (ssGSEA) available here: https://gseapy.readthedocs.io/en/master/gseapy_example.html#3)-command-line-usage-of-single-sample-gseaby - - -# Reference -Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7 \ No newline at end of file +out_infer +└── gCSI-gCSI + └── split_0 + ├── test_scores.json + └── test_y_data_predicted.csv +``` \ No newline at end of file diff --git a/README_old2.md b/README_old2.md new file mode 100644 index 0000000..aca66fc --- /dev/null +++ b/README_old2.md @@ -0,0 +1,181 @@ +# PathDSP +Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores + +# Download benchmark data + +Download the cross-study analysis (CSA) benchmark data into the model directory from https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/ + +``` +mkdir process_dir +cd process_dir +wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data +``` + +Benchmark data will be downloaded under `process_dir/csa_data/` + +# Example usage with Conda + +Download PathDSP and IMPROVE + +``` +mkdir repo +cd repo +git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git +git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git +``` + +# Download author data + +``` +cd ../ +mkdir author_data +bash repo/PathDSP/download_author_data.sh author_data/ +``` + +Author data will be downloaded under `process_dir/author_data/` +PathDSP will be installed at `process_dir/repo/PathDSP` +IMPROVE will be installed at `process_dir/repo/IMPROVE` + +Create environment + +``` +cd repo/PathDSP/ +conda env create -f environment_082223.yml -n PathDSP_env +``` + +Activate environment + +``` +conda activate PathDSP_env +``` + +Install CANDLE package + +``` +pip install git+https://github.com/ECP-CANDLE/candle_lib@develop +``` + +Define enviroment variabels + +``` +improve_lib="/path/to/IMPROVE/repo/" +pathdsp_lib="/path/to/pathdsp/repo/" +# notice the extra PathDSP folder after pathdsp_lib +export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/ +export IMPROVE_DATA_DIR="/path/to/csa_data/" +export AUTHOR_DATA_DIR="/path/to/author_data/" +``` + +Perform preprocessing step + +``` +# go two upper level +cd ../../ +python repo/PathDSP/PathDSP_preprocess_improve.py +``` + +Train the model + +``` +python repo/PathDSP/PathDSP_train_improve.py +``` + +Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json` +Final trained model is located at: `${train_ml_data_dir}/model.pt`. Parameter definitions can be found at `process_dir/repo/PathDSP/PathDSP_default_model.txt` + +Perform inference on the testing data + +``` +python repo/PathDSP/PathDSP_infer_improve.py +``` + +Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` +Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` + +# Example usage with singularity container + +# Download benchmark data + +Download the cross-study analysis (CSA) benchmark data into the model directory from https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/ + +``` +mkdir process_dir +cd process_dir +wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data +``` + +# Download author data + +Download model specific data under csa_data/ directory + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git +bash PathDSP/download_author_data.sh csa_data/ +``` + +Setup Singularity + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git +cd Singularity +./setup +source config/improve.env +``` + +Build Singularity from definition file + +``` +singularity build --fakeroot PathDSP.sif definitions/PathDSP.def +``` + +Perform preprocessing using csa benchmarking data + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh /candle_data_dir --ml_data_outdir /candle_data_dir/preprocess_data/ +``` + +Train the model + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir --train_ml_data_dir /candle_data_dir/preprocess_data/ --val_ml_data_dir /candle_data_dir/preprocess_data/ --model_outdir /candle_data_dir/out_model/ +``` + +Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json` +Final trained model is located at: `${train_ml_data_dir}/model.pt`. + +Perform inference on the testing data + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir --test_ml_data_dir /candle_data_dir/preprocess_data/ --model_dir /candle_data_dir/out_model/ --infer_outdir /candle_data_dir/out_infer/ +``` + +Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` +Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` + + +# Docs from original authors (below) + +# Requirments + +# Input format + +|drug|cell|feature_1|....|feature_n|drug_response| +|----|----|--------|----|--------|----| +|5-FU|03|0|....|0.02|-2.3| +|5-FU|23|1|....|0.04|-3.4| + +Where feature_1 to feature_n are the pathway enrichment scores and the chemical fingerprint coming from data processing +# Usage: +```python +# run FNN +python ./PathDSP/PathDSP/FNN.py -i input.txt -o ./output_prefix + +Where input.txt should be in the input format shown above. +Example input file can be found at https://zenodo.org/record/7532963 +``` +# Data preprocessing +Pathway enrichment scores for categorical data (i.e., mutation, copy number variation, and drug targets) were obtained by running the NetPEA algorithm, which is available at: https://github.com/TangYiChing/NetPEA, while pathway enrichment scores for numeric data (i.e., gene expression) was generated with the single-sample Gene Set Enrichment Analsysis (ssGSEA) available here: https://gseapy.readthedocs.io/en/master/gseapy_example.html#3)-command-line-usage-of-single-sample-gseaby + + +# Reference +Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7 \ No newline at end of file From 3f1b445673285b6770e1cbf2e97d2150ed0da9c5 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 30 Aug 2024 12:16:07 -0400 Subject: [PATCH 141/254] move setup_improve --- PathDSP/setup_improve.sh => setup_improve.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename PathDSP/setup_improve.sh => setup_improve.sh (100%) diff --git a/PathDSP/setup_improve.sh b/setup_improve.sh similarity index 100% rename from PathDSP/setup_improve.sh rename to setup_improve.sh From 324b9bc8a8a117192d2970a2941f8889d83554e7 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 30 Aug 2024 12:21:22 -0400 Subject: [PATCH 142/254] download_csa --- README.md | 1 + download_csa.sh | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 download_csa.sh diff --git a/README.md b/README.md index 5f7bc5c..7cabfa0 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ git checkout v0.0.3-beta Create conda env using `yml` ``` conda env create -f environment_082223.yml -n PathDSP_env +conda activate PathDSP_env ``` diff --git a/download_csa.sh b/download_csa.sh new file mode 100644 index 0000000..7bfc04c --- /dev/null +++ b/download_csa.sh @@ -0,0 +1,2 @@ +# wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data +wget --cut-dirs=8 -P ./ -nH -np -m https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/ \ No newline at end of file From 7d71aaf640c15f365cfa3034bcedeea613f46295 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 30 Aug 2024 12:29:48 -0400 Subject: [PATCH 143/254] last readme updates --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7cabfa0..6ca0cc5 100644 --- a/README.md +++ b/README.md @@ -104,12 +104,11 @@ python PathDSP_preprocess_improve.py Preprocesses the CSA data and creates train, validation (val), and test datasets. Generates: -* three model input data files: `train_data.pt`, `val_data.pt`, `test_data.pt` -* three tabular data files, each containing the drug response values (i.e. AUC) and corresponding metadata: `train_y_data.csv`, `val_y_data.csv`, `test_y_data.csv` +* three model input data files: `train_data.txt`, `val_data.txt`, `test_data.txt` ``` ml_data -└── GDSCv1-CCLE +└── gCSI └── split_0 ├── tmpdir_ssgsea ├── EXP.txt @@ -132,7 +131,7 @@ ml_data python PathDSP_train_improve.py ``` -Trains PathDSP using the model input data: `train_data.pt` (training), `val_data.pt` (for early stopping). +Trains PathDSP using the model input data: `train_data.txt` (training), `val_data.txt` (for early stopping). Generates: * trained model: `model.pt` From 0980b9582bc1f2bb1355dd02f296566e4c8f1d4d Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 30 Aug 2024 13:12:44 -0400 Subject: [PATCH 144/254] imports for model specific stuff --- PathDSP_train_improve.py | 6 +++--- {PathDSP => model_utils}/FNN.py | 0 {PathDSP => model_utils}/FNN_new.py | 0 {PathDSP => model_utils}/infer.sh | 0 {PathDSP => model_utils}/leaveOneGroupOut_FNN.py | 0 {PathDSP => model_utils}/load_pretrained.py | 0 {PathDSP => model_utils}/myDataloader.py | 0 {PathDSP => model_utils}/myDatasplit.py | 0 {PathDSP => model_utils}/myFit.py | 0 {PathDSP => model_utils}/myMetrics.py | 0 {PathDSP => model_utils}/myModel.py | 0 {PathDSP => model_utils}/myUtility.py | 0 {PathDSP => model_utils}/nestedCV.py | 0 13 files changed, 3 insertions(+), 3 deletions(-) rename {PathDSP => model_utils}/FNN.py (100%) rename {PathDSP => model_utils}/FNN_new.py (100%) rename {PathDSP => model_utils}/infer.sh (100%) rename {PathDSP => model_utils}/leaveOneGroupOut_FNN.py (100%) rename {PathDSP => model_utils}/load_pretrained.py (100%) rename {PathDSP => model_utils}/myDataloader.py (100%) rename {PathDSP => model_utils}/myDatasplit.py (100%) rename {PathDSP => model_utils}/myFit.py (100%) rename {PathDSP => model_utils}/myMetrics.py (100%) rename {PathDSP => model_utils}/myModel.py (100%) rename {PathDSP => model_utils}/myUtility.py (100%) rename {PathDSP => model_utils}/nestedCV.py (100%) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index b1233ad..9f27a44 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -6,9 +6,9 @@ import socket import torch as tch import torch.utils.data as tchud -import myModel as mynet -import myDataloader as mydl -import myUtility as myutil +import model_utils.myModel as mynet +import model_utils.myDataloader as mydl +import model_utils.myUtility as myutil import polars as pl from improvelib.applications.drug_response_prediction.config import DRPTrainConfig #NCK diff --git a/PathDSP/FNN.py b/model_utils/FNN.py similarity index 100% rename from PathDSP/FNN.py rename to model_utils/FNN.py diff --git a/PathDSP/FNN_new.py b/model_utils/FNN_new.py similarity index 100% rename from PathDSP/FNN_new.py rename to model_utils/FNN_new.py diff --git a/PathDSP/infer.sh b/model_utils/infer.sh similarity index 100% rename from PathDSP/infer.sh rename to model_utils/infer.sh diff --git a/PathDSP/leaveOneGroupOut_FNN.py b/model_utils/leaveOneGroupOut_FNN.py similarity index 100% rename from PathDSP/leaveOneGroupOut_FNN.py rename to model_utils/leaveOneGroupOut_FNN.py diff --git a/PathDSP/load_pretrained.py b/model_utils/load_pretrained.py similarity index 100% rename from PathDSP/load_pretrained.py rename to model_utils/load_pretrained.py diff --git a/PathDSP/myDataloader.py b/model_utils/myDataloader.py similarity index 100% rename from PathDSP/myDataloader.py rename to model_utils/myDataloader.py diff --git a/PathDSP/myDatasplit.py b/model_utils/myDatasplit.py similarity index 100% rename from PathDSP/myDatasplit.py rename to model_utils/myDatasplit.py diff --git a/PathDSP/myFit.py b/model_utils/myFit.py similarity index 100% rename from PathDSP/myFit.py rename to model_utils/myFit.py diff --git a/PathDSP/myMetrics.py b/model_utils/myMetrics.py similarity index 100% rename from PathDSP/myMetrics.py rename to model_utils/myMetrics.py diff --git a/PathDSP/myModel.py b/model_utils/myModel.py similarity index 100% rename from PathDSP/myModel.py rename to model_utils/myModel.py diff --git a/PathDSP/myUtility.py b/model_utils/myUtility.py similarity index 100% rename from PathDSP/myUtility.py rename to model_utils/myUtility.py diff --git a/PathDSP/nestedCV.py b/model_utils/nestedCV.py similarity index 100% rename from PathDSP/nestedCV.py rename to model_utils/nestedCV.py From af879066ed7c6aef1bfa804e47e93ea65d531664 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 30 Aug 2024 13:17:23 -0400 Subject: [PATCH 145/254] another import --- model_utils/myDataloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model_utils/myDataloader.py b/model_utils/myDataloader.py index e9a4dbf..77eb482 100644 --- a/model_utils/myDataloader.py +++ b/model_utils/myDataloader.py @@ -9,7 +9,7 @@ import torch.utils.data as tchud import sklearn.model_selection as skms import sklearn.preprocessing as skpre -import myDatasplit as mysplit +import model_utils.myDatasplit as mysplit class NumpyDataset(tchud.Dataset): """ From fbac751d101d9f13629e91b2a3552f738fe31fd7 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 30 Aug 2024 13:43:39 -0400 Subject: [PATCH 146/254] updating params --- PathDSP_parameter_definitions.py | 8 +--- PathDSP_params.txt | 73 ++++++++++++++++---------------- 2 files changed, 38 insertions(+), 43 deletions(-) diff --git a/PathDSP_parameter_definitions.py b/PathDSP_parameter_definitions.py index ad61721..c7e3db6 100644 --- a/PathDSP_parameter_definitions.py +++ b/PathDSP_parameter_definitions.py @@ -50,12 +50,8 @@ {"name": "cuda_name", # TODO. frm. How should we control this? "action": "store", "type": str, - "help": "Cuda device (e.g.: cuda:0, cuda:1."}, - {"name": "learning_rate", - "type": float, - "default": 0.0001, - "help": "Learning rate for the optimizer." - }, + "help": "Cuda device (e.g.: cuda:0, cuda:1." + }, {"name": "dropout", "type": float, "default": 0.1, diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 8d8b72b..05d660c 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -1,51 +1,50 @@ -[Global_Params] -model_name='PathDSP' - [Preprocess] train_split_file = gCSI_split_0_train.txt val_split_file = gCSI_split_0_val.txt test_split_file = gCSI_split_0_test.txt -ml_data_outdir = "./ml_data/gCSI-gCSI/split_0" x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] x_data_drug_files = [["drug_SMILES.tsv"]] y_data_files = [["response.tsv"]] - -data_url='https://zenodo.org/record/6093818/files/' -improve_data_url='https://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/raw_data/' -original_data_url='https://zenodo.org/record/7532963/files/' -original_data='input.zip' -gene_set = 'MSigdb.zip' -ppi_data = 'STRING.zip' -drug_target = 'raw_data.zip' -raw_data_dir = "raw_data" -train_data = 'PathDSP_train.txt' -test_data = 'PathDSP_test.txt' -val_data = 'PathDSP_val.txt' +y_col_name = auc +bit_int = 128 +permutation_int = 3 +seed_int = 42 +cpu_int = 20 drug_bits_file='drug_mbit_df.txt' dgnet_file='DGnet.txt' mutnet_file='MUTnet.txt' cnvnet_file='CNVnet.txt' exp_file='EXP.txt' -#output='Result/' -bit_int=128 -permutation_int=3 -y_col_name = 'auc' -metric='auc' -data_type='CTRPv2' -split=0 - - -#Model parameter -seed_int=42 -cpu_int=20 -#cv_int=1 -gpu_int=0 + +[Train] +epochs = 800 learning_rate = 0.001 batch_size = 12 -eps=0.00001 -drug_hiddens='100,50,6' -final_hiddens=6 -epochs=800 -optimizer = 'adam' -loss = 'mse' -improve_analysis='no' +loss = mse +y_col_name = auc + +[Infer] +y_col_name = auc + + + +#gpu_int=0 +#gene_set = 'MSigdb.zip' +#ppi_data = 'STRING.zip' +#drug_target = 'raw_data.zip' +#raw_data_dir = "raw_data" +#train_data = 'PathDSP_train.txt' +#test_data = 'PathDSP_test.txt' +#val_data = 'PathDSP_val.txt' +#y_col_name = 'auc' +#metric='auc' +#data_type='CTRPv2' +#split=0 +#eps=0.00001 +#drug_hiddens='100,50,6' +#final_hiddens=6 +#optimizer = 'adam' +#improve_analysis='no' + + + From 22e7feef76cec1a8fcd2e84f8b8d2e38b131a810 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 30 Aug 2024 16:14:15 -0400 Subject: [PATCH 147/254] take out preprocess in train --- PathDSP_params.txt | 10 +++++----- PathDSP_train_improve.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 05d660c..8630e84 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -10,11 +10,11 @@ bit_int = 128 permutation_int = 3 seed_int = 42 cpu_int = 20 -drug_bits_file='drug_mbit_df.txt' -dgnet_file='DGnet.txt' -mutnet_file='MUTnet.txt' -cnvnet_file='CNVnet.txt' -exp_file='EXP.txt' +drug_bits_file = drug_mbit_df.txt +dgnet_file = DGnet.txt +mutnet_file = MUTnet.txt +cnvnet_file = CNVnet.txt +exp_file = EXP.txt [Train] epochs = 800 diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 9f27a44..9ee7354 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -205,7 +205,7 @@ def run(params): modelpath = frm.build_model_path(params, model_dir=params["output_dir"]) train_data_fname = frm.build_ml_data_name(params, stage="train") val_data_fname = frm.build_ml_data_name(params, stage="val") - params = preprocess(params) + #params = preprocess(params) # set parameters #myutil.set_seed(params["seed_int"]) From 74fdb9e7f7031e41e14dfa6ecd28d20d12384588 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 30 Aug 2024 16:29:42 -0400 Subject: [PATCH 148/254] imports and take out preprocess --- PathDSP_infer_improve.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index 5dbce68..a9221dc 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -6,9 +6,9 @@ import torch as tch import torch.utils.data as tchud import polars as pl -import myModel as mynet -import myDataloader as mydl -import myUtility as myutil +import model_utils.myModel as mynet +import model_utils.myDataloader as mydl +import model_utils.myUtility as myutil from PathDSP_preprocess_improve import mkdir, preprocess from PathDSP_train_improve import ( @@ -33,7 +33,7 @@ def run(params): else: model_dir = params["input_dir"] frm.create_outdir(outdir=params["output_dir"]) - params = preprocess(params) + #params = preprocess(params) test_data_fname = frm.build_ml_data_name(params, stage="test") test_df = pl.read_csv(data_dir + "/" + test_data_fname, separator = "\t").to_pandas() Xtest_arr = test_df.iloc[:, 0:-1].values @@ -58,7 +58,7 @@ def run(params): params, y_true=test_true, y_pred=test_pred, stage="test", outdir=params["output_dir"] ) - test_scores = frm.compute_performace_scores( + test_scores = frm.compute_performance_scores( params, y_true=test_true, y_pred=test_pred, stage="test", outdir=params["output_dir"], metrics=metrics_list ) From da48c2cb9de5b49a38a6425058b15269ee4ae88e Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Thu, 5 Sep 2024 15:11:00 -0400 Subject: [PATCH 149/254] Update README.md --- README.md | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6ca0cc5..178ad46 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,40 @@ # PathDSP + +This is development for v0.1.0-alpha. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This repository demonstrates how to use the [IMPROVE library v0.0.3-beta](https://github.com/JDACS4C-IMPROVE/IMPROVE/tree/v0.0.3-beta) for building a drug response prediction (DRP) model using PathDSP, and provides examples with the benchmark [cross-study analysis (CSA) dataset](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/). @@ -163,4 +199,4 @@ out_infer └── split_0 ├── test_scores.json └── test_y_data_predicted.csv -``` \ No newline at end of file +``` From b3c75275d6a79e0e670b1f406e1d07865e29ec5c Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Thu, 5 Sep 2024 15:47:46 -0400 Subject: [PATCH 150/254] updating with latest improvelib changes --- PathDSP_default_model.txt | 48 ----------------------------------- PathDSP_infer_improve.py | 44 +++++++++++++++++--------------- PathDSP_params.txt | 21 +++++++++++---- PathDSP_preprocess_improve.py | 10 +++++--- PathDSP_train_improve.py | 30 ++++++++++++---------- 5 files changed, 62 insertions(+), 91 deletions(-) delete mode 100644 PathDSP_default_model.txt diff --git a/PathDSP_default_model.txt b/PathDSP_default_model.txt deleted file mode 100644 index c76f6c7..0000000 --- a/PathDSP_default_model.txt +++ /dev/null @@ -1,48 +0,0 @@ -[Global_Params] -model_name='PathDSP' - -[Preprocess] -train_split_file = GDSCv1_split_4_train.txt -val_split_file = GDSCv1_split_4_val.txt -test_split_file = GDSCv1_split_4_test.txt -# ml_data_outdir = ./ml_data/GDSCv1-GDSCv1/split_4 -x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] -x_data_drug_files = [["drug_SMILES.tsv"]] -y_data_files = [["response.tsv"]] -data_format = .txt -drug_bits_file = drug_mbit_df.txt -dgnet_file = DGnet.txt -mutnet_file = MUTnet.txt -cnvnet_file = CNVnet.txt -exp_file = EXP.txt -bit_int = 128 -permutation_int = 3 -seed_int = 42 -cpu_int = 20 -input_supp_data_dir = ../author_data - -[Train] -train_ml_data_dir = ./ml_data/GDSCv1-GDSCv1/split_4 -val_ml_data_dir = ./ml_data/GDSCv1-GDSCv1/split_4 -model_outdir = ./out_models/GDSCv1/split_4 -model_file_name = model -model_file_format = .pt -data_format = .txt -epochs = 3 -batch_size = 12 -val_batch = 12 -loss = mse -early_stop_metric = mse -patience = 30 -cuda_name = cuda:0 -learning_rate = 0.0004 -dropout=0.1 - -[Infer] -test_ml_data_dir = ./ml_data/GDSCv1-GDSCv1/split_4 -model_dir = ./out_models/GDSCv1/split_4 -infer_outdir = ./out_infer/GDSCv1-GDSCv1/split_4 -test_batch = 256 -model_file_name = model -model_file_format = .pt -data_format = .txt \ No newline at end of file diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index a9221dc..560e799 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -13,9 +13,7 @@ from PathDSP_preprocess_improve import mkdir, preprocess from PathDSP_train_improve import ( predicting, - preprocess, cal_time, - metrics_list, ) from improvelib.applications.drug_response_prediction.config import DRPInferConfig #NCK import improvelib.utils as frm #NCK @@ -23,25 +21,17 @@ file_path = os.path.dirname(os.path.realpath(__file__)) -def run(params): - if "input_data_dir" in params: - data_dir = params["input_data_dir"] - else: - data_dir = params["input_dir"] - if "input_model_dir" in params: - model_dir = params["input_model_dir"] - else: - model_dir = params["input_dir"] +def run(params): frm.create_outdir(outdir=params["output_dir"]) #params = preprocess(params) - test_data_fname = frm.build_ml_data_name(params, stage="test") - test_df = pl.read_csv(data_dir + "/" + test_data_fname, separator = "\t").to_pandas() + test_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="test") + test_df = pl.read_csv(params["input_data_dir"] + "/" + test_data_fname, separator = "\t").to_pandas() Xtest_arr = test_df.iloc[:, 0:-1].values ytest_arr = test_df.iloc[:, -1].values Xtest_arr = np.array(Xtest_arr).astype('float32') ytest_arr = np.array(ytest_arr).astype('float32') trained_net = mynet.FNN(Xtest_arr.shape[1]) - modelpath = frm.build_model_path(params, model_dir=model_dir) + modelpath = frm.build_model_path(model_file_name=params["model_file_name"], model_file_format=params["model_file_format"], model_dir=params["input_model_dir"]) trained_net.load_state_dict(tch.load(modelpath)) trained_net.eval() myutil.set_seed(params["seed_int"]) @@ -54,20 +44,32 @@ def run(params): test_dl = tchud.DataLoader(test_dataset, batch_size=params['test_batch'], shuffle=False) start = datetime.now() test_true, test_pred = predicting(trained_net, device, data_loader=test_dl) + frm.store_predictions_df( - params, y_true=test_true, y_pred=test_pred, stage="test", + y_true=test_true, + y_pred=test_pred, + stage="test", + y_col_name=params["y_col_name"], outdir=params["output_dir"] ) - test_scores = frm.compute_performance_scores( - params, y_true=test_true, y_pred=test_pred, stage="test", - outdir=params["output_dir"], metrics=metrics_list - ) + if params["calc_infer_scores"]: + test_scores = frm.compute_performance_scores( + y_true=test_true, + y_pred=test_pred, + stage="test", + metric_type=params["metric_type"], + outdir=params["output_dir"] + ) + print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start))) return test_scores def main(args): - cfg = DRPInferConfig() #NCK - params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=None, required=None) #NCK + cfg = DRPInferConfig() + params = cfg.initialize_parameters( + file_path, + default_config="PathDSP_params.txt", + additional_definitions=None) test_scores = run(params) print("\nFinished inference of PathDSP model.") diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 8630e84..397bc6c 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -1,10 +1,12 @@ [Preprocess] +data_format = .txt +input_supp_data_dir = ../author_data train_split_file = gCSI_split_0_train.txt val_split_file = gCSI_split_0_val.txt test_split_file = gCSI_split_0_test.txt +y_data_files = [["response.tsv"]] x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] x_data_drug_files = [["drug_SMILES.tsv"]] -y_data_files = [["response.tsv"]] y_col_name = auc bit_int = 128 permutation_int = 3 @@ -17,16 +19,25 @@ cnvnet_file = CNVnet.txt exp_file = EXP.txt [Train] -epochs = 800 -learning_rate = 0.001 +data_format = .txt +model_file_name = model +model_file_format = .pt +epochs = 3 +learning_rate = 0.0004 batch_size = 12 +val_batch = 12 loss = mse +patience = 30 y_col_name = auc +cuda_name = cuda:0 +dropout = 0.1 [Infer] y_col_name = auc - - +infer_batch = 256 +model_file_name = model +model_file_format = .pt +data_format = .txt #gpu_int=0 #gene_set = 'MSigdb.zip' diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 9bf898e..0b024da 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -329,7 +329,7 @@ def prep_input(params): comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) comb_data_mtx = comb_data_mtx.dropna() pl.from_pandas(comb_data_mtx).write_csv( - params["output_dir"] + "/" + frm.build_ml_data_name(params, i) + params["output_dir"] + "/" + frm.build_ml_data_file_name(data_format=params["data_format"], stage=i) , separator="\t", has_header=True ) @@ -392,7 +392,6 @@ def run_ssgsea(params): df.T.to_csv(params["exp_file"], header=True, index=True, sep="\t") def run(params): - params = frm.build_paths(params) frm.create_outdir(outdir=params["output_dir"]) params = preprocess(params) print("convert drug to bits.") @@ -410,8 +409,11 @@ def run(params): def main(args): - cfg = DRPPreprocessConfig() #NCK - params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=pathdsp_preprocess_params, required=req_preprocess_args) + cfg = DRPPreprocessConfig() + params = cfg.initialize_parameters( + file_path, + default_config="PathDSP_params.txt", + additional_definitions=pathdsp_preprocess_params) run(params) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 9ee7354..36375a5 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -19,9 +19,6 @@ file_path = os.path.dirname(os.path.realpath(__file__)) -# [Req] List of metrics names to be compute performance scores -metrics_list = ["mse", "rmse", "pcc", "scc", "r2"] - class RMSELoss(tch.nn.Module): def __init__(self): @@ -202,9 +199,9 @@ def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn, params): def run(params): frm.create_outdir(outdir=params["output_dir"]) - modelpath = frm.build_model_path(params, model_dir=params["output_dir"]) - train_data_fname = frm.build_ml_data_name(params, stage="train") - val_data_fname = frm.build_ml_data_name(params, stage="val") + modelpath = frm.build_model_path(model_file_name=params["model_file_name"], model_file_format=params["model_file_format"], model_dir=params["output_dir"]) + train_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="train") + val_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="val") #params = preprocess(params) # set parameters @@ -285,7 +282,10 @@ def init_weights(m): # ----------------------------- # import ipdb; ipdb.set_trace() frm.store_predictions_df( - params, y_true=val_true, y_pred=val_pred, stage="val", + y_true=val_true, + y_pred=val_pred, + stage="val", + y_col_name=params["y_col_name"], outdir=params["output_dir"] ) @@ -294,20 +294,24 @@ def init_weights(m): # ----------------------------- # import ipdb; ipdb.set_trace() val_scores = frm.compute_performance_scores( - params, y_true=val_true, y_pred=val_pred, stage="val", - outdir=params["output_dir"], metrics=metrics_list + y_true=val_true, + y_pred=val_pred, + stage="val", + metric_type=params["metric_type"], + outdir=params["output_dir"] ) return val_scores def main(args): - cfg = DRPTrainConfig() #NCK - params = cfg.initialize_parameters(file_path, default_config="PathDSP_default_model.txt", additional_definitions=pathdsp_train_params, required=None) #NCK + cfg = DRPTrainConfig() + params = cfg.initialize_parameters( + file_path, + default_config="PathDSP_params.txt", + additional_definitions=pathdsp_train_params) # get node name params["node_name"] = socket.gethostname() val_scores = run(params) - # with open(params["model_outdir"] + '/params.json', 'w') as json_file: - # json.dump(params, json_file, indent=4) df = pd.DataFrame.from_dict(params, orient='index', columns=['value']) df.to_csv(params["output_dir"] + '/params.txt',sep="\t") From f9a84bf36b4f9fdcfa5b0f6ce0cc44f4f852d59f Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 6 Sep 2024 08:53:46 -0400 Subject: [PATCH 151/254] typo in output_dir --- PathDSP_infer_improve.py | 4 ++-- PathDSP_train_improve.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index 560e799..bcc0555 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -50,7 +50,7 @@ def run(params): y_pred=test_pred, stage="test", y_col_name=params["y_col_name"], - outdir=params["output_dir"] + output_dir=params["output_dir"] ) if params["calc_infer_scores"]: test_scores = frm.compute_performance_scores( @@ -58,7 +58,7 @@ def run(params): y_pred=test_pred, stage="test", metric_type=params["metric_type"], - outdir=params["output_dir"] + output_dir=params["output_dir"] ) print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start))) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 36375a5..f299b1e 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -286,7 +286,7 @@ def init_weights(m): y_pred=val_pred, stage="val", y_col_name=params["y_col_name"], - outdir=params["output_dir"] + output_dir=params["output_dir"] ) # ----------------------------- @@ -298,7 +298,7 @@ def init_weights(m): y_pred=val_pred, stage="val", metric_type=params["metric_type"], - outdir=params["output_dir"] + output_dir=params["output_dir"] ) return val_scores From e10b4601f7cf0b2f18ed92ff0a8b29524fdc30ec Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 6 Sep 2024 09:00:36 -0400 Subject: [PATCH 152/254] params in infer --- PathDSP_infer_improve.py | 4 ++-- PathDSP_parameter_definitions.py | 8 ++++++++ PathDSP_params.txt | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index bcc0555..3bfbae5 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -34,14 +34,14 @@ def run(params): modelpath = frm.build_model_path(model_file_name=params["model_file_name"], model_file_format=params["model_file_format"], model_dir=params["input_model_dir"]) trained_net.load_state_dict(tch.load(modelpath)) trained_net.eval() - myutil.set_seed(params["seed_int"]) + #myutil.set_seed(params["seed_int"]) cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") if cuda_env_visible is not None: device = 'cuda:0' else: device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) - test_dl = tchud.DataLoader(test_dataset, batch_size=params['test_batch'], shuffle=False) + test_dl = tchud.DataLoader(test_dataset, batch_size=params['infer_batch'], shuffle=False) start = datetime.now() test_true, test_pred = predicting(trained_net, device, data_loader=test_dl) diff --git a/PathDSP_parameter_definitions.py b/PathDSP_parameter_definitions.py index c7e3db6..a9680db 100644 --- a/PathDSP_parameter_definitions.py +++ b/PathDSP_parameter_definitions.py @@ -57,4 +57,12 @@ "default": 0.1, "help": "Dropout rate for the optimizer." }, +] + +pathdsp_infer_params = [ + {"name": "cuda_name", # TODO. frm. How should we control this? + "action": "store", + "type": str, + "help": "Cuda device (e.g.: cuda:0, cuda:1." + }, ] \ No newline at end of file diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 397bc6c..eb3c902 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -38,6 +38,7 @@ infer_batch = 256 model_file_name = model model_file_format = .pt data_format = .txt +cuda_name = cuda:0 #gpu_int=0 #gene_set = 'MSigdb.zip' From c441551b3f67bc0459871f8e4049dd0570b8be54 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 6 Sep 2024 10:18:55 -0400 Subject: [PATCH 153/254] added csa workflow files --- csa_params.ini | 18 +++ csa_params_def.py | 51 ++++++++ hyperparameters_default.json | 67 ++++++++++ hyperparameters_hpo.json | 101 ++++++++++++++ workflow_csa.py | 246 +++++++++++++++++++++++++++++++++++ 5 files changed, 483 insertions(+) create mode 100644 csa_params.ini create mode 100644 csa_params_def.py create mode 100644 hyperparameters_default.json create mode 100644 hyperparameters_hpo.json create mode 100644 workflow_csa.py diff --git a/csa_params.ini b/csa_params.ini new file mode 100644 index 0000000..c04df57 --- /dev/null +++ b/csa_params.ini @@ -0,0 +1,18 @@ +[DEFAULT] +input_dir = ./csa_data/raw_data +output_dir=./improve_output +y_col_name = auc +use_singularity = False +hyperparameters_file = ./hyperparameters_default.json +source_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] +target_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] +split = ["0", "1", "2"] +model_name = "PathDSP" +only_cross_study = False +epochs = 100 + +[Preprocess] + +[Train] + +[Infer] \ No newline at end of file diff --git a/csa_params_def.py b/csa_params_def.py new file mode 100644 index 0000000..b5c87ab --- /dev/null +++ b/csa_params_def.py @@ -0,0 +1,51 @@ + +additional_definitions = [ + {"name": "source_datasets", + "nargs" : "+", + "type": str, + "default": ['CCLE'], + "help": "source_datasets for cross study analysis" + }, + {"name": "target_datasets", + "nargs" : "+", + "type": str, + "default": ["CCLE", "gCSI"], + "help": "target_datasets for cross study analysis" + }, + {"name": "split", + "nargs" : "+", + "type": str, + "default": ['0'], + "help": "Split of the source datasets for CSA" + }, + {"name": "only_cross_study", + "type": bool, + "default": False, + "help": "If only cross study analysis is needed" + }, + {"name": "model_name", + "type": str, + "default": 'graphdrp', ## Change the default to LGBM?? + "help": "Name of the deep learning model" + }, + {"name": "hyperparameters_file", + "type": str, + "default": 'hyperparameters_default.json', + "help": "json file containing optimized hyperparameters per dataset" + }, + {"name": "epochs", + "type": int, + "default": 10, + "help": "Number of epochs" + }, + {"name": "use_singularity", + "type": bool, + "default": True, + "help": "Do you want to use singularity image for running the model?" + }, + {"name": "singularity_image", + "type": str, + "default": '', + "help": "Singularity image file of the model" + } + ] \ No newline at end of file diff --git a/hyperparameters_default.json b/hyperparameters_default.json new file mode 100644 index 0000000..44778fb --- /dev/null +++ b/hyperparameters_default.json @@ -0,0 +1,67 @@ +{ + "graphdrp": { + "CCLE": { + "batch_size": 256, + "learning_rate": 0.0001 + }, + "CTRPv2": { + "batch_size": 256, + "learning_rate": 0.0001 + }, + "GDSCv1": { + "batch_size": 256, + "learning_rate": 0.0001 + }, + "GDSCv2": { + "batch_size": 256, + "learning_rate": 0.0001 + }, + "gCSI": { + "batch_size": 256, + "learning_rate": 0.0001 + } + }, + "HiDRA": { + "CCLE": {}, + "CTRPv2": {}, + "GDSCv1": {}, + "GDSCv2": {}, + "gCSI": {} + }, + "IGTD": { + "CCLE": {}, + "CTRPv2": {}, + "GDSCv1": {}, + "GDSCv2": {}, + "gCSI": {} + }, + "Paccmann_MCA": { + "CCLE": { + "batch_size": 32, + "learning_rate": 0.0001 + }, + "CTRPv2": { + "batch_size": 32, + "learning_rate": 0.0001 + }, + "GDSCv1": { + "batch_size": 32, + "learning_rate": 0.0001 + }, + "GDSCv2": { + "batch_size": 32, + "learning_rate": 0.0001 + }, + "gCSI": { + "batch_size": 32, + "learning_rate": 0.0001 + } + }, + "PathDSP": { + "CCLE": {}, + "CTRPv2": {}, + "GDSCv1": {}, + "GDSCv2": {}, + "gCSI": {} + } +} \ No newline at end of file diff --git a/hyperparameters_hpo.json b/hyperparameters_hpo.json new file mode 100644 index 0000000..f798500 --- /dev/null +++ b/hyperparameters_hpo.json @@ -0,0 +1,101 @@ +{ + "graphdrp": { + "CCLE": { + "batch_size": 64, + "learning_rate": 0.0012233565752066463 + }, + "CTRPv2": { + "batch_size": 256, + "learning_rate": 0.0001 + }, + "GDSCv1": { + "batch_size": 1024, + "learning_rate": 0.06408519376992045 + }, + "GDSCv2": { + "batch_size": 128, + "learning_rate": 0.0005245278010578158 + }, + "gCSI": { + "batch_size": 32, + "learning_rate": 0.00015145175483629418 + } + }, + + "HiDRA": { + "CCLE": { + "batch_size": 16, + "learning_rate": 0.0011825165353494742 + }, + "CTRPv2": {}, + "GDSCv1": { + "batch_size": 64, + "learning_rate": 0.0033642852862458077 + }, + "GDSCv2": {}, + "gCSI": { + "batch_size": 32, + "learning_rate": 0.00685103425830459 + } + }, + "IGTD": { + "CCLE": { + "batch_size": 16, + "learning_rate": 0.00044116909746320135 + }, + "CTRPv2": { + "batch_size": 512, + "learning_rate": 0.0005219211138261751 + }, + "GDSCv1": { + "batch_size": 32, + "learning_rate": 0.0006637139133709175 + }, + "GDSCv2": { + "batch_size": 128, + "learning_rate": 0.0005058417379141607 + }, + "gCSI": { + "batch_size": 32, + "learning_rate": 0.0008434053110179821 + } + }, + "Paccmann_MCA": { + "CCLE": { + "batch_size": 512, + "learning_rate": 0.00019306380321264862 + }, + "CTRPv2": {}, + "GDSCv1": { + "batch_size": 128, + "learning_rate": 2.18431820219191e-05 + }, + "GDSCv2": { + "batch_size": 128, + "learning_rate": 0.00010468988102446662 + }, + "gCSI": { + "batch_size": 256, + "learning_rate": 7.092892399034623e-05 + } + }, + "PathDSP": { + "CCLE": { + "batch_size": 64, + "learning_rate": 9.043143432931976e-05 + }, + "CTRPv2": {}, + "GDSCv1": { + "batch_size": 128, + "learning_rate": 7.565669627288258e-05 + }, + "GDSCv2": { + "batch_size": 16, + "learning_rate": 6.916681700876223e-05 + }, + "gCSI": { + "batch_size": 128, + "learning_rate": 0.00024319069525915603 + } + } +} \ No newline at end of file diff --git a/workflow_csa.py b/workflow_csa.py new file mode 100644 index 0000000..197d29d --- /dev/null +++ b/workflow_csa.py @@ -0,0 +1,246 @@ +import parsl +from parsl import python_app +import subprocess +from parsl.config import Config +from parsl.executors import HighThroughputExecutor +from parsl.providers import LocalProvider +from time import time +from typing import Sequence, Tuple, Union +from pathlib import Path +import logging +import sys +import json + +import csa_params_def as CSA +import improvelib.utils as frm +from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig + + +##### CONFIG FOR LAMBDA ###### +#available_accelerators: Union[int, Sequence[str]] = 8 +worker_port_range: Tuple[int, int] = (10000, 20000) +retries: int = 1 + +config_lambda = Config( + retries=retries, + executors=[ + HighThroughputExecutor( + address='127.0.0.1', + label="htex", + cpu_affinity="block", + #max_workers_per_node=2, ## IS NOT SUPPORTED IN Parsl version: 2023.06.19. CHECK HOW TO USE THIS??? + worker_debug=True, + available_accelerators=8, ## CHANGE THIS AS REQUIRED BY THE MACHINE + worker_port_range=worker_port_range, + provider=LocalProvider( + init_blocks=1, + max_blocks=1, + ), + ) + ], + strategy='simple', +) + +parsl.clear() +parsl.load(config_lambda) + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +fdir = Path(__file__).resolve().parent +logger = logging.getLogger(f'Start workflow') + +############################################################################## +################################ PARSL APPS ################################## +############################################################################## + +@python_app +def preprocess(inputs=[]): # + import warnings + import subprocess + import improvelib.utils as frm + def build_split_fname(source_data_name, split, phase): + """ Build split file name. If file does not exist continue """ + if split=='all': + return f"{source_data_name}_{split}.txt" + return f"{source_data_name}_split_{split}_{phase}.txt" + params=inputs[0] + source_data_name=inputs[1] + split=inputs[2] + + split_nums=params['split'] + # Get the split file paths + if len(split_nums) == 0: + # Get all splits + split_files = list((params['splits_path']).glob(f"{source_data_name}_split_*.txt")) + split_nums = [str(s).split("split_")[1].split("_")[0] for s in split_files] + split_nums = sorted(set(split_nums)) + else: + split_files = [] + for s in split_nums: + split_files.extend(list((params['splits_path']).glob(f"{source_data_name}_split_{s}_*.txt"))) + files_joined = [str(s) for s in split_files] + + print(f"Split id {split} out of {len(split_nums)} splits.") + # Check that train, val, and test are available. Otherwise, continue to the next split. + for phase in ["train", "val", "test"]: + fname = build_split_fname(source_data_name, split, phase) + if fname not in "\t".join(files_joined): + warnings.warn(f"\nThe {phase} split file {fname} is missing (continue to next split)") + continue + + for target_data_name in params['target_datasets']: + ml_data_dir = params['ml_data_dir']/f"{source_data_name}-{target_data_name}"/ \ + f"split_{split}" + if ml_data_dir.exists() is True: + continue + if params['only_cross_study'] and (source_data_name == target_data_name): + continue # only cross-study + print(f"\nSource data: {source_data_name}") + print(f"Target data: {target_data_name}") + + params['ml_data_outdir'] = params['ml_data_dir']/f"{source_data_name}-{target_data_name}"/f"split_{split}" + frm.create_outdir(outdir=params["ml_data_outdir"]) + if source_data_name == target_data_name: + # If source and target are the same, then infer on the test split + test_split_file = f"{source_data_name}_split_{split}_test.txt" + else: + # If source and target are different, then infer on the entire target dataset + test_split_file = f"{target_data_name}_all.txt" + + # Preprocess data + print("\nPreprocessing") + train_split_file = f"{source_data_name}_split_{split}_train.txt" + val_split_file = f"{source_data_name}_split_{split}_val.txt" + print(f"train_split_file: {train_split_file}") + print(f"val_split_file: {val_split_file}") + print(f"test_split_file: {test_split_file}") + print(f"ml_data_outdir: {params['ml_data_outdir']}") + if params['use_singularity']: + raise Exception('Functionality using singularity is work in progress. Please use the Python version to call preprocess by setting use_singularity=False') + + else: + preprocess_run = ["python", + params['preprocess_python_script'], + "--train_split_file", str(train_split_file), + "--val_split_file", str(val_split_file), + "--test_split_file", str(test_split_file), + "--input_dir", params['input_dir'], + "--output_dir", str(ml_data_dir), + "--y_col_name", str(params['y_col_name']) + ] + result = subprocess.run(preprocess_run, capture_output=True, + text=True, check=True) + return {'source_data_name':source_data_name, 'split':split} + + +@python_app +def train(params, hp_model, source_data_name, split): + import subprocess + hp = hp_model[source_data_name] + if hp.__len__()==0: + raise Exception(str('Hyperparameters are not defined for ' + source_data_name)) + + model_dir = params['model_outdir'] / f"{source_data_name}" / f"split_{split}" + ml_data_dir = params['ml_data_dir']/f"{source_data_name}-{params['target_datasets'][0]}"/ \ + f"split_{split}" + if model_dir.exists() is False: + print("\nTrain") + print(f"ml_data_dir: {ml_data_dir}") + print(f"model_dir: {model_dir}") + if params['use_singularity']: + raise Exception('Functionality using singularity is work in progress. Please use the Python version to call train by setting use_singularity=False') + else: + train_run = ["python", + params['train_python_script'], + "--input_dir", str(ml_data_dir), + "--output_dir", str(model_dir), + "--epochs", str(params['epochs']), # DL-specific + "--y_col_name", str(params['y_col_name']), + "--learning_rate", str(hp['learning_rate']), + "--batch_size", str(hp['batch_size']) + ] + result = subprocess.run(train_run, capture_output=True, + text=True, check=True) + return {'source_data_name':source_data_name, 'split':split} + +@python_app +def infer(params, source_data_name, target_data_name, split): # + import subprocess + model_dir = params['model_outdir'] / f"{source_data_name}" / f"split_{split}" + ml_data_dir = params['ml_data_dir']/f"{source_data_name}-{target_data_name}"/ \ + f"split_{split}" + infer_dir = params['infer_dir']/f"{source_data_name}-{target_data_name}"/f"split_{split}" + if params['use_singularity']: + raise Exception('Functionality using singularity is work in progress. Please use the Python version to call infer by setting use_singularity=False') + + else: + print("\nInfer") + infer_run = ["python", params['infer_python_script'], + "--input_data_dir", str(ml_data_dir), + "--input_model_dir", str(model_dir), + "--output_dir", str(infer_dir), + "--y_col_name", str(params['y_col_name']) + ] + result = subprocess.run(infer_run, capture_output=True, + text=True, check=True) + return True + +############################### +####### CSA PARAMETERS ######## +############################### + +additional_definitions = CSA.additional_definitions +filepath = Path(__file__).resolve().parent +cfg = DRPPreprocessConfig() +params = cfg.initialize_parameters( + pathToModelDir=filepath, + default_config="csa_params.ini", + default_model=None, + additional_cli_section=None, + additional_definitions=additional_definitions, + required=None +) + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +fdir = Path(__file__).resolve().parent +y_col_name = params['y_col_name'] +logger = logging.getLogger(f"{params['model_name']}") +params = frm.build_paths(params) # paths to raw data + +#Output directories for preprocess, train and infer +params['ml_data_dir'] = Path(params['output_dir']) / 'ml_data' +params['model_outdir'] = Path(params['output_dir']) / 'models' +params['infer_dir'] = Path(params['output_dir']) / 'infer' + +#Model scripts +params['preprocess_python_script'] = f"{params['model_name']}_preprocess_improve.py" +params['train_python_script'] = f"{params['model_name']}_train_improve.py" +params['infer_python_script'] = f"{params['model_name']}_infer_improve.py" + +#Read Hyperparameters file +with open(params['hyperparameters_file']) as f: + hp = json.load(f) +hp_model = hp[params['model_name']] + +########################################################################## +##################### START PARSL PARALLEL EXECUTION ##################### +########################################################################## + +##Preprocess execution with Parsl +preprocess_futures=[] +for source_data_name in params['source_datasets']: + for split in params['split']: + preprocess_futures.append(preprocess(inputs=[params, source_data_name, split])) + +##Train execution with Parsl +train_futures=[] +for future_p in preprocess_futures: + train_futures.append(train(params, hp_model, future_p.result()['source_data_name'], future_p.result()['split'])) + +##Infer execution with Parsl +infer_futures =[] +for future_t in train_futures: + for target_data_name in params['target_datasets']: + infer_futures.append(infer(params, future_t.result()['source_data_name'], target_data_name, future_t.result()['split'])) + +for future_i in infer_futures: + print(future_i.result()) From 8c4cf48aa2c62c81436e651fb02f7c7d0b55b56d Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 6 Sep 2024 10:55:11 -0400 Subject: [PATCH 154/254] took out default_model --- workflow_csa.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/workflow_csa.py b/workflow_csa.py index 197d29d..b9d6b91 100644 --- a/workflow_csa.py +++ b/workflow_csa.py @@ -194,10 +194,7 @@ def infer(params, source_data_name, target_data_name, split): # params = cfg.initialize_parameters( pathToModelDir=filepath, default_config="csa_params.ini", - default_model=None, - additional_cli_section=None, - additional_definitions=additional_definitions, - required=None + additional_definitions=additional_definitions ) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) From 08fb9135e3da95ce71c6f1c1774cbda8bdbaeab9 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 6 Sep 2024 10:56:09 -0400 Subject: [PATCH 155/254] rm " from model_name --- csa_params.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csa_params.ini b/csa_params.ini index c04df57..29db875 100644 --- a/csa_params.ini +++ b/csa_params.ini @@ -7,7 +7,7 @@ hyperparameters_file = ./hyperparameters_default.json source_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] target_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] split = ["0", "1", "2"] -model_name = "PathDSP" +model_name = PathDSP only_cross_study = False epochs = 100 From 05ee3ecb8ce1095539e65c23d5baa4292b0213fb Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 6 Sep 2024 14:31:20 -0400 Subject: [PATCH 156/254] readme, setup, etc --- ...onment_082223.yml => PathDSP_env_conda.yml | 0 PathDSP_infer_improve.py | 3 +- PathDSP_preprocess_improve.py | 2 +- PathDSP_train_improve.py | 2 +- README.md | 115 ++++++------------ ...eter_definitions.py => model_params_def.py | 0 setup_improve.sh | 2 +- 7 files changed, 42 insertions(+), 82 deletions(-) rename environment_082223.yml => PathDSP_env_conda.yml (100%) rename PathDSP_parameter_definitions.py => model_params_def.py (100%) diff --git a/environment_082223.yml b/PathDSP_env_conda.yml similarity index 100% rename from environment_082223.yml rename to PathDSP_env_conda.yml diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index 3bfbae5..72b0a68 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -17,6 +17,7 @@ ) from improvelib.applications.drug_response_prediction.config import DRPInferConfig #NCK import improvelib.utils as frm #NCK +from model_params_def import pathdsp_infer_params file_path = os.path.dirname(os.path.realpath(__file__)) @@ -69,7 +70,7 @@ def main(args): params = cfg.initialize_parameters( file_path, default_config="PathDSP_params.txt", - additional_definitions=None) + additional_definitions=pathdsp_infer_params) test_scores = run(params) print("\nFinished inference of PathDSP model.") diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 0b024da..0e91051 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -20,7 +20,7 @@ import improvelib.applications.drug_response_prediction.omics_utils as omics #NCK import improvelib.applications.drug_response_prediction.drp_utils as drp #NCK -from PathDSP_parameter_definitions import pathdsp_preprocess_params +from model_params_def import pathdsp_preprocess_params file_path = Path(__file__).resolve().parent diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index f299b1e..1a1cf6c 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -15,7 +15,7 @@ import improvelib.utils as frm #NCK from PathDSP_preprocess_improve import cal_time, preprocess -from PathDSP_parameter_definitions import pathdsp_train_params +from model_params_def import pathdsp_train_params file_path = os.path.dirname(os.path.realpath(__file__)) diff --git a/README.md b/README.md index 178ad46..74faf51 100644 --- a/README.md +++ b/README.md @@ -1,59 +1,21 @@ # PathDSP +# GraphDRP -This is development for v0.1.0-alpha. +This repository demonstrates how to use the [IMPROVE library v0.1.0-alpha](https://jdacs4c-improve.github.io/docs/v0.1.0-alpha/) for building a drug response prediction (DRP) model using PathDSP, and provides examples with the benchmark [cross-study analysis (CSA) dataset](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/). - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -This repository demonstrates how to use the [IMPROVE library v0.0.3-beta](https://github.com/JDACS4C-IMPROVE/IMPROVE/tree/v0.0.3-beta) for building a drug response prediction (DRP) model using PathDSP, and provides examples with the benchmark [cross-study analysis (CSA) dataset](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/). - -This version, tagged as `v0.0.3-beta`, is the final release before transitioning to `v0.1.0-alpha`, which introduces a new API. Version `v0.0.3-beta` and all previous releases have served as the foundation for developing essential components of the IMPROVE software stack. Subsequent releases build on this legacy with an updated API, designed to encourage broader adoption of IMPROVE and its curated models by the research community. - -A more detailed tutorial can be found [here](https://jdacs4c-improve.github.io/docs/v0.0.3-beta/content/ModelContributorGuide.html). +This version, tagged as `v0.1.0-alpha`, introduces a new API which is designed to encourage broader adoption of IMPROVE and its curated models by the research community. ## Dependencies Installation instuctions are detailed below in [Step-by-step instructions](#step-by-step-instructions). -Conda `yml` file [environment_082223.yml](./environment_082223.yml) +Conda `yml` file [PathDSP_env_conda](./PathDSP_env_conda.yml) ML framework: + [Torch](https://pytorch.org/) -- deep learning framework for building the prediction model IMPROVE dependencies: -+ [IMPROVE v0.0.3-beta](https://github.com/JDACS4C-IMPROVE/IMPROVE/tree/v0.0.3-beta) -+ [candle_lib](https://github.com/ECP-CANDLE/candle_lib) - IMPROVE dependency (enables various hyperparameter optimization on HPC machines) ++ [IMPROVE v0.1.0-alpha](https://jdacs4c-improve.github.io/docs/v0.1.0-alpha/) @@ -98,7 +60,8 @@ csa_data/raw_data/ + `PathDSP_preprocess_improve.py` - takes benchmark data files and transforms into files for training and inference + `PathDSP_train_improve.py` - trains the PathDSP model + `PathDSP_infer_improve.py` - runs inference with the trained PathDSP model -+ `PathDSP_default_model.txt` - default parameter file ++ `model_params_def.py` - definitions of parameters that are specific to the model ++ `PathDSP_params.txt` - default parameter file @@ -108,14 +71,14 @@ csa_data/raw_data/ ``` git clone https://github.com/JDACS4C-IMPROVE/PathDSP cd PathDSP -git checkout v0.0.3-beta +git checkout develop ``` ### 2. Set computational environment Create conda env using `yml` ``` -conda env create -f environment_082223.yml -n PathDSP_env +conda env create -f PathDSP_env_conda.yml -n PathDSP_env conda activate PathDSP_env ``` @@ -134,7 +97,7 @@ This will: ### 4. Preprocess CSA benchmark data (_raw data_) to construct model input data (_ML data_) ```bash -python PathDSP_preprocess_improve.py +python PathDSP_preprocess_improve.py --input_dir ./csa_data/raw_data --output_dir exp_result ``` Preprocesses the CSA data and creates train, validation (val), and test datasets. @@ -143,28 +106,26 @@ Generates: * three model input data files: `train_data.txt`, `val_data.txt`, `test_data.txt` ``` -ml_data -└── gCSI - └── split_0 - ├── tmpdir_ssgsea - ├── EXP.txt - ├── cnv_data.txt - ├── CNVnet.txt - ├── DGnet.txt - ├── MUTnet.txt - ├── drug_mbit_df.txt - ├── drug_target.txt - ├── mutation_data.txt - ├── test_data.txt - ├── train_data.txt - ├── val_data.txt - └── x_data_gene_expression_scaler.gz +exp_result +├── tmpdir_ssgsea +├── EXP.txt +├── cnv_data.txt +├── CNVnet.txt +├── DGnet.txt +├── MUTnet.txt +├── drug_mbit_df.txt +├── drug_target.txt +├── mutation_data.txt +├── test_data.txt +├── train_data.txt +├── val_data.txt +└── x_data_gene_expression_scaler.gz ``` ### 5. Train PathDSP model ```bash -python PathDSP_train_improve.py +python PathDSP_train_improve.py --input_dir exp_result --output_dir exp_result ``` Trains PathDSP using the model input data: `train_data.txt` (training), `val_data.txt` (for early stopping). @@ -174,19 +135,19 @@ Generates: * predictions on val data (tabular data): `val_y_data_predicted.csv` * prediction performance scores on val data: `val_scores.json` ``` -out_models -└── gCSI - └── split_0 - ├── model.pt - ├── checkpoint.pt - ├── Val_Loss_orig.txt - ├── val_scores.json - └── val_y_data_predicted.csv +exp_result +├── model.pt +├── checkpoint.pt +├── Val_Loss_orig.txt +├── val_scores.json +└── val_y_data_predicted.csv ``` ### 6. Run inference on test data with the trained model -```python PathDSP_infer_improve.py``` +```bash +python PathDSP_infer_improve.py --input_data_dir exp_result --input_model_dir exp_result --output_dir exp_result --calc_infer_score True +``` Evaluates the performance on a test dataset with the trained model. @@ -194,9 +155,7 @@ Generates: * predictions on test data (tabular data): `test_y_data_predicted.csv` * prediction performance scores on test data: `test_scores.json` ``` -out_infer -└── gCSI-gCSI - └── split_0 - ├── test_scores.json - └── test_y_data_predicted.csv +exp_result +├── test_scores.json +└── test_y_data_predicted.csv ``` diff --git a/PathDSP_parameter_definitions.py b/model_params_def.py similarity index 100% rename from PathDSP_parameter_definitions.py rename to model_params_def.py diff --git a/setup_improve.sh b/setup_improve.sh index 834903b..fd911a0 100644 --- a/setup_improve.sh +++ b/setup_improve.sh @@ -38,7 +38,7 @@ export AUTHOR_DATA_DIR="./$author_dir/" # Clone IMPROVE lib (if needed) pushd ../ improve_lib_path=$PWD/IMPROVE -improve_branch="v0.0.3-beta" +improve_branch="develop" if [ -d $improve_lib_path ]; then echo "IMPROVE repo exists in ${improve_lib_path}" else From f4145548bfa61ae2f97778f2b9c8013cccdac160 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 9 Sep 2024 12:04:36 -0400 Subject: [PATCH 157/254] adjusting csa parameters for test --- csa_params.ini | 4 ++-- workflow_csa.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/csa_params.ini b/csa_params.ini index 29db875..cea4cbd 100644 --- a/csa_params.ini +++ b/csa_params.ini @@ -4,8 +4,8 @@ output_dir=./improve_output y_col_name = auc use_singularity = False hyperparameters_file = ./hyperparameters_default.json -source_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] -target_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] +source_datasets = ["gCSI", "CCLE"] +target_datasets = ["gCSI", "CCLE"] split = ["0", "1", "2"] model_name = PathDSP only_cross_study = False diff --git a/workflow_csa.py b/workflow_csa.py index b9d6b91..3ba736f 100644 --- a/workflow_csa.py +++ b/workflow_csa.py @@ -30,7 +30,7 @@ cpu_affinity="block", #max_workers_per_node=2, ## IS NOT SUPPORTED IN Parsl version: 2023.06.19. CHECK HOW TO USE THIS??? worker_debug=True, - available_accelerators=8, ## CHANGE THIS AS REQUIRED BY THE MACHINE + available_accelerators=4, ## CHANGE THIS AS REQUIRED BY THE MACHINE worker_port_range=worker_port_range, provider=LocalProvider( init_blocks=1, From 65b0391caaa9ff8b1173602064b7cfab5ded6459 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 10 Sep 2024 13:45:22 -0400 Subject: [PATCH 158/254] hyperparam defaults added --- hyperparameters_default.json | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/hyperparameters_default.json b/hyperparameters_default.json index 44778fb..a0ca25d 100644 --- a/hyperparameters_default.json +++ b/hyperparameters_default.json @@ -58,10 +58,25 @@ } }, "PathDSP": { - "CCLE": {}, - "CTRPv2": {}, - "GDSCv1": {}, - "GDSCv2": {}, - "gCSI": {} + "CCLE": { + "batch_size": 12, + "learning_rate": 0.0004 + }, + "CTRPv2": { + "batch_size": 12, + "learning_rate": 0.0004 + }, + "GDSCv1": { + "batch_size": 12, + "learning_rate": 0.0004 + }, + "GDSCv2": { + "batch_size": 12, + "learning_rate": 0.0004 + }, + "gCSI": { + "batch_size": 12, + "learning_rate": 0.0004 + } } } \ No newline at end of file From c363fd1aa20b1daeb9ccbbef1b4c84a821907d76 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:03:55 -0400 Subject: [PATCH 159/254] updated csa_workflow --- csa_params.ini | 1 + csa_params_def.py | 6 +++++ workflow_csa.py | 60 +++++++++++++++++++++++++++++++++-------------- 3 files changed, 50 insertions(+), 17 deletions(-) diff --git a/csa_params.ini b/csa_params.ini index cea4cbd..5463502 100644 --- a/csa_params.ini +++ b/csa_params.ini @@ -10,6 +10,7 @@ split = ["0", "1", "2"] model_name = PathDSP only_cross_study = False epochs = 100 +available_accelerators=["0","1"] [Preprocess] diff --git a/csa_params_def.py b/csa_params_def.py index b5c87ab..9e8930d 100644 --- a/csa_params_def.py +++ b/csa_params_def.py @@ -38,6 +38,12 @@ "default": 10, "help": "Number of epochs" }, + {"name": "available_accelerators", + "nargs" : "+", + "type": str, + "default": ["0", "1"], + "help": "GPU IDs to assign jobs" + }, {"name": "use_singularity", "type": bool, "default": True, diff --git a/workflow_csa.py b/workflow_csa.py index 3ba736f..c010c63 100644 --- a/workflow_csa.py +++ b/workflow_csa.py @@ -15,6 +15,15 @@ import improvelib.utils as frm from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig +# Initialize parameters for CSA +additional_definitions = CSA.additional_definitions +filepath = Path(__file__).resolve().parent +cfg = DRPPreprocessConfig() +params = cfg.initialize_parameters( + pathToModelDir=filepath, + default_config="csa_params.ini", + additional_definitions=additional_definitions +) ##### CONFIG FOR LAMBDA ###### #available_accelerators: Union[int, Sequence[str]] = 8 @@ -30,7 +39,7 @@ cpu_affinity="block", #max_workers_per_node=2, ## IS NOT SUPPORTED IN Parsl version: 2023.06.19. CHECK HOW TO USE THIS??? worker_debug=True, - available_accelerators=4, ## CHANGE THIS AS REQUIRED BY THE MACHINE + available_accelerators=params['available_accelerators'], worker_port_range=worker_port_range, provider=LocalProvider( init_blocks=1, @@ -115,8 +124,17 @@ def build_split_fname(source_data_name, split, phase): print(f"test_split_file: {test_split_file}") print(f"ml_data_outdir: {params['ml_data_outdir']}") if params['use_singularity']: - raise Exception('Functionality using singularity is work in progress. Please use the Python version to call preprocess by setting use_singularity=False') - + preprocess_run = ["singularity", "exec", "--nv", + params['singularity_image'], "preprocess.sh", + str("--train_split_file " + str(train_split_file)), + str("--val_split_file " + str(val_split_file)), + str("--test_split_file " + str(test_split_file)), + str("--input_dir " + params['input_dir']), + str("--output_dir " + str(ml_data_dir)), + str("--y_col_name " + str(params['y_col_name'])) + ] + result = subprocess.run(preprocess_run, capture_output=True, + text=True, check=True) else: preprocess_run = ["python", params['preprocess_python_script'], @@ -147,10 +165,20 @@ def train(params, hp_model, source_data_name, split): print(f"ml_data_dir: {ml_data_dir}") print(f"model_dir: {model_dir}") if params['use_singularity']: - raise Exception('Functionality using singularity is work in progress. Please use the Python version to call train by setting use_singularity=False') + train_run = ["singularity", "exec", "--nv", + params['singularity_image'], "train.sh", + str("--input_dir " + str(ml_data_dir)), + str("--output_dir " + str(model_dir)), + str("--epochs " + str(params['epochs'])), + str("--y_col_name " + str(params['y_col_name'])), + str("--learning_rate " + str(hp['learning_rate'])), + str("--batch_size " + str(hp['batch_size'])) + ] + result = subprocess.run(train_run, capture_output=True, + text=True, check=True) else: train_run = ["python", - params['train_python_script'], + params['train_python_script'], "--input_dir", str(ml_data_dir), "--output_dir", str(model_dir), "--epochs", str(params['epochs']), # DL-specific @@ -170,8 +198,15 @@ def infer(params, source_data_name, target_data_name, split): # f"split_{split}" infer_dir = params['infer_dir']/f"{source_data_name}-{target_data_name}"/f"split_{split}" if params['use_singularity']: - raise Exception('Functionality using singularity is work in progress. Please use the Python version to call infer by setting use_singularity=False') - + infer_run = ["singularity", "exec", "--nv", + params['singularity_image'], "infer.sh", + str("--input_data_dir " + str(ml_data_dir)), + str("--input_model_dir " + str(model_dir)), + str("--output_dir " + str(infer_dir)), + str("--y_col_name " + str(params['y_col_name'])) + ] + result = subprocess.run(infer_run, capture_output=True, + text=True, check=True) else: print("\nInfer") infer_run = ["python", params['infer_python_script'], @@ -188,15 +223,6 @@ def infer(params, source_data_name, target_data_name, split): # ####### CSA PARAMETERS ######## ############################### -additional_definitions = CSA.additional_definitions -filepath = Path(__file__).resolve().parent -cfg = DRPPreprocessConfig() -params = cfg.initialize_parameters( - pathToModelDir=filepath, - default_config="csa_params.ini", - additional_definitions=additional_definitions -) - logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) fdir = Path(__file__).resolve().parent y_col_name = params['y_col_name'] @@ -240,4 +266,4 @@ def infer(params, source_data_name, target_data_name, split): # infer_futures.append(infer(params, future_t.result()['source_data_name'], target_data_name, future_t.result()['split'])) for future_i in infer_futures: - print(future_i.result()) + print(future_i.result()) \ No newline at end of file From 487312b4c89b5bbefb0e42e738799d34e2493ea5 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:30:39 -0400 Subject: [PATCH 160/254] fixed readme --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 74faf51..a19376d 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ # PathDSP -# GraphDRP This repository demonstrates how to use the [IMPROVE library v0.1.0-alpha](https://jdacs4c-improve.github.io/docs/v0.1.0-alpha/) for building a drug response prediction (DRP) model using PathDSP, and provides examples with the benchmark [cross-study analysis (CSA) dataset](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/). From 6f3e19dbeefa19be1a826dfc12466606e8fa8bcf Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:49:39 -0400 Subject: [PATCH 161/254] fix author_data in params --- PathDSP_params.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index eb3c902..6a1356c 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -1,6 +1,6 @@ [Preprocess] data_format = .txt -input_supp_data_dir = ../author_data +input_supp_data_dir = ./author_data train_split_file = gCSI_split_0_train.txt val_split_file = gCSI_split_0_val.txt test_split_file = gCSI_split_0_test.txt From 280192f58f0a6d9cc4ae75286f1ac4ee93f7362b Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 11 Sep 2024 09:50:29 -0400 Subject: [PATCH 162/254] don't return test scores --- PathDSP_infer_improve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index 72b0a68..ad91d27 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -63,7 +63,7 @@ def run(params): ) print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start))) - return test_scores + return True def main(args): cfg = DRPInferConfig() @@ -71,7 +71,7 @@ def main(args): file_path, default_config="PathDSP_params.txt", additional_definitions=pathdsp_infer_params) - test_scores = run(params) + if_ran = run(params) print("\nFinished inference of PathDSP model.") From 1266c91925cb7cc304b302689ba90aee720b5b91 Mon Sep 17 00:00:00 2001 From: Sara Jones Date: Wed, 11 Sep 2024 12:09:34 -0700 Subject: [PATCH 163/254] add and change files for csa parsl workflow --- csa_params.ini | 9 ++- csa_params_def.py | 5 ++ execute_in_conda.sh | 11 +++ workflow_csa.py | 114 +++----------------------- workflow_preprocess.py | 179 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 211 insertions(+), 107 deletions(-) create mode 100644 execute_in_conda.sh create mode 100644 workflow_preprocess.py diff --git a/csa_params.ini b/csa_params.ini index 5463502..2fad301 100644 --- a/csa_params.ini +++ b/csa_params.ini @@ -4,13 +4,14 @@ output_dir=./improve_output y_col_name = auc use_singularity = False hyperparameters_file = ./hyperparameters_default.json -source_datasets = ["gCSI", "CCLE"] +source_datasets = ["gCSI"] target_datasets = ["gCSI", "CCLE"] -split = ["0", "1", "2"] +split = ["0", "1"] model_name = PathDSP only_cross_study = False -epochs = 100 -available_accelerators=["0","1"] +epochs = 3 +available_accelerators=["4","5"] +model_environment = PathDSP_env [Preprocess] diff --git a/csa_params_def.py b/csa_params_def.py index 9e8930d..75daeb5 100644 --- a/csa_params_def.py +++ b/csa_params_def.py @@ -28,6 +28,11 @@ "default": 'graphdrp', ## Change the default to LGBM?? "help": "Name of the deep learning model" }, + {"name": "model_environment", + "type": str, + "default": '', ## Change the default to LGBM?? + "help": "Name of your model conda environment" + }, {"name": "hyperparameters_file", "type": str, "default": 'hyperparameters_default.json', diff --git a/execute_in_conda.sh b/execute_in_conda.sh new file mode 100644 index 0000000..303f81b --- /dev/null +++ b/execute_in_conda.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +model_env=$1 ; shift +script=$1 ; shift +conda_path=$(dirname $(dirname $(which conda))) +source $conda_path/bin/activate $model_env +CMD="python ${script} $@" + +echo "running command ${CMD}" + +$CMD diff --git a/workflow_csa.py b/workflow_csa.py index c010c63..edc9dd7 100644 --- a/workflow_csa.py +++ b/workflow_csa.py @@ -61,95 +61,6 @@ ################################ PARSL APPS ################################## ############################################################################## -@python_app -def preprocess(inputs=[]): # - import warnings - import subprocess - import improvelib.utils as frm - def build_split_fname(source_data_name, split, phase): - """ Build split file name. If file does not exist continue """ - if split=='all': - return f"{source_data_name}_{split}.txt" - return f"{source_data_name}_split_{split}_{phase}.txt" - params=inputs[0] - source_data_name=inputs[1] - split=inputs[2] - - split_nums=params['split'] - # Get the split file paths - if len(split_nums) == 0: - # Get all splits - split_files = list((params['splits_path']).glob(f"{source_data_name}_split_*.txt")) - split_nums = [str(s).split("split_")[1].split("_")[0] for s in split_files] - split_nums = sorted(set(split_nums)) - else: - split_files = [] - for s in split_nums: - split_files.extend(list((params['splits_path']).glob(f"{source_data_name}_split_{s}_*.txt"))) - files_joined = [str(s) for s in split_files] - - print(f"Split id {split} out of {len(split_nums)} splits.") - # Check that train, val, and test are available. Otherwise, continue to the next split. - for phase in ["train", "val", "test"]: - fname = build_split_fname(source_data_name, split, phase) - if fname not in "\t".join(files_joined): - warnings.warn(f"\nThe {phase} split file {fname} is missing (continue to next split)") - continue - - for target_data_name in params['target_datasets']: - ml_data_dir = params['ml_data_dir']/f"{source_data_name}-{target_data_name}"/ \ - f"split_{split}" - if ml_data_dir.exists() is True: - continue - if params['only_cross_study'] and (source_data_name == target_data_name): - continue # only cross-study - print(f"\nSource data: {source_data_name}") - print(f"Target data: {target_data_name}") - - params['ml_data_outdir'] = params['ml_data_dir']/f"{source_data_name}-{target_data_name}"/f"split_{split}" - frm.create_outdir(outdir=params["ml_data_outdir"]) - if source_data_name == target_data_name: - # If source and target are the same, then infer on the test split - test_split_file = f"{source_data_name}_split_{split}_test.txt" - else: - # If source and target are different, then infer on the entire target dataset - test_split_file = f"{target_data_name}_all.txt" - - # Preprocess data - print("\nPreprocessing") - train_split_file = f"{source_data_name}_split_{split}_train.txt" - val_split_file = f"{source_data_name}_split_{split}_val.txt" - print(f"train_split_file: {train_split_file}") - print(f"val_split_file: {val_split_file}") - print(f"test_split_file: {test_split_file}") - print(f"ml_data_outdir: {params['ml_data_outdir']}") - if params['use_singularity']: - preprocess_run = ["singularity", "exec", "--nv", - params['singularity_image'], "preprocess.sh", - str("--train_split_file " + str(train_split_file)), - str("--val_split_file " + str(val_split_file)), - str("--test_split_file " + str(test_split_file)), - str("--input_dir " + params['input_dir']), - str("--output_dir " + str(ml_data_dir)), - str("--y_col_name " + str(params['y_col_name'])) - ] - result = subprocess.run(preprocess_run, capture_output=True, - text=True, check=True) - else: - preprocess_run = ["python", - params['preprocess_python_script'], - "--train_split_file", str(train_split_file), - "--val_split_file", str(val_split_file), - "--test_split_file", str(test_split_file), - "--input_dir", params['input_dir'], - "--output_dir", str(ml_data_dir), - "--y_col_name", str(params['y_col_name']) - ] - result = subprocess.run(preprocess_run, capture_output=True, - text=True, check=True) - return {'source_data_name':source_data_name, 'split':split} - - @python_app def train(params, hp_model, source_data_name, split): import subprocess @@ -177,7 +88,7 @@ def train(params, hp_model, source_data_name, split): result = subprocess.run(train_run, capture_output=True, text=True, check=True) else: - train_run = ["python", + train_run = ["bash", "execute_in_conda.sh",params['model_environment'], params['train_python_script'], "--input_dir", str(ml_data_dir), "--output_dir", str(model_dir), @@ -185,7 +96,7 @@ def train(params, hp_model, source_data_name, split): "--y_col_name", str(params['y_col_name']), "--learning_rate", str(hp['learning_rate']), "--batch_size", str(hp['batch_size']) - ] + ] result = subprocess.run(train_run, capture_output=True, text=True, check=True) return {'source_data_name':source_data_name, 'split':split} @@ -203,18 +114,21 @@ def infer(params, source_data_name, target_data_name, split): # str("--input_data_dir " + str(ml_data_dir)), str("--input_model_dir " + str(model_dir)), str("--output_dir " + str(infer_dir)), + str("--calc_infer_scores "+ "true"), str("--y_col_name " + str(params['y_col_name'])) ] result = subprocess.run(infer_run, capture_output=True, text=True, check=True) else: print("\nInfer") - infer_run = ["python", params['infer_python_script'], + infer_run = ["bash", "execute_in_conda.sh",params['model_environment'], + params['infer_python_script'], "--input_data_dir", str(ml_data_dir), "--input_model_dir", str(model_dir), "--output_dir", str(infer_dir), + "--calc_infer_scores", "true", "--y_col_name", str(params['y_col_name']) - ] + ] result = subprocess.run(infer_run, capture_output=True, text=True, check=True) return True @@ -235,7 +149,6 @@ def infer(params, source_data_name, target_data_name, split): # params['infer_dir'] = Path(params['output_dir']) / 'infer' #Model scripts -params['preprocess_python_script'] = f"{params['model_name']}_preprocess_improve.py" params['train_python_script'] = f"{params['model_name']}_train_improve.py" params['infer_python_script'] = f"{params['model_name']}_infer_improve.py" @@ -248,16 +161,11 @@ def infer(params, source_data_name, target_data_name, split): # ##################### START PARSL PARALLEL EXECUTION ##################### ########################################################################## -##Preprocess execution with Parsl -preprocess_futures=[] -for source_data_name in params['source_datasets']: - for split in params['split']: - preprocess_futures.append(preprocess(inputs=[params, source_data_name, split])) - ##Train execution with Parsl train_futures=[] -for future_p in preprocess_futures: - train_futures.append(train(params, hp_model, future_p.result()['source_data_name'], future_p.result()['split'])) +for source_data_name in params['source_datasets']: + for split in params['split']: + train_futures.append(train(params, hp_model, source_data_name, split)) ##Infer execution with Parsl infer_futures =[] @@ -266,4 +174,4 @@ def infer(params, source_data_name, target_data_name, split): # infer_futures.append(infer(params, future_t.result()['source_data_name'], target_data_name, future_t.result()['split'])) for future_i in infer_futures: - print(future_i.result()) \ No newline at end of file + print(future_i.result()) diff --git a/workflow_preprocess.py b/workflow_preprocess.py new file mode 100644 index 0000000..1a5f6cb --- /dev/null +++ b/workflow_preprocess.py @@ -0,0 +1,179 @@ +import parsl +from parsl import python_app +import subprocess +from parsl.config import Config +from parsl.executors import HighThroughputExecutor +from parsl.providers import LocalProvider +from time import time +from typing import Sequence, Tuple, Union +from pathlib import Path +import logging +import sys +import json + +import csa_params_def as CSA +import improvelib.utils as frm +from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig + +# Initialize parameters for CSA +additional_definitions = CSA.additional_definitions +filepath = Path(__file__).resolve().parent +cfg = DRPPreprocessConfig() +params = cfg.initialize_parameters( + pathToModelDir=filepath, + default_config="csa_params.ini", + additional_definitions=additional_definitions +) + +##### CONFIG FOR LAMBDA ###### +#available_accelerators: Union[int, Sequence[str]] = 8 +worker_port_range: Tuple[int, int] = (10000, 20000) +retries: int = 1 + +config_lambda = Config( + retries=retries, + executors=[ + HighThroughputExecutor( + address='127.0.0.1', + label="htex_preprocess", + cpu_affinity="alternating", + #max_workers_per_node=2, ## IS NOT SUPPORTED IN Parsl version: 2023.06.19. CHECK HOW TO USE THIS??? + worker_debug=True, + worker_port_range=worker_port_range, + provider=LocalProvider( + init_blocks=1, + max_blocks=1, + ), + ) + ], + strategy='simple', +) + +parsl.clear() +parsl.load(config_lambda) + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +fdir = Path(__file__).resolve().parent +logger = logging.getLogger(f'Start workflow') + +############################################################################## +################################ PARSL APPS ################################## +############################################################################## + +@python_app +def preprocess(inputs=[]): # + import warnings + import subprocess + import improvelib.utils as frm + def build_split_fname(source_data_name, split, phase): + """ Build split file name. If file does not exist continue """ + if split=='all': + return f"{source_data_name}_{split}.txt" + return f"{source_data_name}_split_{split}_{phase}.txt" + params=inputs[0] + source_data_name=inputs[1] + split=inputs[2] + + split_nums=params['split'] + # Get the split file paths + if len(split_nums) == 0: + # Get all splits + split_files = list((params['splits_path']).glob(f"{source_data_name}_split_*.txt")) + split_nums = [str(s).split("split_")[1].split("_")[0] for s in split_files] + split_nums = sorted(set(split_nums)) + else: + split_files = [] + for s in split_nums: + split_files.extend(list((params['splits_path']).glob(f"{source_data_name}_split_{s}_*.txt"))) + files_joined = [str(s) for s in split_files] + + print(f"Split id {split} out of {len(split_nums)} splits.") + # Check that train, val, and test are available. Otherwise, continue to the next split. + for phase in ["train", "val", "test"]: + fname = build_split_fname(source_data_name, split, phase) + if fname not in "\t".join(files_joined): + warnings.warn(f"\nThe {phase} split file {fname} is missing (continue to next split)") + continue + + for target_data_name in params['target_datasets']: + ml_data_dir = params['ml_data_dir']/f"{source_data_name}-{target_data_name}"/ \ + f"split_{split}" + if ml_data_dir.exists() is True: + continue + if params['only_cross_study'] and (source_data_name == target_data_name): + continue # only cross-study + print(f"\nSource data: {source_data_name}") + print(f"Target data: {target_data_name}") + + params['ml_data_outdir'] = params['ml_data_dir']/f"{source_data_name}-{target_data_name}"/f"split_{split}" + frm.create_outdir(outdir=params["ml_data_outdir"]) + if source_data_name == target_data_name: + # If source and target are the same, then infer on the test split + test_split_file = f"{source_data_name}_split_{split}_test.txt" + else: + # If source and target are different, then infer on the entire target dataset + test_split_file = f"{target_data_name}_all.txt" + + # Preprocess data + print("\nPreprocessing") + train_split_file = f"{source_data_name}_split_{split}_train.txt" + val_split_file = f"{source_data_name}_split_{split}_val.txt" + print(f"train_split_file: {train_split_file}") + print(f"val_split_file: {val_split_file}") + print(f"test_split_file: {test_split_file}") + print(f"ml_data_outdir: {params['ml_data_outdir']}") + if params['use_singularity']: + preprocess_run = ["singularity", "exec", "--nv", + params['singularity_image'], "preprocess.sh", + str("--train_split_file " + str(train_split_file)), + str("--val_split_file " + str(val_split_file)), + str("--test_split_file " + str(test_split_file)), + str("--input_dir " + params['input_dir']), + str("--output_dir " + str(ml_data_dir)), + str("--y_col_name " + str(params['y_col_name'])) + ] + result = subprocess.run(preprocess_run, capture_output=True, + text=True, check=True) + else: + preprocess_run = ["bash", "execute_in_conda.sh",params['model_environment'], + params['preprocess_python_script'], + "--train_split_file", str(train_split_file), + "--val_split_file", str(val_split_file), + "--test_split_file", str(test_split_file), + "--input_dir", params['input_dir'], + "--output_dir", str(ml_data_dir), + "--y_col_name", str(params['y_col_name']) + ] + result = subprocess.run(preprocess_run, capture_output=True, + text=True, check=True) + return {'source_data_name':source_data_name, 'split':split} + + +############################### +####### CSA PARAMETERS ######## +############################### + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +fdir = Path(__file__).resolve().parent +y_col_name = params['y_col_name'] +logger = logging.getLogger(f"{params['model_name']}") +params = frm.build_paths(params) # paths to raw data + +#Output directories for preprocess, train and infer +params['ml_data_dir'] = Path(params['output_dir']) / 'ml_data' + +#Model scripts +params['preprocess_python_script'] = f"{params['model_name']}_preprocess_improve.py" + +########################################################################## +##################### START PARSL PARALLEL EXECUTION ##################### +########################################################################## + +##Preprocess execution with Parsl +preprocess_futures=[] +for source_data_name in params['source_datasets']: + for split in params['split']: + preprocess_futures.append(preprocess(inputs=[params, source_data_name, split])) + +for future_p in preprocess_futures: + print(future_p.result()) \ No newline at end of file From 8063bd129f5bd73165ba4bfa35d95a9d498298bd Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 11 Sep 2024 16:05:52 -0400 Subject: [PATCH 164/254] epochs back up --- PathDSP_params.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 6a1356c..7a301ec 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -22,7 +22,8 @@ exp_file = EXP.txt data_format = .txt model_file_name = model model_file_format = .pt -epochs = 3 +#epochs = 3 +epochs = 800 learning_rate = 0.0004 batch_size = 12 val_batch = 12 From e9c5b7adcd2907fce708ebcf755fcb2cce9d6c68 Mon Sep 17 00:00:00 2001 From: Sara Jones Date: Fri, 13 Sep 2024 08:38:46 -0700 Subject: [PATCH 165/254] update csa_params.ini for full CSA with parsl --- PathDSP_params.txt | 4 ++-- csa_params.ini | 11 ++++++----- csa_params.test.ini | 20 ++++++++++++++++++++ 3 files changed, 28 insertions(+), 7 deletions(-) create mode 100644 csa_params.test.ini diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 7a301ec..2f33790 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -22,7 +22,7 @@ exp_file = EXP.txt data_format = .txt model_file_name = model model_file_format = .pt -#epochs = 3 +#epochs = 20 epochs = 800 learning_rate = 0.0004 batch_size = 12 @@ -30,7 +30,7 @@ val_batch = 12 loss = mse patience = 30 y_col_name = auc -cuda_name = cuda:0 +cuda_name = cuda:5 dropout = 0.1 [Infer] diff --git a/csa_params.ini b/csa_params.ini index 2fad301..4e0d99e 100644 --- a/csa_params.ini +++ b/csa_params.ini @@ -4,15 +4,16 @@ output_dir=./improve_output y_col_name = auc use_singularity = False hyperparameters_file = ./hyperparameters_default.json -source_datasets = ["gCSI"] -target_datasets = ["gCSI", "CCLE"] -split = ["0", "1"] +source_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] +target_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] +split = ["0","1","2","3","4","5","6","7","8","9"] model_name = PathDSP only_cross_study = False -epochs = 3 -available_accelerators=["4","5"] +epochs = 800 +available_accelerators=["0","1","2","3","4","5","6","7"] model_environment = PathDSP_env + [Preprocess] [Train] diff --git a/csa_params.test.ini b/csa_params.test.ini new file mode 100644 index 0000000..2fad301 --- /dev/null +++ b/csa_params.test.ini @@ -0,0 +1,20 @@ +[DEFAULT] +input_dir = ./csa_data/raw_data +output_dir=./improve_output +y_col_name = auc +use_singularity = False +hyperparameters_file = ./hyperparameters_default.json +source_datasets = ["gCSI"] +target_datasets = ["gCSI", "CCLE"] +split = ["0", "1"] +model_name = PathDSP +only_cross_study = False +epochs = 3 +available_accelerators=["4","5"] +model_environment = PathDSP_env + +[Preprocess] + +[Train] + +[Infer] \ No newline at end of file From 127a9e6e6784bacf4fe48494950980f94b31776c Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 13 Sep 2024 14:08:56 -0400 Subject: [PATCH 166/254] brute force csa --- csa_bruteforce_wf.py | 270 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 csa_bruteforce_wf.py diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py new file mode 100644 index 0000000..0e3670d --- /dev/null +++ b/csa_bruteforce_wf.py @@ -0,0 +1,270 @@ +""" Python implementation of cross-study analysis workflow """ +model_name = 'PathDSP' # Note! Change this for your model. +cuda_name = "cuda:0" +# cuda_name = "cuda:7" + +import os +import subprocess +import warnings +from time import time +from pathlib import Path + +import pandas as pd + +# IMPROVE imports +# from improvelib.initializer.config import Config +# from improvelib.initializer.stage_config import PreprocessConfig, TrainConfig, InferConfig +from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig +# from improvelib.applications.drug_response_prediction.config import DRPTrainConfig +# from improvelib.applications.drug_response_prediction.config import DRPInferConfig +import improvelib.utils as frm + + +def build_split_fname(source: str, split: int, phase: str): + """ Build split file name. If file does not exist continue """ + return f"{source_data_name}_split_{split}_{phase}.txt" + + +class Timer: + """ Measure time. """ + def __init__(self): + self.start = time() + + def timer_end(self): + self.end = time() + return self.end - self.start + + def display_timer(self, print_fn=print): + time_diff = self.timer_end() + if (time_diff) // 3600 > 0: + print_fn("Runtime: {:.1f} hrs".format( (time_diff)/3600) ) + else: + print_fn("Runtime: {:.1f} mins".format( (time_diff)/60) ) + + +filepath = Path(__file__).resolve().parent + +print_fn = print +print_fn(f"File path: {filepath}") + +# =============================================================== +### CSA settings +# =============================================================== +# TODO make it work! +# cfg = Config() +# params = cfg.initialize_parameters( +# pathToModelDir=filepath, +# section='DEFAULT', +# default_config="csa_params.txt", +# default_model=None, +# additional_definitions=None, +# required=None +# ) +# params = frm.build_paths(params) + +cfg = DRPPreprocessConfig() # TODO submit github issue; too many logs printed; is it necessary? +params = cfg.initialize_parameters( + pathToModelDir=filepath, + # default_config="csa_params.txt", + default_config="csa_params.ini", + additional_cli_section=None, + additional_definitions=None, + required=None +) +params = frm.build_paths(params) # TODO move this to improvelib + +# Model scripts +preprocess_python_script = f'{model_name}_preprocess_improve.py' +train_python_script = f'{model_name}_train_improve.py' +infer_python_script = f'{model_name}_infer_improve.py' + +# Specify dirs +# y_col_name = "auc" +y_col_name = params['y_col_name'] +# maindir = Path(f"./{y_col_name}") +# maindir = Path(f"./0_{y_col_name}_improvelib") # main output dir +MAIN_CSA_OUTDIR = Path(f"./run.csa.full") # main output dir +# Note! ML data and trained model should be saved to the same dir for inference script +MAIN_ML_DATA_DIR = MAIN_CSA_OUTDIR / 'ml_data' # output_dir_pp, input_dir_train, input_dir_infer +MAIN_MODEL_DIR = MAIN_CSA_OUTDIR / 'models' # output_dir_train, input_dir_infer +MAIN_INFER_DIR = MAIN_CSA_OUTDIR / 'infer' # output_dir infer + +# Note! Here input_dir is the location of benchmark data +# TODO Should we set input_dir (and output_dir) for each models scrit? +splits_dir = Path(params['input_dir']) / params['splits_dir'] + +### Source and target data sources +## Set 1 - full analysis +source_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +## Set 2 - smaller datasets +# source_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# source_datasets = ["CCLE", "gCSI", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv2"] +# source_datasets = ["CCLE", "GDSCv1"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +## Set 3 - full analysis for a single source +# source_datasets = ["CCLE"] +# source_datasets = ["CTRPv2"] +# target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv2"] +## Set 4 - same source and target +# source_datasets = ["CCLE"] +# target_datasets = ["CCLE"] +## Set 5 - single source and target +# source_datasets = ["GDSCv1"] +# target_datasets = ["CCLE"] + +only_cross_study = False +# only_cross_study = True + +## Splits +split_nums = [] # all splits +# split_nums = [0] +# split_nums = [4, 7] +# split_nums = [1, 4, 7] +# split_nums = [1, 3, 5, 7, 9] + +## Parameters of the experiment/run/workflow +# epochs = 2 +# epochs = 30 +# epochs = 50 +# epochs = 70 +# epochs = 100 +# epochs = 150 +epochs = 200 + + +# =============================================================== +### Generate CSA results (within- and cross-study) +# =============================================================== + +timer = Timer() +# Iterate over source datasets +# Note! The "source_data_name" iterations are independent of each other +print_fn(f"\nsource_datasets: {source_datasets}") +print_fn(f"target_datasets: {target_datasets}") +print_fn(f"split_nums: {split_nums}") +# breakpoint() +for source_data_name in source_datasets: + + # Get the split file paths + # This parsing assumes splits file names are: SOURCE_split_NUM_[train/val/test].txt + if len(split_nums) == 0: + # Get all splits + split_files = list((splits_dir).glob(f"{source_data_name}_split_*.txt")) + split_nums = [str(s).split("split_")[1].split("_")[0] for s in split_files] + split_nums = sorted(set(split_nums)) + # num_splits = 1 + else: + # Use the specified splits + split_files = [] + for s in split_nums: + split_files.extend(list((splits_dir).glob(f"{source_data_name}_split_{s}_*.txt"))) + + files_joined = [str(s) for s in split_files] + + # -------------------- + # Preprocess and Train + # -------------------- + for split in split_nums: + print_fn(f"Split id {split} out of {len(split_nums)} splits.") + # Check that train, val, and test are available. Otherwise, continue to the next split. + for phase in ["train", "val", "test"]: + fname = build_split_fname(source_data_name, split, phase) + if fname not in "\t".join(files_joined): + warnings.warn(f"\nThe {phase} split file {fname} is missing (continue to next split)") + continue + + for target_data_name in target_datasets: + if only_cross_study and (source_data_name == target_data_name): + continue # only cross-study + print_fn(f"\nSource data: {source_data_name}") + print_fn(f"Target data: {target_data_name}") + + ml_data_dir = MAIN_ML_DATA_DIR / f"{source_data_name}-{target_data_name}" / \ + f"split_{split}" + model_dir = MAIN_MODEL_DIR / f"{source_data_name}" / f"split_{split}" + infer_dir = MAIN_INFER_DIR / f"{source_data_name}-{target_data_name}" / \ + f"split_{split}" # AP + + if source_data_name == target_data_name: + # If source and target are the same, then infer on the test split + test_split_file = f"{source_data_name}_split_{split}_test.txt" + else: + # If source and target are different, then infer on the entire target dataset + test_split_file = f"{target_data_name}_all.txt" + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # p1 (none): Preprocess train data + # train_split_files = list((ig.splits_dir).glob(f"{source_data_name}_split_0_train*.txt")) # placeholder for LC + timer_preprocess = Timer() + # ml_data_path = graphdrp_preprocess_improve.main([ + # "--train_split_file", f"{source_data_name}_split_{split}_train.txt", + # "--val_split_file", f"{source_data_name}_split_{split}_val.txt", + # "--test_split_file", str(test_split_file_name), + # "--input_dir", str(input_dir), + # "--output_dir", str(output_dir), + # "--y_col_name", y_col_name + # ]) + print_fn("\nPreprocessing") + train_split_file = f"{source_data_name}_split_{split}_train.txt" + val_split_file = f"{source_data_name}_split_{split}_val.txt" + print_fn(f"train_split_file: {train_split_file}") + print_fn(f"val_split_file: {val_split_file}") + print_fn(f"test_split_file: {test_split_file}") + preprocess_run = ["python", preprocess_python_script, + "--train_split_file", str(train_split_file), + "--val_split_file", str(val_split_file), + "--test_split_file", str(test_split_file), + "--input_dir", params['input_dir'], # str("./csa_data/raw_data"), + "--output_dir", str(ml_data_dir), + "--y_col_name", str(y_col_name) + ] + result = subprocess.run(preprocess_run, capture_output=True, + text=True, check=True) + # print(result.stdout) + # print(result.stderr) + timer_preprocess.display_timer(print_fn) + + # p2 (p1): Train model + # Train a single model for a given [source, split] pair + # Train using train samples and early stop using val samples + if model_dir.exists() is False: + timer_train = Timer() + print_fn("\nTrain") + print_fn(f"ml_data_dir: {ml_data_dir}") + print_fn(f"model_dir: {model_dir}") + train_run = ["python", train_python_script, + "--input_dir", str(ml_data_dir), + "--output_dir", str(model_dir), + "--epochs", str(epochs), # DL-specific + "--cuda_name", cuda_name, # DL-specific + "--y_col_name", y_col_name + ] + result = subprocess.run(train_run, capture_output=True, + text=True, check=True) + timer_train.display_timer(print_fn) + + # Infer + # p3 (p1, p2): Inference + timer_infer = Timer() + print_fn("\nInfer") + infer_run = ["python", infer_python_script, + "--input_data_dir", str(ml_data_dir), + "--input_model_dir", str(model_dir), + "--output_dir", str(infer_dir), + "--cuda_name", cuda_name, # DL-specific + "--y_col_name", y_col_name, + "--calc_infer_scores", "true" + ] + result = subprocess.run(infer_run, capture_output=True, + text=True, check=True) + timer_infer.display_timer(print_fn) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +timer.display_timer(print_fn) +print_fn('Finished full cross-study run.') \ No newline at end of file From c2db430fb041c4ddf85475bbbb5266dbc7cd7c3c Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Sun, 15 Sep 2024 14:49:56 -0400 Subject: [PATCH 167/254] bruteforce csa with parameters --- csa_bruteforce_params.ini | 52 ++++++++++++++++++++++++ csa_bruteforce_params_def.py | 46 +++++++++++++++++++++ csa_bruteforce_wf.py | 77 +++++++----------------------------- 3 files changed, 113 insertions(+), 62 deletions(-) create mode 100644 csa_bruteforce_params.ini create mode 100644 csa_bruteforce_params_def.py diff --git a/csa_bruteforce_params.ini b/csa_bruteforce_params.ini new file mode 100644 index 0000000..6ba0249 --- /dev/null +++ b/csa_bruteforce_params.ini @@ -0,0 +1,52 @@ +[DEFAULT] +input_dir = ./csa_data/raw_data +output_dir=./bruteforce_output +y_col_name = auc +source_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] +target_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] +split = ["0"] +model_name = PathDSP +only_cross_study = False +epochs = 800 + +### Source and target data sources +## Set 1 - full analysis +source_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +## Set 2 - smaller datasets +# source_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# source_datasets = ["CCLE", "gCSI", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv2"] +# source_datasets = ["CCLE", "GDSCv1"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +## Set 3 - full analysis for a single source +# source_datasets = ["CCLE"] +# source_datasets = ["CTRPv2"] +# target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv2"] +## Set 4 - same source and target +# source_datasets = ["CCLE"] +# target_datasets = ["CCLE"] +## Set 5 - single source and target +# source_datasets = ["GDSCv1"] +# target_datasets = ["CCLE"] + +only_cross_study = False +# only_cross_study = True + +## Splits +split_nums = [] # all splits +# split_nums = [0] +# split_nums = [4, 7] +# split_nums = [1, 4, 7] +# split_nums = [1, 3, 5, 7, 9] + + + +[Preprocess] + +[Train] + +[Infer] \ No newline at end of file diff --git a/csa_bruteforce_params_def.py b/csa_bruteforce_params_def.py new file mode 100644 index 0000000..dba9d42 --- /dev/null +++ b/csa_bruteforce_params_def.py @@ -0,0 +1,46 @@ +csa_bruteforce_params = [ + {"name": "cuda_name", + "type": str, + "default": "cuda:0", + "help": "Cuda devide name.", + }, + {"name": "csa_outdir", + "type": str, + "default": "./run.csa.full", + "help": "Outdir for workflow.", + }, + {"name": "source_datasets", + "nargs" : "+", + "type": str, + "default": ['CCLE'], + "help": "source_datasets for cross study analysis" + }, + {"name": "target_datasets", + "nargs" : "+", + "type": str, + "default": ["CCLE", "gCSI"], + "help": "target_datasets for cross study analysis" + }, + {"name": "split", + "nargs" : "+", + "type": str, + "default": ['0'], + "help": "Split of the source datasets for CSA" + }, + {"name": "only_cross_study", + "type": bool, + "default": False, + "help": "If only cross study analysis is needed" + }, + {"name": "model_name", + "type": str, + "default": 'graphdrp', ## Change the default to LGBM?? + "help": "Name of the deep learning model" + }, + {"name": "epochs", + "type": int, + "default": 10, + "help": "Number of epochs" + }, + +] \ No newline at end of file diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py index 0e3670d..bc285ba 100644 --- a/csa_bruteforce_wf.py +++ b/csa_bruteforce_wf.py @@ -1,7 +1,5 @@ """ Python implementation of cross-study analysis workflow """ -model_name = 'PathDSP' # Note! Change this for your model. -cuda_name = "cuda:0" -# cuda_name = "cuda:7" + import os import subprocess @@ -18,6 +16,7 @@ # from improvelib.applications.drug_response_prediction.config import DRPTrainConfig # from improvelib.applications.drug_response_prediction.config import DRPInferConfig import improvelib.utils as frm +from csa_bruteforce_params_def import csa_bruteforce_params def build_split_fname(source: str, split: int, phase: str): @@ -50,40 +49,29 @@ def display_timer(self, print_fn=print): # =============================================================== ### CSA settings # =============================================================== -# TODO make it work! -# cfg = Config() -# params = cfg.initialize_parameters( -# pathToModelDir=filepath, -# section='DEFAULT', -# default_config="csa_params.txt", -# default_model=None, -# additional_definitions=None, -# required=None -# ) -# params = frm.build_paths(params) + cfg = DRPPreprocessConfig() # TODO submit github issue; too many logs printed; is it necessary? params = cfg.initialize_parameters( pathToModelDir=filepath, - # default_config="csa_params.txt", - default_config="csa_params.ini", - additional_cli_section=None, - additional_definitions=None, + default_config="csa_bruteforce_params.ini", + additional_definitions=csa_bruteforce_params, required=None ) params = frm.build_paths(params) # TODO move this to improvelib # Model scripts +model_name = params["model_name"] preprocess_python_script = f'{model_name}_preprocess_improve.py' train_python_script = f'{model_name}_train_improve.py' infer_python_script = f'{model_name}_infer_improve.py' # Specify dirs -# y_col_name = "auc" + y_col_name = params['y_col_name'] # maindir = Path(f"./{y_col_name}") # maindir = Path(f"./0_{y_col_name}_improvelib") # main output dir -MAIN_CSA_OUTDIR = Path(f"./run.csa.full") # main output dir +MAIN_CSA_OUTDIR = Path(params["csa_outdir"]) # main output dir # Note! ML data and trained model should be saved to the same dir for inference script MAIN_ML_DATA_DIR = MAIN_CSA_OUTDIR / 'ml_data' # output_dir_pp, input_dir_train, input_dir_infer MAIN_MODEL_DIR = MAIN_CSA_OUTDIR / 'models' # output_dir_train, input_dir_infer @@ -93,48 +81,13 @@ def display_timer(self, print_fn=print): # TODO Should we set input_dir (and output_dir) for each models scrit? splits_dir = Path(params['input_dir']) / params['splits_dir'] -### Source and target data sources -## Set 1 - full analysis -source_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] -target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] -## Set 2 - smaller datasets -# source_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] -# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] -# source_datasets = ["CCLE", "gCSI", "GDSCv2"] -# target_datasets = ["CCLE", "gCSI", "GDSCv2"] -# source_datasets = ["CCLE", "GDSCv1"] -# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] -## Set 3 - full analysis for a single source -# source_datasets = ["CCLE"] -# source_datasets = ["CTRPv2"] -# target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] -# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] -# target_datasets = ["CCLE", "gCSI", "GDSCv2"] -## Set 4 - same source and target -# source_datasets = ["CCLE"] -# target_datasets = ["CCLE"] -## Set 5 - single source and target -# source_datasets = ["GDSCv1"] -# target_datasets = ["CCLE"] - -only_cross_study = False -# only_cross_study = True - -## Splits -split_nums = [] # all splits -# split_nums = [0] -# split_nums = [4, 7] -# split_nums = [1, 4, 7] -# split_nums = [1, 3, 5, 7, 9] - -## Parameters of the experiment/run/workflow -# epochs = 2 -# epochs = 30 -# epochs = 50 -# epochs = 70 -# epochs = 100 -# epochs = 150 -epochs = 200 + +source_datasets = params["source_datasets"] +target_datasets = params["target_datasets"] +only_cross_study = params["only_cross_study"] +split_nums = params["split_nums"] +epochs = params["epochs"] +cuda_name = params["cuda_name"] # =============================================================== From 96a3e8f8bce909a9c3861bc2a3ac34d3efff1395 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 16 Sep 2024 08:39:37 -0400 Subject: [PATCH 168/254] fixed ini --- csa_bruteforce_params.ini | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/csa_bruteforce_params.ini b/csa_bruteforce_params.ini index 6ba0249..9c01778 100644 --- a/csa_bruteforce_params.ini +++ b/csa_bruteforce_params.ini @@ -4,15 +4,15 @@ output_dir=./bruteforce_output y_col_name = auc source_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] target_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] -split = ["0"] +split_nums = ["0"] model_name = PathDSP only_cross_study = False epochs = 800 ### Source and target data sources ## Set 1 - full analysis -source_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] -target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +#source_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +#target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] ## Set 2 - smaller datasets # source_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] # target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] @@ -33,11 +33,9 @@ target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] # source_datasets = ["GDSCv1"] # target_datasets = ["CCLE"] -only_cross_study = False -# only_cross_study = True ## Splits -split_nums = [] # all splits +#split_nums = [] # all splits # split_nums = [0] # split_nums = [4, 7] # split_nums = [1, 4, 7] From 4930b042b7a0b0599c3fafbcaa01c7c7dcfe15cb Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 16 Sep 2024 08:46:26 -0400 Subject: [PATCH 169/254] split nums --- csa_bruteforce_params_def.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csa_bruteforce_params_def.py b/csa_bruteforce_params_def.py index dba9d42..ea93875 100644 --- a/csa_bruteforce_params_def.py +++ b/csa_bruteforce_params_def.py @@ -21,7 +21,7 @@ "default": ["CCLE", "gCSI"], "help": "target_datasets for cross study analysis" }, - {"name": "split", + {"name": "split_nums", "nargs" : "+", "type": str, "default": ['0'], From 38d5f36f01fd2cd15e8eac228136adcd3d2e5ea8 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 16 Sep 2024 09:28:27 -0400 Subject: [PATCH 170/254] remove build paths --- csa_bruteforce_wf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py index bc285ba..bfcc87d 100644 --- a/csa_bruteforce_wf.py +++ b/csa_bruteforce_wf.py @@ -58,7 +58,7 @@ def display_timer(self, print_fn=print): additional_definitions=csa_bruteforce_params, required=None ) -params = frm.build_paths(params) # TODO move this to improvelib + # Model scripts model_name = params["model_name"] From 22038ff23e07a29daf216d8028c1b3707fbd228f Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 16 Sep 2024 10:30:15 -0400 Subject: [PATCH 171/254] print statements --- csa_bruteforce_wf.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py index bfcc87d..3b927e3 100644 --- a/csa_bruteforce_wf.py +++ b/csa_bruteforce_wf.py @@ -58,14 +58,14 @@ def display_timer(self, print_fn=print): additional_definitions=csa_bruteforce_params, required=None ) - +print("Loaded params") # Model scripts model_name = params["model_name"] preprocess_python_script = f'{model_name}_preprocess_improve.py' train_python_script = f'{model_name}_train_improve.py' infer_python_script = f'{model_name}_infer_improve.py' - +print("Created script names") # Specify dirs y_col_name = params['y_col_name'] @@ -76,11 +76,11 @@ def display_timer(self, print_fn=print): MAIN_ML_DATA_DIR = MAIN_CSA_OUTDIR / 'ml_data' # output_dir_pp, input_dir_train, input_dir_infer MAIN_MODEL_DIR = MAIN_CSA_OUTDIR / 'models' # output_dir_train, input_dir_infer MAIN_INFER_DIR = MAIN_CSA_OUTDIR / 'infer' # output_dir infer - +print("Created directory names") # Note! Here input_dir is the location of benchmark data # TODO Should we set input_dir (and output_dir) for each models scrit? splits_dir = Path(params['input_dir']) / params['splits_dir'] - +print("Created splits path") source_datasets = params["source_datasets"] target_datasets = params["target_datasets"] @@ -88,6 +88,7 @@ def display_timer(self, print_fn=print): split_nums = params["split_nums"] epochs = params["epochs"] cuda_name = params["cuda_name"] +print("internal params") # =============================================================== From 6af6777b247539ee734a381d54c7b76b248a299d Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 16 Sep 2024 11:00:38 -0400 Subject: [PATCH 172/254] bug --- csa_bruteforce_params.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csa_bruteforce_params.ini b/csa_bruteforce_params.ini index 9c01778..a8a7a3c 100644 --- a/csa_bruteforce_params.ini +++ b/csa_bruteforce_params.ini @@ -1,6 +1,6 @@ [DEFAULT] input_dir = ./csa_data/raw_data -output_dir=./bruteforce_output +output_dir = ./bruteforce_output y_col_name = auc source_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] target_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] From 391319b1bf7dc4d79a83242b4ec8d39be97af9ac Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 16 Sep 2024 15:14:28 -0400 Subject: [PATCH 173/254] write output --- csa_bruteforce_params_def.py | 2 +- csa_bruteforce_wf.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/csa_bruteforce_params_def.py b/csa_bruteforce_params_def.py index ea93875..7e3413c 100644 --- a/csa_bruteforce_params_def.py +++ b/csa_bruteforce_params_def.py @@ -2,7 +2,7 @@ {"name": "cuda_name", "type": str, "default": "cuda:0", - "help": "Cuda devide name.", + "help": "Cuda device name.", }, {"name": "csa_outdir", "type": str, diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py index 3b927e3..1ce4d31 100644 --- a/csa_bruteforce_wf.py +++ b/csa_bruteforce_wf.py @@ -77,10 +77,15 @@ def display_timer(self, print_fn=print): MAIN_MODEL_DIR = MAIN_CSA_OUTDIR / 'models' # output_dir_train, input_dir_infer MAIN_INFER_DIR = MAIN_CSA_OUTDIR / 'infer' # output_dir infer print("Created directory names") +print("MAIN_CSA_OUTDIR: ", MAIN_CSA_OUTDIR) +print("MAIN_ML_DATA_DIR: ", MAIN_ML_DATA_DIR) +print("MAIN_MODEL_DIR: ", MAIN_MODEL_DIR) +print("MAIN_INFER_DIR: ", MAIN_INFER_DIR) # Note! Here input_dir is the location of benchmark data # TODO Should we set input_dir (and output_dir) for each models scrit? splits_dir = Path(params['input_dir']) / params['splits_dir'] print("Created splits path") +print("splits_dir: ", splits_dir) source_datasets = params["source_datasets"] target_datasets = params["target_datasets"] @@ -180,6 +185,13 @@ def display_timer(self, print_fn=print): result = subprocess.run(preprocess_run, capture_output=True, text=True, check=True) # print(result.stdout) + result_file_name_stdout = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-preprocess-subprocess-stdout.txt" + result_file_name_stderr = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-preprocess-subprocess-stderr.txt" + + with open(result_file_name_stdout, 'w') as file: + file.write(result.stdout) + with open(result_file_name_stderr, 'w') as file: + file.write(result.stderr) # print(result.stderr) timer_preprocess.display_timer(print_fn) From 312bf52f0e0420ec938809b1263697e81c1b6c0a Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 16 Sep 2024 16:32:04 -0400 Subject: [PATCH 174/254] print std --- csa_bruteforce_wf.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py index 1ce4d31..64e5945 100644 --- a/csa_bruteforce_wf.py +++ b/csa_bruteforce_wf.py @@ -184,7 +184,10 @@ def display_timer(self, print_fn=print): ] result = subprocess.run(preprocess_run, capture_output=True, text=True, check=True) - # print(result.stdout) + print("result_file_name_stdout", result_file_name_stdout) + print(result.stdout) + print("result_file_name_stderr", result_file_name_stderr) + print(result.stderr) result_file_name_stdout = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-preprocess-subprocess-stdout.txt" result_file_name_stderr = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-preprocess-subprocess-stderr.txt" @@ -192,7 +195,7 @@ def display_timer(self, print_fn=print): file.write(result.stdout) with open(result_file_name_stderr, 'w') as file: file.write(result.stderr) - # print(result.stderr) + timer_preprocess.display_timer(print_fn) # p2 (p1): Train model From 336e38501d50e49cd7162071fd58632d55049c05 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 16 Sep 2024 22:08:51 -0400 Subject: [PATCH 175/254] bug --- csa_bruteforce_wf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py index 64e5945..5e1eb14 100644 --- a/csa_bruteforce_wf.py +++ b/csa_bruteforce_wf.py @@ -184,13 +184,13 @@ def display_timer(self, print_fn=print): ] result = subprocess.run(preprocess_run, capture_output=True, text=True, check=True) + + result_file_name_stdout = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-preprocess-subprocess-stdout.txt" + result_file_name_stderr = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-preprocess-subprocess-stderr.txt" print("result_file_name_stdout", result_file_name_stdout) print(result.stdout) print("result_file_name_stderr", result_file_name_stderr) print(result.stderr) - result_file_name_stdout = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-preprocess-subprocess-stdout.txt" - result_file_name_stderr = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-preprocess-subprocess-stderr.txt" - with open(result_file_name_stdout, 'w') as file: file.write(result.stdout) with open(result_file_name_stderr, 'w') as file: From bd6c9f3efeaab327fb06983ec70f713485257bac Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 17 Sep 2024 08:49:24 -0400 Subject: [PATCH 176/254] outdir --- csa_bruteforce_params.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csa_bruteforce_params.ini b/csa_bruteforce_params.ini index a8a7a3c..0224f94 100644 --- a/csa_bruteforce_params.ini +++ b/csa_bruteforce_params.ini @@ -1,6 +1,6 @@ [DEFAULT] input_dir = ./csa_data/raw_data -output_dir = ./bruteforce_output +csa_outdir = ./bruteforce_output y_col_name = auc source_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] target_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] From f55dbaff74d0a26cc7fb71fbd5eb6c2819868512 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 17 Sep 2024 10:44:15 -0400 Subject: [PATCH 177/254] saves logs --- csa_bruteforce_wf.py | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py index 5e1eb14..d09cbf8 100644 --- a/csa_bruteforce_wf.py +++ b/csa_bruteforce_wf.py @@ -23,6 +23,15 @@ def build_split_fname(source: str, split: int, phase: str): """ Build split file name. If file does not exist continue """ return f"{source_data_name}_split_{split}_{phase}.txt" +def save_captured_output(result, process, MAIN_CSA_OUTDIR, source_data_name, target_data_name, split): + result_file_name_stdout = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-{process}-stdout.txt" + result_file_name_stderr = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-{process}-stderr.txt" + with open(result_file_name_stdout, 'w') as file: + file.write(result.stdout) + with open(result_file_name_stderr, 'w') as file: + file.write(result.stderr) + + class Timer: """ Measure time. """ @@ -160,14 +169,6 @@ def display_timer(self, print_fn=print): # p1 (none): Preprocess train data # train_split_files = list((ig.splits_dir).glob(f"{source_data_name}_split_0_train*.txt")) # placeholder for LC timer_preprocess = Timer() - # ml_data_path = graphdrp_preprocess_improve.main([ - # "--train_split_file", f"{source_data_name}_split_{split}_train.txt", - # "--val_split_file", f"{source_data_name}_split_{split}_val.txt", - # "--test_split_file", str(test_split_file_name), - # "--input_dir", str(input_dir), - # "--output_dir", str(output_dir), - # "--y_col_name", y_col_name - # ]) print_fn("\nPreprocessing") train_split_file = f"{source_data_name}_split_{split}_train.txt" val_split_file = f"{source_data_name}_split_{split}_val.txt" @@ -185,17 +186,7 @@ def display_timer(self, print_fn=print): result = subprocess.run(preprocess_run, capture_output=True, text=True, check=True) - result_file_name_stdout = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-preprocess-subprocess-stdout.txt" - result_file_name_stderr = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-preprocess-subprocess-stderr.txt" - print("result_file_name_stdout", result_file_name_stdout) - print(result.stdout) - print("result_file_name_stderr", result_file_name_stderr) - print(result.stderr) - with open(result_file_name_stdout, 'w') as file: - file.write(result.stdout) - with open(result_file_name_stderr, 'w') as file: - file.write(result.stderr) - + save_captured_output(result, "preprocess", MAIN_CSA_OUTDIR, source_data_name, target_data_name, split) timer_preprocess.display_timer(print_fn) # p2 (p1): Train model @@ -215,6 +206,7 @@ def display_timer(self, print_fn=print): ] result = subprocess.run(train_run, capture_output=True, text=True, check=True) + save_captured_output(result, "train", MAIN_CSA_OUTDIR, source_data_name, "none", split) timer_train.display_timer(print_fn) # Infer @@ -231,6 +223,7 @@ def display_timer(self, print_fn=print): ] result = subprocess.run(infer_run, capture_output=True, text=True, check=True) + save_captured_output(result, "infer", MAIN_CSA_OUTDIR, source_data_name, target_data_name, split) timer_infer.display_timer(print_fn) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 76b797e034d8dfeb64cb2fa96551aa9f8dc5b681 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 18 Sep 2024 12:23:59 -0400 Subject: [PATCH 178/254] all splits --- csa_bruteforce_params.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csa_bruteforce_params.ini b/csa_bruteforce_params.ini index 0224f94..53d5495 100644 --- a/csa_bruteforce_params.ini +++ b/csa_bruteforce_params.ini @@ -4,7 +4,7 @@ csa_outdir = ./bruteforce_output y_col_name = auc source_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] target_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] -split_nums = ["0"] +split_nums = [] model_name = PathDSP only_cross_study = False epochs = 800 From 320f6d6e0f723b242c24f1372291f647117292d5 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 18 Sep 2024 14:48:42 -0400 Subject: [PATCH 179/254] check false --- csa_bruteforce_wf.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py index d09cbf8..52cd24f 100644 --- a/csa_bruteforce_wf.py +++ b/csa_bruteforce_wf.py @@ -184,8 +184,8 @@ def display_timer(self, print_fn=print): "--y_col_name", str(y_col_name) ] result = subprocess.run(preprocess_run, capture_output=True, - text=True, check=True) - + text=True) #check=True + print(f"returncode = {result.returncode}") save_captured_output(result, "preprocess", MAIN_CSA_OUTDIR, source_data_name, target_data_name, split) timer_preprocess.display_timer(print_fn) @@ -205,7 +205,8 @@ def display_timer(self, print_fn=print): "--y_col_name", y_col_name ] result = subprocess.run(train_run, capture_output=True, - text=True, check=True) + text=True) + print(f"returncode = {result.returncode}") save_captured_output(result, "train", MAIN_CSA_OUTDIR, source_data_name, "none", split) timer_train.display_timer(print_fn) @@ -222,7 +223,8 @@ def display_timer(self, print_fn=print): "--calc_infer_scores", "true" ] result = subprocess.run(infer_run, capture_output=True, - text=True, check=True) + text=True) + print(f"returncode = {result.returncode}") save_captured_output(result, "infer", MAIN_CSA_OUTDIR, source_data_name, target_data_name, split) timer_infer.display_timer(print_fn) From 2930191e73853c32aa3ea957e60e71520e3387c6 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:40:19 -0400 Subject: [PATCH 180/254] reserve cuda testing --- csa_bruteforce_params_def.py | 9 ++++++++- csa_bruteforce_wf.py | 5 +++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/csa_bruteforce_params_def.py b/csa_bruteforce_params_def.py index 7e3413c..98669ea 100644 --- a/csa_bruteforce_params_def.py +++ b/csa_bruteforce_params_def.py @@ -1,3 +1,5 @@ +from improvelib.utils import str2bool + csa_bruteforce_params = [ {"name": "cuda_name", "type": str, @@ -28,7 +30,7 @@ "help": "Split of the source datasets for CSA" }, {"name": "only_cross_study", - "type": bool, + "type": str2bool, "default": False, "help": "If only cross study analysis is needed" }, @@ -42,5 +44,10 @@ "default": 10, "help": "Number of epochs" }, + {"name": "reserved_system", + "type": str2bool, + "default": False, + "help": "If the system reserves GPUs. False for lambda and True for slurm systems." + }, ] \ No newline at end of file diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py index 52cd24f..9da3961 100644 --- a/csa_bruteforce_wf.py +++ b/csa_bruteforce_wf.py @@ -8,6 +8,8 @@ from pathlib import Path import pandas as pd +import torch + # IMPROVE imports # from improvelib.initializer.config import Config @@ -68,6 +70,9 @@ def display_timer(self, print_fn=print): required=None ) print("Loaded params") +if not params["reserved_system"]: + torch.Tensor([0]).to(params["cuda_name"]) + print("Reserved GPU: ", params["cuda_name"]) # Model scripts model_name = params["model_name"] From 0aba60078e32e43e4b506cc57fb7d0eb52d9e543 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 20 Sep 2024 09:06:22 -0400 Subject: [PATCH 181/254] trying to make logs back compatible with python < 3.7 --- csa_bruteforce_wf.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py index 9da3961..088549a 100644 --- a/csa_bruteforce_wf.py +++ b/csa_bruteforce_wf.py @@ -26,12 +26,10 @@ def build_split_fname(source: str, split: int, phase: str): return f"{source_data_name}_split_{split}_{phase}.txt" def save_captured_output(result, process, MAIN_CSA_OUTDIR, source_data_name, target_data_name, split): - result_file_name_stdout = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-{process}-stdout.txt" - result_file_name_stderr = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-{process}-stderr.txt" + result_file_name_stdout = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-{process}-log.txt" with open(result_file_name_stdout, 'w') as file: file.write(result.stdout) - with open(result_file_name_stderr, 'w') as file: - file.write(result.stderr) + @@ -188,8 +186,7 @@ def display_timer(self, print_fn=print): "--output_dir", str(ml_data_dir), "--y_col_name", str(y_col_name) ] - result = subprocess.run(preprocess_run, capture_output=True, - text=True) #check=True + result = subprocess.run(preprocess_run, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, universal_newlines=True) print(f"returncode = {result.returncode}") save_captured_output(result, "preprocess", MAIN_CSA_OUTDIR, source_data_name, target_data_name, split) timer_preprocess.display_timer(print_fn) @@ -212,7 +209,7 @@ def display_timer(self, print_fn=print): result = subprocess.run(train_run, capture_output=True, text=True) print(f"returncode = {result.returncode}") - save_captured_output(result, "train", MAIN_CSA_OUTDIR, source_data_name, "none", split) + #save_captured_output(result, "train", MAIN_CSA_OUTDIR, source_data_name, "none", split) timer_train.display_timer(print_fn) # Infer @@ -230,7 +227,7 @@ def display_timer(self, print_fn=print): result = subprocess.run(infer_run, capture_output=True, text=True) print(f"returncode = {result.returncode}") - save_captured_output(result, "infer", MAIN_CSA_OUTDIR, source_data_name, target_data_name, split) + #save_captured_output(result, "infer", MAIN_CSA_OUTDIR, source_data_name, target_data_name, split) timer_infer.display_timer(print_fn) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 0983abc94c212605960c91a3a1a8461f6ae6a2ef Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 20 Sep 2024 13:58:13 -0400 Subject: [PATCH 182/254] updated brute force workflow --- csa_bruteforce_params.ini | 1 + csa_bruteforce_params_def.py | 6 ++-- csa_bruteforce_wf.py | 70 +++++++++++++++++++++--------------- 3 files changed, 46 insertions(+), 31 deletions(-) diff --git a/csa_bruteforce_params.ini b/csa_bruteforce_params.ini index 53d5495..d8644d1 100644 --- a/csa_bruteforce_params.ini +++ b/csa_bruteforce_params.ini @@ -8,6 +8,7 @@ split_nums = [] model_name = PathDSP only_cross_study = False epochs = 800 +uses_cuda_name = True ### Source and target data sources ## Set 1 - full analysis diff --git a/csa_bruteforce_params_def.py b/csa_bruteforce_params_def.py index 98669ea..6b5d7c9 100644 --- a/csa_bruteforce_params_def.py +++ b/csa_bruteforce_params_def.py @@ -44,10 +44,10 @@ "default": 10, "help": "Number of epochs" }, - {"name": "reserved_system", + {"name": "uses_cuda_name", "type": str2bool, - "default": False, - "help": "If the system reserves GPUs. False for lambda and True for slurm systems." + "default": True, + "help": "Change to false if the model doesn't have a cuda_name parameter." }, ] \ No newline at end of file diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py index 088549a..62f998f 100644 --- a/csa_bruteforce_wf.py +++ b/csa_bruteforce_wf.py @@ -8,7 +8,6 @@ from pathlib import Path import pandas as pd -import torch # IMPROVE imports @@ -25,8 +24,8 @@ def build_split_fname(source: str, split: int, phase: str): """ Build split file name. If file does not exist continue """ return f"{source_data_name}_split_{split}_{phase}.txt" -def save_captured_output(result, process, MAIN_CSA_OUTDIR, source_data_name, target_data_name, split): - result_file_name_stdout = MAIN_CSA_OUTDIR / f"{source_data_name}-{target_data_name}-{split}-{process}-log.txt" +def save_captured_output(result, process, MAIN_LOG_DIR, source_data_name, target_data_name, split): + result_file_name_stdout = MAIN_LOG_DIR / f"{source_data_name}-{target_data_name}-{split}-{process}-log.txt" with open(result_file_name_stdout, 'w') as file: file.write(result.stdout) @@ -68,9 +67,6 @@ def display_timer(self, print_fn=print): required=None ) print("Loaded params") -if not params["reserved_system"]: - torch.Tensor([0]).to(params["cuda_name"]) - print("Reserved GPU: ", params["cuda_name"]) # Model scripts model_name = params["model_name"] @@ -88,11 +84,14 @@ def display_timer(self, print_fn=print): MAIN_ML_DATA_DIR = MAIN_CSA_OUTDIR / 'ml_data' # output_dir_pp, input_dir_train, input_dir_infer MAIN_MODEL_DIR = MAIN_CSA_OUTDIR / 'models' # output_dir_train, input_dir_infer MAIN_INFER_DIR = MAIN_CSA_OUTDIR / 'infer' # output_dir infer +MAIN_LOG_DIR = MAIN_CSA_OUTDIR / 'logs' +frm.create_outdir(MAIN_LOG_DIR) print("Created directory names") print("MAIN_CSA_OUTDIR: ", MAIN_CSA_OUTDIR) print("MAIN_ML_DATA_DIR: ", MAIN_ML_DATA_DIR) print("MAIN_MODEL_DIR: ", MAIN_MODEL_DIR) print("MAIN_INFER_DIR: ", MAIN_INFER_DIR) +print("MAIN_LOG_DIR: ", MAIN_LOG_DIR) # Note! Here input_dir is the location of benchmark data # TODO Should we set input_dir (and output_dir) for each models scrit? splits_dir = Path(params['input_dir']) / params['splits_dir'] @@ -188,7 +187,7 @@ def display_timer(self, print_fn=print): ] result = subprocess.run(preprocess_run, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, universal_newlines=True) print(f"returncode = {result.returncode}") - save_captured_output(result, "preprocess", MAIN_CSA_OUTDIR, source_data_name, target_data_name, split) + save_captured_output(result, "preprocess", MAIN_LOG_DIR, source_data_name, target_data_name, split) timer_preprocess.display_timer(print_fn) # p2 (p1): Train model @@ -199,35 +198,50 @@ def display_timer(self, print_fn=print): print_fn("\nTrain") print_fn(f"ml_data_dir: {ml_data_dir}") print_fn(f"model_dir: {model_dir}") - train_run = ["python", train_python_script, - "--input_dir", str(ml_data_dir), - "--output_dir", str(model_dir), - "--epochs", str(epochs), # DL-specific - "--cuda_name", cuda_name, # DL-specific - "--y_col_name", y_col_name - ] - result = subprocess.run(train_run, capture_output=True, - text=True) + if params["uses_cuda_name"]: + train_run = ["python", train_python_script, + "--input_dir", str(ml_data_dir), + "--output_dir", str(model_dir), + "--epochs", str(epochs), # DL-specific + "--cuda_name", cuda_name, # DL-specific + "--y_col_name", y_col_name + ] + else: + train_run = ["python", train_python_script, + "--input_dir", str(ml_data_dir), + "--output_dir", str(model_dir), + "--epochs", str(epochs), # DL-specific + "--y_col_name", y_col_name + ] + result = subprocess.run(train_run, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, universal_newlines=True) print(f"returncode = {result.returncode}") - #save_captured_output(result, "train", MAIN_CSA_OUTDIR, source_data_name, "none", split) + save_captured_output(result, "train", MAIN_LOG_DIR, source_data_name, "none", split) timer_train.display_timer(print_fn) # Infer # p3 (p1, p2): Inference timer_infer = Timer() print_fn("\nInfer") - infer_run = ["python", infer_python_script, - "--input_data_dir", str(ml_data_dir), - "--input_model_dir", str(model_dir), - "--output_dir", str(infer_dir), - "--cuda_name", cuda_name, # DL-specific - "--y_col_name", y_col_name, - "--calc_infer_scores", "true" - ] - result = subprocess.run(infer_run, capture_output=True, - text=True) + if params["uses_cuda_name"]: + infer_run = ["python", infer_python_script, + "--input_data_dir", str(ml_data_dir), + "--input_model_dir", str(model_dir), + "--output_dir", str(infer_dir), + "--cuda_name", cuda_name, # DL-specific + "--y_col_name", y_col_name, + "--calc_infer_scores", "true" + ] + else: + infer_run = ["python", infer_python_script, + "--input_data_dir", str(ml_data_dir), + "--input_model_dir", str(model_dir), + "--output_dir", str(infer_dir), + "--y_col_name", y_col_name, + "--calc_infer_scores", "true" + ] + result = subprocess.run(infer_run, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, universal_newlines=True) print(f"returncode = {result.returncode}") - #save_captured_output(result, "infer", MAIN_CSA_OUTDIR, source_data_name, target_data_name, split) + save_captured_output(result, "infer", MAIN_LOG_DIR, source_data_name, target_data_name, split) timer_infer.display_timer(print_fn) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From c4557e59cc9e41f04df266b433af43d8099439e8 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 25 Sep 2024 16:10:20 -0400 Subject: [PATCH 183/254] Update PathDSP_train_improve.py --- PathDSP_train_improve.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 1a1cf6c..739e06b 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -286,7 +286,8 @@ def init_weights(m): y_pred=val_pred, stage="val", y_col_name=params["y_col_name"], - output_dir=params["output_dir"] + output_dir=params["output_dir"], + input_dir=params["input_dir"] ) # ----------------------------- From 0f24f140b623a5d81127f8b24e4a522e855c9971 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 25 Sep 2024 16:10:44 -0400 Subject: [PATCH 184/254] Update PathDSP_infer_improve.py --- PathDSP_infer_improve.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index ad91d27..99cde39 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -51,7 +51,8 @@ def run(params): y_pred=test_pred, stage="test", y_col_name=params["y_col_name"], - output_dir=params["output_dir"] + output_dir=params["output_dir"], + input_dir=params["input_data_dir"] ) if params["calc_infer_scores"]: test_scores = frm.compute_performance_scores( From 6779091ecf451642d31d5eec40dd7e26ec69fe52 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 25 Sep 2024 16:44:17 -0400 Subject: [PATCH 185/254] Update PathDSP_params.txt --- PathDSP_params.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 2f33790..a6177ca 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -1,9 +1,9 @@ [Preprocess] data_format = .txt input_supp_data_dir = ./author_data -train_split_file = gCSI_split_0_train.txt -val_split_file = gCSI_split_0_val.txt -test_split_file = gCSI_split_0_test.txt +train_split_file = CCLE_split_0_train.txt +val_split_file = CCLE_split_0_val.txt +test_split_file = CCLE_split_0_test.txt y_data_files = [["response.tsv"]] x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] x_data_drug_files = [["drug_SMILES.tsv"]] From 4d2dcf558f070dcb3f3bce7bc14da61b4c7f75ac Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Thu, 26 Sep 2024 14:58:40 -0400 Subject: [PATCH 186/254] save stage ydf --- PathDSP_preprocess_improve.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 0e91051..4be820e 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -312,6 +312,7 @@ def prep_input(params): & (response_df["sample_id"].isin(common_sample_ids)), :, ] + frm.save_stage_ydf(ydf=response_df, stage=i, output_dir=params["output_dir"]) comb_data_mtx = pd.DataFrame( { "drug_id": response_df["drug_id"].values, From 8f4a3f24afe6fa2491e50c582a966ea64fe1a611 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:33:07 -0400 Subject: [PATCH 187/254] untransform --- PathDSP_infer_improve.py | 11 +++++++---- PathDSP_train_improve.py | 11 +++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index 99cde39..a67b5f1 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -46,9 +46,12 @@ def run(params): start = datetime.now() test_true, test_pred = predicting(trained_net, device, data_loader=test_dl) + test_true_untrans = test_true.apply(lambda x: 10 ** (x) - 0.01) + test_pred_untrans = test_pred.apply(lambda x: 10 ** (x) - 0.01) + frm.store_predictions_df( - y_true=test_true, - y_pred=test_pred, + y_true=test_true_untrans, + y_pred=test_pred_untrans, stage="test", y_col_name=params["y_col_name"], output_dir=params["output_dir"], @@ -56,8 +59,8 @@ def run(params): ) if params["calc_infer_scores"]: test_scores = frm.compute_performance_scores( - y_true=test_true, - y_pred=test_pred, + y_true=test_true_untrans, + y_pred=test_pred_untrans, stage="test", metric_type=params["metric_type"], output_dir=params["output_dir"] diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 739e06b..62ac447 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -277,13 +277,16 @@ def init_weights(m): #val_true, val_pred = predicting(best_model, device, valid_dl) # (groud truth), (predictions) val_true, val_pred = predict(best_model, device, valid_dl) # (groud truth), (predictions) + #comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) + val_true_untrans = val_true.apply(lambda x: 10 ** (x) - 0.01) + val_pred_untrans = val_pred.apply(lambda x: 10 ** (x) - 0.01) # ----------------------------- # [Req] Save raw predictions in dataframe # ----------------------------- # import ipdb; ipdb.set_trace() frm.store_predictions_df( - y_true=val_true, - y_pred=val_pred, + y_true=val_true_untrans, + y_pred=val_pred_untrans, stage="val", y_col_name=params["y_col_name"], output_dir=params["output_dir"], @@ -295,8 +298,8 @@ def init_weights(m): # ----------------------------- # import ipdb; ipdb.set_trace() val_scores = frm.compute_performance_scores( - y_true=val_true, - y_pred=val_pred, + y_true=val_true_untrans, + y_pred=val_pred_untrans, stage="val", metric_type=params["metric_type"], output_dir=params["output_dir"] From e6bc9d6ddb20486f81d68eaef28fecf89d670b6a Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:38:59 -0400 Subject: [PATCH 188/254] setup improve --- setup_improve.sh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/setup_improve.sh b/setup_improve.sh index fd911a0..d566948 100644 --- a/setup_improve.sh +++ b/setup_improve.sh @@ -35,17 +35,19 @@ export IMPROVE_DATA_DIR="./$data_dir/" # Env var AUTHOR_DATA_DIR - PathDSP specific export AUTHOR_DATA_DIR="./$author_dir/" -# Clone IMPROVE lib (if needed) -pushd ../ +# Clone IMPROVE lib (if needed) and checkout the branch/tag +cd ../ improve_lib_path=$PWD/IMPROVE -improve_branch="develop" +# improve_branch="develop" +improve_branch="v0.1.0-2024-09-27" if [ -d $improve_lib_path ]; then - echo "IMPROVE repo exists in ${improve_lib_path}" + echo "IMPROVE repo exists in ${improve_lib_path}" else - # git clone https://github.com/JDACS4C-IMPROVE/IMPROVE - git clone -b $improve_branch https://github.com/JDACS4C-IMPROVE/IMPROVE + git clone https://github.com/JDACS4C-IMPROVE/IMPROVE fi -pushd $model_name +cd IMPROVE +git checkout -f $improve_branch +cd ../$model_name # Env var PYTHOPATH export PYTHONPATH=$PYTHONPATH:$improve_lib_path From 8e0268b1ae4f54c90edd01fcb091f79232ffa235 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:57:14 -0400 Subject: [PATCH 189/254] fix cuda name --- PathDSP_params.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index a6177ca..91f33f6 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -30,7 +30,7 @@ val_batch = 12 loss = mse patience = 30 y_col_name = auc -cuda_name = cuda:5 +cuda_name = cuda:0 dropout = 0.1 [Infer] From a58d8a53afe5251982b5597a579618b443be81e7 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:30:01 -0400 Subject: [PATCH 190/254] to pandas --- PathDSP_infer_improve.py | 2 ++ PathDSP_train_improve.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py index a67b5f1..97fc938 100755 --- a/PathDSP_infer_improve.py +++ b/PathDSP_infer_improve.py @@ -46,6 +46,8 @@ def run(params): start = datetime.now() test_true, test_pred = predicting(trained_net, device, data_loader=test_dl) + test_true = pd.Series(test_true) + test_pred = pd.Series(test_pred) test_true_untrans = test_true.apply(lambda x: 10 ** (x) - 0.01) test_pred_untrans = test_pred.apply(lambda x: 10 ** (x) - 0.01) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index 62ac447..ef64f97 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -278,6 +278,9 @@ def init_weights(m): val_true, val_pred = predict(best_model, device, valid_dl) # (groud truth), (predictions) #comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) + print("val_true", val_true) + val_true = pd.Series(val_true) + val_pred = pd.Series(val_pred) val_true_untrans = val_true.apply(lambda x: 10 ** (x) - 0.01) val_pred_untrans = val_pred.apply(lambda x: 10 ** (x) - 0.01) # ----------------------------- From 4cc8d186f5f45455f54e86af7de060aba2d1bc63 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:43:36 -0400 Subject: [PATCH 191/254] remove print --- PathDSP_train_improve.py | 1 - 1 file changed, 1 deletion(-) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py index ef64f97..b3eb9a6 100644 --- a/PathDSP_train_improve.py +++ b/PathDSP_train_improve.py @@ -278,7 +278,6 @@ def init_weights(m): val_true, val_pred = predict(best_model, device, valid_dl) # (groud truth), (predictions) #comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) - print("val_true", val_true) val_true = pd.Series(val_true) val_pred = pd.Series(val_pred) val_true_untrans = val_true.apply(lambda x: 10 ** (x) - 0.01) From a9662bf6a65e143a5ef5070b73ce8e4b01cf6298 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:44:55 -0400 Subject: [PATCH 192/254] update readme --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a19376d..2ff089b 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # PathDSP -This repository demonstrates how to use the [IMPROVE library v0.1.0-alpha](https://jdacs4c-improve.github.io/docs/v0.1.0-alpha/) for building a drug response prediction (DRP) model using PathDSP, and provides examples with the benchmark [cross-study analysis (CSA) dataset](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/). +This repository demonstrates how to use the [IMPROVE library v0.1.0-2024-09-27](https://jdacs4c-improve.github.io/docs/v0.1.0-alpha/) for building a drug response prediction (DRP) model using PathDSP, and provides examples with the benchmark [cross-study analysis (CSA) dataset](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/). -This version, tagged as `v0.1.0-alpha`, introduces a new API which is designed to encourage broader adoption of IMPROVE and its curated models by the research community. +This version, tagged as `v0.1.0-2024-09-27`, introduces a new API which is designed to encourage broader adoption of IMPROVE and its curated models by the research community. ## Dependencies @@ -14,7 +14,7 @@ ML framework: + [Torch](https://pytorch.org/) -- deep learning framework for building the prediction model IMPROVE dependencies: -+ [IMPROVE v0.1.0-alpha](https://jdacs4c-improve.github.io/docs/v0.1.0-alpha/) ++ [IMPROVE v0.1.0-2024-09-27](https://jdacs4c-improve.github.io/docs/v0.1.0-alpha/) @@ -70,7 +70,7 @@ csa_data/raw_data/ ``` git clone https://github.com/JDACS4C-IMPROVE/PathDSP cd PathDSP -git checkout develop +git checkout v0.1.0-2024-09-27 ``` @@ -89,7 +89,7 @@ source setup_improve.sh This will: 1. Download cross-study analysis (CSA) benchmark data into `./csa_data/`. -2. Clone IMPROVE repo (checkout tag `v0.0.3-beta`) outside the PathDSP model repo +2. Clone IMPROVE repo (checkout tag `v0.1.0-2024-09-27`) outside the PathDSP model repo 3. Set up env variables: `IMPROVE_DATA_DIR` (to `./csa_data/`) and `PYTHONPATH` (adds IMPROVE repo). 4. Download the model-specific supplemental data (aka author data) and set up the env variable `AUTHOR_DATA_DIR`. From 8b64f779ca5e33f40b502531dec09f78dcfeeeb6 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 15 Oct 2024 16:54:34 -0400 Subject: [PATCH 193/254] parsl update --- csa_params.ini | 19 +++-- workflow_csa.py | 184 +++++++++++++++++++++++++++-------------- workflow_preprocess.py | 130 ++++++++++++++++++----------- 3 files changed, 216 insertions(+), 117 deletions(-) diff --git a/csa_params.ini b/csa_params.ini index 4e0d99e..f9261e3 100644 --- a/csa_params.ini +++ b/csa_params.ini @@ -1,19 +1,28 @@ [DEFAULT] input_dir = ./csa_data/raw_data -output_dir=./improve_output y_col_name = auc use_singularity = False hyperparameters_file = ./hyperparameters_default.json -source_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] -target_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] -split = ["0","1","2","3","4","5","6","7","8","9"] model_name = PathDSP only_cross_study = False epochs = 800 -available_accelerators=["0","1","2","3","4","5","6","7"] model_environment = PathDSP_env +# Full-scale CSA +# output_dir = ./parsl_csa_exp +# source_datasets = ["CCLE","CTRPv2","gCSI","GDSCv1","GDSCv2"] +# target_datasets = ["CCLE","CTRPv2","gCSI","GDSCv1","GDSCv2"] +# split = ["0","1","2","3","4","5","6","7","8","9"] +# available_accelerators = ["0","1","2","3","4","5","6","7"] + +# Exp 3 +output_dir = ./parsl_csa_exp3 +source_datasets = ["CCLE","GDSCv2","gCSI"] +target_datasets = ["CCLE","GDSCv2","gCSI"] +split = ["0","1"] +available_accelerators = ["4","5","6","7"] + [Preprocess] [Train] diff --git a/workflow_csa.py b/workflow_csa.py index edc9dd7..7063101 100644 --- a/workflow_csa.py +++ b/workflow_csa.py @@ -1,18 +1,16 @@ +import json +import logging +import sys +from pathlib import Path +from typing import Sequence, Tuple, Union + import parsl from parsl import python_app -import subprocess from parsl.config import Config from parsl.executors import HighThroughputExecutor from parsl.providers import LocalProvider -from time import time -from typing import Sequence, Tuple, Union -from pathlib import Path -import logging -import sys -import json import csa_params_def as CSA -import improvelib.utils as frm from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig # Initialize parameters for CSA @@ -37,14 +35,14 @@ address='127.0.0.1', label="htex", cpu_affinity="block", - #max_workers_per_node=2, ## IS NOT SUPPORTED IN Parsl version: 2023.06.19. CHECK HOW TO USE THIS??? + #max_workers_per_node=2, ## IS NOT SUPPORTED IN Parsl version: 2023.06.19. CHECK HOW TO USE THIS??? worker_debug=True, - available_accelerators=params['available_accelerators'], worker_port_range=worker_port_range, provider=LocalProvider( init_blocks=1, max_blocks=1, ), + available_accelerators=params['available_accelerators'], ) ], strategy='simple', @@ -62,75 +60,134 @@ ############################################################################## @python_app -def train(params, hp_model, source_data_name, split): +def train(params, hp_model, source_data_name, split): + """ parsl implementation of training stage using python_app. """ + import json import subprocess + import time + from pathlib import Path + hp = hp_model[source_data_name] - if hp.__len__()==0: + if hp.__len__() == 0: raise Exception(str('Hyperparameters are not defined for ' + source_data_name)) - model_dir = params['model_outdir'] / f"{source_data_name}" / f"split_{split}" - ml_data_dir = params['ml_data_dir']/f"{source_data_name}-{params['target_datasets'][0]}"/ \ - f"split_{split}" + model_dir = params['model_dir'] / f"{source_data_name}" / f"split_{split}" + ml_data_dir = params['ml_data_dir'] / \ + f"{source_data_name}-{params['target_datasets'][0]}"/ f"split_{split}" + if model_dir.exists() is False: print("\nTrain") print(f"ml_data_dir: {ml_data_dir}") print(f"model_dir: {model_dir}") + start = time.time() if params['use_singularity']: train_run = ["singularity", "exec", "--nv", - params['singularity_image'], "train.sh", - str("--input_dir " + str(ml_data_dir)), - str("--output_dir " + str(model_dir)), - str("--epochs " + str(params['epochs'])), - str("--y_col_name " + str(params['y_col_name'])), - str("--learning_rate " + str(hp['learning_rate'])), - str("--batch_size " + str(hp['batch_size'])) + params['singularity_image'], "train.sh", + str("--input_dir " + str(ml_data_dir)), + str("--output_dir " + str(model_dir)), + str("--epochs " + str(params['epochs'])), + str("--y_col_name " + str(params['y_col_name'])), + str("--learning_rate " + str(hp['learning_rate'])), + str("--batch_size " + str(hp['batch_size'])) ] - result = subprocess.run(train_run, capture_output=True, - text=True, check=True) else: - train_run = ["bash", "execute_in_conda.sh",params['model_environment'], - params['train_python_script'], - "--input_dir", str(ml_data_dir), - "--output_dir", str(model_dir), - "--epochs", str(params['epochs']), # DL-specific - "--y_col_name", str(params['y_col_name']), - "--learning_rate", str(hp['learning_rate']), - "--batch_size", str(hp['batch_size']) + train_run = ["bash", "execute_in_conda.sh", + params['model_environment'], + params['train_python_script'], + "--input_dir", str(ml_data_dir), + "--output_dir", str(model_dir), + "--epochs", str(params['epochs']), # DL-specific + "--y_col_name", str(params['y_col_name']), + "--learning_rate", str(hp['learning_rate']), + "--batch_size", str(hp['batch_size']) ] - result = subprocess.run(train_run, capture_output=True, - text=True, check=True) - return {'source_data_name':source_data_name, 'split':split} + + result = subprocess.run(train_run, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True) + + # Logger + print(f"returncode = {result.returncode}") + result_file_name_stdout = model_dir / 'logs.txt' + with open(result_file_name_stdout, 'w') as file: + file.write(result.stdout) + + # Timer + time_diff = time.time() - start + hours = int(time_diff // 3600) + minutes = int((time_diff % 3600) // 60) + seconds = time_diff % 60 + time_diff_dict = {'hours': hours, + 'minutes': minutes, + 'seconds': seconds} + dir_to_save = model_dir + filename = 'runtime.json' + with open(Path(dir_to_save) / filename, 'w') as json_file: + json.dump(time_diff_dict, json_file, indent=4) + + return {'source_data_name': source_data_name, 'split': split} @python_app -def infer(params, source_data_name, target_data_name, split): # +def infer(params, source_data_name, target_data_name, split): + """ parsl implementation of inferece stage using python_app. """ import subprocess - model_dir = params['model_outdir'] / f"{source_data_name}" / f"split_{split}" - ml_data_dir = params['ml_data_dir']/f"{source_data_name}-{target_data_name}"/ \ - f"split_{split}" - infer_dir = params['infer_dir']/f"{source_data_name}-{target_data_name}"/f"split_{split}" + import json + import time + from pathlib import Path + + model_dir = params['model_dir'] / f"{source_data_name}" / f"split_{split}" + ml_data_dir = params['ml_data_dir'] / \ + f"{source_data_name}-{target_data_name}" / f"split_{split}" + infer_dir = params['infer_dir'] / \ + f"{source_data_name}-{target_data_name}" / f"split_{split}" + + print("\nInfer") + start = time.time() if params['use_singularity']: infer_run = ["singularity", "exec", "--nv", - params['singularity_image'], "infer.sh", - str("--input_data_dir " + str(ml_data_dir)), - str("--input_model_dir " + str(model_dir)), - str("--output_dir " + str(infer_dir)), - str("--calc_infer_scores "+ "true"), - str("--y_col_name " + str(params['y_col_name'])) + params['singularity_image'], "infer.sh", + str("--input_data_dir " + str(ml_data_dir)), + str("--input_model_dir " + str(model_dir)), + str("--output_dir " + str(infer_dir)), + str("--calc_infer_scores "+ "true"), + str("--y_col_name " + str(params['y_col_name'])) ] - result = subprocess.run(infer_run, capture_output=True, - text=True, check=True) else: - print("\nInfer") - infer_run = ["bash", "execute_in_conda.sh",params['model_environment'], - params['infer_python_script'], - "--input_data_dir", str(ml_data_dir), - "--input_model_dir", str(model_dir), - "--output_dir", str(infer_dir), - "--calc_infer_scores", "true", - "--y_col_name", str(params['y_col_name']) - ] - result = subprocess.run(infer_run, capture_output=True, - text=True, check=True) + infer_run = ["bash", "execute_in_conda.sh", + params['model_environment'], + params['infer_python_script'], + "--input_data_dir", str(ml_data_dir), + "--input_model_dir", str(model_dir), + "--output_dir", str(infer_dir), + "--calc_infer_scores", "true", + "--y_col_name", str(params['y_col_name']) + ] + + result = subprocess.run(infer_run, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True) + + # Logger + print(f"returncode = {result.returncode}") + result_file_name_stdout = infer_dir / 'logs.txt' + with open(result_file_name_stdout, 'w') as file: + file.write(result.stdout) + + # Timer + time_diff = time.time() - start + hours = int(time_diff // 3600) + minutes = int((time_diff % 3600) // 60) + seconds = time_diff % 60 + time_diff_dict = {'hours': hours, + 'minutes': minutes, + 'seconds': seconds} + dir_to_save = infer_dir + filename = 'runtime.json' + with open(Path(dir_to_save) / filename, 'w') as json_file: + json.dump(time_diff_dict, json_file, indent=4) + return True ############################### @@ -141,11 +198,10 @@ def infer(params, source_data_name, target_data_name, split): # fdir = Path(__file__).resolve().parent y_col_name = params['y_col_name'] logger = logging.getLogger(f"{params['model_name']}") -params = frm.build_paths(params) # paths to raw data #Output directories for preprocess, train and infer params['ml_data_dir'] = Path(params['output_dir']) / 'ml_data' -params['model_outdir'] = Path(params['output_dir']) / 'models' +params['model_dir'] = Path(params['output_dir']) / 'models' params['infer_dir'] = Path(params['output_dir']) / 'infer' #Model scripts @@ -162,16 +218,18 @@ def infer(params, source_data_name, target_data_name, split): # ########################################################################## ##Train execution with Parsl -train_futures=[] +train_futures = [] for source_data_name in params['source_datasets']: for split in params['split']: train_futures.append(train(params, hp_model, source_data_name, split)) ##Infer execution with Parsl -infer_futures =[] +infer_futures = [] for future_t in train_futures: for target_data_name in params['target_datasets']: infer_futures.append(infer(params, future_t.result()['source_data_name'], target_data_name, future_t.result()['split'])) for future_i in infer_futures: print(future_i.result()) + +parsl.dfk().cleanup() \ No newline at end of file diff --git a/workflow_preprocess.py b/workflow_preprocess.py index 1a5f6cb..d0e14bd 100644 --- a/workflow_preprocess.py +++ b/workflow_preprocess.py @@ -1,18 +1,16 @@ +import json +import logging +import sys +from pathlib import Path +from typing import Sequence, Tuple, Union + import parsl from parsl import python_app -import subprocess from parsl.config import Config from parsl.executors import HighThroughputExecutor from parsl.providers import LocalProvider -from time import time -from typing import Sequence, Tuple, Union -from pathlib import Path -import logging -import sys -import json import csa_params_def as CSA -import improvelib.utils as frm from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig # Initialize parameters for CSA @@ -37,7 +35,7 @@ address='127.0.0.1', label="htex_preprocess", cpu_affinity="alternating", - #max_workers_per_node=2, ## IS NOT SUPPORTED IN Parsl version: 2023.06.19. CHECK HOW TO USE THIS??? + #max_workers_per_node=2, ## IS NOT SUPPORTED IN Parsl version: 2023.06.19. CHECK HOW TO USE THIS??? worker_debug=True, worker_port_range=worker_port_range, provider=LocalProvider( @@ -61,20 +59,27 @@ ############################################################################## @python_app -def preprocess(inputs=[]): # - import warnings +def preprocess(inputs=[]): + """ parsl implementation of preprocessing stage using python_app. """ + import json import subprocess - import improvelib.utils as frm + import time + import warnings + from pathlib import Path + def build_split_fname(source_data_name, split, phase): """ Build split file name. If file does not exist continue """ if split=='all': return f"{source_data_name}_{split}.txt" return f"{source_data_name}_split_{split}_{phase}.txt" - params=inputs[0] - source_data_name=inputs[1] - split=inputs[2] - split_nums=params['split'] + # python_app inputs + params = inputs[0] + source_data_name = inputs[1] + split = inputs[2] + + split_nums = params['split'] + # Get the split file paths if len(split_nums) == 0: # Get all splits @@ -92,26 +97,28 @@ def build_split_fname(source_data_name, split, phase): for phase in ["train", "val", "test"]: fname = build_split_fname(source_data_name, split, phase) if fname not in "\t".join(files_joined): - warnings.warn(f"\nThe {phase} split file {fname} is missing (continue to next split)") + warnings.warn(f"\nThe {phase} split file {fname} is missing \ + (continue to next split)") continue for target_data_name in params['target_datasets']: - ml_data_dir = params['ml_data_dir']/f"{source_data_name}-{target_data_name}"/ \ - f"split_{split}" - if ml_data_dir.exists() is True: - continue + if params['only_cross_study'] and (source_data_name == target_data_name): continue # only cross-study print(f"\nSource data: {source_data_name}") print(f"Target data: {target_data_name}") - params['ml_data_outdir'] = params['ml_data_dir']/f"{source_data_name}-{target_data_name}"/f"split_{split}" - frm.create_outdir(outdir=params["ml_data_outdir"]) + ml_data_dir = params['ml_data_dir'] / \ + f"{source_data_name}-{target_data_name}" / f"split_{split}" + if ml_data_dir.exists() is True: + continue + if source_data_name == target_data_name: # If source and target are the same, then infer on the test split test_split_file = f"{source_data_name}_split_{split}_test.txt" else: - # If source and target are different, then infer on the entire target dataset + # If source and target are different, then infer on the entire + # target dataset test_split_file = f"{target_data_name}_all.txt" # Preprocess data @@ -121,32 +128,54 @@ def build_split_fname(source_data_name, split, phase): print(f"train_split_file: {train_split_file}") print(f"val_split_file: {val_split_file}") print(f"test_split_file: {test_split_file}") - print(f"ml_data_outdir: {params['ml_data_outdir']}") + start = time.time() if params['use_singularity']: preprocess_run = ["singularity", "exec", "--nv", - params['singularity_image'], "preprocess.sh", - str("--train_split_file " + str(train_split_file)), - str("--val_split_file " + str(val_split_file)), - str("--test_split_file " + str(test_split_file)), - str("--input_dir " + params['input_dir']), - str("--output_dir " + str(ml_data_dir)), - str("--y_col_name " + str(params['y_col_name'])) + params['singularity_image'], "preprocess.sh", + str("--train_split_file " + str(train_split_file)), + str("--val_split_file " + str(val_split_file)), + str("--test_split_file " + str(test_split_file)), + str("--input_dir " + params['input_dir']), + str("--output_dir " + str(ml_data_dir)), + str("--y_col_name " + str(params['y_col_name'])) ] - result = subprocess.run(preprocess_run, capture_output=True, - text=True, check=True) else: - preprocess_run = ["bash", "execute_in_conda.sh",params['model_environment'], - params['preprocess_python_script'], - "--train_split_file", str(train_split_file), - "--val_split_file", str(val_split_file), - "--test_split_file", str(test_split_file), - "--input_dir", params['input_dir'], - "--output_dir", str(ml_data_dir), - "--y_col_name", str(params['y_col_name']) + preprocess_run = ["bash", "execute_in_conda.sh", + params['model_environment'], + params['preprocess_python_script'], + "--train_split_file", str(train_split_file), + "--val_split_file", str(val_split_file), + "--test_split_file", str(test_split_file), + "--input_dir", params['input_dir'], + "--output_dir", str(ml_data_dir), + "--y_col_name", str(params['y_col_name']) ] - result = subprocess.run(preprocess_run, capture_output=True, - text=True, check=True) - return {'source_data_name':source_data_name, 'split':split} + + result = subprocess.run(preprocess_run, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True) + + # Logger + print(f"returncode = {result.returncode}") + result_file_name_stdout = ml_data_dir / 'logs.txt' + with open(result_file_name_stdout, 'w') as file: + file.write(result.stdout) + + # Timer + time_diff = time.time() - start + hours = int(time_diff // 3600) + minutes = int((time_diff % 3600) // 60) + seconds = time_diff % 60 + time_diff_dict = {'hours': hours, + 'minutes': minutes, + 'seconds': seconds} + dir_to_save = ml_data_dir + filename = 'runtime.json' + with open(Path(dir_to_save) / filename, 'w') as json_file: + json.dump(time_diff_dict, json_file, indent=4) + + return {'source_data_name': source_data_name, 'split': split} ############################### @@ -157,7 +186,6 @@ def build_split_fname(source_data_name, split, phase): fdir = Path(__file__).resolve().parent y_col_name = params['y_col_name'] logger = logging.getLogger(f"{params['model_name']}") -params = frm.build_paths(params) # paths to raw data #Output directories for preprocess, train and infer params['ml_data_dir'] = Path(params['output_dir']) / 'ml_data' @@ -170,10 +198,14 @@ def build_split_fname(source_data_name, split, phase): ########################################################################## ##Preprocess execution with Parsl -preprocess_futures=[] +preprocess_futures = [] for source_data_name in params['source_datasets']: for split in params['split']: - preprocess_futures.append(preprocess(inputs=[params, source_data_name, split])) + preprocess_futures.append( + preprocess(inputs=[params, source_data_name, split]) + ) for future_p in preprocess_futures: - print(future_p.result()) \ No newline at end of file + print(future_p.result()) + +parsl.dfk().cleanup() \ No newline at end of file From e5c9fb73e2b718dabf0ddd11fbefbcc63718508e Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 16 Oct 2024 08:55:06 -0400 Subject: [PATCH 194/254] fixing save ydf --- PathDSP_preprocess_improve.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 4be820e..633cb8d 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -312,7 +312,7 @@ def prep_input(params): & (response_df["sample_id"].isin(common_sample_ids)), :, ] - frm.save_stage_ydf(ydf=response_df, stage=i, output_dir=params["output_dir"]) + comb_data_mtx = pd.DataFrame( { "drug_id": response_df["drug_id"].values, @@ -329,6 +329,11 @@ def prep_input(params): ## add 0.01 to avoid possible inf values comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) comb_data_mtx = comb_data_mtx.dropna() + comb_data_mtx_to_save = pd.concat([comb_data_mtx["drug_id"], comb_data_mtx["sample_id"]], axis=1) + auc_to_save = pd.Series(comb_data_mtx["response"]) + auc_to_save = auc_to_save.apply(lambda x: 10 ** (x) - 0.01) + comb_data_mtx_to_save[params["y_col_name"]] = auc_to_save + frm.save_stage_ydf(ydf=comb_data_mtx_to_save, stage=i, output_dir=params["output_dir"]) pl.from_pandas(comb_data_mtx).write_csv( params["output_dir"] + "/" + frm.build_ml_data_file_name(data_format=params["data_format"], stage=i) , separator="\t", has_header=True From db4da00034acaa2d19213070fb18f1afbe8fd28e Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 16 Oct 2024 11:57:44 -0400 Subject: [PATCH 195/254] fixing save ydf --- PathDSP_preprocess_improve.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 633cb8d..369b905 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -329,7 +329,13 @@ def prep_input(params): ## add 0.01 to avoid possible inf values comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) comb_data_mtx = comb_data_mtx.dropna() - comb_data_mtx_to_save = pd.concat([comb_data_mtx["drug_id"], comb_data_mtx["sample_id"]], axis=1) + comb_data_mtx_to_save = pd.DataFrame( + { + "drug_id": comb_data_mtx.index.get_level_values("drug_id"), + "sample_id": comb_data_mtx.index.get_level_values("sample_id") + } + ) + auc_to_save = pd.Series(comb_data_mtx["response"]) auc_to_save = auc_to_save.apply(lambda x: 10 ** (x) - 0.01) comb_data_mtx_to_save[params["y_col_name"]] = auc_to_save From 19236ad0e1bf2d1b1f9d6093a38ecd38c8e3b90a Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 16 Oct 2024 14:25:05 -0400 Subject: [PATCH 196/254] bug --- PathDSP_preprocess_improve.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 369b905..6e1198d 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -337,7 +337,8 @@ def prep_input(params): ) auc_to_save = pd.Series(comb_data_mtx["response"]) - auc_to_save = auc_to_save.apply(lambda x: 10 ** (x) - 0.01) + print(auc_to_save) + auc_to_save = pd.Series(auc_to_save.apply(lambda x: 10 ** (x) - 0.01)) comb_data_mtx_to_save[params["y_col_name"]] = auc_to_save frm.save_stage_ydf(ydf=comb_data_mtx_to_save, stage=i, output_dir=params["output_dir"]) pl.from_pandas(comb_data_mtx).write_csv( From 6ff0346b7684c53e249bd9fed8b77e0c791979c8 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 16 Oct 2024 16:06:09 -0400 Subject: [PATCH 197/254] bug --- PathDSP_preprocess_improve.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 6e1198d..54aff53 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -3,6 +3,7 @@ import polars as pl import numpy as np import pandas as pd +import copy from functools import reduce from pathlib import Path from rdkit import Chem @@ -329,17 +330,11 @@ def prep_input(params): ## add 0.01 to avoid possible inf values comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) comb_data_mtx = comb_data_mtx.dropna() - comb_data_mtx_to_save = pd.DataFrame( - { - "drug_id": comb_data_mtx.index.get_level_values("drug_id"), - "sample_id": comb_data_mtx.index.get_level_values("sample_id") - } - ) - - auc_to_save = pd.Series(comb_data_mtx["response"]) - print(auc_to_save) - auc_to_save = pd.Series(auc_to_save.apply(lambda x: 10 ** (x) - 0.01)) - comb_data_mtx_to_save[params["y_col_name"]] = auc_to_save + + comb_data_mtx_to_save = copy.deepcopy(comb_data_mtx) + comb_data_mtx_to_save = comb_data_mtx_to_save.reset_index() + print(comb_data_mtx_to_save) + comb_data_mtx_to_save[params["y_col_name"]] = comb_data_mtx_to_save["response"].apply(lambda x: 10 ** (x) - 0.01) frm.save_stage_ydf(ydf=comb_data_mtx_to_save, stage=i, output_dir=params["output_dir"]) pl.from_pandas(comb_data_mtx).write_csv( params["output_dir"] + "/" + frm.build_ml_data_file_name(data_format=params["data_format"], stage=i) From 8287965f64ff231212f09de6a249518933ec647f Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 29 Oct 2024 11:33:05 -0400 Subject: [PATCH 198/254] updated install instructions --- README_deephyper_alpha.md | 67 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 README_deephyper_alpha.md diff --git a/README_deephyper_alpha.md b/README_deephyper_alpha.md new file mode 100644 index 0000000..fac1c3d --- /dev/null +++ b/README_deephyper_alpha.md @@ -0,0 +1,67 @@ +# Run HPO using deephyper on Polaris + +## Install conda environment for the curated model (PathDSP) +``` +## install PathDSP +git clone https://github.com/JDACS4C-IMPROVE/PathDSP +cd PathDSP +git checkout develop + +## install IMPROVE and download data +source setup_improve.sh + +## define where to install PathDSP env +export PathDSP_env=./PathDSP_env/ +conda env create -f PathDSP_env_conda.yml -p $PathDSP_env + +## set up environment variables +cd .. +cd +improve_lib="$PWD/IMPROVE/" +echo "export PYTHONPATH=$PYTHONPATH:${improve_lib}" >> IMPROVE_env +echo "export PathDSP_env=$PathDSP_env" >> IMPROVE_env +source $PWD/IMPROVE_env +``` + + + +## Perform preprocessing +Run the preprocess script. This script taks around 40 mins to complete. + +``` +### if necessary, request an interactive node from polaris to testing purposes +### qsub -A IMPROVE -I -l select=1 -l filesystems=home:eagle -l walltime=1:00:00 -q debug +### NEED to cd into your working directory again once the job started +``` + +``` +cd PathDSP +conda activate $PathDSP_env +python PathDSP_preprocess_improve.py --input_dir ./csa_data/raw_data +``` + +## Perform HPO using singularity container across two nodes +This will presumably have to be redone for alpha. + +``` +## copy processed to IMPROVE_DATA_DIR +cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ $IMPROVE_DATA_DIR +## specify singularity image file for PathDSP +echo "export PathDSP_sif=/lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP.sif" >> IMPROVE_env +cd PathDSP +## submit to debug queue +qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale_singularity_debug.sh +## to submit to debug-scaling or prod queue +## use hpo_scale_singularity_debug_scaling.sh +## or hpo_scale_singularity_prod.sh +## for interative node, run: mpirun -np 10 python hpo_subprocess_singularity.py +``` + +## Alternatively, perform HPO across two nodes based on conda + +``` +cd PathDSP +# supply environment variables to qsub +qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale.sh +## for interactive node, you can run: mpirun -np 10 python hpo_subprocess.py +``` \ No newline at end of file From 75b6dd8cd374a9ca2bfcd3ea19180de09493bd7e Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 12 Nov 2024 09:41:23 -0500 Subject: [PATCH 199/254] deephyper updates --- hpo_deephyper_params_def.py | 56 +++++++++ hpo_deephyper_subprocess.py | 183 ++++++++++++++++++++++++++++++ hpo_deephyper_subprocess_train.sh | 62 ++++++++++ 3 files changed, 301 insertions(+) create mode 100644 hpo_deephyper_params_def.py create mode 100644 hpo_deephyper_subprocess.py create mode 100644 hpo_deephyper_subprocess_train.sh diff --git a/hpo_deephyper_params_def.py b/hpo_deephyper_params_def.py new file mode 100644 index 0000000..e1eab8c --- /dev/null +++ b/hpo_deephyper_params_def.py @@ -0,0 +1,56 @@ +additional_definitions = [ + {"name": "source", + "type": str, + "default": "GDSCv1", + "help": "source dataset for HPO" + }, + {"name": "split", + "type": str, + "default": "4", + "help": "Split of the source datasets for HPO" + }, + {"name": "model_name", + "type": str, + "default": 'PathDSP', + "help": "Name of the deep learning model" + }, + + + + {"name": "model_scripts_dir", + "type": str, + "default": './', + "help": "Path to the model repository" + }, + {"name": "model_environment", + "type": str, + "default": '', + "help": "Name of your model conda environment" + }, + {"name": "hyperparameters_file", + "type": str, + "default": 'hyperparameters_default.json', + "help": "json file containing optimized hyperparameters per dataset" + }, + {"name": "epochs", + "type": int, + "default": 10, + "help": "Number of epochs" + }, + {"name": "available_accelerators", + "nargs" : "+", + "type": str, + "default": ["0", "1"], + "help": "GPU IDs to assign jobs" + }, + {"name": "use_singularity", + "type": bool, + "default": True, + "help": "Do you want to use singularity image for running the model?" + }, + {"name": "singularity_image", + "type": str, + "default": '', + "help": "Singularity image file of the model" + } + ] \ No newline at end of file diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py new file mode 100644 index 0000000..ca26aff --- /dev/null +++ b/hpo_deephyper_subprocess.py @@ -0,0 +1,183 @@ +""" +Before running this script, first need to preprocess the data. +This can be done by running preprocess_example.sh + +It is assumed that the csa benchmark data is downloaded via download_csa.sh +and the env vars $IMPROVE_DATA_DIR and $PYTHONPATH are set: +export IMPROVE_DATA_DIR="./csa_data/" +export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib + +It also assumes that your processed training data is at: "ml_data/{source}-{source}/split_{split}" +validation data is at: "ml_data/{source}-{source}/split_{split}" +model output files will be saved at "dh_hpo_improve/{source}/split_{split}" + +mpirun -np 10 python hpo_subprocess.py +""" +# import copy +import json +import subprocess +import pandas as pd +import os +import logging +import mpi4py +from deephyper.evaluator import Evaluator, profile +from deephyper.evaluator.callback import TqdmCallback +from deephyper.problem import HpProblem +from deephyper.search.hps import CBO +from mpi4py import MPI +import socket +import hpo_deephyper_params_def +from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig + +# Start time +start_full_wf = time.time() + +# Initialize parameters for DeepHyper HPO +filepath = Path(__file__).resolve().parent +cfg = DRPPreprocessConfig() +params = cfg.initialize_parameters( + pathToModelDir=filepath, + default_config="hpo_deephyper_params.ini", + additional_definitions=hpo_deephyper_params_def.additional_definitions +) + +# --------------------- +# Enable using multiple GPUs +# --------------------- + +mpi4py.rc.initialize = False +mpi4py.rc.threads = True +mpi4py.rc.thread_level = "multiple" +mpi4py.rc.recv_mprobe = False + +if not MPI.Is_initialized(): + MPI.Init_thread() + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() +size = comm.Get_size() +local_rank = os.environ["PMI_LOCAL_RANK"] + +# CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh +# uncomment the below commands if running via interactive node +#num_gpus_per_node = 4 +#os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) +#cuda_name = "cuda:" + str(rank % num_gpus_per_node) + +# --------------------- +# Enable logging +# --------------------- + +logging.basicConfig( + # filename=f"deephyper.{rank}.log, # optional if we want to store the logs to disk + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(filename)s:%(funcName)s - %(message)s", + force=True, +) + +# --------------------- +# Hyperparameters +# --------------------- +problem = HpProblem() + +problem.add_hyperparameter((8, 512, "log-uniform"), "batch_size", default_value=64) +problem.add_hyperparameter((1e-6, 1e-2, "log-uniform"), + "learning_rate", default_value=0.001) +# problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.0) +# problem.add_hyperparameter([True, False], "early_stopping", default_value=False) + +# --------------------- +# Some IMPROVE settings +# --------------------- +# source = "GDSCv1" +# split = 4 +# added model name +# ini output_dir = dh_hpo_improve +ml_data_dir = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" +model_outdir = f"{params['output_dir']}/{params['source']}/split_{params['split']}" +log_dir = f"{params['output_dir']}_logs/" +# subprocess_bashscript = "subprocess_train.sh" + + +@profile +def run(job, optuna_trial=None): + + # config = copy.deepcopy(job.parameters) + # params = { + # "epochs": DEEPHYPER_BENCHMARK_MAX_EPOCHS, + # "timeout": DEEPHYPER_BENCHMARK_TIMEOUT, + # "verbose": False, + # } + # if len(config) > 0: + # remap_hyperparameters(config) + # params.update(config) + + model_outdir_job_id = model_outdir + f"/{job.id}" + learning_rate = job.parameters["learning_rate"] + batch_size = job.parameters["batch_size"] + # val_scores = main_train_grapdrp([ + # "--train_ml_data_dir", str(train_ml_data_dir), + # "--val_ml_data_dir", str(val_ml_data_dir), + # "--model_outdir", str(model_outdir_job_id), + # ]) + subprocess_res = subprocess.run( + [ + "bash", + "subprocess_train.sh", + str(params['model_path']), + str(params['model_name']), + str(ml_data_dir), + str(model_outdir_job_id), + str(learning_rate), + str(batch_size), + str(params['epochs']), + #str(cuda_name) + str(os.environ["CUDA_VISIBLE_DEVICES"]) + ], + capture_output=True, text=True, check=True + ) + + # print(subprocess_res.stdout) + # print(subprocess_res.stderr) + + # Load val_scores and get val_loss + # f = open(model_outdir + "/val_scores.json") + f = open(model_outdir_job_id + "/val_scores.json") + val_scores = json.load(f) + objective = -val_scores["val_loss"] + # print("objective:", objective) + + # Checkpoint the model weights + with open(f"{log_dir}/model_{job.id}.pkl", "w") as f: + f.write("model weights") + + # return score + return {"objective": objective, "metadata": val_scores} + + +if __name__ == "__main__": + with Evaluator.create( + run, method="mpicomm", method_kwargs={"callbacks": [TqdmCallback()]} + ) as evaluator: + + if evaluator is not None: + print(problem) + + search = CBO( + problem, + evaluator, + log_dir=log_dir, + verbose=1, + ) + + # max_evals = 2 + # max_evals = 4 + # max_evals = 10 + # max_evals = 20 + max_evals = 10 + # max_evals = 100 + results = search.search(max_evals=max_evals) + results = results.sort_values("m:val_loss", ascending=True) + results.to_csv(model_outdir + "/hpo_results.csv", index=False) + print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) + print("Finished deephyper HPO.") diff --git a/hpo_deephyper_subprocess_train.sh b/hpo_deephyper_subprocess_train.sh new file mode 100644 index 0000000..738ccc4 --- /dev/null +++ b/hpo_deephyper_subprocess_train.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0 +# CUDA_VISIBLE_DEVICES=5 bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0 + +# Need to comment this when using ' eval "$(conda shell.bash hook)" ' +# set -e + +# Activate conda env for model using "conda activate myenv" +# https://saturncloud.io/blog/activating-conda-environments-from-scripts-a-guide-for-data-scientists +# https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script +# This doesn't work w/o eval "$(conda shell.bash hook)" +CONDA_ENV=$PathDSP_env +#echo "Allow conda commands in shell script by running 'conda shell.bash hook'" +#eval "$(conda shell.bash hook)" +echo "Activated conda commands in shell script" +#conda activate $CONDA_ENV +#source activate $CONDA_ENV +conda_path=$(dirname $(dirname $(which conda))) +source $conda_path/bin/activate $CONDA_ENV +#source /soft/datascience/conda/2023-10-04/mconda3/bin/activate $CONDA_ENV +#source activate $CONDA_ENV +echo "Activated conda env $CONDA_ENV" +#model path, model name, epochs +MODELPATH=$1 +MODELNAME=$2 +input_dir=$3 +output_dir=$4 +learning_rate=$5 +batch_size=$6 +epochs=$7 +#cuda_name=$6 +CUDA_VISIBLE_DEVICES=$8 + +#echo "train_ml_data_dir: $train_ml_data_dir" +#echo "val_ml_data_dir: $val_ml_data_dir" +echo "input_dir: $input_dir" +echo "output_dir: $output_dir" +echo "learning_rate: $learning_rate" +echo "batch_size: $batch_size" +#echo "cuda_name: $cuda_name" +echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" + +# epochs=10 +#epochs=10 +# epochs=50 + +# All train outputs are saved in params["model_outdir"] +#CUDA_VISIBLE_DEVICES=6,7 python PathDSP_train_improve.py \ +#CUDA_VISIBLE_DEVICES=5 +#CUDA_VISIBLE_DEVICES=6,7 +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} python $MODELPATH/$MODELNAME_train_improve.py \ + --input_dir $input_dir \ + --output_dir $output_dir \ + --epochs $epochs \ + --learning_rate $learning_rate \ + --batch_size $batch_size +# --cuda_name $cuda_name + +#conda deactivate +source $conda_path/bin/deactivate +echo "Deactivated conda env $CONDA_ENV" From 46584a1d2517e03a89af69cd9041ef864a222383 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Thu, 14 Nov 2024 11:53:36 -0500 Subject: [PATCH 200/254] train.sh --- README_deephyper_alpha.md | 5 +++-- hpo_deephyper_subprocess_train.sh | 18 +++++++++--------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/README_deephyper_alpha.md b/README_deephyper_alpha.md index fac1c3d..29d617b 100644 --- a/README_deephyper_alpha.md +++ b/README_deephyper_alpha.md @@ -27,6 +27,7 @@ source $PWD/IMPROVE_env ## Perform preprocessing Run the preprocess script. This script taks around 40 mins to complete. +The workflow assumes that your preprocessed data is at: "ml_data/{source}-{source}/split_{split}" ``` ### if necessary, request an interactive node from polaris to testing purposes @@ -37,7 +38,7 @@ Run the preprocess script. This script taks around 40 mins to complete. ``` cd PathDSP conda activate $PathDSP_env -python PathDSP_preprocess_improve.py --input_dir ./csa_data/raw_data +python PathDSP_preprocess_improve.py --input_dir ./csa_data/raw_data --output_dir ./ml_data/CCLE-CCLE/split_0 ``` ## Perform HPO using singularity container across two nodes @@ -63,5 +64,5 @@ qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale_singularity_debug.sh cd PathDSP # supply environment variables to qsub qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale.sh -## for interactive node, you can run: mpirun -np 10 python hpo_subprocess.py +## for interactive node, you can run: mpirun -np 10 python hpo_deephyper_subprocess.py ``` \ No newline at end of file diff --git a/hpo_deephyper_subprocess_train.sh b/hpo_deephyper_subprocess_train.sh index 738ccc4..acc83e4 100644 --- a/hpo_deephyper_subprocess_train.sh +++ b/hpo_deephyper_subprocess_train.sh @@ -22,15 +22,15 @@ source $conda_path/bin/activate $CONDA_ENV #source activate $CONDA_ENV echo "Activated conda env $CONDA_ENV" #model path, model name, epochs -MODELPATH=$1 -MODELNAME=$2 -input_dir=$3 -output_dir=$4 -learning_rate=$5 -batch_size=$6 -epochs=$7 +SCRIPT=$1 +input_dir=$2 +output_dir=$3 +learning_rate=$4 +batch_size=$5 +epochs=$6 #cuda_name=$6 -CUDA_VISIBLE_DEVICES=$8 +CUDA_VISIBLE_DEVICES=$7 + #echo "train_ml_data_dir: $train_ml_data_dir" #echo "val_ml_data_dir: $val_ml_data_dir" @@ -49,7 +49,7 @@ echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" #CUDA_VISIBLE_DEVICES=6,7 python PathDSP_train_improve.py \ #CUDA_VISIBLE_DEVICES=5 #CUDA_VISIBLE_DEVICES=6,7 -CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} python $MODELPATH/$MODELNAME_train_improve.py \ +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} python $SCRIPT \ --input_dir $input_dir \ --output_dir $output_dir \ --epochs $epochs \ From d8d477d06fa260a0ccc38bdfdfdd8464ce9ccfa3 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Thu, 14 Nov 2024 13:04:49 -0500 Subject: [PATCH 201/254] updates --- README_deephyper_alpha.md | 2 ++ hpo_deephyper_params.ini | 20 ++++++++++++++++++++ hpo_deephyper_params_def.py | 3 --- hpo_deephyper_subprocess.py | 7 ++++--- hpo_deephyper_subprocess_train.sh | 16 ++++++++-------- 5 files changed, 34 insertions(+), 14 deletions(-) create mode 100644 hpo_deephyper_params.ini diff --git a/README_deephyper_alpha.md b/README_deephyper_alpha.md index 29d617b..69dd038 100644 --- a/README_deephyper_alpha.md +++ b/README_deephyper_alpha.md @@ -9,6 +9,8 @@ git checkout develop ## install IMPROVE and download data source setup_improve.sh +or +export PYTHONPATH=../IMPROVE ## define where to install PathDSP env export PathDSP_env=./PathDSP_env/ diff --git a/hpo_deephyper_params.ini b/hpo_deephyper_params.ini new file mode 100644 index 0000000..ab9f252 --- /dev/null +++ b/hpo_deephyper_params.ini @@ -0,0 +1,20 @@ +[DEFAULT] +input_dir = ./csa_data/raw_data +y_col_name = auc +model_name = PathDSP +model_scripts_dir = ./ +model_environment = ./PathDSP_env/ +epochs = 3 +output_dir = ./test +source_datasets = "gCSI" +target_datasets = "gCSI" +split = 4 + + + + +[Preprocess] + +[Train] + +[Infer] \ No newline at end of file diff --git a/hpo_deephyper_params_def.py b/hpo_deephyper_params_def.py index e1eab8c..5d4b998 100644 --- a/hpo_deephyper_params_def.py +++ b/hpo_deephyper_params_def.py @@ -14,9 +14,6 @@ "default": 'PathDSP', "help": "Name of the deep learning model" }, - - - {"name": "model_scripts_dir", "type": str, "default": './', diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index ca26aff..27650e4 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -97,7 +97,7 @@ model_outdir = f"{params['output_dir']}/{params['source']}/split_{params['split']}" log_dir = f"{params['output_dir']}_logs/" # subprocess_bashscript = "subprocess_train.sh" - +script_name = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") @profile def run(job, optuna_trial=None): @@ -120,12 +120,13 @@ def run(job, optuna_trial=None): # "--val_ml_data_dir", str(val_ml_data_dir), # "--model_outdir", str(model_outdir_job_id), # ]) + print("launch run") subprocess_res = subprocess.run( [ "bash", "subprocess_train.sh", - str(params['model_path']), - str(params['model_name']), + str(params['model_environment']), + str(script_name), str(ml_data_dir), str(model_outdir_job_id), str(learning_rate), diff --git a/hpo_deephyper_subprocess_train.sh b/hpo_deephyper_subprocess_train.sh index acc83e4..cc1dd56 100644 --- a/hpo_deephyper_subprocess_train.sh +++ b/hpo_deephyper_subprocess_train.sh @@ -10,7 +10,7 @@ # https://saturncloud.io/blog/activating-conda-environments-from-scripts-a-guide-for-data-scientists # https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script # This doesn't work w/o eval "$(conda shell.bash hook)" -CONDA_ENV=$PathDSP_env +CONDA_ENV=$1 #echo "Allow conda commands in shell script by running 'conda shell.bash hook'" #eval "$(conda shell.bash hook)" echo "Activated conda commands in shell script" @@ -22,14 +22,14 @@ source $conda_path/bin/activate $CONDA_ENV #source activate $CONDA_ENV echo "Activated conda env $CONDA_ENV" #model path, model name, epochs -SCRIPT=$1 -input_dir=$2 -output_dir=$3 -learning_rate=$4 -batch_size=$5 -epochs=$6 +SCRIPT=$2 +input_dir=$3 +output_dir=$4 +learning_rate=$5 +batch_size=$6 +epochs=$7 #cuda_name=$6 -CUDA_VISIBLE_DEVICES=$7 +CUDA_VISIBLE_DEVICES=$8 #echo "train_ml_data_dir: $train_ml_data_dir" From 9f86340791685bc0c76c90bffbf0d95b9eae3ea7 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Thu, 14 Nov 2024 13:25:05 -0500 Subject: [PATCH 202/254] import time --- hpo_deephyper_subprocess.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 27650e4..c91c706 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -18,6 +18,7 @@ import subprocess import pandas as pd import os +import time import logging import mpi4py from deephyper.evaluator import Evaluator, profile From 58a4ead201877bc7abc21b9ac4021019ee9e92ab Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Thu, 14 Nov 2024 13:25:48 -0500 Subject: [PATCH 203/254] import Path --- hpo_deephyper_subprocess.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index c91c706..57c2166 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -19,6 +19,7 @@ import pandas as pd import os import time +from pathlib import Path import logging import mpi4py from deephyper.evaluator import Evaluator, profile From 5af2c275b438ec23ac17ec5c8b1229fee1339ac1 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Thu, 14 Nov 2024 13:32:55 -0500 Subject: [PATCH 204/254] rank --- hpo_deephyper_subprocess.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 57c2166..f45e8a6 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -58,13 +58,13 @@ comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() -local_rank = os.environ["PMI_LOCAL_RANK"] +#NCK local_rank = os.environ["PMI_LOCAL_RANK"] # CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh # uncomment the below commands if running via interactive node -#num_gpus_per_node = 4 -#os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) -#cuda_name = "cuda:" + str(rank % num_gpus_per_node) +num_gpus_per_node = 2 +os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) +cuda_name = "cuda:" + str(rank % num_gpus_per_node) # --------------------- # Enable logging @@ -182,5 +182,5 @@ def run(job, optuna_trial=None): results = search.search(max_evals=max_evals) results = results.sort_values("m:val_loss", ascending=True) results.to_csv(model_outdir + "/hpo_results.csv", index=False) - print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) + #print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) print("Finished deephyper HPO.") From e1ce53511d89a211980f0c632ea052b6486de8d7 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Thu, 14 Nov 2024 13:51:17 -0500 Subject: [PATCH 205/254] prints --- hpo_deephyper_subprocess.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index f45e8a6..28df624 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -57,7 +57,9 @@ comm = MPI.COMM_WORLD rank = comm.Get_rank() +print(rank) size = comm.Get_size() +print(size) #NCK local_rank = os.environ["PMI_LOCAL_RANK"] # CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh From 81f0219e85e896d5dce6b5386d2bb5a21eb4e10c Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:28:22 -0500 Subject: [PATCH 206/254] prints --- README_deephyper_alpha.md | 21 ++++++++++++++++++++- hpo_deephyper_subprocess.py | 8 ++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/README_deephyper_alpha.md b/README_deephyper_alpha.md index 69dd038..af267af 100644 --- a/README_deephyper_alpha.md +++ b/README_deephyper_alpha.md @@ -67,4 +67,23 @@ cd PathDSP # supply environment variables to qsub qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale.sh ## for interactive node, you can run: mpirun -np 10 python hpo_deephyper_subprocess.py -``` \ No newline at end of file +``` + + +Lambda instructions +Setup repo + +Install DeepHyper env + +module load openmpi +conda create -n dh python=3.9 -y +conda activate dh +conda install gxx_linux-64 gcc_linu +pip install "deephyper[default]" +pip install mpi4py + +Set python path +export PYTHONPATH=../IMPROVE + +Run +mpirun -np 10 python hpo_deephyper_subprocess.py \ No newline at end of file diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 28df624..1433ae1 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -124,6 +124,14 @@ def run(job, optuna_trial=None): # "--val_ml_data_dir", str(val_ml_data_dir), # "--model_outdir", str(model_outdir_job_id), # ]) + print("model env:", params['model_environment']) + print("script_name:", script_name) + print("ml_data_dir:", ml_data_dir) + print("model_outdir_job_id:", model_outdir_job_id) + print("learning_rate:", learning_rate) + print("batch_size:", batch_size) + print("params['epochs']:", params['epochs']) + print("CUDA_VISIBLE_DEVICES:", os.environ["CUDA_VISIBLE_DEVICES"]) print("launch run") subprocess_res = subprocess.run( [ From 11f6c7f930b1db7d0a86a6832629253de4f8bfe5 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:45:31 -0500 Subject: [PATCH 207/254] moved config --- hpo_deephyper_params.ini | 6 ++--- hpo_deephyper_subprocess.py | 53 +++++++++++++++++-------------------- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/hpo_deephyper_params.ini b/hpo_deephyper_params.ini index ab9f252..241d6d6 100644 --- a/hpo_deephyper_params.ini +++ b/hpo_deephyper_params.ini @@ -6,9 +6,9 @@ model_scripts_dir = ./ model_environment = ./PathDSP_env/ epochs = 3 output_dir = ./test -source_datasets = "gCSI" -target_datasets = "gCSI" -split = 4 +source_datasets = "CCLE" +target_datasets = "CCLE" +split = 0 diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 1433ae1..d7162e9 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -31,17 +31,7 @@ import hpo_deephyper_params_def from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig -# Start time -start_full_wf = time.time() - -# Initialize parameters for DeepHyper HPO -filepath = Path(__file__).resolve().parent -cfg = DRPPreprocessConfig() -params = cfg.initialize_parameters( - pathToModelDir=filepath, - default_config="hpo_deephyper_params.ini", - additional_definitions=hpo_deephyper_params_def.additional_definitions -) + # --------------------- # Enable using multiple GPUs @@ -89,19 +79,22 @@ "learning_rate", default_value=0.001) # problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.0) # problem.add_hyperparameter([True, False], "early_stopping", default_value=False) +def prepare_parameters(): + # Initialize parameters for DeepHyper HPO + filepath = Path(__file__).resolve().parent + cfg = DRPPreprocessConfig() + params = cfg.initialize_parameters( + pathToModelDir=filepath, + default_config="hpo_deephyper_params.ini", + additional_definitions=hpo_deephyper_params_def.additional_definitions + ) -# --------------------- -# Some IMPROVE settings -# --------------------- -# source = "GDSCv1" -# split = 4 -# added model name -# ini output_dir = dh_hpo_improve -ml_data_dir = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" -model_outdir = f"{params['output_dir']}/{params['source']}/split_{params['split']}" -log_dir = f"{params['output_dir']}_logs/" -# subprocess_bashscript = "subprocess_train.sh" -script_name = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") + params['ml_data_dir'] = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" + params['model_outdir'] = f"{params['output_dir']}/{params['source']}/split_{params['split']}" + params['log_dir'] = f"{params['output_dir']}_logs/" + # subprocess_bashscript = "subprocess_train.sh" + params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") + return params @profile def run(job, optuna_trial=None): @@ -116,7 +109,7 @@ def run(job, optuna_trial=None): # remap_hyperparameters(config) # params.update(config) - model_outdir_job_id = model_outdir + f"/{job.id}" + model_outdir_job_id = params['model_outdir'] + f"/{job.id}" learning_rate = job.parameters["learning_rate"] batch_size = job.parameters["batch_size"] # val_scores = main_train_grapdrp([ @@ -125,8 +118,8 @@ def run(job, optuna_trial=None): # "--model_outdir", str(model_outdir_job_id), # ]) print("model env:", params['model_environment']) - print("script_name:", script_name) - print("ml_data_dir:", ml_data_dir) + print("script_name:", params['script_name']) + print("ml_data_dir:", params['ml_data_dir']) print("model_outdir_job_id:", model_outdir_job_id) print("learning_rate:", learning_rate) print("batch_size:", batch_size) @@ -138,8 +131,8 @@ def run(job, optuna_trial=None): "bash", "subprocess_train.sh", str(params['model_environment']), - str(script_name), - str(ml_data_dir), + str(params['script_name']), + str(params['ml_data_dir']), str(model_outdir_job_id), str(learning_rate), str(batch_size), @@ -169,6 +162,10 @@ def run(job, optuna_trial=None): if __name__ == "__main__": + # Start time + start_full_wf = time.time() + global params + params = prepare_parameters() with Evaluator.create( run, method="mpicomm", method_kwargs={"callbacks": [TqdmCallback()]} ) as evaluator: From 44709ddae36848053c3741aed0ef98f50870be92 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 09:17:46 -0500 Subject: [PATCH 208/254] fix log dir --- hpo_deephyper_subprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index d7162e9..affa8b5 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -176,7 +176,7 @@ def run(job, optuna_trial=None): search = CBO( problem, evaluator, - log_dir=log_dir, + log_dir=params['log_dir'], verbose=1, ) From 4d45ff6c9c9f971498f2d590e48eb62ee1a8da20 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 09:31:28 -0500 Subject: [PATCH 209/254] add logger --- README_deephyper_alpha.md | 5 ++++- hpo_deephyper_subprocess.py | 8 +++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/README_deephyper_alpha.md b/README_deephyper_alpha.md index af267af..1c6c8d9 100644 --- a/README_deephyper_alpha.md +++ b/README_deephyper_alpha.md @@ -82,8 +82,11 @@ conda install gxx_linux-64 gcc_linu pip install "deephyper[default]" pip install mpi4py -Set python path +Each time: +Set python path (in repo) export PYTHONPATH=../IMPROVE +module load openmpi +conda activate dh Run mpirun -np 10 python hpo_deephyper_subprocess.py \ No newline at end of file diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index affa8b5..b9f3f7b 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -142,7 +142,13 @@ def run(job, optuna_trial=None): ], capture_output=True, text=True, check=True ) - + # Logger + print(f"returncode = {subprocess_res.returncode}") + result_file_name_stdout = model_outdir_job_id / 'logs.txt' + if model_outdir_job_id.exists() is False: # If subprocess fails, model_dir may not be created and we need to write the log files in model_dir + os.makedirs(model_outdir_job_id, exist_ok=True) + with open(result_file_name_stdout, 'w') as file: + file.write(subprocess_res.stdout) # print(subprocess_res.stdout) # print(subprocess_res.stderr) From 53a61c33f275c1030dfc4930a97bcbb6c4793d58 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 09:51:11 -0500 Subject: [PATCH 210/254] bug --- hpo_deephyper_subprocess.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index b9f3f7b..b3d25bd 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -94,6 +94,7 @@ def prepare_parameters(): params['log_dir'] = f"{params['output_dir']}_logs/" # subprocess_bashscript = "subprocess_train.sh" params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") + print(params) return params @profile From 043a9db57df8438c0b20c1949f68e94d6cdf7a59 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 09:54:12 -0500 Subject: [PATCH 211/254] bug --- hpo_deephyper_subprocess.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index b3d25bd..a2b4cf1 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -94,7 +94,9 @@ def prepare_parameters(): params['log_dir'] = f"{params['output_dir']}_logs/" # subprocess_bashscript = "subprocess_train.sh" params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") + print("NATASHA LOOK HERE") print(params) + print("NATASHA DONE LOOK HERE") return params @profile From d454fe1090a318a9ef3b82008c2e927eee243260 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 09:57:10 -0500 Subject: [PATCH 212/254] bug fix --- hpo_deephyper_params.ini | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hpo_deephyper_params.ini b/hpo_deephyper_params.ini index 241d6d6..7dd70bc 100644 --- a/hpo_deephyper_params.ini +++ b/hpo_deephyper_params.ini @@ -6,8 +6,7 @@ model_scripts_dir = ./ model_environment = ./PathDSP_env/ epochs = 3 output_dir = ./test -source_datasets = "CCLE" -target_datasets = "CCLE" +source = "CCLE" split = 0 From efe73fa93ee1054beb40a8e728d99e06a0fb9941 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:16:53 -0500 Subject: [PATCH 213/254] log --- hpo_deephyper_subprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index a2b4cf1..3c9b0ce 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -147,7 +147,7 @@ def run(job, optuna_trial=None): ) # Logger print(f"returncode = {subprocess_res.returncode}") - result_file_name_stdout = model_outdir_job_id / 'logs.txt' + result_file_name_stdout = Path(model_outdir_job_id) / 'logs.txt' if model_outdir_job_id.exists() is False: # If subprocess fails, model_dir may not be created and we need to write the log files in model_dir os.makedirs(model_outdir_job_id, exist_ok=True) with open(result_file_name_stdout, 'w') as file: From dfb31a63a4e88c150a65c07a4c23a41f02e5bcea Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:18:30 -0500 Subject: [PATCH 214/254] path --- hpo_deephyper_params.ini | 2 +- hpo_deephyper_subprocess.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hpo_deephyper_params.ini b/hpo_deephyper_params.ini index 7dd70bc..afbf618 100644 --- a/hpo_deephyper_params.ini +++ b/hpo_deephyper_params.ini @@ -6,7 +6,7 @@ model_scripts_dir = ./ model_environment = ./PathDSP_env/ epochs = 3 output_dir = ./test -source = "CCLE" +source = CCLE split = 0 diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 3c9b0ce..8cc1acf 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -112,7 +112,7 @@ def run(job, optuna_trial=None): # remap_hyperparameters(config) # params.update(config) - model_outdir_job_id = params['model_outdir'] + f"/{job.id}" + model_outdir_job_id = Path(params['model_outdir'] + f"/{job.id}") learning_rate = job.parameters["learning_rate"] batch_size = job.parameters["batch_size"] # val_scores = main_train_grapdrp([ @@ -147,7 +147,7 @@ def run(job, optuna_trial=None): ) # Logger print(f"returncode = {subprocess_res.returncode}") - result_file_name_stdout = Path(model_outdir_job_id) / 'logs.txt' + result_file_name_stdout = model_outdir_job_id / 'logs.txt' if model_outdir_job_id.exists() is False: # If subprocess fails, model_dir may not be created and we need to write the log files in model_dir os.makedirs(model_outdir_job_id, exist_ok=True) with open(result_file_name_stdout, 'w') as file: From 3d44236336168c6478217d1ffb8abca997c83ed4 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:20:27 -0500 Subject: [PATCH 215/254] path --- hpo_deephyper_subprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 8cc1acf..7d94795 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -157,7 +157,7 @@ def run(job, optuna_trial=None): # Load val_scores and get val_loss # f = open(model_outdir + "/val_scores.json") - f = open(model_outdir_job_id + "/val_scores.json") + f = open(model_outdir_job_id / "val_scores.json") val_scores = json.load(f) objective = -val_scores["val_loss"] # print("objective:", objective) From a30c6c8a4b1f36d4b4ec4386337731d3d0f2ac32 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:26:26 -0500 Subject: [PATCH 216/254] logs --- hpo_deephyper_subprocess.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 7d94795..83b5636 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -143,7 +143,9 @@ def run(job, optuna_trial=None): #str(cuda_name) str(os.environ["CUDA_VISIBLE_DEVICES"]) ], - capture_output=True, text=True, check=True + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True ) # Logger print(f"returncode = {subprocess_res.returncode}") From 9b00698ce4676390209961e82cb7065a8cfd9ae4 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:31:46 -0500 Subject: [PATCH 217/254] env issues --- hpo_deephyper_subprocess.py | 1 - hpo_deephyper_subprocess_train.sh | 7 +++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 83b5636..d6fff87 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -140,7 +140,6 @@ def run(job, optuna_trial=None): str(learning_rate), str(batch_size), str(params['epochs']), - #str(cuda_name) str(os.environ["CUDA_VISIBLE_DEVICES"]) ], stdout=subprocess.PIPE, diff --git a/hpo_deephyper_subprocess_train.sh b/hpo_deephyper_subprocess_train.sh index cc1dd56..1b28f04 100644 --- a/hpo_deephyper_subprocess_train.sh +++ b/hpo_deephyper_subprocess_train.sh @@ -34,16 +34,15 @@ CUDA_VISIBLE_DEVICES=$8 #echo "train_ml_data_dir: $train_ml_data_dir" #echo "val_ml_data_dir: $val_ml_data_dir" +echo "CONDA_ENV: $CONDA_ENV" +echo "SCRIPT: $SCRIPT" echo "input_dir: $input_dir" echo "output_dir: $output_dir" echo "learning_rate: $learning_rate" echo "batch_size: $batch_size" -#echo "cuda_name: $cuda_name" +echo "epochs: $epochs" echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -# epochs=10 -#epochs=10 -# epochs=50 # All train outputs are saved in params["model_outdir"] #CUDA_VISIBLE_DEVICES=6,7 python PathDSP_train_improve.py \ From 07d8d4b6acec623d418ed4e6ce71a765d6e7ef53 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:33:30 -0500 Subject: [PATCH 218/254] correct sh script --- hpo_deephyper_subprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index d6fff87..dfae182 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -132,7 +132,7 @@ def run(job, optuna_trial=None): subprocess_res = subprocess.run( [ "bash", - "subprocess_train.sh", + "hpo_deephyper_subprocess_train.sh", str(params['model_environment']), str(params['script_name']), str(params['ml_data_dir']), From 7e6a9d5b26b3acfaee7f816ced562461b9a79fbc Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:38:49 -0500 Subject: [PATCH 219/254] added param for type of val loss since it's no longer recorded as val_loss --- hpo_deephyper_params_def.py | 5 +++++ hpo_deephyper_subprocess.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/hpo_deephyper_params_def.py b/hpo_deephyper_params_def.py index 5d4b998..003e62f 100644 --- a/hpo_deephyper_params_def.py +++ b/hpo_deephyper_params_def.py @@ -49,5 +49,10 @@ "type": str, "default": '', "help": "Singularity image file of the model" + }, + {"name": "val_loss", + "type": str, + "default": 'mse', + "help": "Type of loss for validation" } ] \ No newline at end of file diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index dfae182..e012366 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -160,7 +160,7 @@ def run(job, optuna_trial=None): # f = open(model_outdir + "/val_scores.json") f = open(model_outdir_job_id / "val_scores.json") val_scores = json.load(f) - objective = -val_scores["val_loss"] + objective = -val_scores[params['val_loss']] # print("objective:", objective) # Checkpoint the model weights From 1a1dea2a5d9e2138c7dc9eafb68acc6479257ced Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:43:13 -0500 Subject: [PATCH 220/254] fix log dir --- hpo_deephyper_subprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index e012366..33a660c 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -164,7 +164,7 @@ def run(job, optuna_trial=None): # print("objective:", objective) # Checkpoint the model weights - with open(f"{log_dir}/model_{job.id}.pkl", "w") as f: + with open(f"{params['log_dir']}/model_{job.id}.pkl", "w") as f: f.write("model weights") # return score From 494946f17da1e7b3bbf02a5c1531010f59b31182 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:46:45 -0500 Subject: [PATCH 221/254] bug --- hpo_deephyper_subprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 33a660c..d8a8ad1 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -197,7 +197,7 @@ def run(job, optuna_trial=None): max_evals = 10 # max_evals = 100 results = search.search(max_evals=max_evals) - results = results.sort_values("m:val_loss", ascending=True) + #results = results.sort_values("m:val_loss", ascending=True) results.to_csv(model_outdir + "/hpo_results.csv", index=False) #print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) print("Finished deephyper HPO.") From 8cb0969b0a780bf203baa3415333af038512c9b1 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 11:20:19 -0500 Subject: [PATCH 222/254] results --- hpo_deephyper_subprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index d8a8ad1..3adf112 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -198,6 +198,6 @@ def run(job, optuna_trial=None): # max_evals = 100 results = search.search(max_evals=max_evals) #results = results.sort_values("m:val_loss", ascending=True) - results.to_csv(model_outdir + "/hpo_results.csv", index=False) + results.to_csv("hpo_results.csv", index=False) #print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) print("Finished deephyper HPO.") From 0c7f5d990eb72afa815f4eae4227364d8ce9c285 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Fri, 15 Nov 2024 11:22:27 -0500 Subject: [PATCH 223/254] move params --- hpo_deephyper_subprocess.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 3adf112..a66c90e 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -79,25 +79,7 @@ "learning_rate", default_value=0.001) # problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.0) # problem.add_hyperparameter([True, False], "early_stopping", default_value=False) -def prepare_parameters(): - # Initialize parameters for DeepHyper HPO - filepath = Path(__file__).resolve().parent - cfg = DRPPreprocessConfig() - params = cfg.initialize_parameters( - pathToModelDir=filepath, - default_config="hpo_deephyper_params.ini", - additional_definitions=hpo_deephyper_params_def.additional_definitions - ) - params['ml_data_dir'] = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" - params['model_outdir'] = f"{params['output_dir']}/{params['source']}/split_{params['split']}" - params['log_dir'] = f"{params['output_dir']}_logs/" - # subprocess_bashscript = "subprocess_train.sh" - params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") - print("NATASHA LOOK HERE") - print(params) - print("NATASHA DONE LOOK HERE") - return params @profile def run(job, optuna_trial=None): @@ -175,7 +157,24 @@ def run(job, optuna_trial=None): # Start time start_full_wf = time.time() global params - params = prepare_parameters() + # Initialize parameters for DeepHyper HPO + filepath = Path(__file__).resolve().parent + cfg = DRPPreprocessConfig() + params = cfg.initialize_parameters( + pathToModelDir=filepath, + default_config="hpo_deephyper_params.ini", + additional_definitions=hpo_deephyper_params_def.additional_definitions + ) + + params['ml_data_dir'] = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" + params['model_outdir'] = f"{params['output_dir']}/{params['source']}/split_{params['split']}" + params['log_dir'] = f"{params['output_dir']}_logs/" + # subprocess_bashscript = "subprocess_train.sh" + params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") + print("NATASHA LOOK HERE") + print(params) + print("NATASHA DONE LOOK HERE") + with Evaluator.create( run, method="mpicomm", method_kwargs={"callbacks": [TqdmCallback()]} ) as evaluator: From e6efff9137a8590628274bec21f9ec25dba18cc0 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Sun, 17 Nov 2024 21:29:51 -0500 Subject: [PATCH 224/254] add back sort to results --- README_deephyper_alpha.md | 3 ++- hpo_deephyper_subprocess.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README_deephyper_alpha.md b/README_deephyper_alpha.md index 1c6c8d9..a4a77d6 100644 --- a/README_deephyper_alpha.md +++ b/README_deephyper_alpha.md @@ -84,9 +84,10 @@ pip install mpi4py Each time: Set python path (in repo) -export PYTHONPATH=../IMPROVE + module load openmpi conda activate dh +export PYTHONPATH=../IMPROVE Run mpirun -np 10 python hpo_deephyper_subprocess.py \ No newline at end of file diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index a66c90e..afd0bfa 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -196,7 +196,7 @@ def run(job, optuna_trial=None): max_evals = 10 # max_evals = 100 results = search.search(max_evals=max_evals) - #results = results.sort_values("m:val_loss", ascending=True) + results = results.sort_values(f"m:{params['val_loss']}", ascending=True) results.to_csv("hpo_results.csv", index=False) #print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) print("Finished deephyper HPO.") From 63701161f5f04c87b9fca126bb17255da34eb6ac Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 18 Nov 2024 11:15:24 -0500 Subject: [PATCH 225/254] got rid of log_dir, write all to output folder --- hpo_deephyper_subprocess.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index afd0bfa..0fe29cc 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -146,7 +146,7 @@ def run(job, optuna_trial=None): # print("objective:", objective) # Checkpoint the model weights - with open(f"{params['log_dir']}/model_{job.id}.pkl", "w") as f: + with open(f"{params['output_dir']}/model_{job.id}.pkl", "w") as f: f.write("model weights") # return score @@ -168,7 +168,7 @@ def run(job, optuna_trial=None): params['ml_data_dir'] = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" params['model_outdir'] = f"{params['output_dir']}/{params['source']}/split_{params['split']}" - params['log_dir'] = f"{params['output_dir']}_logs/" + #params['log_dir'] = f"{params['output_dir']}_logs/" # subprocess_bashscript = "subprocess_train.sh" params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") print("NATASHA LOOK HERE") @@ -185,7 +185,7 @@ def run(job, optuna_trial=None): search = CBO( problem, evaluator, - log_dir=params['log_dir'], + log_dir=params['output_dir'], verbose=1, ) @@ -197,6 +197,6 @@ def run(job, optuna_trial=None): # max_evals = 100 results = search.search(max_evals=max_evals) results = results.sort_values(f"m:{params['val_loss']}", ascending=True) - results.to_csv("hpo_results.csv", index=False) + results.to_csv(f"{params['output_dir']}/hpo_results.csv", index=False) #print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) print("Finished deephyper HPO.") From 1e092dc8f4770a54a37d37b5cedd9c5933d570e9 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 18 Nov 2024 11:55:29 -0500 Subject: [PATCH 226/254] clean up, add max_evals as param --- hpo_deephyper_params.ini | 1 + hpo_deephyper_params_def.py | 5 ++++ hpo_deephyper_subprocess.py | 53 ++++--------------------------------- 3 files changed, 11 insertions(+), 48 deletions(-) diff --git a/hpo_deephyper_params.ini b/hpo_deephyper_params.ini index afbf618..45fa81d 100644 --- a/hpo_deephyper_params.ini +++ b/hpo_deephyper_params.ini @@ -8,6 +8,7 @@ epochs = 3 output_dir = ./test source = CCLE split = 0 +max_evals = 5 diff --git a/hpo_deephyper_params_def.py b/hpo_deephyper_params_def.py index 003e62f..e5662e1 100644 --- a/hpo_deephyper_params_def.py +++ b/hpo_deephyper_params_def.py @@ -54,5 +54,10 @@ "type": str, "default": 'mse', "help": "Type of loss for validation" + }, + {"name": "max_evals", + "type": int, + "default": 10, + "help": "Number of evaluations" } ] \ No newline at end of file diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 0fe29cc..9766e74 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -3,8 +3,7 @@ This can be done by running preprocess_example.sh It is assumed that the csa benchmark data is downloaded via download_csa.sh -and the env vars $IMPROVE_DATA_DIR and $PYTHONPATH are set: -export IMPROVE_DATA_DIR="./csa_data/" +and the env vars $PYTHONPATH is set: export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib It also assumes that your processed training data is at: "ml_data/{source}-{source}/split_{split}" @@ -83,34 +82,11 @@ @profile def run(job, optuna_trial=None): - - # config = copy.deepcopy(job.parameters) - # params = { - # "epochs": DEEPHYPER_BENCHMARK_MAX_EPOCHS, - # "timeout": DEEPHYPER_BENCHMARK_TIMEOUT, - # "verbose": False, - # } - # if len(config) > 0: - # remap_hyperparameters(config) - # params.update(config) - model_outdir_job_id = Path(params['model_outdir'] + f"/{job.id}") learning_rate = job.parameters["learning_rate"] batch_size = job.parameters["batch_size"] - # val_scores = main_train_grapdrp([ - # "--train_ml_data_dir", str(train_ml_data_dir), - # "--val_ml_data_dir", str(val_ml_data_dir), - # "--model_outdir", str(model_outdir_job_id), - # ]) - print("model env:", params['model_environment']) - print("script_name:", params['script_name']) - print("ml_data_dir:", params['ml_data_dir']) - print("model_outdir_job_id:", model_outdir_job_id) - print("learning_rate:", learning_rate) - print("batch_size:", batch_size) - print("params['epochs']:", params['epochs']) - print("CUDA_VISIBLE_DEVICES:", os.environ["CUDA_VISIBLE_DEVICES"]) - print("launch run") + + print(f"Launching run: batch_size={batch_size}, learning_rate={learning_rate}") subprocess_res = subprocess.run( [ "bash", @@ -135,15 +111,11 @@ def run(job, optuna_trial=None): os.makedirs(model_outdir_job_id, exist_ok=True) with open(result_file_name_stdout, 'w') as file: file.write(subprocess_res.stdout) - # print(subprocess_res.stdout) - # print(subprocess_res.stderr) # Load val_scores and get val_loss - # f = open(model_outdir + "/val_scores.json") f = open(model_outdir_job_id / "val_scores.json") val_scores = json.load(f) objective = -val_scores[params['val_loss']] - # print("objective:", objective) # Checkpoint the model weights with open(f"{params['output_dir']}/model_{job.id}.pkl", "w") as f: @@ -154,12 +126,10 @@ def run(job, optuna_trial=None): if __name__ == "__main__": - # Start time - start_full_wf = time.time() - global params # Initialize parameters for DeepHyper HPO filepath = Path(__file__).resolve().parent cfg = DRPPreprocessConfig() + global params params = cfg.initialize_parameters( pathToModelDir=filepath, default_config="hpo_deephyper_params.ini", @@ -168,12 +138,7 @@ def run(job, optuna_trial=None): params['ml_data_dir'] = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" params['model_outdir'] = f"{params['output_dir']}/{params['source']}/split_{params['split']}" - #params['log_dir'] = f"{params['output_dir']}_logs/" - # subprocess_bashscript = "subprocess_train.sh" params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") - print("NATASHA LOOK HERE") - print(params) - print("NATASHA DONE LOOK HERE") with Evaluator.create( run, method="mpicomm", method_kwargs={"callbacks": [TqdmCallback()]} @@ -181,21 +146,13 @@ def run(job, optuna_trial=None): if evaluator is not None: print(problem) - search = CBO( problem, evaluator, log_dir=params['output_dir'], verbose=1, ) - - # max_evals = 2 - # max_evals = 4 - # max_evals = 10 - # max_evals = 20 - max_evals = 10 - # max_evals = 100 - results = search.search(max_evals=max_evals) + results = search.search(max_evals=params['max_evals']) results = results.sort_values(f"m:{params['val_loss']}", ascending=True) results.to_csv(f"{params['output_dir']}/hpo_results.csv", index=False) #print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) From e81f7602a602e1dad224288861e04fd99eedb4f6 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:06:48 -0500 Subject: [PATCH 227/254] bug --- hpo_deephyper_params_def.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hpo_deephyper_params_def.py b/hpo_deephyper_params_def.py index e5662e1..52d484a 100644 --- a/hpo_deephyper_params_def.py +++ b/hpo_deephyper_params_def.py @@ -57,7 +57,7 @@ }, {"name": "max_evals", "type": int, - "default": 10, + "default": 20, "help": "Number of evaluations" } ] \ No newline at end of file From 3f999dfa0379b37c52de305a5f1ecb4fd2f163d0 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:08:03 -0500 Subject: [PATCH 228/254] creates output dir --- hpo_deephyper_subprocess.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 9766e74..d51812b 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -135,7 +135,8 @@ def run(job, optuna_trial=None): default_config="hpo_deephyper_params.ini", additional_definitions=hpo_deephyper_params_def.additional_definitions ) - + if params['output_dir'].exists() is False: + os.makedirs(params['output_dir'], exist_ok=True) params['ml_data_dir'] = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" params['model_outdir'] = f"{params['output_dir']}/{params['source']}/split_{params['split']}" params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") From b60188bb44e13bcba921f5bce1875b99f05d62af Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:12:50 -0500 Subject: [PATCH 229/254] path for output dir --- hpo_deephyper_subprocess.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index d51812b..e14fc38 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -135,8 +135,9 @@ def run(job, optuna_trial=None): default_config="hpo_deephyper_params.ini", additional_definitions=hpo_deephyper_params_def.additional_definitions ) - if params['output_dir'].exists() is False: - os.makedirs(params['output_dir'], exist_ok=True) + output_dir = Path(params['output_dir']) + if output_dir.exists() is False: + os.makedirs(output_dir, exist_ok=True) params['ml_data_dir'] = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" params['model_outdir'] = f"{params['output_dir']}/{params['source']}/split_{params['split']}" params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") From f1d87b3bb17d8c7857df32e7bb9fcc21e5cd54ce Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 18 Nov 2024 14:56:34 -0500 Subject: [PATCH 230/254] bugs --- hpo_deephyper_subprocess.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index e14fc38..442fbb5 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -46,10 +46,8 @@ comm = MPI.COMM_WORLD rank = comm.Get_rank() -print(rank) size = comm.Get_size() -print(size) -#NCK local_rank = os.environ["PMI_LOCAL_RANK"] +local_rank = os.environ["PMI_LOCAL_RANK"] # CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh # uncomment the below commands if running via interactive node @@ -157,5 +155,6 @@ def run(job, optuna_trial=None): results = search.search(max_evals=params['max_evals']) results = results.sort_values(f"m:{params['val_loss']}", ascending=True) results.to_csv(f"{params['output_dir']}/hpo_results.csv", index=False) - #print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) + print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) print("Finished deephyper HPO.") + print(params['max_evals']) From 13c941661706d2fd90458663b8f78b49d5a3a998 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 18 Nov 2024 14:58:02 -0500 Subject: [PATCH 231/254] bug --- hpo_deephyper_subprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 442fbb5..b420322 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -47,7 +47,7 @@ comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() -local_rank = os.environ["PMI_LOCAL_RANK"] +#local_rank = os.environ["PMI_LOCAL_RANK"] # CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh # uncomment the below commands if running via interactive node @@ -155,6 +155,6 @@ def run(job, optuna_trial=None): results = search.search(max_evals=params['max_evals']) results = results.sort_values(f"m:{params['val_loss']}", ascending=True) results.to_csv(f"{params['output_dir']}/hpo_results.csv", index=False) - print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) + print("current node: ", socket.gethostname(), "; current rank: ", rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) print("Finished deephyper HPO.") print(params['max_evals']) From a34b6ef643bb777da6b70f8b7e25420a15d13aa5 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:12:43 -0500 Subject: [PATCH 232/254] interactive session in params --- hpo_deephyper_params.ini | 2 +- hpo_deephyper_params_def.py | 5 +++++ hpo_deephyper_subprocess.py | 14 +++++++------- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/hpo_deephyper_params.ini b/hpo_deephyper_params.ini index 45fa81d..f31a7c7 100644 --- a/hpo_deephyper_params.ini +++ b/hpo_deephyper_params.ini @@ -9,7 +9,7 @@ output_dir = ./test source = CCLE split = 0 max_evals = 5 - +interactive_session = True diff --git a/hpo_deephyper_params_def.py b/hpo_deephyper_params_def.py index 52d484a..8aaef2a 100644 --- a/hpo_deephyper_params_def.py +++ b/hpo_deephyper_params_def.py @@ -59,5 +59,10 @@ "type": int, "default": 20, "help": "Number of evaluations" + }, + {"name": "interactive_session", + "type": bool, + "default": True, + "help": "Are you using an interactive session?" } ] \ No newline at end of file diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index b420322..28d6e2d 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -47,13 +47,14 @@ comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() -#local_rank = os.environ["PMI_LOCAL_RANK"] -# CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh -# uncomment the below commands if running via interactive node -num_gpus_per_node = 2 -os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) -cuda_name = "cuda:" + str(rank % num_gpus_per_node) +if params['interactive_session']: + num_gpus_per_node = 2 + os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) + cuda_name = "cuda:" + str(rank % num_gpus_per_node) +else: + # CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh + local_rank = os.environ["PMI_LOCAL_RANK"] # --------------------- # Enable logging @@ -157,4 +158,3 @@ def run(job, optuna_trial=None): results.to_csv(f"{params['output_dir']}/hpo_results.csv", index=False) print("current node: ", socket.gethostname(), "; current rank: ", rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) print("Finished deephyper HPO.") - print(params['max_evals']) From 0139542ccd2faad1888b6dfe6c666399120a5105 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:15:20 -0500 Subject: [PATCH 233/254] bug --- hpo_deephyper_subprocess.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 28d6e2d..7a4a9ca 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -30,7 +30,21 @@ import hpo_deephyper_params_def from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig - +# Initialize parameters for DeepHyper HPO +filepath = Path(__file__).resolve().parent +cfg = DRPPreprocessConfig() +global params +params = cfg.initialize_parameters( + pathToModelDir=filepath, + default_config="hpo_deephyper_params.ini", + additional_definitions=hpo_deephyper_params_def.additional_definitions +) +output_dir = Path(params['output_dir']) +if output_dir.exists() is False: + os.makedirs(output_dir, exist_ok=True) +params['ml_data_dir'] = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" +params['model_outdir'] = f"{params['output_dir']}/{params['source']}/split_{params['split']}" +params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") # --------------------- # Enable using multiple GPUs @@ -125,22 +139,6 @@ def run(job, optuna_trial=None): if __name__ == "__main__": - # Initialize parameters for DeepHyper HPO - filepath = Path(__file__).resolve().parent - cfg = DRPPreprocessConfig() - global params - params = cfg.initialize_parameters( - pathToModelDir=filepath, - default_config="hpo_deephyper_params.ini", - additional_definitions=hpo_deephyper_params_def.additional_definitions - ) - output_dir = Path(params['output_dir']) - if output_dir.exists() is False: - os.makedirs(output_dir, exist_ok=True) - params['ml_data_dir'] = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" - params['model_outdir'] = f"{params['output_dir']}/{params['source']}/split_{params['split']}" - params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") - with Evaluator.create( run, method="mpicomm", method_kwargs={"callbacks": [TqdmCallback()]} ) as evaluator: From 883158aacb606348adeed508663c1534d7c30f01 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:47:43 -0500 Subject: [PATCH 234/254] inputdir --- README_deephyper_alpha.md | 77 +++++++++++-------------------------- hpo_deephyper_params.ini | 2 +- hpo_deephyper_params_def.py | 11 ------ hpo_deephyper_subprocess.py | 12 +++--- 4 files changed, 29 insertions(+), 73 deletions(-) diff --git a/README_deephyper_alpha.md b/README_deephyper_alpha.md index a4a77d6..6337ad8 100644 --- a/README_deephyper_alpha.md +++ b/README_deephyper_alpha.md @@ -1,93 +1,60 @@ -# Run HPO using deephyper on Polaris +# Run HPO using DeepHyper on Lambda ## Install conda environment for the curated model (PathDSP) +Install PathDSP: ``` -## install PathDSP git clone https://github.com/JDACS4C-IMPROVE/PathDSP -cd PathDSP -git checkout develop +``` -## install IMPROVE and download data +Install IMPROVE and download data: +``` source setup_improve.sh -or -export PYTHONPATH=../IMPROVE +``` -## define where to install PathDSP env +Install PathDSP environment: +``` +cd PathDSP export PathDSP_env=./PathDSP_env/ conda env create -f PathDSP_env_conda.yml -p $PathDSP_env +``` -## set up environment variables -cd .. +Set up environment variables? cd improve_lib="$PWD/IMPROVE/" echo "export PYTHONPATH=$PYTHONPATH:${improve_lib}" >> IMPROVE_env echo "export PathDSP_env=$PathDSP_env" >> IMPROVE_env source $PWD/IMPROVE_env -``` - - ## Perform preprocessing Run the preprocess script. This script taks around 40 mins to complete. The workflow assumes that your preprocessed data is at: "ml_data/{source}-{source}/split_{split}" -``` -### if necessary, request an interactive node from polaris to testing purposes -### qsub -A IMPROVE -I -l select=1 -l filesystems=home:eagle -l walltime=1:00:00 -q debug -### NEED to cd into your working directory again once the job started -``` - ``` cd PathDSP conda activate $PathDSP_env python PathDSP_preprocess_improve.py --input_dir ./csa_data/raw_data --output_dir ./ml_data/CCLE-CCLE/split_0 ``` -## Perform HPO using singularity container across two nodes -This will presumably have to be redone for alpha. - -``` -## copy processed to IMPROVE_DATA_DIR -cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ $IMPROVE_DATA_DIR -## specify singularity image file for PathDSP -echo "export PathDSP_sif=/lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP.sif" >> IMPROVE_env -cd PathDSP -## submit to debug queue -qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale_singularity_debug.sh -## to submit to debug-scaling or prod queue -## use hpo_scale_singularity_debug_scaling.sh -## or hpo_scale_singularity_prod.sh -## for interative node, run: mpirun -np 10 python hpo_subprocess_singularity.py +## Install conda environment for DeepHyper ``` - -## Alternatively, perform HPO across two nodes based on conda - -``` -cd PathDSP -# supply environment variables to qsub -qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale.sh -## for interactive node, you can run: mpirun -np 10 python hpo_deephyper_subprocess.py -``` - - -Lambda instructions -Setup repo - -Install DeepHyper env - module load openmpi conda create -n dh python=3.9 -y conda activate dh conda install gxx_linux-64 gcc_linu pip install "deephyper[default]" pip install mpi4py +``` -Each time: -Set python path (in repo) - +## Perform HPO +If necesssary, activate environment: +``` module load openmpi conda activate dh export PYTHONPATH=../IMPROVE +``` + +Run HPO: +``` +mpirun -np 10 python hpo_deephyper_subprocess.py +``` -Run -mpirun -np 10 python hpo_deephyper_subprocess.py \ No newline at end of file diff --git a/hpo_deephyper_params.ini b/hpo_deephyper_params.ini index f31a7c7..f30d5a5 100644 --- a/hpo_deephyper_params.ini +++ b/hpo_deephyper_params.ini @@ -1,5 +1,5 @@ [DEFAULT] -input_dir = ./csa_data/raw_data +input_dir = ./ml_data/CCLE-CCLE/split_0 y_col_name = auc model_name = PathDSP model_scripts_dir = ./ diff --git a/hpo_deephyper_params_def.py b/hpo_deephyper_params_def.py index 8aaef2a..0fd217c 100644 --- a/hpo_deephyper_params_def.py +++ b/hpo_deephyper_params_def.py @@ -24,22 +24,11 @@ "default": '', "help": "Name of your model conda environment" }, - {"name": "hyperparameters_file", - "type": str, - "default": 'hyperparameters_default.json', - "help": "json file containing optimized hyperparameters per dataset" - }, {"name": "epochs", "type": int, "default": 10, "help": "Number of epochs" }, - {"name": "available_accelerators", - "nargs" : "+", - "type": str, - "default": ["0", "1"], - "help": "GPU IDs to assign jobs" - }, {"name": "use_singularity", "type": bool, "default": True, diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 7a4a9ca..3be1741 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -2,13 +2,11 @@ Before running this script, first need to preprocess the data. This can be done by running preprocess_example.sh -It is assumed that the csa benchmark data is downloaded via download_csa.sh and the env vars $PYTHONPATH is set: export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib -It also assumes that your processed training data is at: "ml_data/{source}-{source}/split_{split}" -validation data is at: "ml_data/{source}-{source}/split_{split}" -model output files will be saved at "dh_hpo_improve/{source}/split_{split}" +It also assumes that your processed training and validation data is in input_dir. +Model output files will be saved in output_dir/{source}/split_{split}. mpirun -np 10 python hpo_subprocess.py """ @@ -30,7 +28,9 @@ import hpo_deephyper_params_def from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig +# --------------------- # Initialize parameters for DeepHyper HPO +# --------------------- filepath = Path(__file__).resolve().parent cfg = DRPPreprocessConfig() global params @@ -42,7 +42,7 @@ output_dir = Path(params['output_dir']) if output_dir.exists() is False: os.makedirs(output_dir, exist_ok=True) -params['ml_data_dir'] = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" +#params['ml_data_dir'] = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" params['model_outdir'] = f"{params['output_dir']}/{params['source']}/split_{params['split']}" params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") @@ -106,7 +106,7 @@ def run(job, optuna_trial=None): "hpo_deephyper_subprocess_train.sh", str(params['model_environment']), str(params['script_name']), - str(params['ml_data_dir']), + str(params['input_dir']), str(model_outdir_job_id), str(learning_rate), str(batch_size), From bc140c9571f3f52e071efea5ccc54c76094b1ee8 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:57:34 -0500 Subject: [PATCH 235/254] ml_data_dir, input_dir throws error related to x_data path --- hpo_deephyper_params.ini | 2 +- hpo_deephyper_params_def.py | 5 +++++ hpo_deephyper_subprocess.py | 4 ++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/hpo_deephyper_params.ini b/hpo_deephyper_params.ini index f30d5a5..f3d7247 100644 --- a/hpo_deephyper_params.ini +++ b/hpo_deephyper_params.ini @@ -1,5 +1,5 @@ [DEFAULT] -input_dir = ./ml_data/CCLE-CCLE/split_0 +ml_data_dir = ./ml_data/CCLE-CCLE/split_0 y_col_name = auc model_name = PathDSP model_scripts_dir = ./ diff --git a/hpo_deephyper_params_def.py b/hpo_deephyper_params_def.py index 0fd217c..0588ea3 100644 --- a/hpo_deephyper_params_def.py +++ b/hpo_deephyper_params_def.py @@ -53,5 +53,10 @@ "type": bool, "default": True, "help": "Are you using an interactive session?" + }, + {"name": "ml_data_dir", + "type": str, + "default": './', + "help": "Location of the preprocessed data." } ] \ No newline at end of file diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 3be1741..02f1b0b 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -5,7 +5,7 @@ and the env vars $PYTHONPATH is set: export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib -It also assumes that your processed training and validation data is in input_dir. +It also assumes that your processed training and validation data is in ml_data_dir. Model output files will be saved in output_dir/{source}/split_{split}. mpirun -np 10 python hpo_subprocess.py @@ -106,7 +106,7 @@ def run(job, optuna_trial=None): "hpo_deephyper_subprocess_train.sh", str(params['model_environment']), str(params['script_name']), - str(params['input_dir']), + str(params['ml_data_dir']), str(model_outdir_job_id), str(learning_rate), str(batch_size), From d6c7ebdcb41f880402414c2004417f73fddbc352 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:59:49 -0500 Subject: [PATCH 236/254] bug --- hpo_deephyper_params.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/hpo_deephyper_params.ini b/hpo_deephyper_params.ini index f3d7247..2178f9c 100644 --- a/hpo_deephyper_params.ini +++ b/hpo_deephyper_params.ini @@ -1,4 +1,5 @@ [DEFAULT] +input_dir = ./csa_data/raw_data ml_data_dir = ./ml_data/CCLE-CCLE/split_0 y_col_name = auc model_name = PathDSP From 8dd589e36ea54dd141e4225f5a9175f54124a7f8 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Tue, 19 Nov 2024 08:57:59 -0500 Subject: [PATCH 237/254] fix readme, clean up repo --- README_deephyper_alpha.md | 18 +++++++++--------- .../README_deephyper.md | 0 .../hpo_subprocess.py | 0 .../subprocess_train.sh | 0 .../PathDSP_cs_model.txt | 0 .../README_old.md | 0 .../README_old2.md | 0 TODO.txt => old_legacy_scripts/TODO.txt | 0 .../environment.yml | 0 .../environment_081723.yml | 0 .../get_test_data.py | 0 .../improve_utils.py | 0 infer.py => old_legacy_scripts/infer.py | 0 infer.sh => old_legacy_scripts/infer.sh | 0 .../parse_DSP_data_Chia_Jan12_2023.R | 0 .../preprocess.py | 0 .../preprocess.sh | 0 .../preprocess_new.py | 0 train.py => old_legacy_scripts/train.py | 0 train.sh => old_legacy_scripts/train.sh | 0 20 files changed, 9 insertions(+), 9 deletions(-) rename README_deephyper.md => old_deephyper/README_deephyper.md (100%) rename hpo_subprocess.py => old_deephyper/hpo_subprocess.py (100%) rename subprocess_train.sh => old_deephyper/subprocess_train.sh (100%) rename PathDSP_cs_model.txt => old_legacy_scripts/PathDSP_cs_model.txt (100%) rename README_old.md => old_legacy_scripts/README_old.md (100%) rename README_old2.md => old_legacy_scripts/README_old2.md (100%) rename TODO.txt => old_legacy_scripts/TODO.txt (100%) rename environment.yml => old_legacy_scripts/environment.yml (100%) rename environment_081723.yml => old_legacy_scripts/environment_081723.yml (100%) rename get_test_data.py => old_legacy_scripts/get_test_data.py (100%) rename improve_utils.py => old_legacy_scripts/improve_utils.py (100%) rename infer.py => old_legacy_scripts/infer.py (100%) rename infer.sh => old_legacy_scripts/infer.sh (100%) rename parse_DSP_data_Chia_Jan12_2023.R => old_legacy_scripts/parse_DSP_data_Chia_Jan12_2023.R (100%) rename preprocess.py => old_legacy_scripts/preprocess.py (100%) rename preprocess.sh => old_legacy_scripts/preprocess.sh (100%) rename preprocess_new.py => old_legacy_scripts/preprocess_new.py (100%) rename train.py => old_legacy_scripts/train.py (100%) rename train.sh => old_legacy_scripts/train.sh (100%) diff --git a/README_deephyper_alpha.md b/README_deephyper_alpha.md index 6337ad8..de026cf 100644 --- a/README_deephyper_alpha.md +++ b/README_deephyper_alpha.md @@ -1,13 +1,15 @@ -# Run HPO using DeepHyper on Lambda +# Run HPO using DeepHyper on Lambda with conda ## Install conda environment for the curated model (PathDSP) Install PathDSP: ``` +cd git clone https://github.com/JDACS4C-IMPROVE/PathDSP ``` Install IMPROVE and download data: ``` +cd PathDSP source setup_improve.sh ``` @@ -18,13 +20,6 @@ export PathDSP_env=./PathDSP_env/ conda env create -f PathDSP_env_conda.yml -p $PathDSP_env ``` -Set up environment variables? -cd -improve_lib="$PWD/IMPROVE/" -echo "export PYTHONPATH=$PYTHONPATH:${improve_lib}" >> IMPROVE_env -echo "export PathDSP_env=$PathDSP_env" >> IMPROVE_env -source $PWD/IMPROVE_env - ## Perform preprocessing Run the preprocess script. This script taks around 40 mins to complete. The workflow assumes that your preprocessed data is at: "ml_data/{source}-{source}/split_{split}" @@ -33,6 +28,7 @@ The workflow assumes that your preprocessed data is at: "ml_data/{source}-{sourc cd PathDSP conda activate $PathDSP_env python PathDSP_preprocess_improve.py --input_dir ./csa_data/raw_data --output_dir ./ml_data/CCLE-CCLE/split_0 +conda deactivate ``` ## Install conda environment for DeepHyper @@ -40,7 +36,7 @@ python PathDSP_preprocess_improve.py --input_dir ./csa_data/raw_data --output_di module load openmpi conda create -n dh python=3.9 -y conda activate dh -conda install gxx_linux-64 gcc_linu +conda install gxx_linux-64 gcc_linux-64 pip install "deephyper[default]" pip install mpi4py ``` @@ -58,3 +54,7 @@ Run HPO: mpirun -np 10 python hpo_deephyper_subprocess.py ``` +# Run HPO using DeepHyper on Polaris with conda + +# Run HPO using DeepHyper on Polaris with singularity + diff --git a/README_deephyper.md b/old_deephyper/README_deephyper.md similarity index 100% rename from README_deephyper.md rename to old_deephyper/README_deephyper.md diff --git a/hpo_subprocess.py b/old_deephyper/hpo_subprocess.py similarity index 100% rename from hpo_subprocess.py rename to old_deephyper/hpo_subprocess.py diff --git a/subprocess_train.sh b/old_deephyper/subprocess_train.sh similarity index 100% rename from subprocess_train.sh rename to old_deephyper/subprocess_train.sh diff --git a/PathDSP_cs_model.txt b/old_legacy_scripts/PathDSP_cs_model.txt similarity index 100% rename from PathDSP_cs_model.txt rename to old_legacy_scripts/PathDSP_cs_model.txt diff --git a/README_old.md b/old_legacy_scripts/README_old.md similarity index 100% rename from README_old.md rename to old_legacy_scripts/README_old.md diff --git a/README_old2.md b/old_legacy_scripts/README_old2.md similarity index 100% rename from README_old2.md rename to old_legacy_scripts/README_old2.md diff --git a/TODO.txt b/old_legacy_scripts/TODO.txt similarity index 100% rename from TODO.txt rename to old_legacy_scripts/TODO.txt diff --git a/environment.yml b/old_legacy_scripts/environment.yml similarity index 100% rename from environment.yml rename to old_legacy_scripts/environment.yml diff --git a/environment_081723.yml b/old_legacy_scripts/environment_081723.yml similarity index 100% rename from environment_081723.yml rename to old_legacy_scripts/environment_081723.yml diff --git a/get_test_data.py b/old_legacy_scripts/get_test_data.py similarity index 100% rename from get_test_data.py rename to old_legacy_scripts/get_test_data.py diff --git a/improve_utils.py b/old_legacy_scripts/improve_utils.py similarity index 100% rename from improve_utils.py rename to old_legacy_scripts/improve_utils.py diff --git a/infer.py b/old_legacy_scripts/infer.py similarity index 100% rename from infer.py rename to old_legacy_scripts/infer.py diff --git a/infer.sh b/old_legacy_scripts/infer.sh similarity index 100% rename from infer.sh rename to old_legacy_scripts/infer.sh diff --git a/parse_DSP_data_Chia_Jan12_2023.R b/old_legacy_scripts/parse_DSP_data_Chia_Jan12_2023.R similarity index 100% rename from parse_DSP_data_Chia_Jan12_2023.R rename to old_legacy_scripts/parse_DSP_data_Chia_Jan12_2023.R diff --git a/preprocess.py b/old_legacy_scripts/preprocess.py similarity index 100% rename from preprocess.py rename to old_legacy_scripts/preprocess.py diff --git a/preprocess.sh b/old_legacy_scripts/preprocess.sh similarity index 100% rename from preprocess.sh rename to old_legacy_scripts/preprocess.sh diff --git a/preprocess_new.py b/old_legacy_scripts/preprocess_new.py similarity index 100% rename from preprocess_new.py rename to old_legacy_scripts/preprocess_new.py diff --git a/train.py b/old_legacy_scripts/train.py similarity index 100% rename from train.py rename to old_legacy_scripts/train.py diff --git a/train.sh b/old_legacy_scripts/train.sh similarity index 100% rename from train.sh rename to old_legacy_scripts/train.sh From ac1c7e65614d7910f0e531ff6bc012c8b488b063 Mon Sep 17 00:00:00 2001 From: Andreas Wilke Date: Tue, 19 Nov 2024 11:12:18 -0600 Subject: [PATCH 238/254] Interface scripts for CSA --- PathDSP_params.txt | 2 +- infer.sh | 69 +++-------------------------------------- preprocess.sh | 77 +++------------------------------------------- train.sh | 70 +++-------------------------------------- 4 files changed, 14 insertions(+), 204 deletions(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 91f33f6..32bc09b 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -1,6 +1,6 @@ [Preprocess] data_format = .txt -input_supp_data_dir = ./author_data +input_supp_data_dir = /nfs/ml_lab/projects/improve/data/experiments/src/PathDSP/author_data train_split_file = CCLE_split_0_train.txt val_split_file = CCLE_split_0_val.txt test_split_file = CCLE_split_0_test.txt diff --git a/infer.sh b/infer.sh index 4be3ec5..f1dcd94 100755 --- a/infer.sh +++ b/infer.sh @@ -1,68 +1,7 @@ -#!/bin/bash +IMPROVE_MODEL_NAME=PathDSP +IMPROVE_MODEL_SCRIPT=${IMPROVE_MODEL_NAME}_infer_improve.py -# arg 1 CUDA_VISIBLE_DEVICES -# arg 2 CANDLE_DATA_DIR -# arg 3 CANDLE_CONFIG - -### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=PathDSP_infer_improve.py - -### Set env if CANDLE_MODEL is not in same directory as this script +# Set env if CANDLE_MODEL is not in same directory as this script IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} -CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL} -if [ ! -f ${CANDLE_MODEL} ] ; then - echo No such file ${CANDLE_MODEL} - exit 404 -fi - -if [ $# -lt 2 ]; then - echo "Illegal number of parameters" - echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" - exit -1 -fi - -if [ $# -eq 2 ]; then - CUDA_VISIBLE_DEVICES=$1 - shift - CANDLE_DATA_DIR=$1 - shift - CMD="python ${CANDLE_MODEL}" - echo "CMD = ${CMD}" -elif [ $# -ge 3 ]; then - CUDA_VISIBLE_DEVICES=$1 - shift - CANDLE_DATA_DIR=$1 - shift - - # if original $3 is a file, set candle_config and passthrough $@ - ### if [ -f $CANDLE_DATA_DIR/$1 ] ; then - if [ -f $1 ]; then - echo "$CANDLE_DATA_DIR/$1 is a file" - CANDLE_CONFIG=$1 - shift - CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" - echo "CMD = $CMD" - - # else passthrough $@ - else - echo "$1 is not a file" - CMD="python ${CANDLE_MODEL} $@" - echo "CMD = $CMD" - - fi -fi - - - -# Display runtime arguments -#echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" -echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" -echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" -echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" - -# Set up environmental variables and execute model -echo "activating environment" -source activate /usr/local/conda_envs/PathDSP_env -echo "running command ${CMD}" -CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD +python $IMPROVE_MODEL_DIR/$IMPROVE_MODEL_SCRIPT $@ diff --git a/preprocess.sh b/preprocess.sh index 590905b..4da8d87 100755 --- a/preprocess.sh +++ b/preprocess.sh @@ -1,77 +1,8 @@ -#!/bin/bash - -######################################################################### -### THIS IS A TEMPLATE FILE. SUBSTITUTE #PATH# WITH THE MODEL EXECUTABLE. -######################################################################### - -# arg 1 CANDLE_DATA_DIR -# arg 2 CANDLE_CONFIG - -### Path and Name to your CANDLEized model's main Python script### - -# e.g. CANDLE_MODEL=graphdrp_preprocess.py -CANDLE_MODEL_SCRIPT=PathDSP_preprocess_improve.py +IMPROVE_MODEL_NAME=PathDSP +IMPROVE_STAGE=preprocess +IMPROVE_MODEL_SCRIPT=${IMPROVE_MODEL_NAME}_${IMPROVE_STAGE}_improve.py # Set env if CANDLE_MODEL is not in same directory as this script IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} -# Combine path and name and check if executable exists -CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL_SCRIPT} -if [ ! -f ${CANDLE_MODEL} ] ; then - echo No such file ${CANDLE_MODEL} - exit 404 -fi - - - -if [ $# -lt 2 ] ; then - echo "Illegal number of parameters" - echo "CANDLE_DATA_DIR PARAMS are required" - exit -1 -fi - - - -if [ $# -eq 2 ] ; then - - CANDLE_DATA_DIR=$1 ; shift - - # if $2 is a file, then set candle_config - if [ -f $CANDLE_DATA_DIR/$1 ] ; then - CONFIG_FILE=$1 ; shift - CMD="python ${CANDLE_MODEL} --config_file ${CONFIG_FILE}" - else - CMD="python ${CANDLE_MODEL} $@" - echo CMD=\"$CMD\" - fi - -elif [ $# -ge 3 ] ; then - - CANDLE_DATA_DIR=$1 ; shift - - # if $2 is a file, then set candle_config - if [ -f $CANDLE_DATA_DIR/$1 ] ; then - echo "$1 is a file" - CANDLE_CONFIG=$1 ; shift - CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" - echo "CMD = $CMD $@" - - # else passthrough $@ - else - echo "$1 is not a file" - CMD="python ${CANDLE_MODEL} $@" - echo "CMD = $CMD" - - fi -fi - -# Display runtime arguments -echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" -echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" -echo "running command ${CMD}" -# Set up environmental variables and execute model -echo "activating environment" -#source /opt/conda/etc/profile.d/conda.sh -source activate /usr/local/conda_envs/PathDSP_env -echo "running command ${CMD}" -CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD +python $IMPROVE_MODEL_DIR/$IMPROVE_MODEL_SCRIPT $@ diff --git a/train.sh b/train.sh index 325f9a4..a10d6e2 100755 --- a/train.sh +++ b/train.sh @@ -1,67 +1,7 @@ -#!/bin/bash +IMPROVE_MODEL_NAME=PathDSP +IMPROVE_MODEL_SCRIPT=${IMPROVE_MODEL_NAME}_train_improve.py -# arg 1 CUDA_VISIBLE_DEVICES -# arg 2 CANDLE_DATA_DIR -# arg 3 CANDLE_CONFIG +# Set env if CANDLE_MODEL is not in same directory as this script +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} -### Path to your CANDLEized model's main Python script### -CANDLE_MODEL=PathDSP_train_improve.py - -### Set env if CANDLE_MODEL is not in same directory as this script -IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$(dirname -- "$0")} - -CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL} -if [ ! -f ${CANDLE_MODEL} ]; then - echo No such file ${CANDLE_MODEL} - exit 404 -fi - -if [ $# -lt 2 ]; then - echo "Illegal number of parameters" - echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" - exit -1 -fi - -if [ $# -eq 2 ]; then - CUDA_VISIBLE_DEVICES=$1 - shift - CANDLE_DATA_DIR=$1 - shift - CMD="python ${CANDLE_MODEL}" - echo "CMD = ${CMD}" -elif [ $# -ge 3 ]; then - CUDA_VISIBLE_DEVICES=$1 - shift - CANDLE_DATA_DIR=$1 - shift - - # if original $3 is a file, set candle_config and passthrough $@ - ### if [ -f $CANDLE_DATA_DIR/$1 ] ; then - if [ -f $1 ]; then - echo "$CANDLE_DATA_DIR/$1 is a file" - CANDLE_CONFIG=$1 - shift - CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" - echo "CMD = $CMD" - - # else passthrough $@ - else - echo "$1 is not a file" - CMD="python ${CANDLE_MODEL} $@" - echo "CMD = $CMD" - - fi -fi - -# Display runtime arguments -#echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" -echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" -echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" -echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" - -# Set up environmental variables and execute model -echo "activating environment" -#source /opt/conda/etc/profile.d/conda.sh -source activate /usr/local/conda_envs/PathDSP_env -echo "running command ${CMD}" -CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD +python $IMPROVE_MODEL_DIR/$IMPROVE_MODEL_SCRIPT $@ From 1271f2cc5228af873b52b6b1e9872a230159d85f Mon Sep 17 00:00:00 2001 From: Andreas Wilke Date: Tue, 19 Nov 2024 11:16:17 -0600 Subject: [PATCH 239/254] Moved interfaced scripts back --- old_legacy_scripts/infer.sh => infer.sh | 0 old_legacy_scripts/preprocess.sh => preprocess.sh | 0 old_legacy_scripts/train.sh => train.sh | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename old_legacy_scripts/infer.sh => infer.sh (100%) rename old_legacy_scripts/preprocess.sh => preprocess.sh (100%) rename old_legacy_scripts/train.sh => train.sh (100%) diff --git a/old_legacy_scripts/infer.sh b/infer.sh similarity index 100% rename from old_legacy_scripts/infer.sh rename to infer.sh diff --git a/old_legacy_scripts/preprocess.sh b/preprocess.sh similarity index 100% rename from old_legacy_scripts/preprocess.sh rename to preprocess.sh diff --git a/old_legacy_scripts/train.sh b/train.sh similarity index 100% rename from old_legacy_scripts/train.sh rename to train.sh From 8804acd8e7a240f261cb5bcb69fe58b2c30544e2 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 20 Nov 2024 10:31:38 -0500 Subject: [PATCH 240/254] un-hardcoded hyperparams, testing HpProblem --- hpo_deephyper_hyperparameters.py | 35 ++++++++++++++++++++++++++++++++ hpo_deephyper_subprocess.py | 16 ++++++++++++--- 2 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 hpo_deephyper_hyperparameters.py diff --git a/hpo_deephyper_hyperparameters.py b/hpo_deephyper_hyperparameters.py new file mode 100644 index 0000000..0256e3c --- /dev/null +++ b/hpo_deephyper_hyperparameters.py @@ -0,0 +1,35 @@ +hyperparams = [ + {"name": "batch_size", + "type": int, + "min": 8, + "max": 512, + "default": 64, + "log_uniform": True + }, + {"name": "learning_rate", + "type": float, + "min": 1e-6, + "max": 1e-2, + "default": 0.001, + "log_uniform": True + }, +] + + +''' +{ +"name": "dropout", +"type": float, +"min": 0, +"max": 0.5, +"default": 0, +"log_uniform": False +} + +{ +"name": "early_stopping", +"type": "categorical", +"choices": [True, False], +"default": False +} +''' \ No newline at end of file diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 02f1b0b..210ee39 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -26,6 +26,7 @@ from mpi4py import MPI import socket import hpo_deephyper_params_def +from hpo_deephyper_hyperparameters import hyperparams from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig # --------------------- @@ -86,9 +87,18 @@ # --------------------- problem = HpProblem() -problem.add_hyperparameter((8, 512, "log-uniform"), "batch_size", default_value=64) -problem.add_hyperparameter((1e-6, 1e-2, "log-uniform"), - "learning_rate", default_value=0.001) +for hp in hyperparams: + if hp['type'] == "categorical": + print("not implemented yet") + else: + if hp['log_uniform']: + problem.add_hyperparameter((hp['min'], hp['max'], "log-uniform"), + hp['name'], default_value=hp['default']) + else: + problem.add_hyperparameter((hp['min'], hp['max']), + hp['name'], default_value=hp['default']) + + # problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.0) # problem.add_hyperparameter([True, False], "early_stopping", default_value=False) From a074fdbd13175ad62bafc9170ad375bf8889ec33 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:11:15 -0500 Subject: [PATCH 241/254] modified shell script to take n hyperparameters --- hpo_deephyper_subprocess.py | 2 + hpo_deephyper_subprocess_train.sh | 64 ++++++++++++------------------- 2 files changed, 27 insertions(+), 39 deletions(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 210ee39..2abd7b3 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -98,6 +98,8 @@ problem.add_hyperparameter((hp['min'], hp['max']), hp['name'], default_value=hp['default']) +params['hyperparams'] = [d['name'] for d in hyperparams] + # problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.0) # problem.add_hyperparameter([True, False], "early_stopping", default_value=False) diff --git a/hpo_deephyper_subprocess_train.sh b/hpo_deephyper_subprocess_train.sh index 1b28f04..c3f8984 100644 --- a/hpo_deephyper_subprocess_train.sh +++ b/hpo_deephyper_subprocess_train.sh @@ -11,51 +11,37 @@ # https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script # This doesn't work w/o eval "$(conda shell.bash hook)" CONDA_ENV=$1 -#echo "Allow conda commands in shell script by running 'conda shell.bash hook'" -#eval "$(conda shell.bash hook)" echo "Activated conda commands in shell script" -#conda activate $CONDA_ENV -#source activate $CONDA_ENV conda_path=$(dirname $(dirname $(which conda))) source $conda_path/bin/activate $CONDA_ENV -#source /soft/datascience/conda/2023-10-04/mconda3/bin/activate $CONDA_ENV -#source activate $CONDA_ENV echo "Activated conda env $CONDA_ENV" -#model path, model name, epochs + +# get mandatory arguments SCRIPT=$2 input_dir=$3 output_dir=$4 -learning_rate=$5 -batch_size=$6 -epochs=$7 -#cuda_name=$6 -CUDA_VISIBLE_DEVICES=$8 - - -#echo "train_ml_data_dir: $train_ml_data_dir" -#echo "val_ml_data_dir: $val_ml_data_dir" -echo "CONDA_ENV: $CONDA_ENV" -echo "SCRIPT: $SCRIPT" -echo "input_dir: $input_dir" -echo "output_dir: $output_dir" -echo "learning_rate: $learning_rate" -echo "batch_size: $batch_size" -echo "epochs: $epochs" -echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" - - -# All train outputs are saved in params["model_outdir"] -#CUDA_VISIBLE_DEVICES=6,7 python PathDSP_train_improve.py \ -#CUDA_VISIBLE_DEVICES=5 -#CUDA_VISIBLE_DEVICES=6,7 -CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} python $SCRIPT \ - --input_dir $input_dir \ - --output_dir $output_dir \ - --epochs $epochs \ - --learning_rate $learning_rate \ - --batch_size $batch_size -# --cuda_name $cuda_name - -#conda deactivate +epochs=$5 +CUDA_VISIBLE_DEVICES=$6 + +command="python $SCRIPT --input_dir $input_dir --output_dir $output_dir --epochs $epochs " + + +# append hyperparameter arguments to python call +for i in $(seq 7 $#) +do + if [ $(($i % 2)) == 0 ]; then + command="${command} ${!i}" + else + command="${command} --${!i}" + fi +done + + +echo "command: $command" + +# run python script +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} $command + + source $conda_path/bin/deactivate echo "Deactivated conda env $CONDA_ENV" From befc4b56c2d59f9349244ca49d71a29431f27f62 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:24:17 -0500 Subject: [PATCH 242/254] modified run() to take n hyperparameters --- hpo_deephyper_subprocess.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 2abd7b3..091d010 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -108,23 +108,24 @@ @profile def run(job, optuna_trial=None): model_outdir_job_id = Path(params['model_outdir'] + f"/{job.id}") - learning_rate = job.parameters["learning_rate"] - batch_size = job.parameters["batch_size"] - - print(f"Launching run: batch_size={batch_size}, learning_rate={learning_rate}") - subprocess_res = subprocess.run( - [ - "bash", - "hpo_deephyper_subprocess_train.sh", + #learning_rate = job.parameters["learning_rate"] + #batch_size = job.parameters["batch_size"] + + train_run = ["bash", "hpo_deephyper_subprocess_train.sh", str(params['model_environment']), str(params['script_name']), str(params['ml_data_dir']), str(model_outdir_job_id), - str(learning_rate), - str(batch_size), str(params['epochs']), str(os.environ["CUDA_VISIBLE_DEVICES"]) - ], + ] + for hp in params['hyperparams']: + train_run = train_run + [hp] + train_run = train_run + [job.parameters[hp]] + + print(f"Launching run: ") + print(train_run) + subprocess_res = subprocess.run(train_run, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True From e230ca09b8f13d2009be331a4a84fb91f138ef8c Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:28:32 -0500 Subject: [PATCH 243/254] bug --- hpo_deephyper_subprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py index 091d010..01c950e 100644 --- a/hpo_deephyper_subprocess.py +++ b/hpo_deephyper_subprocess.py @@ -120,8 +120,8 @@ def run(job, optuna_trial=None): str(os.environ["CUDA_VISIBLE_DEVICES"]) ] for hp in params['hyperparams']: - train_run = train_run + [hp] - train_run = train_run + [job.parameters[hp]] + train_run = train_run + [str(hp)] + train_run = train_run + [str(job.parameters[hp])] print(f"Launching run: ") print(train_run) From e5eb69731e5eb6dc8c4569f1127745ce37536319 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:33:00 -0500 Subject: [PATCH 244/254] testing 3 hps --- hpo_deephyper_hyperparameters.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hpo_deephyper_hyperparameters.py b/hpo_deephyper_hyperparameters.py index 0256e3c..56c446b 100644 --- a/hpo_deephyper_hyperparameters.py +++ b/hpo_deephyper_hyperparameters.py @@ -13,6 +13,14 @@ "default": 0.001, "log_uniform": True }, + { + "name": "dropout", + "type": float, + "min": 0, + "max": 0.5, + "default": 0, + "log_uniform": False + } ] From 619c3f1a480e59252e89f3a302fa2531bd03fcd7 Mon Sep 17 00:00:00 2001 From: Andreas Wilke Date: Mon, 25 Nov 2024 13:17:08 -0600 Subject: [PATCH 245/254] Added aliases for stage files to work with csa --- pathdsp_infer_improve.py | 1 + pathdsp_preprocess_improve.py | 1 + pathdsp_train_improve.py | 1 + 3 files changed, 3 insertions(+) create mode 120000 pathdsp_infer_improve.py create mode 120000 pathdsp_preprocess_improve.py create mode 120000 pathdsp_train_improve.py diff --git a/pathdsp_infer_improve.py b/pathdsp_infer_improve.py new file mode 120000 index 0000000..90b57c2 --- /dev/null +++ b/pathdsp_infer_improve.py @@ -0,0 +1 @@ +PathDSP_infer_improve.py \ No newline at end of file diff --git a/pathdsp_preprocess_improve.py b/pathdsp_preprocess_improve.py new file mode 120000 index 0000000..0e1ec22 --- /dev/null +++ b/pathdsp_preprocess_improve.py @@ -0,0 +1 @@ +PathDSP_preprocess_improve.py \ No newline at end of file diff --git a/pathdsp_train_improve.py b/pathdsp_train_improve.py new file mode 120000 index 0000000..c4d5809 --- /dev/null +++ b/pathdsp_train_improve.py @@ -0,0 +1 @@ +PathDSP_train_improve.py \ No newline at end of file From d743806aa26f88a8c9476d710d0c39b7ead8cff3 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:26:51 -0500 Subject: [PATCH 246/254] Update PathDSP_preprocess_improve.py --- PathDSP_preprocess_improve.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 54aff53..f36b915 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -331,11 +331,18 @@ def prep_input(params): comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) comb_data_mtx = comb_data_mtx.dropna() - comb_data_mtx_to_save = copy.deepcopy(comb_data_mtx) + comb_data_mtx_to_save = comb_data_mtx['response'] comb_data_mtx_to_save = comb_data_mtx_to_save.reset_index() - print(comb_data_mtx_to_save) + comb_data_mtx_to_save.rename(columns={'drug_id': 'improve_chem_id', 'sample_id': 'improve_sample_id'}, inplace=True) comb_data_mtx_to_save[params["y_col_name"]] = comb_data_mtx_to_save["response"].apply(lambda x: 10 ** (x) - 0.01) - frm.save_stage_ydf(ydf=comb_data_mtx_to_save, stage=i, output_dir=params["output_dir"]) + rsp = drp.DrugResponseLoader(params, + split_file=params[i+"_split_file"], + verbose=False).dfs["response.tsv"] + ydata = rsp.join(comb_data_mtx_to_save, on=['improve_chem_id', 'improve_sample_id'], how='right') + print(comb_data_mtx_to_save) + print("YDATA") + print(ydata) + frm.save_stage_ydf(ydf=ydata, stage=i, output_dir=params["output_dir"]) pl.from_pandas(comb_data_mtx).write_csv( params["output_dir"] + "/" + frm.build_ml_data_file_name(data_format=params["data_format"], stage=i) , separator="\t", has_header=True From 2044d94c1c352662c524ad975c411a49107f9880 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 2 Dec 2024 10:01:27 -0500 Subject: [PATCH 247/254] Update PathDSP_preprocess_improve.py --- PathDSP_preprocess_improve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index f36b915..4e88841 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -338,7 +338,7 @@ def prep_input(params): rsp = drp.DrugResponseLoader(params, split_file=params[i+"_split_file"], verbose=False).dfs["response.tsv"] - ydata = rsp.join(comb_data_mtx_to_save, on=['improve_chem_id', 'improve_sample_id'], how='right') + ydata = rsp.merge(comb_data_mtx_to_save, on=['improve_chem_id', 'improve_sample_id'], how='right') print(comb_data_mtx_to_save) print("YDATA") print(ydata) From aef3a41fc4759d6d1874ec74e8013f04e4975b72 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 2 Dec 2024 11:57:39 -0500 Subject: [PATCH 248/254] Update PathDSP_preprocess_improve.py --- PathDSP_preprocess_improve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 4e88841..0cdc896 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -334,7 +334,7 @@ def prep_input(params): comb_data_mtx_to_save = comb_data_mtx['response'] comb_data_mtx_to_save = comb_data_mtx_to_save.reset_index() comb_data_mtx_to_save.rename(columns={'drug_id': 'improve_chem_id', 'sample_id': 'improve_sample_id'}, inplace=True) - comb_data_mtx_to_save[params["y_col_name"]] = comb_data_mtx_to_save["response"].apply(lambda x: 10 ** (x) - 0.01) + #comb_data_mtx_to_save[params["y_col_name"]] = comb_data_mtx_to_save["response"].apply(lambda x: 10 ** (x) - 0.01) rsp = drp.DrugResponseLoader(params, split_file=params[i+"_split_file"], verbose=False).dfs["response.tsv"] From 3dc67cdb24af9bcbac04e695cf8a41153f82164d Mon Sep 17 00:00:00 2001 From: Andreas Wilke Date: Tue, 3 Dec 2024 09:18:58 -0600 Subject: [PATCH 249/254] Removing symlinks, creating problems in case insensitive file systems --- PathDSP_infer_improve.py | 85 ------- PathDSP_preprocess_improve.py | 438 ---------------------------------- PathDSP_train_improve.py | 329 ------------------------- pathdsp_infer_improve.py | 1 - pathdsp_preprocess_improve.py | 1 - pathdsp_train_improve.py | 1 - 6 files changed, 855 deletions(-) delete mode 100755 PathDSP_infer_improve.py delete mode 100644 PathDSP_preprocess_improve.py delete mode 100644 PathDSP_train_improve.py delete mode 120000 pathdsp_infer_improve.py delete mode 120000 pathdsp_preprocess_improve.py delete mode 120000 pathdsp_train_improve.py diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py deleted file mode 100755 index 97fc938..0000000 --- a/PathDSP_infer_improve.py +++ /dev/null @@ -1,85 +0,0 @@ -import os -import sys -import numpy as np -import pandas as pd -from datetime import datetime -import torch as tch -import torch.utils.data as tchud -import polars as pl -import model_utils.myModel as mynet -import model_utils.myDataloader as mydl -import model_utils.myUtility as myutil - -from PathDSP_preprocess_improve import mkdir, preprocess -from PathDSP_train_improve import ( - predicting, - cal_time, -) -from improvelib.applications.drug_response_prediction.config import DRPInferConfig #NCK -import improvelib.utils as frm #NCK -from model_params_def import pathdsp_infer_params - -file_path = os.path.dirname(os.path.realpath(__file__)) - - -def run(params): - frm.create_outdir(outdir=params["output_dir"]) - #params = preprocess(params) - test_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="test") - test_df = pl.read_csv(params["input_data_dir"] + "/" + test_data_fname, separator = "\t").to_pandas() - Xtest_arr = test_df.iloc[:, 0:-1].values - ytest_arr = test_df.iloc[:, -1].values - Xtest_arr = np.array(Xtest_arr).astype('float32') - ytest_arr = np.array(ytest_arr).astype('float32') - trained_net = mynet.FNN(Xtest_arr.shape[1]) - modelpath = frm.build_model_path(model_file_name=params["model_file_name"], model_file_format=params["model_file_format"], model_dir=params["input_model_dir"]) - trained_net.load_state_dict(tch.load(modelpath)) - trained_net.eval() - #myutil.set_seed(params["seed_int"]) - cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") - if cuda_env_visible is not None: - device = 'cuda:0' - else: - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) - test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) - test_dl = tchud.DataLoader(test_dataset, batch_size=params['infer_batch'], shuffle=False) - start = datetime.now() - test_true, test_pred = predicting(trained_net, device, data_loader=test_dl) - - test_true = pd.Series(test_true) - test_pred = pd.Series(test_pred) - test_true_untrans = test_true.apply(lambda x: 10 ** (x) - 0.01) - test_pred_untrans = test_pred.apply(lambda x: 10 ** (x) - 0.01) - - frm.store_predictions_df( - y_true=test_true_untrans, - y_pred=test_pred_untrans, - stage="test", - y_col_name=params["y_col_name"], - output_dir=params["output_dir"], - input_dir=params["input_data_dir"] - ) - if params["calc_infer_scores"]: - test_scores = frm.compute_performance_scores( - y_true=test_true_untrans, - y_pred=test_pred_untrans, - stage="test", - metric_type=params["metric_type"], - output_dir=params["output_dir"] - ) - - print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start))) - return True - -def main(args): - cfg = DRPInferConfig() - params = cfg.initialize_parameters( - file_path, - default_config="PathDSP_params.txt", - additional_definitions=pathdsp_infer_params) - if_ran = run(params) - print("\nFinished inference of PathDSP model.") - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py deleted file mode 100644 index 0cdc896..0000000 --- a/PathDSP_preprocess_improve.py +++ /dev/null @@ -1,438 +0,0 @@ -import sys -import os -import polars as pl -import numpy as np -import pandas as pd -import copy -from functools import reduce -from pathlib import Path -from rdkit import Chem -from rdkit.Chem import AllChem -from datetime import datetime -import RWR as rwr -import NetPEA as pea -import gseapy as gp -import sklearn.model_selection as skms -from sklearn.preprocessing import StandardScaler -from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig #NCK -from improvelib.utils import str2bool #NCK -import improvelib.utils as frm #NCK -import improvelib.applications.drug_response_prediction.drug_utils as drugs #NCK -import improvelib.applications.drug_response_prediction.omics_utils as omics #NCK -import improvelib.applications.drug_response_prediction.drp_utils as drp #NCK - -from model_params_def import pathdsp_preprocess_params - -file_path = Path(__file__).resolve().parent - -req_preprocess_args = [ll["name"] for ll in pathdsp_preprocess_params] - -def mkdir(directory): - directories = directory.split("/") - folder = "" - for d in directories: - folder += d + "/" - if not os.path.exists(folder): - print("creating folder: %s" % folder) - os.mkdir(folder) - - -def preprocess(params): - for i in [ - "drug_bits_file", - "dgnet_file", - "mutnet_file", - "cnvnet_file", - "exp_file", - ]: - params[i] = params["output_dir"] + "/" + params[i] - return params - - -# set timer -def cal_time(end, start): - """return time spent""" - # end = datetime.now(), start = datetime.now() - datetimeFormat = "%Y-%m-%d %H:%M:%S.%f" - spend = datetime.strptime(str(end), datetimeFormat) - datetime.strptime( - str(start), datetimeFormat - ) - return spend - -def response_out(params, split_file): - response_df = drp.DrugResponseLoader(params, split_file=split_file, verbose=True) - return response_df.dfs["response.tsv"] - - -def smile2bits(params): - start = datetime.now() - response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] - response_df = pd.concat(response_df, ignore_index=True) - smile_df = drugs.DrugsLoader(params) - smile_df = smile_df.dfs['drug_SMILES.tsv'] - smile_df = smile_df.reset_index() - smile_df.columns = ["drug", "smile"] - smile_df = smile_df.drop_duplicates(subset=["drug"], keep="first").set_index("drug") - smile_df = smile_df.loc[smile_df.index.isin(response_df["improve_chem_id"]),] - bit_int = params["bit_int"] - record_list = [] - # smile2bits drug by drug - n_drug = 1 - for idx, row in smile_df.iterrows(): - drug = idx - smile = row["smile"] - mol = Chem.MolFromSmiles(smile) - if mol is None: - continue - mbit = list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=bit_int)) - # drug_mbit_dict.update({drug:mbit}) - # append to result - record_list.append(tuple([drug] + mbit)) - if len(mbit) == bit_int: - n_drug += 1 - print("total {:} drugs with bits".format(n_drug)) - # convert dict to dataframe - colname_list = ["drug"] + ["mBit_" + str(i) for i in range(bit_int)] - drug_mbit_df = pd.DataFrame.from_records(record_list, columns=colname_list) - # drug_mbit_df = pd.DataFrame.from_dict(drug_mbit_dict, orient='index', columns=colname_list) - # drug_mbit_df.index.name = 'drug' - print("unique drugs={:}".format(len(drug_mbit_df["drug"].unique()))) - # save to file - drug_mbit_df.to_csv(params["drug_bits_file"], header=True, index=False, sep="\t") - print("[Finished in {:}]".format(cal_time(datetime.now(), start))) - - -def times_expression(rwr, exp): - """ - :param rwrDf: dataframe of cell by gene probability matrix - :param expDf: dataframe of cell by gene expression matrix - :return rwr_timesexp_df: dataframe of cell by gene probability matrix, - in which genes are multiplied with expression values - - Note: this function assumes cells are all overlapped while gene maybe not - """ - cell_list = sorted(list(set(rwr.index) & set(exp.index))) - gene_list = sorted(list(set(rwr.columns) & set(exp.columns))) - - if len(cell_list) == 0: - print("ERROR! no overlapping cell lines") - sys.exit(1) - if len(gene_list) == 0: - print("ERROR! no overlapping genes") - sys.exit(1) - # multiply with gene expression for overlapping cell, gene - rwr_timesexp = rwr.loc[cell_list, gene_list] * exp.loc[cell_list, gene_list] - # concat with other gene - out_gene_list = list(set(rwr.columns) - set(gene_list)) - out_df = pd.concat([rwr_timesexp, rwr[out_gene_list]], axis=1) - return out_df - - -def run_netpea(params, dtype, multiply_expression): - # timer - start_time = datetime.now() - ppi_path = params["input_supp_data_dir"] + "/STRING/9606.protein_name.links.v11.0.pkl" - pathway_path = ( - params["input_supp_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" - ) - log_transform = False - permutation_int = params["permutation_int"] - seed_int = params["seed_int"] - cpu_int = params["cpu_int"] - response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] - response_df = pd.concat(response_df, ignore_index=True) - omics_data = omics.OmicsLoader(params) - if dtype == "DGnet": - drug_info = pd.read_csv(params["input_dir"] + "/x_data/drug_info.tsv", sep="\t") - drug_info["NAME"] = drug_info["NAME"].str.upper() - target_info = pd.read_csv( - params["input_supp_data_dir"] + "/data/DB.Drug.Target.txt", sep="\t" - ) - target_info = target_info.rename(columns={"drug": "NAME"}) - combined_df = pd.merge(drug_info, target_info, how="left", on="NAME").dropna( - subset=["gene"] - ) - combined_df = combined_df.loc[ - combined_df["improve_chem_id"].isin(response_df["improve_chem_id"]), - ] - restart_path = params["output_dir"] + "/drug_target.txt" - combined_df.iloc[:, -2:].to_csv( - restart_path, sep="\t", header=True, index=False - ) - outpath = params["dgnet_file"] - elif dtype == "MUTnet": - mutation_data = omics_data.dfs['cancer_mutation_count.tsv'] - #mutation_data = mutation_data.reset_index() - mutation_data = pd.melt(mutation_data, id_vars="improve_sample_id").loc[ - lambda x: x["value"] > 0 - ] - mutation_data = mutation_data.loc[ - mutation_data["improve_sample_id"].isin(response_df["improve_sample_id"]), - ] - restart_path = params["output_dir"] + "/mutation_data.txt" - mutation_data.iloc[:, 0:2].to_csv( - restart_path, sep="\t", header=True, index=False - ) - outpath = params["mutnet_file"] - else: - cnv_data = omics_data.dfs['cancer_discretized_copy_number.tsv'] - #cnv_data = cnv_data.reset_index() - cnv_data = pd.melt(cnv_data, id_vars="improve_sample_id").loc[ - lambda x: x["value"] != 0 - ] - cnv_data = cnv_data.loc[ - cnv_data["improve_sample_id"].isin(response_df["improve_sample_id"]), - ] - restart_path = params["output_dir"] + "/cnv_data.txt" - cnv_data.iloc[:, 0:2].to_csv(restart_path, sep="\t", header=True, index=False) - outpath = params["cnvnet_file"] - # perform Random Walk - print(datetime.now(), "performing random walk with restart") - rwr_df = rwr.RWR( - ppi_path, - restart_path, - restartProbFloat=0.5, - convergenceFloat=0.00001, - normalize="l1", - weighted=True, - ).get_prob() - # multiply with gene expression - if multiply_expression: - print( - datetime.now(), - "multiplying gene expression with random walk probability for genes were expressed", - ) - # exp_df = improve_utils.load_gene_expression_data(gene_system_identifier='Gene_Symbol') - # exp_df = drp.load_omics_data( - # params, - # omics_type="gene_expression", - # canc_col_name="improve_sample_id", - # gene_system_identifier="Gene_Symbol", - # ) - exp_df = omics_data.dfs['cancer_gene_expression.tsv'] - exp_df = exp_df.set_index(params['canc_col_name']) - rwr_df = times_expression(rwr_df, exp_df) - # rwr_df.to_csv(out_path+'.RWR.txt', header=True, index=True, sep='\t') - # perform Pathwa Enrichment Analysis - print(datetime.now(), "performing network-based pathway enrichment") - cell_pathway_df = pea.NetPEA( - rwr_df, - pathway_path, - log_transform=log_transform, - permutation=permutation_int, - seed=seed_int, - n_cpu=cpu_int, - out_path=outpath, - ) - print("[Finished in {:}]".format(cal_time(datetime.now(), start_time))) - - -def prep_input(params): - # Read data files - drug_mbit_df = pd.read_csv(params["drug_bits_file"], sep="\t", index_col=0) - drug_mbit_df = drug_mbit_df.reset_index().rename(columns={"drug": "drug_id"}) - DGnet = pd.read_csv(params["dgnet_file"], sep="\t", index_col=0) - DGnet = ( - DGnet.add_suffix("_dgnet").reset_index().rename(columns={"index": "drug_id"}) - ) - CNVnet = pd.read_csv(params["cnvnet_file"], sep="\t", index_col=0) - CNVnet = ( - CNVnet.add_suffix("_cnvnet") - .reset_index() - .rename(columns={"index": "sample_id"}) - ) - MUTnet = pd.read_csv(params["mutnet_file"], sep="\t", index_col=0) - MUTnet = ( - MUTnet.add_suffix("_mutnet") - .reset_index() - .rename(columns={"index": "sample_id"}) - ) - EXP = pd.read_csv(params["exp_file"], sep="\t", index_col=0) - EXP = EXP.add_suffix("_exp").reset_index().rename(columns={"index": "sample_id"}) - response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] - response_df = pd.concat(response_df, ignore_index=True) - response_df = response_df.rename( - columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} - ) - # Extract relevant IDs - common_drug_ids = reduce( - np.intersect1d, - (drug_mbit_df["drug_id"], DGnet["drug_id"], response_df["drug_id"]), - ) - common_sample_ids = reduce( - np.intersect1d, - ( - CNVnet["sample_id"], - MUTnet["sample_id"], - EXP["sample_id"], - response_df["sample_id"], - ), - ) - response_df = response_df.loc[ - (response_df["drug_id"].isin(common_drug_ids)) - & (response_df["sample_id"].isin(common_sample_ids)), - :, - ] - drug_mbit_df = ( - drug_mbit_df.loc[drug_mbit_df["drug_id"].isin(common_drug_ids), :] - .set_index("drug_id") - .sort_index() - ) - DGnet = ( - DGnet.loc[DGnet["drug_id"].isin(common_drug_ids), :] - .set_index("drug_id") - .sort_index() - ) - CNVnet = ( - CNVnet.loc[CNVnet["sample_id"].isin(common_sample_ids), :] - .set_index("sample_id") - .sort_index() - ) - MUTnet = ( - MUTnet.loc[MUTnet["sample_id"].isin(common_sample_ids), :] - .set_index("sample_id") - .sort_index() - ) - EXP = ( - EXP.loc[EXP["sample_id"].isin(common_sample_ids), :] - .set_index("sample_id") - .sort_index() - ) - - drug_data = drug_mbit_df.join(DGnet) - sample_data = CNVnet.join([MUTnet, EXP]) - ## export train,val,test set - for i in ["train", "test", "val"]: - response_df = drp.DrugResponseLoader(params, split_file=params[i+"_split_file"], verbose=True) - response_df = response_df.dfs['response.tsv'] - response_df = response_df.rename( - columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} - ) - response_df = response_df.loc[ - (response_df["drug_id"].isin(common_drug_ids)) - & (response_df["sample_id"].isin(common_sample_ids)), - :, - ] - - comb_data_mtx = pd.DataFrame( - { - "drug_id": response_df["drug_id"].values, - "sample_id": response_df["sample_id"].values, - } - ) - comb_data_mtx = ( - comb_data_mtx.set_index(["drug_id", "sample_id"]) - .join(drug_data, on="drug_id") - .join(sample_data, on="sample_id") - ) - ss = StandardScaler() - comb_data_mtx.iloc[:,params["bit_int"]:comb_data_mtx.shape[1]] = ss.fit_transform(comb_data_mtx.iloc[:,params["bit_int"]:comb_data_mtx.shape[1]]) - ## add 0.01 to avoid possible inf values - comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) - comb_data_mtx = comb_data_mtx.dropna() - - comb_data_mtx_to_save = comb_data_mtx['response'] - comb_data_mtx_to_save = comb_data_mtx_to_save.reset_index() - comb_data_mtx_to_save.rename(columns={'drug_id': 'improve_chem_id', 'sample_id': 'improve_sample_id'}, inplace=True) - #comb_data_mtx_to_save[params["y_col_name"]] = comb_data_mtx_to_save["response"].apply(lambda x: 10 ** (x) - 0.01) - rsp = drp.DrugResponseLoader(params, - split_file=params[i+"_split_file"], - verbose=False).dfs["response.tsv"] - ydata = rsp.merge(comb_data_mtx_to_save, on=['improve_chem_id', 'improve_sample_id'], how='right') - print(comb_data_mtx_to_save) - print("YDATA") - print(ydata) - frm.save_stage_ydf(ydf=ydata, stage=i, output_dir=params["output_dir"]) - pl.from_pandas(comb_data_mtx).write_csv( - params["output_dir"] + "/" + frm.build_ml_data_file_name(data_format=params["data_format"], stage=i) -, separator="\t", has_header=True - ) - - -def run_ssgsea(params): - # expMat = improve_utils.load_gene_expression_data(sep='\t') - # expMat = drp.load_omics_data( - # params, - # omics_type="gene_expression", - # canc_col_name="improve_sample_id", - # gene_system_identifier="Gene_Symbol", - # ) - omics_data = omics.OmicsLoader(params) - expMat = omics_data.dfs['cancer_gene_expression.tsv'] - expMat = expMat.set_index(params['canc_col_name']) - - # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], - # split=params['split'], split_type=["train", "test", "val"], - # y_col_name=params['metric']) - response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] - response_df = pd.concat(response_df, ignore_index=True) - expMat = expMat.loc[expMat.index.isin(response_df["improve_sample_id"]),] - gct = expMat.T # gene (rows) cell lines (columns) - pathway_path = ( - params["input_supp_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" - ) - gmt = pathway_path - tmp_str = params["output_dir"] + "/tmpdir_ssgsea/" - - if not os.path.isdir(tmp_str): - os.mkdir(tmp_str) - - # run enrichment - ssgsea = gp.ssgsea( - data=gct, # gct: a matrix of gene by sample - gene_sets=gmt, # gmt format - outdir=tmp_str, - scale=True, - permutation_num=0, # 1000 - no_plot=True, - processes=params["cpu_int"], - # min_size=0, - format="png", - ) - - result_mat = ssgsea.res2d.T # get the normalized enrichment score (i.e., NES) - result_mat.to_csv(tmp_str + "ssGSEA.txt", header=True, index=True, sep="\t") - - f = open(tmp_str + "ssGSEA.txt", "r") - lines = f.readlines() - total_dict = {} - for cell in set(lines[1].split()): - total_dict[cell] = {} - cell_lines = lines[1].split() - vals = lines[4].split() - for i, pathway in enumerate((lines[2].split())): - if i > 0: - total_dict[cell_lines[i]][pathway] = float(vals[i]) - df = pd.DataFrame(total_dict) - df.T.to_csv(params["exp_file"], header=True, index=True, sep="\t") - -def run(params): - frm.create_outdir(outdir=params["output_dir"]) - params = preprocess(params) - print("convert drug to bits.") - smile2bits(params) - print("compute DGnet.") - run_netpea(params, dtype="DGnet", multiply_expression=False) - print("compute MUTnet.") - run_netpea(params, dtype="MUTnet", multiply_expression=True) - print("compute CNVnet.") - run_netpea(params, dtype="CNVnet", multiply_expression=True) - print("compute EXP.") - run_ssgsea(params) - print("prepare final input file.") - prep_input(params) - - -def main(args): - cfg = DRPPreprocessConfig() - params = cfg.initialize_parameters( - file_path, - default_config="PathDSP_params.txt", - additional_definitions=pathdsp_preprocess_params) - run(params) - - -if __name__ == "__main__": - start = datetime.now() - main(sys.argv[1:]) - print("[Preprocessing finished in {:}]".format(cal_time(datetime.now(), start))) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py deleted file mode 100644 index b3eb9a6..0000000 --- a/PathDSP_train_improve.py +++ /dev/null @@ -1,329 +0,0 @@ -import os -import sys -import numpy as np -import pandas as pd -from datetime import datetime -import socket -import torch as tch -import torch.utils.data as tchud -import model_utils.myModel as mynet -import model_utils.myDataloader as mydl -import model_utils.myUtility as myutil -import polars as pl - -from improvelib.applications.drug_response_prediction.config import DRPTrainConfig #NCK -import improvelib.utils as frm #NCK - -from PathDSP_preprocess_improve import cal_time, preprocess -from model_params_def import pathdsp_train_params - -file_path = os.path.dirname(os.path.realpath(__file__)) - - -class RMSELoss(tch.nn.Module): - def __init__(self): - super(RMSELoss,self).__init__() - - def forward(self,x,y): - eps = 1e-6 - criterion = tch.nn.MSELoss() - loss = tch.sqrt(criterion(x, y) + eps) - return loss - -def predicting(model, device, data_loader): - """ Method to make predictions/inference. - This is used in *train.py and *infer.py - - Parameters - ---------- - model : pytorch model - Model to evaluate. - device : string - Identifier for hardware that will be used to evaluate model. - data_loader : pytorch data loader. - Object to load data to evaluate. - - Returns - ------- - total_labels: numpy array - Array with ground truth. - total_preds: numpy array - Array with inferred outputs. - """ - model.to(device) - model.eval() - total_preds = tch.Tensor() - total_labels = tch.Tensor() - print("Make prediction for {} samples...".format(len(data_loader.dataset))) - with tch.no_grad(): - for i, (data_x, data_y) in enumerate(data_loader): - data_x, data_y = data_x.to(device), data_y.to(device) - data_y_pred = model(data_x) - # Is this computationally efficient? - total_preds = tch.cat((total_preds, data_y_pred.cpu()), 0) # preds to tensor - total_labels = tch.cat((total_labels, data_y.view(-1, 1).cpu()), 0) # labels to tensor - return total_labels.numpy().flatten(), total_preds.numpy().flatten() - - -def predict(net, device, test_dl): - """ - Return prediction list - - :param net: model - :param train_dl: train dataloader - :param device: string representing cpu or cuda:0 - """ - # create result lists - prediction_list = list() - true_list = list() - - with tch.no_grad(): - net = net.to(device) # load the network onto the device - net.eval() - for i, (X_test, y_test) in enumerate(test_dl): - X_test, y_test = X_test.to(device), y_test.to(device) # load data onto the device - y_test_pred = net(X_test) # test result - # bring data back to cpu in np.array format, and append to result lists - prediction_list.append( y_test_pred.cpu().numpy() ) - true_list.append(y_test.cpu().numpy()) - #print(prediction_list) - - # merge all batches - prediction_list = np.vstack(prediction_list) - prediction_list = np.hstack(prediction_list).tolist() - true_list = np.vstack(true_list) - true_list = np.hstack(true_list).tolist() - # return - return true_list, prediction_list - -def r2_score(y_true, y_pred): - y_mean = np.mean(y_true) - ss_tot = np.sum((y_true - y_mean)**2) - ss_res = np.sum((y_true - y_pred)**2) - r2 = 1 - ss_res / ss_tot - return r2 - -def cal_time(end, start): - '''return time spent''' - # end = datetime.now(), start = datetime.now() - datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' - spend = datetime.strptime(str(end), datetimeFormat) - \ - datetime.strptime(str(start),datetimeFormat) - return spend - - -def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn, params): - """ - Return train and valid performance including loss - - :param net: model - :param train_dl: train dataloader - :param valid_dl: valid dataloader - :param epochs: integer representing EPOCH - :param learning_rate: float representing LEARNING_RATE - :param device: string representing cpu or cuda:0 - :param opt_fn: optimization function in torch (e.g., tch.optim.Adam) - :param loss_fn: loss function in torch (e.g., tch.nn.MSELoss) - """ - # setup - criterion = RMSELoss() # setup LOSS function - optimizer = opt_fn(net.parameters(), lr=learning_rate, weight_decay=1e-5) # setup optimizer - net = net.to(device) # load the network onto the device - trainloss_list = [] # metrics: MSE, size equals to EPOCH - validloss_list = [] # metrics: MSE, size equals to EPOCH - validr2_list = [] # metrics: r2, size equals to EPOCH - early_stopping = myutil.EarlyStopping(patience=params['patience'], verbose=True, path= params["output_dir"] + "/checkpoint.pt") # initialize the early_stopping - # repeat the training for EPOCH times - start_total = datetime.now() - for epoch in range(epochs): - ## training phase - start = datetime.now() - net.train() - # initial loss - train_epoch_loss = 0.0 # save loss for each epoch, batch by batch - for i, (X_train, y_train) in enumerate(train_dl): - X_train, y_train = X_train.to(device), y_train.to(device) # load data onto the device - y_train_pred = net(X_train) # train result - train_loss = criterion(y_train_pred, y_train.float()) # calculate loss - optimizer.zero_grad() # clear gradients - train_loss.backward() # backpropagation - #### add this if you have gradient explosion problem ### - clip_value = 5 - tch.nn.utils.clip_grad_value_(net.parameters(), clip_value) - ########climp gradient within -5 ~ 5 ################### - optimizer.step() # update weights - train_epoch_loss += train_loss.item() # adding loss from each batch - # calculate total loss of all batches - avg_train_loss = train_epoch_loss / len(train_dl) - trainloss_list.append( avg_train_loss ) - print('epoch ' + str(epoch) + ' :[Finished in {:}]'.format(cal_time(datetime.now(), start))) - ## validation phase - with tch.no_grad(): - net.eval() - valid_epoch_loss = 0.0 # save loss for each epoch, batch by batch - ss_res = 0.0 - ss_tot = 0.0 - for i, (X_valid, y_valid) in enumerate(valid_dl): - X_valid, y_valid = X_valid.to(device), y_valid.to(device) # load data onto the device - y_valid_pred = net(X_valid) # valid result - valid_loss = criterion(y_valid_pred, y_valid.float())#y_valid.unsqueeze(1)) # calculate loss - valid_epoch_loss += valid_loss.item() # adding loss from each batch - ss_res += tch.sum((y_valid_pred - y_valid.float())**2) - ss_tot += tch.sum((y_valid_pred - y_valid.mean())**2) - - - # calculate total loss of all batches, and append to result list - avg_valid_loss = valid_epoch_loss / len(valid_dl) - validloss_list.append( avg_valid_loss) - valid_r2 = 1 - ss_res / ss_tot - validr2_list.append(valid_r2.cpu().numpy()) - # display print message - #print('epoch={:}/{:}, train loss={:.5f}, valid loss={:.5f}'.format( - # epoch+1, epochs, train_epoch_loss / len(train_dl), - # valid_epoch_loss / len(valid_dl))) - - # early_stopping needs the validation loss to check if it has decresed, - # and if it has, it will make a checkpoint of the current model - early_stopping(avg_valid_loss, net) - - if early_stopping.early_stop: - print("Early stopping") - break - - print('Total time (all epochs) :[Finished in {:}]'.format(cal_time(datetime.now(), start_total))) - # load the last checkpoint with the best model - net.load_state_dict(tch.load(params["output_dir"] + '/checkpoint.pt')) - - return net, trainloss_list, validloss_list, validr2_list - - -def run(params): - frm.create_outdir(outdir=params["output_dir"]) - modelpath = frm.build_model_path(model_file_name=params["model_file_name"], model_file_format=params["model_file_format"], model_dir=params["output_dir"]) - train_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="train") - val_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="val") - #params = preprocess(params) - - # set parameters - #myutil.set_seed(params["seed_int"]) - ## set device - cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") - if cuda_env_visible is not None: - device = 'cuda:0' - params["CUDA_VISIBLE_DEVICES"] = cuda_env_visible - else: - device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) - #print("Using device: " + device) - learning_rate = params['learning_rate'] - epoch = params['epochs'] - batch_size = params['batch_size'] - val_batch = params['val_batch'] - opt_fn = tch.optim.Adam - - # ------------------------------------------------------ - # [PathDSP] Prepare dataloaders - # ------------------------------------------------------ - print('loadinig data') - train_df = pl.read_csv(params["input_dir"] + "/" + train_data_fname, separator = "\t").to_pandas() - val_df = pl.read_csv(params["input_dir"] + "/" + val_data_fname, separator = "\t").to_pandas() - Xtrain_arr = train_df.iloc[:, 0:-1].values - Xvalid_arr = val_df.iloc[:, 0:-1].values - ytrain_arr = train_df.iloc[:, -1].values - yvalid_arr = val_df.iloc[:, -1].values - Xtrain_arr = np.array(Xtrain_arr).astype('float32') - Xvalid_arr = np.array(Xvalid_arr).astype('float32') - ytrain_arr = np.array(ytrain_arr).astype('float32') - yvalid_arr = np.array(yvalid_arr).astype('float32') - # create mini-batch - train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) - valid_dataset = mydl.NumpyDataset(tch.from_numpy(Xvalid_arr), tch.from_numpy(yvalid_arr)) - train_dl = tchud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) - valid_dl = tchud.DataLoader(valid_dataset, batch_size=val_batch, shuffle=False) - - # ------------------------------------------------------ - # [PathDSP] Prepare model - # ------------------------------------------------------ - # initial weight - def init_weights(m): - if type(m) == tch.nn.Linear: - tch.nn.init.kaiming_uniform_(m.weight) - m.bias.data.fill_(0.01) - # load model - n_features = Xtrain_arr.shape[1] - net = mynet.FNN(n_features) - ## specify dropout rate - for module in net.modules(): - if isinstance(module, tch.nn.Dropout): - module.p = params['dropout'] - net.apply(init_weights) - - # ------------------------------------------------------ - # [PathDSP] Training - # ------------------------------------------------------ - print('start training process') - trained_net, train_loss_list, valid_loss_list, valid_r2_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn, params) - - loss_df = pd.DataFrame({'epoch':[i+1 for i in range(len(train_loss_list))], - 'train loss':train_loss_list, - 'valid loss': valid_loss_list, - 'valid r2': valid_r2_list}) - loss_df.to_csv(params['output_dir'] + '/Val_Loss_orig.txt', header=True, index=False, sep="\t") - - # make train/valid loss plots - best_model = trained_net - tch.save(best_model.state_dict(), modelpath) - #best_model.eval() - # Compute predictions - #val_true, val_pred = predicting(best_model, device, valid_dl) # (groud truth), (predictions) - val_true, val_pred = predict(best_model, device, valid_dl) # (groud truth), (predictions) - - #comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) - val_true = pd.Series(val_true) - val_pred = pd.Series(val_pred) - val_true_untrans = val_true.apply(lambda x: 10 ** (x) - 0.01) - val_pred_untrans = val_pred.apply(lambda x: 10 ** (x) - 0.01) - # ----------------------------- - # [Req] Save raw predictions in dataframe - # ----------------------------- - # import ipdb; ipdb.set_trace() - frm.store_predictions_df( - y_true=val_true_untrans, - y_pred=val_pred_untrans, - stage="val", - y_col_name=params["y_col_name"], - output_dir=params["output_dir"], - input_dir=params["input_dir"] - ) - - # ----------------------------- - # [Req] Compute performance scores - # ----------------------------- - # import ipdb; ipdb.set_trace() - val_scores = frm.compute_performance_scores( - y_true=val_true_untrans, - y_pred=val_pred_untrans, - stage="val", - metric_type=params["metric_type"], - output_dir=params["output_dir"] - ) - return val_scores - - -def main(args): - cfg = DRPTrainConfig() - params = cfg.initialize_parameters( - file_path, - default_config="PathDSP_params.txt", - additional_definitions=pathdsp_train_params) - # get node name - params["node_name"] = socket.gethostname() - val_scores = run(params) - df = pd.DataFrame.from_dict(params, orient='index', columns=['value']) - df.to_csv(params["output_dir"] + '/params.txt',sep="\t") - - - -if __name__ == "__main__": - start = datetime.now() - main(sys.argv[1:]) - print("[Training finished in {:}]".format(cal_time(datetime.now(), start))) diff --git a/pathdsp_infer_improve.py b/pathdsp_infer_improve.py deleted file mode 120000 index 90b57c2..0000000 --- a/pathdsp_infer_improve.py +++ /dev/null @@ -1 +0,0 @@ -PathDSP_infer_improve.py \ No newline at end of file diff --git a/pathdsp_preprocess_improve.py b/pathdsp_preprocess_improve.py deleted file mode 120000 index 0e1ec22..0000000 --- a/pathdsp_preprocess_improve.py +++ /dev/null @@ -1 +0,0 @@ -PathDSP_preprocess_improve.py \ No newline at end of file diff --git a/pathdsp_train_improve.py b/pathdsp_train_improve.py deleted file mode 120000 index c4d5809..0000000 --- a/pathdsp_train_improve.py +++ /dev/null @@ -1 +0,0 @@ -PathDSP_train_improve.py \ No newline at end of file From 5da39c9902fd9bd236dc111258a03573f46a11c8 Mon Sep 17 00:00:00 2001 From: Andreas Wilke Date: Tue, 3 Dec 2024 09:40:21 -0600 Subject: [PATCH 250/254] Added backups from original files --- PathDSP_infer_improve.py.bak | 85 ++++++ PathDSP_preprocess_improve.py.bak | 438 ++++++++++++++++++++++++++++++ PathDSP_train_improve.py.bak | 329 ++++++++++++++++++++++ 3 files changed, 852 insertions(+) create mode 100755 PathDSP_infer_improve.py.bak create mode 100644 PathDSP_preprocess_improve.py.bak create mode 100644 PathDSP_train_improve.py.bak diff --git a/PathDSP_infer_improve.py.bak b/PathDSP_infer_improve.py.bak new file mode 100755 index 0000000..97fc938 --- /dev/null +++ b/PathDSP_infer_improve.py.bak @@ -0,0 +1,85 @@ +import os +import sys +import numpy as np +import pandas as pd +from datetime import datetime +import torch as tch +import torch.utils.data as tchud +import polars as pl +import model_utils.myModel as mynet +import model_utils.myDataloader as mydl +import model_utils.myUtility as myutil + +from PathDSP_preprocess_improve import mkdir, preprocess +from PathDSP_train_improve import ( + predicting, + cal_time, +) +from improvelib.applications.drug_response_prediction.config import DRPInferConfig #NCK +import improvelib.utils as frm #NCK +from model_params_def import pathdsp_infer_params + +file_path = os.path.dirname(os.path.realpath(__file__)) + + +def run(params): + frm.create_outdir(outdir=params["output_dir"]) + #params = preprocess(params) + test_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="test") + test_df = pl.read_csv(params["input_data_dir"] + "/" + test_data_fname, separator = "\t").to_pandas() + Xtest_arr = test_df.iloc[:, 0:-1].values + ytest_arr = test_df.iloc[:, -1].values + Xtest_arr = np.array(Xtest_arr).astype('float32') + ytest_arr = np.array(ytest_arr).astype('float32') + trained_net = mynet.FNN(Xtest_arr.shape[1]) + modelpath = frm.build_model_path(model_file_name=params["model_file_name"], model_file_format=params["model_file_format"], model_dir=params["input_model_dir"]) + trained_net.load_state_dict(tch.load(modelpath)) + trained_net.eval() + #myutil.set_seed(params["seed_int"]) + cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_env_visible is not None: + device = 'cuda:0' + else: + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) + test_dl = tchud.DataLoader(test_dataset, batch_size=params['infer_batch'], shuffle=False) + start = datetime.now() + test_true, test_pred = predicting(trained_net, device, data_loader=test_dl) + + test_true = pd.Series(test_true) + test_pred = pd.Series(test_pred) + test_true_untrans = test_true.apply(lambda x: 10 ** (x) - 0.01) + test_pred_untrans = test_pred.apply(lambda x: 10 ** (x) - 0.01) + + frm.store_predictions_df( + y_true=test_true_untrans, + y_pred=test_pred_untrans, + stage="test", + y_col_name=params["y_col_name"], + output_dir=params["output_dir"], + input_dir=params["input_data_dir"] + ) + if params["calc_infer_scores"]: + test_scores = frm.compute_performance_scores( + y_true=test_true_untrans, + y_pred=test_pred_untrans, + stage="test", + metric_type=params["metric_type"], + output_dir=params["output_dir"] + ) + + print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start))) + return True + +def main(args): + cfg = DRPInferConfig() + params = cfg.initialize_parameters( + file_path, + default_config="PathDSP_params.txt", + additional_definitions=pathdsp_infer_params) + if_ran = run(params) + print("\nFinished inference of PathDSP model.") + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/PathDSP_preprocess_improve.py.bak b/PathDSP_preprocess_improve.py.bak new file mode 100644 index 0000000..0cdc896 --- /dev/null +++ b/PathDSP_preprocess_improve.py.bak @@ -0,0 +1,438 @@ +import sys +import os +import polars as pl +import numpy as np +import pandas as pd +import copy +from functools import reduce +from pathlib import Path +from rdkit import Chem +from rdkit.Chem import AllChem +from datetime import datetime +import RWR as rwr +import NetPEA as pea +import gseapy as gp +import sklearn.model_selection as skms +from sklearn.preprocessing import StandardScaler +from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig #NCK +from improvelib.utils import str2bool #NCK +import improvelib.utils as frm #NCK +import improvelib.applications.drug_response_prediction.drug_utils as drugs #NCK +import improvelib.applications.drug_response_prediction.omics_utils as omics #NCK +import improvelib.applications.drug_response_prediction.drp_utils as drp #NCK + +from model_params_def import pathdsp_preprocess_params + +file_path = Path(__file__).resolve().parent + +req_preprocess_args = [ll["name"] for ll in pathdsp_preprocess_params] + +def mkdir(directory): + directories = directory.split("/") + folder = "" + for d in directories: + folder += d + "/" + if not os.path.exists(folder): + print("creating folder: %s" % folder) + os.mkdir(folder) + + +def preprocess(params): + for i in [ + "drug_bits_file", + "dgnet_file", + "mutnet_file", + "cnvnet_file", + "exp_file", + ]: + params[i] = params["output_dir"] + "/" + params[i] + return params + + +# set timer +def cal_time(end, start): + """return time spent""" + # end = datetime.now(), start = datetime.now() + datetimeFormat = "%Y-%m-%d %H:%M:%S.%f" + spend = datetime.strptime(str(end), datetimeFormat) - datetime.strptime( + str(start), datetimeFormat + ) + return spend + +def response_out(params, split_file): + response_df = drp.DrugResponseLoader(params, split_file=split_file, verbose=True) + return response_df.dfs["response.tsv"] + + +def smile2bits(params): + start = datetime.now() + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + smile_df = drugs.DrugsLoader(params) + smile_df = smile_df.dfs['drug_SMILES.tsv'] + smile_df = smile_df.reset_index() + smile_df.columns = ["drug", "smile"] + smile_df = smile_df.drop_duplicates(subset=["drug"], keep="first").set_index("drug") + smile_df = smile_df.loc[smile_df.index.isin(response_df["improve_chem_id"]),] + bit_int = params["bit_int"] + record_list = [] + # smile2bits drug by drug + n_drug = 1 + for idx, row in smile_df.iterrows(): + drug = idx + smile = row["smile"] + mol = Chem.MolFromSmiles(smile) + if mol is None: + continue + mbit = list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=bit_int)) + # drug_mbit_dict.update({drug:mbit}) + # append to result + record_list.append(tuple([drug] + mbit)) + if len(mbit) == bit_int: + n_drug += 1 + print("total {:} drugs with bits".format(n_drug)) + # convert dict to dataframe + colname_list = ["drug"] + ["mBit_" + str(i) for i in range(bit_int)] + drug_mbit_df = pd.DataFrame.from_records(record_list, columns=colname_list) + # drug_mbit_df = pd.DataFrame.from_dict(drug_mbit_dict, orient='index', columns=colname_list) + # drug_mbit_df.index.name = 'drug' + print("unique drugs={:}".format(len(drug_mbit_df["drug"].unique()))) + # save to file + drug_mbit_df.to_csv(params["drug_bits_file"], header=True, index=False, sep="\t") + print("[Finished in {:}]".format(cal_time(datetime.now(), start))) + + +def times_expression(rwr, exp): + """ + :param rwrDf: dataframe of cell by gene probability matrix + :param expDf: dataframe of cell by gene expression matrix + :return rwr_timesexp_df: dataframe of cell by gene probability matrix, + in which genes are multiplied with expression values + + Note: this function assumes cells are all overlapped while gene maybe not + """ + cell_list = sorted(list(set(rwr.index) & set(exp.index))) + gene_list = sorted(list(set(rwr.columns) & set(exp.columns))) + + if len(cell_list) == 0: + print("ERROR! no overlapping cell lines") + sys.exit(1) + if len(gene_list) == 0: + print("ERROR! no overlapping genes") + sys.exit(1) + # multiply with gene expression for overlapping cell, gene + rwr_timesexp = rwr.loc[cell_list, gene_list] * exp.loc[cell_list, gene_list] + # concat with other gene + out_gene_list = list(set(rwr.columns) - set(gene_list)) + out_df = pd.concat([rwr_timesexp, rwr[out_gene_list]], axis=1) + return out_df + + +def run_netpea(params, dtype, multiply_expression): + # timer + start_time = datetime.now() + ppi_path = params["input_supp_data_dir"] + "/STRING/9606.protein_name.links.v11.0.pkl" + pathway_path = ( + params["input_supp_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" + ) + log_transform = False + permutation_int = params["permutation_int"] + seed_int = params["seed_int"] + cpu_int = params["cpu_int"] + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + omics_data = omics.OmicsLoader(params) + if dtype == "DGnet": + drug_info = pd.read_csv(params["input_dir"] + "/x_data/drug_info.tsv", sep="\t") + drug_info["NAME"] = drug_info["NAME"].str.upper() + target_info = pd.read_csv( + params["input_supp_data_dir"] + "/data/DB.Drug.Target.txt", sep="\t" + ) + target_info = target_info.rename(columns={"drug": "NAME"}) + combined_df = pd.merge(drug_info, target_info, how="left", on="NAME").dropna( + subset=["gene"] + ) + combined_df = combined_df.loc[ + combined_df["improve_chem_id"].isin(response_df["improve_chem_id"]), + ] + restart_path = params["output_dir"] + "/drug_target.txt" + combined_df.iloc[:, -2:].to_csv( + restart_path, sep="\t", header=True, index=False + ) + outpath = params["dgnet_file"] + elif dtype == "MUTnet": + mutation_data = omics_data.dfs['cancer_mutation_count.tsv'] + #mutation_data = mutation_data.reset_index() + mutation_data = pd.melt(mutation_data, id_vars="improve_sample_id").loc[ + lambda x: x["value"] > 0 + ] + mutation_data = mutation_data.loc[ + mutation_data["improve_sample_id"].isin(response_df["improve_sample_id"]), + ] + restart_path = params["output_dir"] + "/mutation_data.txt" + mutation_data.iloc[:, 0:2].to_csv( + restart_path, sep="\t", header=True, index=False + ) + outpath = params["mutnet_file"] + else: + cnv_data = omics_data.dfs['cancer_discretized_copy_number.tsv'] + #cnv_data = cnv_data.reset_index() + cnv_data = pd.melt(cnv_data, id_vars="improve_sample_id").loc[ + lambda x: x["value"] != 0 + ] + cnv_data = cnv_data.loc[ + cnv_data["improve_sample_id"].isin(response_df["improve_sample_id"]), + ] + restart_path = params["output_dir"] + "/cnv_data.txt" + cnv_data.iloc[:, 0:2].to_csv(restart_path, sep="\t", header=True, index=False) + outpath = params["cnvnet_file"] + # perform Random Walk + print(datetime.now(), "performing random walk with restart") + rwr_df = rwr.RWR( + ppi_path, + restart_path, + restartProbFloat=0.5, + convergenceFloat=0.00001, + normalize="l1", + weighted=True, + ).get_prob() + # multiply with gene expression + if multiply_expression: + print( + datetime.now(), + "multiplying gene expression with random walk probability for genes were expressed", + ) + # exp_df = improve_utils.load_gene_expression_data(gene_system_identifier='Gene_Symbol') + # exp_df = drp.load_omics_data( + # params, + # omics_type="gene_expression", + # canc_col_name="improve_sample_id", + # gene_system_identifier="Gene_Symbol", + # ) + exp_df = omics_data.dfs['cancer_gene_expression.tsv'] + exp_df = exp_df.set_index(params['canc_col_name']) + rwr_df = times_expression(rwr_df, exp_df) + # rwr_df.to_csv(out_path+'.RWR.txt', header=True, index=True, sep='\t') + # perform Pathwa Enrichment Analysis + print(datetime.now(), "performing network-based pathway enrichment") + cell_pathway_df = pea.NetPEA( + rwr_df, + pathway_path, + log_transform=log_transform, + permutation=permutation_int, + seed=seed_int, + n_cpu=cpu_int, + out_path=outpath, + ) + print("[Finished in {:}]".format(cal_time(datetime.now(), start_time))) + + +def prep_input(params): + # Read data files + drug_mbit_df = pd.read_csv(params["drug_bits_file"], sep="\t", index_col=0) + drug_mbit_df = drug_mbit_df.reset_index().rename(columns={"drug": "drug_id"}) + DGnet = pd.read_csv(params["dgnet_file"], sep="\t", index_col=0) + DGnet = ( + DGnet.add_suffix("_dgnet").reset_index().rename(columns={"index": "drug_id"}) + ) + CNVnet = pd.read_csv(params["cnvnet_file"], sep="\t", index_col=0) + CNVnet = ( + CNVnet.add_suffix("_cnvnet") + .reset_index() + .rename(columns={"index": "sample_id"}) + ) + MUTnet = pd.read_csv(params["mutnet_file"], sep="\t", index_col=0) + MUTnet = ( + MUTnet.add_suffix("_mutnet") + .reset_index() + .rename(columns={"index": "sample_id"}) + ) + EXP = pd.read_csv(params["exp_file"], sep="\t", index_col=0) + EXP = EXP.add_suffix("_exp").reset_index().rename(columns={"index": "sample_id"}) + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + response_df = response_df.rename( + columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} + ) + # Extract relevant IDs + common_drug_ids = reduce( + np.intersect1d, + (drug_mbit_df["drug_id"], DGnet["drug_id"], response_df["drug_id"]), + ) + common_sample_ids = reduce( + np.intersect1d, + ( + CNVnet["sample_id"], + MUTnet["sample_id"], + EXP["sample_id"], + response_df["sample_id"], + ), + ) + response_df = response_df.loc[ + (response_df["drug_id"].isin(common_drug_ids)) + & (response_df["sample_id"].isin(common_sample_ids)), + :, + ] + drug_mbit_df = ( + drug_mbit_df.loc[drug_mbit_df["drug_id"].isin(common_drug_ids), :] + .set_index("drug_id") + .sort_index() + ) + DGnet = ( + DGnet.loc[DGnet["drug_id"].isin(common_drug_ids), :] + .set_index("drug_id") + .sort_index() + ) + CNVnet = ( + CNVnet.loc[CNVnet["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + MUTnet = ( + MUTnet.loc[MUTnet["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + EXP = ( + EXP.loc[EXP["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + + drug_data = drug_mbit_df.join(DGnet) + sample_data = CNVnet.join([MUTnet, EXP]) + ## export train,val,test set + for i in ["train", "test", "val"]: + response_df = drp.DrugResponseLoader(params, split_file=params[i+"_split_file"], verbose=True) + response_df = response_df.dfs['response.tsv'] + response_df = response_df.rename( + columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} + ) + response_df = response_df.loc[ + (response_df["drug_id"].isin(common_drug_ids)) + & (response_df["sample_id"].isin(common_sample_ids)), + :, + ] + + comb_data_mtx = pd.DataFrame( + { + "drug_id": response_df["drug_id"].values, + "sample_id": response_df["sample_id"].values, + } + ) + comb_data_mtx = ( + comb_data_mtx.set_index(["drug_id", "sample_id"]) + .join(drug_data, on="drug_id") + .join(sample_data, on="sample_id") + ) + ss = StandardScaler() + comb_data_mtx.iloc[:,params["bit_int"]:comb_data_mtx.shape[1]] = ss.fit_transform(comb_data_mtx.iloc[:,params["bit_int"]:comb_data_mtx.shape[1]]) + ## add 0.01 to avoid possible inf values + comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) + comb_data_mtx = comb_data_mtx.dropna() + + comb_data_mtx_to_save = comb_data_mtx['response'] + comb_data_mtx_to_save = comb_data_mtx_to_save.reset_index() + comb_data_mtx_to_save.rename(columns={'drug_id': 'improve_chem_id', 'sample_id': 'improve_sample_id'}, inplace=True) + #comb_data_mtx_to_save[params["y_col_name"]] = comb_data_mtx_to_save["response"].apply(lambda x: 10 ** (x) - 0.01) + rsp = drp.DrugResponseLoader(params, + split_file=params[i+"_split_file"], + verbose=False).dfs["response.tsv"] + ydata = rsp.merge(comb_data_mtx_to_save, on=['improve_chem_id', 'improve_sample_id'], how='right') + print(comb_data_mtx_to_save) + print("YDATA") + print(ydata) + frm.save_stage_ydf(ydf=ydata, stage=i, output_dir=params["output_dir"]) + pl.from_pandas(comb_data_mtx).write_csv( + params["output_dir"] + "/" + frm.build_ml_data_file_name(data_format=params["data_format"], stage=i) +, separator="\t", has_header=True + ) + + +def run_ssgsea(params): + # expMat = improve_utils.load_gene_expression_data(sep='\t') + # expMat = drp.load_omics_data( + # params, + # omics_type="gene_expression", + # canc_col_name="improve_sample_id", + # gene_system_identifier="Gene_Symbol", + # ) + omics_data = omics.OmicsLoader(params) + expMat = omics_data.dfs['cancer_gene_expression.tsv'] + expMat = expMat.set_index(params['canc_col_name']) + + # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], + # split=params['split'], split_type=["train", "test", "val"], + # y_col_name=params['metric']) + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + expMat = expMat.loc[expMat.index.isin(response_df["improve_sample_id"]),] + gct = expMat.T # gene (rows) cell lines (columns) + pathway_path = ( + params["input_supp_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" + ) + gmt = pathway_path + tmp_str = params["output_dir"] + "/tmpdir_ssgsea/" + + if not os.path.isdir(tmp_str): + os.mkdir(tmp_str) + + # run enrichment + ssgsea = gp.ssgsea( + data=gct, # gct: a matrix of gene by sample + gene_sets=gmt, # gmt format + outdir=tmp_str, + scale=True, + permutation_num=0, # 1000 + no_plot=True, + processes=params["cpu_int"], + # min_size=0, + format="png", + ) + + result_mat = ssgsea.res2d.T # get the normalized enrichment score (i.e., NES) + result_mat.to_csv(tmp_str + "ssGSEA.txt", header=True, index=True, sep="\t") + + f = open(tmp_str + "ssGSEA.txt", "r") + lines = f.readlines() + total_dict = {} + for cell in set(lines[1].split()): + total_dict[cell] = {} + cell_lines = lines[1].split() + vals = lines[4].split() + for i, pathway in enumerate((lines[2].split())): + if i > 0: + total_dict[cell_lines[i]][pathway] = float(vals[i]) + df = pd.DataFrame(total_dict) + df.T.to_csv(params["exp_file"], header=True, index=True, sep="\t") + +def run(params): + frm.create_outdir(outdir=params["output_dir"]) + params = preprocess(params) + print("convert drug to bits.") + smile2bits(params) + print("compute DGnet.") + run_netpea(params, dtype="DGnet", multiply_expression=False) + print("compute MUTnet.") + run_netpea(params, dtype="MUTnet", multiply_expression=True) + print("compute CNVnet.") + run_netpea(params, dtype="CNVnet", multiply_expression=True) + print("compute EXP.") + run_ssgsea(params) + print("prepare final input file.") + prep_input(params) + + +def main(args): + cfg = DRPPreprocessConfig() + params = cfg.initialize_parameters( + file_path, + default_config="PathDSP_params.txt", + additional_definitions=pathdsp_preprocess_params) + run(params) + + +if __name__ == "__main__": + start = datetime.now() + main(sys.argv[1:]) + print("[Preprocessing finished in {:}]".format(cal_time(datetime.now(), start))) diff --git a/PathDSP_train_improve.py.bak b/PathDSP_train_improve.py.bak new file mode 100644 index 0000000..b3eb9a6 --- /dev/null +++ b/PathDSP_train_improve.py.bak @@ -0,0 +1,329 @@ +import os +import sys +import numpy as np +import pandas as pd +from datetime import datetime +import socket +import torch as tch +import torch.utils.data as tchud +import model_utils.myModel as mynet +import model_utils.myDataloader as mydl +import model_utils.myUtility as myutil +import polars as pl + +from improvelib.applications.drug_response_prediction.config import DRPTrainConfig #NCK +import improvelib.utils as frm #NCK + +from PathDSP_preprocess_improve import cal_time, preprocess +from model_params_def import pathdsp_train_params + +file_path = os.path.dirname(os.path.realpath(__file__)) + + +class RMSELoss(tch.nn.Module): + def __init__(self): + super(RMSELoss,self).__init__() + + def forward(self,x,y): + eps = 1e-6 + criterion = tch.nn.MSELoss() + loss = tch.sqrt(criterion(x, y) + eps) + return loss + +def predicting(model, device, data_loader): + """ Method to make predictions/inference. + This is used in *train.py and *infer.py + + Parameters + ---------- + model : pytorch model + Model to evaluate. + device : string + Identifier for hardware that will be used to evaluate model. + data_loader : pytorch data loader. + Object to load data to evaluate. + + Returns + ------- + total_labels: numpy array + Array with ground truth. + total_preds: numpy array + Array with inferred outputs. + """ + model.to(device) + model.eval() + total_preds = tch.Tensor() + total_labels = tch.Tensor() + print("Make prediction for {} samples...".format(len(data_loader.dataset))) + with tch.no_grad(): + for i, (data_x, data_y) in enumerate(data_loader): + data_x, data_y = data_x.to(device), data_y.to(device) + data_y_pred = model(data_x) + # Is this computationally efficient? + total_preds = tch.cat((total_preds, data_y_pred.cpu()), 0) # preds to tensor + total_labels = tch.cat((total_labels, data_y.view(-1, 1).cpu()), 0) # labels to tensor + return total_labels.numpy().flatten(), total_preds.numpy().flatten() + + +def predict(net, device, test_dl): + """ + Return prediction list + + :param net: model + :param train_dl: train dataloader + :param device: string representing cpu or cuda:0 + """ + # create result lists + prediction_list = list() + true_list = list() + + with tch.no_grad(): + net = net.to(device) # load the network onto the device + net.eval() + for i, (X_test, y_test) in enumerate(test_dl): + X_test, y_test = X_test.to(device), y_test.to(device) # load data onto the device + y_test_pred = net(X_test) # test result + # bring data back to cpu in np.array format, and append to result lists + prediction_list.append( y_test_pred.cpu().numpy() ) + true_list.append(y_test.cpu().numpy()) + #print(prediction_list) + + # merge all batches + prediction_list = np.vstack(prediction_list) + prediction_list = np.hstack(prediction_list).tolist() + true_list = np.vstack(true_list) + true_list = np.hstack(true_list).tolist() + # return + return true_list, prediction_list + +def r2_score(y_true, y_pred): + y_mean = np.mean(y_true) + ss_tot = np.sum((y_true - y_mean)**2) + ss_res = np.sum((y_true - y_pred)**2) + r2 = 1 - ss_res / ss_tot + return r2 + +def cal_time(end, start): + '''return time spent''' + # end = datetime.now(), start = datetime.now() + datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' + spend = datetime.strptime(str(end), datetimeFormat) - \ + datetime.strptime(str(start),datetimeFormat) + return spend + + +def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn, params): + """ + Return train and valid performance including loss + + :param net: model + :param train_dl: train dataloader + :param valid_dl: valid dataloader + :param epochs: integer representing EPOCH + :param learning_rate: float representing LEARNING_RATE + :param device: string representing cpu or cuda:0 + :param opt_fn: optimization function in torch (e.g., tch.optim.Adam) + :param loss_fn: loss function in torch (e.g., tch.nn.MSELoss) + """ + # setup + criterion = RMSELoss() # setup LOSS function + optimizer = opt_fn(net.parameters(), lr=learning_rate, weight_decay=1e-5) # setup optimizer + net = net.to(device) # load the network onto the device + trainloss_list = [] # metrics: MSE, size equals to EPOCH + validloss_list = [] # metrics: MSE, size equals to EPOCH + validr2_list = [] # metrics: r2, size equals to EPOCH + early_stopping = myutil.EarlyStopping(patience=params['patience'], verbose=True, path= params["output_dir"] + "/checkpoint.pt") # initialize the early_stopping + # repeat the training for EPOCH times + start_total = datetime.now() + for epoch in range(epochs): + ## training phase + start = datetime.now() + net.train() + # initial loss + train_epoch_loss = 0.0 # save loss for each epoch, batch by batch + for i, (X_train, y_train) in enumerate(train_dl): + X_train, y_train = X_train.to(device), y_train.to(device) # load data onto the device + y_train_pred = net(X_train) # train result + train_loss = criterion(y_train_pred, y_train.float()) # calculate loss + optimizer.zero_grad() # clear gradients + train_loss.backward() # backpropagation + #### add this if you have gradient explosion problem ### + clip_value = 5 + tch.nn.utils.clip_grad_value_(net.parameters(), clip_value) + ########climp gradient within -5 ~ 5 ################### + optimizer.step() # update weights + train_epoch_loss += train_loss.item() # adding loss from each batch + # calculate total loss of all batches + avg_train_loss = train_epoch_loss / len(train_dl) + trainloss_list.append( avg_train_loss ) + print('epoch ' + str(epoch) + ' :[Finished in {:}]'.format(cal_time(datetime.now(), start))) + ## validation phase + with tch.no_grad(): + net.eval() + valid_epoch_loss = 0.0 # save loss for each epoch, batch by batch + ss_res = 0.0 + ss_tot = 0.0 + for i, (X_valid, y_valid) in enumerate(valid_dl): + X_valid, y_valid = X_valid.to(device), y_valid.to(device) # load data onto the device + y_valid_pred = net(X_valid) # valid result + valid_loss = criterion(y_valid_pred, y_valid.float())#y_valid.unsqueeze(1)) # calculate loss + valid_epoch_loss += valid_loss.item() # adding loss from each batch + ss_res += tch.sum((y_valid_pred - y_valid.float())**2) + ss_tot += tch.sum((y_valid_pred - y_valid.mean())**2) + + + # calculate total loss of all batches, and append to result list + avg_valid_loss = valid_epoch_loss / len(valid_dl) + validloss_list.append( avg_valid_loss) + valid_r2 = 1 - ss_res / ss_tot + validr2_list.append(valid_r2.cpu().numpy()) + # display print message + #print('epoch={:}/{:}, train loss={:.5f}, valid loss={:.5f}'.format( + # epoch+1, epochs, train_epoch_loss / len(train_dl), + # valid_epoch_loss / len(valid_dl))) + + # early_stopping needs the validation loss to check if it has decresed, + # and if it has, it will make a checkpoint of the current model + early_stopping(avg_valid_loss, net) + + if early_stopping.early_stop: + print("Early stopping") + break + + print('Total time (all epochs) :[Finished in {:}]'.format(cal_time(datetime.now(), start_total))) + # load the last checkpoint with the best model + net.load_state_dict(tch.load(params["output_dir"] + '/checkpoint.pt')) + + return net, trainloss_list, validloss_list, validr2_list + + +def run(params): + frm.create_outdir(outdir=params["output_dir"]) + modelpath = frm.build_model_path(model_file_name=params["model_file_name"], model_file_format=params["model_file_format"], model_dir=params["output_dir"]) + train_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="train") + val_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="val") + #params = preprocess(params) + + # set parameters + #myutil.set_seed(params["seed_int"]) + ## set device + cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_env_visible is not None: + device = 'cuda:0' + params["CUDA_VISIBLE_DEVICES"] = cuda_env_visible + else: + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + #print("Using device: " + device) + learning_rate = params['learning_rate'] + epoch = params['epochs'] + batch_size = params['batch_size'] + val_batch = params['val_batch'] + opt_fn = tch.optim.Adam + + # ------------------------------------------------------ + # [PathDSP] Prepare dataloaders + # ------------------------------------------------------ + print('loadinig data') + train_df = pl.read_csv(params["input_dir"] + "/" + train_data_fname, separator = "\t").to_pandas() + val_df = pl.read_csv(params["input_dir"] + "/" + val_data_fname, separator = "\t").to_pandas() + Xtrain_arr = train_df.iloc[:, 0:-1].values + Xvalid_arr = val_df.iloc[:, 0:-1].values + ytrain_arr = train_df.iloc[:, -1].values + yvalid_arr = val_df.iloc[:, -1].values + Xtrain_arr = np.array(Xtrain_arr).astype('float32') + Xvalid_arr = np.array(Xvalid_arr).astype('float32') + ytrain_arr = np.array(ytrain_arr).astype('float32') + yvalid_arr = np.array(yvalid_arr).astype('float32') + # create mini-batch + train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) + valid_dataset = mydl.NumpyDataset(tch.from_numpy(Xvalid_arr), tch.from_numpy(yvalid_arr)) + train_dl = tchud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + valid_dl = tchud.DataLoader(valid_dataset, batch_size=val_batch, shuffle=False) + + # ------------------------------------------------------ + # [PathDSP] Prepare model + # ------------------------------------------------------ + # initial weight + def init_weights(m): + if type(m) == tch.nn.Linear: + tch.nn.init.kaiming_uniform_(m.weight) + m.bias.data.fill_(0.01) + # load model + n_features = Xtrain_arr.shape[1] + net = mynet.FNN(n_features) + ## specify dropout rate + for module in net.modules(): + if isinstance(module, tch.nn.Dropout): + module.p = params['dropout'] + net.apply(init_weights) + + # ------------------------------------------------------ + # [PathDSP] Training + # ------------------------------------------------------ + print('start training process') + trained_net, train_loss_list, valid_loss_list, valid_r2_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn, params) + + loss_df = pd.DataFrame({'epoch':[i+1 for i in range(len(train_loss_list))], + 'train loss':train_loss_list, + 'valid loss': valid_loss_list, + 'valid r2': valid_r2_list}) + loss_df.to_csv(params['output_dir'] + '/Val_Loss_orig.txt', header=True, index=False, sep="\t") + + # make train/valid loss plots + best_model = trained_net + tch.save(best_model.state_dict(), modelpath) + #best_model.eval() + # Compute predictions + #val_true, val_pred = predicting(best_model, device, valid_dl) # (groud truth), (predictions) + val_true, val_pred = predict(best_model, device, valid_dl) # (groud truth), (predictions) + + #comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) + val_true = pd.Series(val_true) + val_pred = pd.Series(val_pred) + val_true_untrans = val_true.apply(lambda x: 10 ** (x) - 0.01) + val_pred_untrans = val_pred.apply(lambda x: 10 ** (x) - 0.01) + # ----------------------------- + # [Req] Save raw predictions in dataframe + # ----------------------------- + # import ipdb; ipdb.set_trace() + frm.store_predictions_df( + y_true=val_true_untrans, + y_pred=val_pred_untrans, + stage="val", + y_col_name=params["y_col_name"], + output_dir=params["output_dir"], + input_dir=params["input_dir"] + ) + + # ----------------------------- + # [Req] Compute performance scores + # ----------------------------- + # import ipdb; ipdb.set_trace() + val_scores = frm.compute_performance_scores( + y_true=val_true_untrans, + y_pred=val_pred_untrans, + stage="val", + metric_type=params["metric_type"], + output_dir=params["output_dir"] + ) + return val_scores + + +def main(args): + cfg = DRPTrainConfig() + params = cfg.initialize_parameters( + file_path, + default_config="PathDSP_params.txt", + additional_definitions=pathdsp_train_params) + # get node name + params["node_name"] = socket.gethostname() + val_scores = run(params) + df = pd.DataFrame.from_dict(params, orient='index', columns=['value']) + df.to_csv(params["output_dir"] + '/params.txt',sep="\t") + + + +if __name__ == "__main__": + start = datetime.now() + main(sys.argv[1:]) + print("[Training finished in {:}]".format(cal_time(datetime.now(), start))) From c8279d8a22327f9f2a494d114967c38360d54dea Mon Sep 17 00:00:00 2001 From: Andreas Wilke Date: Tue, 3 Dec 2024 09:42:26 -0600 Subject: [PATCH 251/254] Renamed files, back to original name --- PathDSP_infer_improve.py.bak => PathDSP_infer_improve.py | 0 ...DSP_preprocess_improve.py.bak => PathDSP_preprocess_improve.py | 0 PathDSP_train_improve.py.bak => PathDSP_train_improve.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename PathDSP_infer_improve.py.bak => PathDSP_infer_improve.py (100%) rename PathDSP_preprocess_improve.py.bak => PathDSP_preprocess_improve.py (100%) rename PathDSP_train_improve.py.bak => PathDSP_train_improve.py (100%) diff --git a/PathDSP_infer_improve.py.bak b/PathDSP_infer_improve.py similarity index 100% rename from PathDSP_infer_improve.py.bak rename to PathDSP_infer_improve.py diff --git a/PathDSP_preprocess_improve.py.bak b/PathDSP_preprocess_improve.py similarity index 100% rename from PathDSP_preprocess_improve.py.bak rename to PathDSP_preprocess_improve.py diff --git a/PathDSP_train_improve.py.bak b/PathDSP_train_improve.py similarity index 100% rename from PathDSP_train_improve.py.bak rename to PathDSP_train_improve.py From 5729886075bc0d74cabe5b3e7b69f911a76bd7fa Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 9 Dec 2024 12:00:57 -0500 Subject: [PATCH 252/254] added exp_id --- PathDSP_params.txt | 2 +- PathDSP_preprocess_improve.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/PathDSP_params.txt b/PathDSP_params.txt index 32bc09b..91f33f6 100644 --- a/PathDSP_params.txt +++ b/PathDSP_params.txt @@ -1,6 +1,6 @@ [Preprocess] data_format = .txt -input_supp_data_dir = /nfs/ml_lab/projects/improve/data/experiments/src/PathDSP/author_data +input_supp_data_dir = ./author_data train_split_file = CCLE_split_0_train.txt val_split_file = CCLE_split_0_val.txt test_split_file = CCLE_split_0_test.txt diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py index 0cdc896..e5ce173 100644 --- a/PathDSP_preprocess_improve.py +++ b/PathDSP_preprocess_improve.py @@ -305,6 +305,7 @@ def prep_input(params): for i in ["train", "test", "val"]: response_df = drp.DrugResponseLoader(params, split_file=params[i+"_split_file"], verbose=True) response_df = response_df.dfs['response.tsv'] + response_df['exp_id'] = list(range(0,response_df.shape[0])) response_df = response_df.rename( columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} ) @@ -318,10 +319,11 @@ def prep_input(params): { "drug_id": response_df["drug_id"].values, "sample_id": response_df["sample_id"].values, + "exp_id": response_df["exp_id"].values } ) comb_data_mtx = ( - comb_data_mtx.set_index(["drug_id", "sample_id"]) + comb_data_mtx.set_index(["drug_id", "sample_id", "exp_id"]) .join(drug_data, on="drug_id") .join(sample_data, on="sample_id") ) @@ -338,7 +340,8 @@ def prep_input(params): rsp = drp.DrugResponseLoader(params, split_file=params[i+"_split_file"], verbose=False).dfs["response.tsv"] - ydata = rsp.merge(comb_data_mtx_to_save, on=['improve_chem_id', 'improve_sample_id'], how='right') + rsp['exp_id'] = list(range(0,rsp.shape[0])) + ydata = rsp.merge(comb_data_mtx_to_save, on=['exp_id'], how='right') print(comb_data_mtx_to_save) print("YDATA") print(ydata) From 8da37bdd7ea4ae9e1aac9c538f721d94c3bba280 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 16 Jun 2025 16:07:22 -0400 Subject: [PATCH 253/254] Update README.md --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2ff089b..6bfbad5 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # PathDSP -This repository demonstrates how to use the [IMPROVE library v0.1.0-2024-09-27](https://jdacs4c-improve.github.io/docs/v0.1.0-alpha/) for building a drug response prediction (DRP) model using PathDSP, and provides examples with the benchmark [cross-study analysis (CSA) dataset](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/). +This repository demonstrates how to use the [IMPROVE library v0.1.0](https://jdacs4c-improve.github.io/docs/v0.1.0/) for building a drug response prediction (DRP) model using PathDSP, and provides examples with the benchmark [cross-study analysis (CSA) dataset](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/). -This version, tagged as `v0.1.0-2024-09-27`, introduces a new API which is designed to encourage broader adoption of IMPROVE and its curated models by the research community. +This version, tagged as `v0.1.0`, introduces a new API which is designed to encourage broader adoption of IMPROVE and its curated models by the research community. ## Dependencies @@ -14,7 +14,7 @@ ML framework: + [Torch](https://pytorch.org/) -- deep learning framework for building the prediction model IMPROVE dependencies: -+ [IMPROVE v0.1.0-2024-09-27](https://jdacs4c-improve.github.io/docs/v0.1.0-alpha/) ++ [IMPROVE v0.1.0](https://jdacs4c-improve.github.io/docs/v0.1.0) @@ -70,7 +70,7 @@ csa_data/raw_data/ ``` git clone https://github.com/JDACS4C-IMPROVE/PathDSP cd PathDSP -git checkout v0.1.0-2024-09-27 +git checkout v0.1.0 ``` @@ -89,8 +89,8 @@ source setup_improve.sh This will: 1. Download cross-study analysis (CSA) benchmark data into `./csa_data/`. -2. Clone IMPROVE repo (checkout tag `v0.1.0-2024-09-27`) outside the PathDSP model repo -3. Set up env variables: `IMPROVE_DATA_DIR` (to `./csa_data/`) and `PYTHONPATH` (adds IMPROVE repo). +2. Clone IMPROVE repo (checkout tag `v0.1.0`) outside the PathDSP model repo +3. Set up `PYTHONPATH` (adds IMPROVE repo). 4. Download the model-specific supplemental data (aka author data) and set up the env variable `AUTHOR_DATA_DIR`. From b524636aba98f09cd0fd4c5577711fc16d6dd3e2 Mon Sep 17 00:00:00 2001 From: nkoussa <156325369+nkoussa@users.noreply.github.com> Date: Mon, 16 Jun 2025 16:07:49 -0400 Subject: [PATCH 254/254] Update setup_improve.sh --- setup_improve.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup_improve.sh b/setup_improve.sh index d566948..8fad5bc 100644 --- a/setup_improve.sh +++ b/setup_improve.sh @@ -39,7 +39,7 @@ export AUTHOR_DATA_DIR="./$author_dir/" cd ../ improve_lib_path=$PWD/IMPROVE # improve_branch="develop" -improve_branch="v0.1.0-2024-09-27" +improve_branch="v0.1.0" if [ -d $improve_lib_path ]; then echo "IMPROVE repo exists in ${improve_lib_path}" else @@ -55,4 +55,4 @@ export PYTHONPATH=$PYTHONPATH:$improve_lib_path echo echo "IMPROVE_DATA_DIR: $IMPROVE_DATA_DIR" echo "AUTHOR_DATA_DIR: $AUTHOR_DATA_DIR" -echo "PYTHONPATH: $PYTHONPATH" \ No newline at end of file +echo "PYTHONPATH: $PYTHONPATH"