diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5d7b436 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +.ipynb_checkpoints/ +PathDSP/__pycache__/ +__pycache__/ +EDA.ipynib +ml_data/ +dh_hpo_improve/ +dh_hpo_logs/ + +## gpu utilization +PathDSP_gpu_util_model.txt +gpu_log_strip.txt +gpu_logs.txt +out_models/ +train_gpu_util.sh + +## image +PathDSP.sif + +## log files +dh_hpo_scale_test.* diff --git a/NetPEA.py b/NetPEA.py new file mode 100644 index 0000000..d7006ef --- /dev/null +++ b/NetPEA.py @@ -0,0 +1,212 @@ +""" +Implementation of NetPEA: pathway enrichment with networks (Liu, 2017) + +Ref: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5664096/ +zscore >1.65, equivalent to p-value=0.05 +""" + +import os +import sys +import argparse +import numpy as np +import pandas as pd +import multiprocessing as mp +import scipy.stats as scistat +from datetime import datetime + +class NetPEA: + """ + :param rwrDf: dataframe with cell by PPI genes + :param pathwayGMT: pathway database in gmt format + :param permutation: + :param seed: + :param threshold: + """ + def __init__(self, rwrPath, pathwayGMT, log_transform=False, permutation=1000, seed=42, n_cpu=5, out_path='./'): + # load data + self.rwr_path = rwrPath #pd.read_csv(rwrDf, header=0, index_col=0, sep="\t") + self.pathway_gmt = pathwayGMT + self.permutation = int(permutation) + self.seed = int(seed) + self.out_path = out_path + + # settings + np.random.seed(self.seed) + self.n_cpu = int(n_cpu) + if len(self.rwr_path) < self.n_cpu: + self.n_cpu = len(self.rwr_path) + + # prepare pathway genes to save time + print('{:}: collect pathway genes'.format(datetime.now())) + pathway_geneList_dict = self._get_pathway_genes(pathwayGMT) # {pathway: geneList} + # obtain shared genes for calculating score of pathway genes + self.rwrDf = self.rwr_path#pd.read_csv(rwrPath, header=0, index_col=0, sep="\t") + if log_transform == True: + print('log transform input data') + self.rwrDf = np.log(self.rwrDf) + pathway_shareGeneList_dict = self._find_overlaps(self.rwrDf, pathway_geneList_dict) # {pathway: shareGeneList} + # generate random gene list for calculating score of random pathway genes + pathway_randomGeneListList_dict = {} + bg_gene_list = self.rwrDf.columns.tolist() # ppi genes + for pathway, shareGeneList in pathway_shareGeneList_dict.items(): + pathway_randomGeneListList_dict.update({pathway:[]}) + for p in range(self.permutation): + gene_list = np.random.choice(bg_gene_list, len(shareGeneList)).tolist() + pathway_randomGeneListList_dict[pathway].append(gene_list) + self.pathwayDictList = [pathway_geneList_dict, pathway_shareGeneList_dict, pathway_randomGeneListList_dict] + + # call function + self.netpea_parallel(self.rwrDf, self.pathwayDictList, self.n_cpu, self.out_path) + + def netpea_parallel(self, rwrDf, pathwayDictList, n_cpu, out_path): + # split dataframe + n_partitions = int(n_cpu) + split_list = np.array_split(rwrDf, n_partitions) + # parallel computing + pool = mp.Pool(int(n_cpu)) + df_list = pool.starmap(self.netpea, [(df, pathwayDictList) for df in split_list]) + pool.close() + pool.join() + print('{:}: comple {:} dfs'.format(datetime.now(), len(df_list))) + print(df_list[0]) + + # merge result of all cells and save to file + print('{:}: merge result of all cells and save to file'.format(datetime.now())) + all_cell_zscore_df = pd.concat(df_list, axis=0) + zscore_fname = self.out_path + all_cell_zscore_df.to_csv(zscore_fname, header=True, index=True, sep="\t") + #print(all_cell_zscore_df) + + + def netpea(self, rwrDf, pathwayDictList): + """return dataframe with cell by pathway""" + pathway_geneList_dict, pathway_shareGeneList_dict, pathway_randomGeneListList_dict = pathwayDictList + # convert to dataframe with headers=[pathway, #pathway genes, overlap genes] + pathway_df = self._merge_pathway_dict(pathway_geneList_dict, pathway_shareGeneList_dict) + # collect score of random gene list + print('{:}: collect score of random gene list'.format(datetime.now())) + cell_pathway_bgScoreList_dict = {} # dict of dict + for cell in rwrDf.index: + cell_pathway_bgScoreList_dict.update({cell:{}}) + # prepare data + rwr_df = rwrDf.loc[cell] # 1 by ppiG dataframe + # append aggregate score for each randomgenelist for each pathway + for pathway, randomGeneListList in pathway_randomGeneListList_dict.items(): + bgScoreList = [rwr_df.loc[randomGeneList].mean() for randomGeneList in randomGeneListList] + cell_pathway_bgScoreList_dict[cell].update({pathway:bgScoreList}) + + # collect score of share gene list + print('{:}: collect score of share gene list'.format(datetime.now())) + cell_pathway_ScoreList_dict = {} # dict of dict + for cell in rwrDf.index: + cell_pathway_ScoreList_dict.update({cell:{}}) + # prepare data + rwr_df = rwrDf.loc[cell] # 1 by ppiG dataframe + # append aggregate score for each randomgenelist for each pathway + for pathway, shareGeneList in pathway_shareGeneList_dict.items(): + score = rwr_df.loc[shareGeneList].mean() + cell_pathway_ScoreList_dict[cell].update({pathway:score}) + # ztest to determin significance + print('{:}: ztest to determin significance'.format(datetime.now())) + zscore_dfs = [] + cell_pathway_zscore_dict = {} # collect zscore for each pathway + cell_pathway_ztest_dict = {} # collect zscore and pvalue for each pathway + for cell in rwrDf.index: + cell_pathway_zscore_dict.update({cell:{}}) + cell_pathway_ztest_dict.update({cell:{}}) + pathway_score_dict = cell_pathway_ScoreList_dict[cell] + pathway_bgList_dict = cell_pathway_bgScoreList_dict[cell] + for pathway in pathway_geneList_dict.keys(): + score = pathway_score_dict[pathway] + bgList = pathway_bgList_dict[pathway] + [zscore, pvalue] = self._cal_zscore(score, bgList) + cell_pathway_ztest_dict[cell].update({pathway: [zscore, pvalue]}) + cell_pathway_zscore_dict[cell].update({pathway:zscore}) + # save per-cell zscore + cell_zscore_df = pd.DataFrame(cell_pathway_zscore_dict[cell], index=[cell]) + zscore_dfs.append(cell_zscore_df) + # save per-cell ztest results + cell_bgtest_df = pd.DataFrame(cell_pathway_ztest_dict[cell], index=['zscore', 'pvalue']).T + cell_bgtest_df.index.name = 'pathway' + cell_bgtest_df = cell_bgtest_df.join(pathway_df) + #percell_fname = self.out_path + '.' + cell + '.NetPEA.background_result.txt' + #cell_bgtest_df.to_csv(percell_fname, header=True, index=True, sep="\t") + # merge result of all cells and save to file + #print('{:}: merge result of all cells and save to file'.format(datetime.now())) + all_cell_zscore_df = pd.concat(zscore_dfs, axis=0) + #zscore_fname = self.out_path + '.NetPEA.zscore.txt' + #all_cell_zscore_df.to_csv(zscore_fname, header=True, index=True, sep="\t") + + # clear space + pathwayDictList = [] + return all_cell_zscore_df + + def _merge_pathway_dict(self, pathway_geneList_dict, pathway_shareGeneList_dict): + """return dataframe with headers = [pathway, #pathway genes, overlap genes]""" + pathway_lenG_dict = {pathway: len(geneList) for pathway, geneList in pathway_geneList_dict.items()} + pathway_strG_dict = {pathway: ",".join(geneList) for pathway, geneList in pathway_shareGeneList_dict.items()} + df1 = pd.DataFrame(pathway_lenG_dict.items(), columns=['pathway', '#pathway genes']) + df2 = pd.DataFrame(pathway_strG_dict.items(), columns=['pathway', 'overlap genes']) + return df1.set_index('pathway').join(df2.set_index('pathway')) + + def _find_overlaps(self, rwrDf, pathway_dict): + """return diction with pathway:geneList""" + # create result dictionary + result_dict = {} #pathway:sharedGeneList + # get ppiGenes + ppi_gene_list = rwrDf.columns.tolist() + # find overlaps + for pathway, geneList in pathway_dict.items(): + shareGene_list = sorted(list(set(geneList) & set(ppi_gene_list))) + result_dict.update({pathway:shareGene_list}) + return result_dict + + def _cal_zscore(self, score, scoreList): + """return zscore and pvalue by lookup table""" + if np.std(scoreList) != 0: + zscore = (score - np.mean(scoreList) ) / np.std(scoreList) + pvalue = scistat.norm.sf(abs(zscore)) # not pdf + #print('score={:}, scoreList={:}, zscore={:}, pvalue={:}'.format( + # score, scoreList[:10], zscore, pvalue)) + else: + zscore, pvalue = np.nan, np.nan + return [zscore, pvalue] + + def _cal_similarity_score(self, rwrDf, geneList): + """return similarity score by taking average of rwr for given geneList""" + return rwrDf.loc[geneList].mean() + + def _get_pathway_genes(self, gmt): + """ + Return pathwayStr_geneList_dict + + :param fin: file name to pathway in gmt format + :return pathway_dict: dictionary of pathway as key, genelist as values + """ + pathwayStr_geneList_dict = {} + with open(gmt, 'r') as f: + for line in f: + # extract fields + line = line.strip('\n').split('\t') + pathway_str = line[0] + gene_list = line[2:] + # update to dict + pathwayStr_geneList_dict.update({pathway_str:gene_list}) + return pathwayStr_geneList_dict + + def _df2dict(self, df): + """return 1 by N dataframe to dictionary of N keys""" + return df.to_dict('records')[0] # keys are column names = gene nams + + +if __name__ == "__main__": + # timer + datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' + start_time = datetime.now() + rwr_df = 'test.txt' #'/repo4/ytang4/PHD/db/GDSC/processed/GDSC.MUTCNV.STRING.RWR.txt' + pathway_gmt = '/repo4/ytang4/PHD/db/MSigdb/c2.cp.pid.v7.1.symbols.gmt' + # initiate + cell_pathway_df = NetPEA(rwr_df, pathway_gmt, permutation=3, seed=42, n_cpu=5, out_path='./test_netpea/GDSC') + spend = datetime.strptime(str(datetime.now()), datetimeFormat) - datetime.strptime(str(start_time),datetimeFormat) + print( '[Finished in {:}]'.format(spend) ) + diff --git a/PathDSP.def b/PathDSP.def new file mode 100644 index 0000000..b754001 --- /dev/null +++ b/PathDSP.def @@ -0,0 +1,59 @@ +Bootstrap: docker +From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime + +%labels + MANTAINER Yuanhang Liu + +%setup + cp ./src/Singularity_gpu_fix.sh $SINGULARITY_ROOTFS + # add local url of this repository for testing + + +%environment + export PATH=$PATH:/usr/local/PathDSP + export IMPROVE_MODEL_DIR=/usr/local/PathDSP + export CANDLE_DATA_DIR=/candle_data_dir + export AUTHOR_DATA_DIR=/candle_data_dir + export PYTHONPATH=$PYTHONPATH:/usr/local/IMPROVE/:/usr/local/PathDSP/PathDSP/ + +%post + apt-get update -y + apt-get install wget -y + apt-get install -y gnupg + apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80 + apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys A4B469963BF863CC + + apt-get install build-essential -y + apt-get install git -y + apt-get install vim -y + apt-get install subversion -y + + # install gpu fix and clean up + cd / + chmod +x Singularity_gpu_fix.sh + ./Singularity_gpu_fix.sh + rm Singularity_gpu_fix.sh + + # these three need to be compiled and linked to the cuda libs. + # at the moment, what works for me is to build these in a + # singularity shell in a sandbox with the --nv flag to singularity set. + + + # create default internal candle_data_dir, map external candle_data_dir here + mkdir /candle_data_dir + + #install python modules and model prerequites + cd /usr/local + git clone https://github.com/JDACS4C-IMPROVE/IMPROVE.git + git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git + cd PathDSP + + # download conda + + /opt/conda/bin/conda env create -f environment_082223.yml --prefix /usr/local/conda_envs/PathDSP_env/ + #/opt/conda/bin/conda activate PathDSP_env + /usr/local/conda_envs/PathDSP_env/bin/pip install git+https://github.com/ECP-CANDLE/candle_lib@develop + + cp *.sh /usr/local/bin + chmod a+x /usr/local/bin/*.sh + chmod a+x /usr/local/PathDSP/*.sh \ No newline at end of file diff --git a/PathDSP_env_conda.yml b/PathDSP_env_conda.yml new file mode 100644 index 0000000..0f91f17 --- /dev/null +++ b/PathDSP_env_conda.yml @@ -0,0 +1,218 @@ +name: PathDSP_env +channels: + - bioconda + - pytorch + - nvidia + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - abseil-cpp=20211102.0 + - arrow-cpp=11.0.0 + - asttokens=2.2.1 + - aws-c-common=0.6.8 + - aws-c-event-stream=0.1.6 + - aws-checksums=0.1.11 + - aws-sdk-cpp=1.8.185 + - backcall=0.2.0 + - backports=1.0 + - backports.functools_lru_cache=1.6.5 + - blas=1.0 + - boost=1.74.0 + - boost-cpp=1.74.0 + - bottleneck=1.3.5 + - brotlipy=0.7.0 + - bzip2=1.0.8 + - c-ares=1.19.0 + - ca-certificates=2023.05.30 + - cairo=1.16.0 + - certifi=2023.7.22 + - cffi=1.15.1 + - charset-normalizer=2.0.4 + - comm=0.1.4 + - cryptography=41.0.2 + - cuda-cudart=11.7.99 + - cuda-cupti=11.7.101 + - cuda-libraries=11.7.1 + - cuda-nvrtc=11.7.99 + - cuda-nvtx=11.7.91 + - cuda-runtime=11.7.1 + - cycler=0.11.0 + - debugpy=1.6.7 + - decorator=5.1.1 + - executing=1.2.0 + - ffmpeg=4.3 + - filelock=3.9.0 + - fontconfig=2.14.1 + - freetype=2.10.4 + - gflags=2.2.2 + - giflib=5.2.1 + - glib=2.69.1 + - glog=0.5.0 + - gmp=6.2.1 + - gmpy2=2.1.2 + - gnutls=3.6.15 + - greenlet=2.0.1 + - grpc-cpp=1.48.2 + - gseapy=1.0.5 + - icu=70.1 + - idna=3.4 + - importlib-metadata=6.8.0 + - importlib_metadata=6.8.0 + - intel-openmp=2023.1.0 + - ipykernel=6.25.1 + - ipython=8.14.0 + - jbig=2.1 + - jedi=0.19.0 + - jinja2=3.1.2 + - jpeg=9e + - jupyter_client=8.3.0 + - jupyter_core=4.12.0 + - kiwisolver=1.4.4 + - krb5=1.20.1 + - lame=3.100 + - lcms2=2.12 + - ld_impl_linux-64=2.38 + - lerc=3.0 + - libbrotlicommon=1.0.9 + - libbrotlidec=1.0.9 + - libbrotlienc=1.0.9 + - libcublas=11.10.3.66 + - libcufft=10.7.2.124 + - libcufile=1.7.1.12 + - libcurand=10.3.3.129 + - libcurl=8.1.1 + - libcusolver=11.4.0.1 + - libcusparse=11.7.4.91 + - libdeflate=1.8 + - libedit=3.1.20221030 + - libev=4.33 + - libevent=2.1.12 + - libffi=3.4.4 + - libgcc-ng=13.1.0 + - libgfortran-ng=11.2.0 + - libgfortran5=11.2.0 + - libiconv=1.16 + - libidn2=2.3.4 + - libnghttp2=1.52.0 + - libnpp=11.7.4.75 + - libnsl=2.0.0 + - libnvjpeg=11.8.0.2 + - libpng=1.6.39 + - libprotobuf=3.20.3 + - libsodium=1.0.18 + - libsqlite=3.42.0 + - libssh2=1.10.0 + - libstdcxx-ng=11.2.0 + - libtasn1=4.19.0 + - libthrift=0.15.0 + - libtiff=4.3.0 + - libunistring=0.9.10 + - libuuid=2.38.1 + - libwebp=1.2.4 + - libwebp-base=1.2.4 + - libxcb=1.15 + - libxml2=2.9.14 + - libzlib=1.2.13 + - llvm-openmp=16.0.6 + - lz4-c=1.9.4 + - markupsafe=2.1.1 + - matplotlib-base=3.4.3 + - matplotlib-inline=0.1.6 + - mkl=2023.1.0 + - mkl-service=2.4.0 + - mkl_fft=1.3.6 + - mkl_random=1.2.2 + - mpc=1.1.0 + - mpfr=4.0.2 + - mpmath=1.3.0 + - ncurses=6.4 + - nest-asyncio=1.5.6 + - nettle=3.7.3 + - networkx=3.1 + - numexpr=2.8.4 + - numpy=1.25.2 + - numpy-base=1.25.2 + - openh264=2.1.1 + - openssl=3.1.2 + - orc=1.7.4 + - packaging=23.1 + - pandas=1.5.3 + - parso=0.8.3 + - pcre=8.45 + - pexpect=4.8.0 + - pickleshare=0.7.5 + - pillow=9.4.0 + - pip=23.2.1 + - pixman=0.40.0 + - polars=0.18.15 + - prompt-toolkit=3.0.39 + - prompt_toolkit=3.0.39 + - psutil=5.9.5 + - pthread-stubs=0.4 + - ptyprocess=0.7.0 + - pure_eval=0.2.2 + - pyarrow=11.0.0 + - pycairo=1.24.0 + - pycparser=2.21 + - pygments=2.16.1 + - pyopenssl=23.2.0 + - pysocks=1.7.1 + - python=3.10.12 + - python-dateutil=2.8.2 + - python_abi=3.10 + - pytorch=2.0.1 + - pytorch-cuda=11.7 + - pytorch-mutex=1.0 + - pytz=2022.7 + - pyzmq=25.1.0 + - rdkit=2022.03.2 + - re2=2022.04.01 + - readline=8.2 + - reportlab=3.6.12 + - requests=2.31.0 + - scipy=1.11.1 + - seaborn=0.12.2 + - setuptools=68.0.0 + - six=1.16.0 + - snappy=1.1.9 + - sqlalchemy=1.4.49 + - sqlite=3.41.2 + - stack_data=0.6.2 + - sympy=1.11.1 + - tbb=2021.8.0 + - tk=8.6.12 + - torchaudio=2.0.2 + - torchtriton=2.0.0 + - torchvision=0.15.2 + - tornado=6.3.2 + - traitlets=5.9.0 + - typing_extensions=4.7.1 + - tzdata=2023c + - urllib3=1.26.16 + - utf8proc=2.6.1 + - wcwidth=0.2.6 + - wheel=0.38.4 + - xorg-libxau=1.0.11 + - xorg-libxdmcp=1.1.3 + - xz=5.2.6 + - zeromq=4.3.4 + - zipp=3.16.2 + - zlib=1.2.13 + - zstd=1.5.2 + - pip: + - astropy==5.3.2 + - contourpy==1.1.0 + - fonttools==4.42.0 + - joblib==1.3.2 + - matplotlib==3.7.2 + - patsy==0.5.3 + - protobuf==3.19.0 + - pyerfa==2.0.0.3 + - pyparsing==3.0.9 + - pyyaml==6.0.1 + - scikit-learn==1.3.0 + - statsmodels==0.14.0 + - threadpoolctl==3.2.0 + - tqdm==4.66.1 diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py new file mode 100755 index 0000000..97fc938 --- /dev/null +++ b/PathDSP_infer_improve.py @@ -0,0 +1,85 @@ +import os +import sys +import numpy as np +import pandas as pd +from datetime import datetime +import torch as tch +import torch.utils.data as tchud +import polars as pl +import model_utils.myModel as mynet +import model_utils.myDataloader as mydl +import model_utils.myUtility as myutil + +from PathDSP_preprocess_improve import mkdir, preprocess +from PathDSP_train_improve import ( + predicting, + cal_time, +) +from improvelib.applications.drug_response_prediction.config import DRPInferConfig #NCK +import improvelib.utils as frm #NCK +from model_params_def import pathdsp_infer_params + +file_path = os.path.dirname(os.path.realpath(__file__)) + + +def run(params): + frm.create_outdir(outdir=params["output_dir"]) + #params = preprocess(params) + test_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="test") + test_df = pl.read_csv(params["input_data_dir"] + "/" + test_data_fname, separator = "\t").to_pandas() + Xtest_arr = test_df.iloc[:, 0:-1].values + ytest_arr = test_df.iloc[:, -1].values + Xtest_arr = np.array(Xtest_arr).astype('float32') + ytest_arr = np.array(ytest_arr).astype('float32') + trained_net = mynet.FNN(Xtest_arr.shape[1]) + modelpath = frm.build_model_path(model_file_name=params["model_file_name"], model_file_format=params["model_file_format"], model_dir=params["input_model_dir"]) + trained_net.load_state_dict(tch.load(modelpath)) + trained_net.eval() + #myutil.set_seed(params["seed_int"]) + cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_env_visible is not None: + device = 'cuda:0' + else: + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) + test_dl = tchud.DataLoader(test_dataset, batch_size=params['infer_batch'], shuffle=False) + start = datetime.now() + test_true, test_pred = predicting(trained_net, device, data_loader=test_dl) + + test_true = pd.Series(test_true) + test_pred = pd.Series(test_pred) + test_true_untrans = test_true.apply(lambda x: 10 ** (x) - 0.01) + test_pred_untrans = test_pred.apply(lambda x: 10 ** (x) - 0.01) + + frm.store_predictions_df( + y_true=test_true_untrans, + y_pred=test_pred_untrans, + stage="test", + y_col_name=params["y_col_name"], + output_dir=params["output_dir"], + input_dir=params["input_data_dir"] + ) + if params["calc_infer_scores"]: + test_scores = frm.compute_performance_scores( + y_true=test_true_untrans, + y_pred=test_pred_untrans, + stage="test", + metric_type=params["metric_type"], + output_dir=params["output_dir"] + ) + + print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start))) + return True + +def main(args): + cfg = DRPInferConfig() + params = cfg.initialize_parameters( + file_path, + default_config="PathDSP_params.txt", + additional_definitions=pathdsp_infer_params) + if_ran = run(params) + print("\nFinished inference of PathDSP model.") + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/PathDSP_params.txt b/PathDSP_params.txt new file mode 100644 index 0000000..91f33f6 --- /dev/null +++ b/PathDSP_params.txt @@ -0,0 +1,63 @@ +[Preprocess] +data_format = .txt +input_supp_data_dir = ./author_data +train_split_file = CCLE_split_0_train.txt +val_split_file = CCLE_split_0_val.txt +test_split_file = CCLE_split_0_test.txt +y_data_files = [["response.tsv"]] +x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] +x_data_drug_files = [["drug_SMILES.tsv"]] +y_col_name = auc +bit_int = 128 +permutation_int = 3 +seed_int = 42 +cpu_int = 20 +drug_bits_file = drug_mbit_df.txt +dgnet_file = DGnet.txt +mutnet_file = MUTnet.txt +cnvnet_file = CNVnet.txt +exp_file = EXP.txt + +[Train] +data_format = .txt +model_file_name = model +model_file_format = .pt +#epochs = 20 +epochs = 800 +learning_rate = 0.0004 +batch_size = 12 +val_batch = 12 +loss = mse +patience = 30 +y_col_name = auc +cuda_name = cuda:0 +dropout = 0.1 + +[Infer] +y_col_name = auc +infer_batch = 256 +model_file_name = model +model_file_format = .pt +data_format = .txt +cuda_name = cuda:0 + +#gpu_int=0 +#gene_set = 'MSigdb.zip' +#ppi_data = 'STRING.zip' +#drug_target = 'raw_data.zip' +#raw_data_dir = "raw_data" +#train_data = 'PathDSP_train.txt' +#test_data = 'PathDSP_test.txt' +#val_data = 'PathDSP_val.txt' +#y_col_name = 'auc' +#metric='auc' +#data_type='CTRPv2' +#split=0 +#eps=0.00001 +#drug_hiddens='100,50,6' +#final_hiddens=6 +#optimizer = 'adam' +#improve_analysis='no' + + + diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py new file mode 100644 index 0000000..e5ce173 --- /dev/null +++ b/PathDSP_preprocess_improve.py @@ -0,0 +1,441 @@ +import sys +import os +import polars as pl +import numpy as np +import pandas as pd +import copy +from functools import reduce +from pathlib import Path +from rdkit import Chem +from rdkit.Chem import AllChem +from datetime import datetime +import RWR as rwr +import NetPEA as pea +import gseapy as gp +import sklearn.model_selection as skms +from sklearn.preprocessing import StandardScaler +from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig #NCK +from improvelib.utils import str2bool #NCK +import improvelib.utils as frm #NCK +import improvelib.applications.drug_response_prediction.drug_utils as drugs #NCK +import improvelib.applications.drug_response_prediction.omics_utils as omics #NCK +import improvelib.applications.drug_response_prediction.drp_utils as drp #NCK + +from model_params_def import pathdsp_preprocess_params + +file_path = Path(__file__).resolve().parent + +req_preprocess_args = [ll["name"] for ll in pathdsp_preprocess_params] + +def mkdir(directory): + directories = directory.split("/") + folder = "" + for d in directories: + folder += d + "/" + if not os.path.exists(folder): + print("creating folder: %s" % folder) + os.mkdir(folder) + + +def preprocess(params): + for i in [ + "drug_bits_file", + "dgnet_file", + "mutnet_file", + "cnvnet_file", + "exp_file", + ]: + params[i] = params["output_dir"] + "/" + params[i] + return params + + +# set timer +def cal_time(end, start): + """return time spent""" + # end = datetime.now(), start = datetime.now() + datetimeFormat = "%Y-%m-%d %H:%M:%S.%f" + spend = datetime.strptime(str(end), datetimeFormat) - datetime.strptime( + str(start), datetimeFormat + ) + return spend + +def response_out(params, split_file): + response_df = drp.DrugResponseLoader(params, split_file=split_file, verbose=True) + return response_df.dfs["response.tsv"] + + +def smile2bits(params): + start = datetime.now() + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + smile_df = drugs.DrugsLoader(params) + smile_df = smile_df.dfs['drug_SMILES.tsv'] + smile_df = smile_df.reset_index() + smile_df.columns = ["drug", "smile"] + smile_df = smile_df.drop_duplicates(subset=["drug"], keep="first").set_index("drug") + smile_df = smile_df.loc[smile_df.index.isin(response_df["improve_chem_id"]),] + bit_int = params["bit_int"] + record_list = [] + # smile2bits drug by drug + n_drug = 1 + for idx, row in smile_df.iterrows(): + drug = idx + smile = row["smile"] + mol = Chem.MolFromSmiles(smile) + if mol is None: + continue + mbit = list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=bit_int)) + # drug_mbit_dict.update({drug:mbit}) + # append to result + record_list.append(tuple([drug] + mbit)) + if len(mbit) == bit_int: + n_drug += 1 + print("total {:} drugs with bits".format(n_drug)) + # convert dict to dataframe + colname_list = ["drug"] + ["mBit_" + str(i) for i in range(bit_int)] + drug_mbit_df = pd.DataFrame.from_records(record_list, columns=colname_list) + # drug_mbit_df = pd.DataFrame.from_dict(drug_mbit_dict, orient='index', columns=colname_list) + # drug_mbit_df.index.name = 'drug' + print("unique drugs={:}".format(len(drug_mbit_df["drug"].unique()))) + # save to file + drug_mbit_df.to_csv(params["drug_bits_file"], header=True, index=False, sep="\t") + print("[Finished in {:}]".format(cal_time(datetime.now(), start))) + + +def times_expression(rwr, exp): + """ + :param rwrDf: dataframe of cell by gene probability matrix + :param expDf: dataframe of cell by gene expression matrix + :return rwr_timesexp_df: dataframe of cell by gene probability matrix, + in which genes are multiplied with expression values + + Note: this function assumes cells are all overlapped while gene maybe not + """ + cell_list = sorted(list(set(rwr.index) & set(exp.index))) + gene_list = sorted(list(set(rwr.columns) & set(exp.columns))) + + if len(cell_list) == 0: + print("ERROR! no overlapping cell lines") + sys.exit(1) + if len(gene_list) == 0: + print("ERROR! no overlapping genes") + sys.exit(1) + # multiply with gene expression for overlapping cell, gene + rwr_timesexp = rwr.loc[cell_list, gene_list] * exp.loc[cell_list, gene_list] + # concat with other gene + out_gene_list = list(set(rwr.columns) - set(gene_list)) + out_df = pd.concat([rwr_timesexp, rwr[out_gene_list]], axis=1) + return out_df + + +def run_netpea(params, dtype, multiply_expression): + # timer + start_time = datetime.now() + ppi_path = params["input_supp_data_dir"] + "/STRING/9606.protein_name.links.v11.0.pkl" + pathway_path = ( + params["input_supp_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" + ) + log_transform = False + permutation_int = params["permutation_int"] + seed_int = params["seed_int"] + cpu_int = params["cpu_int"] + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + omics_data = omics.OmicsLoader(params) + if dtype == "DGnet": + drug_info = pd.read_csv(params["input_dir"] + "/x_data/drug_info.tsv", sep="\t") + drug_info["NAME"] = drug_info["NAME"].str.upper() + target_info = pd.read_csv( + params["input_supp_data_dir"] + "/data/DB.Drug.Target.txt", sep="\t" + ) + target_info = target_info.rename(columns={"drug": "NAME"}) + combined_df = pd.merge(drug_info, target_info, how="left", on="NAME").dropna( + subset=["gene"] + ) + combined_df = combined_df.loc[ + combined_df["improve_chem_id"].isin(response_df["improve_chem_id"]), + ] + restart_path = params["output_dir"] + "/drug_target.txt" + combined_df.iloc[:, -2:].to_csv( + restart_path, sep="\t", header=True, index=False + ) + outpath = params["dgnet_file"] + elif dtype == "MUTnet": + mutation_data = omics_data.dfs['cancer_mutation_count.tsv'] + #mutation_data = mutation_data.reset_index() + mutation_data = pd.melt(mutation_data, id_vars="improve_sample_id").loc[ + lambda x: x["value"] > 0 + ] + mutation_data = mutation_data.loc[ + mutation_data["improve_sample_id"].isin(response_df["improve_sample_id"]), + ] + restart_path = params["output_dir"] + "/mutation_data.txt" + mutation_data.iloc[:, 0:2].to_csv( + restart_path, sep="\t", header=True, index=False + ) + outpath = params["mutnet_file"] + else: + cnv_data = omics_data.dfs['cancer_discretized_copy_number.tsv'] + #cnv_data = cnv_data.reset_index() + cnv_data = pd.melt(cnv_data, id_vars="improve_sample_id").loc[ + lambda x: x["value"] != 0 + ] + cnv_data = cnv_data.loc[ + cnv_data["improve_sample_id"].isin(response_df["improve_sample_id"]), + ] + restart_path = params["output_dir"] + "/cnv_data.txt" + cnv_data.iloc[:, 0:2].to_csv(restart_path, sep="\t", header=True, index=False) + outpath = params["cnvnet_file"] + # perform Random Walk + print(datetime.now(), "performing random walk with restart") + rwr_df = rwr.RWR( + ppi_path, + restart_path, + restartProbFloat=0.5, + convergenceFloat=0.00001, + normalize="l1", + weighted=True, + ).get_prob() + # multiply with gene expression + if multiply_expression: + print( + datetime.now(), + "multiplying gene expression with random walk probability for genes were expressed", + ) + # exp_df = improve_utils.load_gene_expression_data(gene_system_identifier='Gene_Symbol') + # exp_df = drp.load_omics_data( + # params, + # omics_type="gene_expression", + # canc_col_name="improve_sample_id", + # gene_system_identifier="Gene_Symbol", + # ) + exp_df = omics_data.dfs['cancer_gene_expression.tsv'] + exp_df = exp_df.set_index(params['canc_col_name']) + rwr_df = times_expression(rwr_df, exp_df) + # rwr_df.to_csv(out_path+'.RWR.txt', header=True, index=True, sep='\t') + # perform Pathwa Enrichment Analysis + print(datetime.now(), "performing network-based pathway enrichment") + cell_pathway_df = pea.NetPEA( + rwr_df, + pathway_path, + log_transform=log_transform, + permutation=permutation_int, + seed=seed_int, + n_cpu=cpu_int, + out_path=outpath, + ) + print("[Finished in {:}]".format(cal_time(datetime.now(), start_time))) + + +def prep_input(params): + # Read data files + drug_mbit_df = pd.read_csv(params["drug_bits_file"], sep="\t", index_col=0) + drug_mbit_df = drug_mbit_df.reset_index().rename(columns={"drug": "drug_id"}) + DGnet = pd.read_csv(params["dgnet_file"], sep="\t", index_col=0) + DGnet = ( + DGnet.add_suffix("_dgnet").reset_index().rename(columns={"index": "drug_id"}) + ) + CNVnet = pd.read_csv(params["cnvnet_file"], sep="\t", index_col=0) + CNVnet = ( + CNVnet.add_suffix("_cnvnet") + .reset_index() + .rename(columns={"index": "sample_id"}) + ) + MUTnet = pd.read_csv(params["mutnet_file"], sep="\t", index_col=0) + MUTnet = ( + MUTnet.add_suffix("_mutnet") + .reset_index() + .rename(columns={"index": "sample_id"}) + ) + EXP = pd.read_csv(params["exp_file"], sep="\t", index_col=0) + EXP = EXP.add_suffix("_exp").reset_index().rename(columns={"index": "sample_id"}) + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + response_df = response_df.rename( + columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} + ) + # Extract relevant IDs + common_drug_ids = reduce( + np.intersect1d, + (drug_mbit_df["drug_id"], DGnet["drug_id"], response_df["drug_id"]), + ) + common_sample_ids = reduce( + np.intersect1d, + ( + CNVnet["sample_id"], + MUTnet["sample_id"], + EXP["sample_id"], + response_df["sample_id"], + ), + ) + response_df = response_df.loc[ + (response_df["drug_id"].isin(common_drug_ids)) + & (response_df["sample_id"].isin(common_sample_ids)), + :, + ] + drug_mbit_df = ( + drug_mbit_df.loc[drug_mbit_df["drug_id"].isin(common_drug_ids), :] + .set_index("drug_id") + .sort_index() + ) + DGnet = ( + DGnet.loc[DGnet["drug_id"].isin(common_drug_ids), :] + .set_index("drug_id") + .sort_index() + ) + CNVnet = ( + CNVnet.loc[CNVnet["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + MUTnet = ( + MUTnet.loc[MUTnet["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + EXP = ( + EXP.loc[EXP["sample_id"].isin(common_sample_ids), :] + .set_index("sample_id") + .sort_index() + ) + + drug_data = drug_mbit_df.join(DGnet) + sample_data = CNVnet.join([MUTnet, EXP]) + ## export train,val,test set + for i in ["train", "test", "val"]: + response_df = drp.DrugResponseLoader(params, split_file=params[i+"_split_file"], verbose=True) + response_df = response_df.dfs['response.tsv'] + response_df['exp_id'] = list(range(0,response_df.shape[0])) + response_df = response_df.rename( + columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"} + ) + response_df = response_df.loc[ + (response_df["drug_id"].isin(common_drug_ids)) + & (response_df["sample_id"].isin(common_sample_ids)), + :, + ] + + comb_data_mtx = pd.DataFrame( + { + "drug_id": response_df["drug_id"].values, + "sample_id": response_df["sample_id"].values, + "exp_id": response_df["exp_id"].values + } + ) + comb_data_mtx = ( + comb_data_mtx.set_index(["drug_id", "sample_id", "exp_id"]) + .join(drug_data, on="drug_id") + .join(sample_data, on="sample_id") + ) + ss = StandardScaler() + comb_data_mtx.iloc[:,params["bit_int"]:comb_data_mtx.shape[1]] = ss.fit_transform(comb_data_mtx.iloc[:,params["bit_int"]:comb_data_mtx.shape[1]]) + ## add 0.01 to avoid possible inf values + comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) + comb_data_mtx = comb_data_mtx.dropna() + + comb_data_mtx_to_save = comb_data_mtx['response'] + comb_data_mtx_to_save = comb_data_mtx_to_save.reset_index() + comb_data_mtx_to_save.rename(columns={'drug_id': 'improve_chem_id', 'sample_id': 'improve_sample_id'}, inplace=True) + #comb_data_mtx_to_save[params["y_col_name"]] = comb_data_mtx_to_save["response"].apply(lambda x: 10 ** (x) - 0.01) + rsp = drp.DrugResponseLoader(params, + split_file=params[i+"_split_file"], + verbose=False).dfs["response.tsv"] + rsp['exp_id'] = list(range(0,rsp.shape[0])) + ydata = rsp.merge(comb_data_mtx_to_save, on=['exp_id'], how='right') + print(comb_data_mtx_to_save) + print("YDATA") + print(ydata) + frm.save_stage_ydf(ydf=ydata, stage=i, output_dir=params["output_dir"]) + pl.from_pandas(comb_data_mtx).write_csv( + params["output_dir"] + "/" + frm.build_ml_data_file_name(data_format=params["data_format"], stage=i) +, separator="\t", has_header=True + ) + + +def run_ssgsea(params): + # expMat = improve_utils.load_gene_expression_data(sep='\t') + # expMat = drp.load_omics_data( + # params, + # omics_type="gene_expression", + # canc_col_name="improve_sample_id", + # gene_system_identifier="Gene_Symbol", + # ) + omics_data = omics.OmicsLoader(params) + expMat = omics_data.dfs['cancer_gene_expression.tsv'] + expMat = expMat.set_index(params['canc_col_name']) + + # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], + # split=params['split'], split_type=["train", "test", "val"], + # y_col_name=params['metric']) + response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]] + response_df = pd.concat(response_df, ignore_index=True) + expMat = expMat.loc[expMat.index.isin(response_df["improve_sample_id"]),] + gct = expMat.T # gene (rows) cell lines (columns) + pathway_path = ( + params["input_supp_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt" + ) + gmt = pathway_path + tmp_str = params["output_dir"] + "/tmpdir_ssgsea/" + + if not os.path.isdir(tmp_str): + os.mkdir(tmp_str) + + # run enrichment + ssgsea = gp.ssgsea( + data=gct, # gct: a matrix of gene by sample + gene_sets=gmt, # gmt format + outdir=tmp_str, + scale=True, + permutation_num=0, # 1000 + no_plot=True, + processes=params["cpu_int"], + # min_size=0, + format="png", + ) + + result_mat = ssgsea.res2d.T # get the normalized enrichment score (i.e., NES) + result_mat.to_csv(tmp_str + "ssGSEA.txt", header=True, index=True, sep="\t") + + f = open(tmp_str + "ssGSEA.txt", "r") + lines = f.readlines() + total_dict = {} + for cell in set(lines[1].split()): + total_dict[cell] = {} + cell_lines = lines[1].split() + vals = lines[4].split() + for i, pathway in enumerate((lines[2].split())): + if i > 0: + total_dict[cell_lines[i]][pathway] = float(vals[i]) + df = pd.DataFrame(total_dict) + df.T.to_csv(params["exp_file"], header=True, index=True, sep="\t") + +def run(params): + frm.create_outdir(outdir=params["output_dir"]) + params = preprocess(params) + print("convert drug to bits.") + smile2bits(params) + print("compute DGnet.") + run_netpea(params, dtype="DGnet", multiply_expression=False) + print("compute MUTnet.") + run_netpea(params, dtype="MUTnet", multiply_expression=True) + print("compute CNVnet.") + run_netpea(params, dtype="CNVnet", multiply_expression=True) + print("compute EXP.") + run_ssgsea(params) + print("prepare final input file.") + prep_input(params) + + +def main(args): + cfg = DRPPreprocessConfig() + params = cfg.initialize_parameters( + file_path, + default_config="PathDSP_params.txt", + additional_definitions=pathdsp_preprocess_params) + run(params) + + +if __name__ == "__main__": + start = datetime.now() + main(sys.argv[1:]) + print("[Preprocessing finished in {:}]".format(cal_time(datetime.now(), start))) diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py new file mode 100644 index 0000000..b3eb9a6 --- /dev/null +++ b/PathDSP_train_improve.py @@ -0,0 +1,329 @@ +import os +import sys +import numpy as np +import pandas as pd +from datetime import datetime +import socket +import torch as tch +import torch.utils.data as tchud +import model_utils.myModel as mynet +import model_utils.myDataloader as mydl +import model_utils.myUtility as myutil +import polars as pl + +from improvelib.applications.drug_response_prediction.config import DRPTrainConfig #NCK +import improvelib.utils as frm #NCK + +from PathDSP_preprocess_improve import cal_time, preprocess +from model_params_def import pathdsp_train_params + +file_path = os.path.dirname(os.path.realpath(__file__)) + + +class RMSELoss(tch.nn.Module): + def __init__(self): + super(RMSELoss,self).__init__() + + def forward(self,x,y): + eps = 1e-6 + criterion = tch.nn.MSELoss() + loss = tch.sqrt(criterion(x, y) + eps) + return loss + +def predicting(model, device, data_loader): + """ Method to make predictions/inference. + This is used in *train.py and *infer.py + + Parameters + ---------- + model : pytorch model + Model to evaluate. + device : string + Identifier for hardware that will be used to evaluate model. + data_loader : pytorch data loader. + Object to load data to evaluate. + + Returns + ------- + total_labels: numpy array + Array with ground truth. + total_preds: numpy array + Array with inferred outputs. + """ + model.to(device) + model.eval() + total_preds = tch.Tensor() + total_labels = tch.Tensor() + print("Make prediction for {} samples...".format(len(data_loader.dataset))) + with tch.no_grad(): + for i, (data_x, data_y) in enumerate(data_loader): + data_x, data_y = data_x.to(device), data_y.to(device) + data_y_pred = model(data_x) + # Is this computationally efficient? + total_preds = tch.cat((total_preds, data_y_pred.cpu()), 0) # preds to tensor + total_labels = tch.cat((total_labels, data_y.view(-1, 1).cpu()), 0) # labels to tensor + return total_labels.numpy().flatten(), total_preds.numpy().flatten() + + +def predict(net, device, test_dl): + """ + Return prediction list + + :param net: model + :param train_dl: train dataloader + :param device: string representing cpu or cuda:0 + """ + # create result lists + prediction_list = list() + true_list = list() + + with tch.no_grad(): + net = net.to(device) # load the network onto the device + net.eval() + for i, (X_test, y_test) in enumerate(test_dl): + X_test, y_test = X_test.to(device), y_test.to(device) # load data onto the device + y_test_pred = net(X_test) # test result + # bring data back to cpu in np.array format, and append to result lists + prediction_list.append( y_test_pred.cpu().numpy() ) + true_list.append(y_test.cpu().numpy()) + #print(prediction_list) + + # merge all batches + prediction_list = np.vstack(prediction_list) + prediction_list = np.hstack(prediction_list).tolist() + true_list = np.vstack(true_list) + true_list = np.hstack(true_list).tolist() + # return + return true_list, prediction_list + +def r2_score(y_true, y_pred): + y_mean = np.mean(y_true) + ss_tot = np.sum((y_true - y_mean)**2) + ss_res = np.sum((y_true - y_pred)**2) + r2 = 1 - ss_res / ss_tot + return r2 + +def cal_time(end, start): + '''return time spent''' + # end = datetime.now(), start = datetime.now() + datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' + spend = datetime.strptime(str(end), datetimeFormat) - \ + datetime.strptime(str(start),datetimeFormat) + return spend + + +def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn, params): + """ + Return train and valid performance including loss + + :param net: model + :param train_dl: train dataloader + :param valid_dl: valid dataloader + :param epochs: integer representing EPOCH + :param learning_rate: float representing LEARNING_RATE + :param device: string representing cpu or cuda:0 + :param opt_fn: optimization function in torch (e.g., tch.optim.Adam) + :param loss_fn: loss function in torch (e.g., tch.nn.MSELoss) + """ + # setup + criterion = RMSELoss() # setup LOSS function + optimizer = opt_fn(net.parameters(), lr=learning_rate, weight_decay=1e-5) # setup optimizer + net = net.to(device) # load the network onto the device + trainloss_list = [] # metrics: MSE, size equals to EPOCH + validloss_list = [] # metrics: MSE, size equals to EPOCH + validr2_list = [] # metrics: r2, size equals to EPOCH + early_stopping = myutil.EarlyStopping(patience=params['patience'], verbose=True, path= params["output_dir"] + "/checkpoint.pt") # initialize the early_stopping + # repeat the training for EPOCH times + start_total = datetime.now() + for epoch in range(epochs): + ## training phase + start = datetime.now() + net.train() + # initial loss + train_epoch_loss = 0.0 # save loss for each epoch, batch by batch + for i, (X_train, y_train) in enumerate(train_dl): + X_train, y_train = X_train.to(device), y_train.to(device) # load data onto the device + y_train_pred = net(X_train) # train result + train_loss = criterion(y_train_pred, y_train.float()) # calculate loss + optimizer.zero_grad() # clear gradients + train_loss.backward() # backpropagation + #### add this if you have gradient explosion problem ### + clip_value = 5 + tch.nn.utils.clip_grad_value_(net.parameters(), clip_value) + ########climp gradient within -5 ~ 5 ################### + optimizer.step() # update weights + train_epoch_loss += train_loss.item() # adding loss from each batch + # calculate total loss of all batches + avg_train_loss = train_epoch_loss / len(train_dl) + trainloss_list.append( avg_train_loss ) + print('epoch ' + str(epoch) + ' :[Finished in {:}]'.format(cal_time(datetime.now(), start))) + ## validation phase + with tch.no_grad(): + net.eval() + valid_epoch_loss = 0.0 # save loss for each epoch, batch by batch + ss_res = 0.0 + ss_tot = 0.0 + for i, (X_valid, y_valid) in enumerate(valid_dl): + X_valid, y_valid = X_valid.to(device), y_valid.to(device) # load data onto the device + y_valid_pred = net(X_valid) # valid result + valid_loss = criterion(y_valid_pred, y_valid.float())#y_valid.unsqueeze(1)) # calculate loss + valid_epoch_loss += valid_loss.item() # adding loss from each batch + ss_res += tch.sum((y_valid_pred - y_valid.float())**2) + ss_tot += tch.sum((y_valid_pred - y_valid.mean())**2) + + + # calculate total loss of all batches, and append to result list + avg_valid_loss = valid_epoch_loss / len(valid_dl) + validloss_list.append( avg_valid_loss) + valid_r2 = 1 - ss_res / ss_tot + validr2_list.append(valid_r2.cpu().numpy()) + # display print message + #print('epoch={:}/{:}, train loss={:.5f}, valid loss={:.5f}'.format( + # epoch+1, epochs, train_epoch_loss / len(train_dl), + # valid_epoch_loss / len(valid_dl))) + + # early_stopping needs the validation loss to check if it has decresed, + # and if it has, it will make a checkpoint of the current model + early_stopping(avg_valid_loss, net) + + if early_stopping.early_stop: + print("Early stopping") + break + + print('Total time (all epochs) :[Finished in {:}]'.format(cal_time(datetime.now(), start_total))) + # load the last checkpoint with the best model + net.load_state_dict(tch.load(params["output_dir"] + '/checkpoint.pt')) + + return net, trainloss_list, validloss_list, validr2_list + + +def run(params): + frm.create_outdir(outdir=params["output_dir"]) + modelpath = frm.build_model_path(model_file_name=params["model_file_name"], model_file_format=params["model_file_format"], model_dir=params["output_dir"]) + train_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="train") + val_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="val") + #params = preprocess(params) + + # set parameters + #myutil.set_seed(params["seed_int"]) + ## set device + cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_env_visible is not None: + device = 'cuda:0' + params["CUDA_VISIBLE_DEVICES"] = cuda_env_visible + else: + device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1])) + #print("Using device: " + device) + learning_rate = params['learning_rate'] + epoch = params['epochs'] + batch_size = params['batch_size'] + val_batch = params['val_batch'] + opt_fn = tch.optim.Adam + + # ------------------------------------------------------ + # [PathDSP] Prepare dataloaders + # ------------------------------------------------------ + print('loadinig data') + train_df = pl.read_csv(params["input_dir"] + "/" + train_data_fname, separator = "\t").to_pandas() + val_df = pl.read_csv(params["input_dir"] + "/" + val_data_fname, separator = "\t").to_pandas() + Xtrain_arr = train_df.iloc[:, 0:-1].values + Xvalid_arr = val_df.iloc[:, 0:-1].values + ytrain_arr = train_df.iloc[:, -1].values + yvalid_arr = val_df.iloc[:, -1].values + Xtrain_arr = np.array(Xtrain_arr).astype('float32') + Xvalid_arr = np.array(Xvalid_arr).astype('float32') + ytrain_arr = np.array(ytrain_arr).astype('float32') + yvalid_arr = np.array(yvalid_arr).astype('float32') + # create mini-batch + train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) + valid_dataset = mydl.NumpyDataset(tch.from_numpy(Xvalid_arr), tch.from_numpy(yvalid_arr)) + train_dl = tchud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + valid_dl = tchud.DataLoader(valid_dataset, batch_size=val_batch, shuffle=False) + + # ------------------------------------------------------ + # [PathDSP] Prepare model + # ------------------------------------------------------ + # initial weight + def init_weights(m): + if type(m) == tch.nn.Linear: + tch.nn.init.kaiming_uniform_(m.weight) + m.bias.data.fill_(0.01) + # load model + n_features = Xtrain_arr.shape[1] + net = mynet.FNN(n_features) + ## specify dropout rate + for module in net.modules(): + if isinstance(module, tch.nn.Dropout): + module.p = params['dropout'] + net.apply(init_weights) + + # ------------------------------------------------------ + # [PathDSP] Training + # ------------------------------------------------------ + print('start training process') + trained_net, train_loss_list, valid_loss_list, valid_r2_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn, params) + + loss_df = pd.DataFrame({'epoch':[i+1 for i in range(len(train_loss_list))], + 'train loss':train_loss_list, + 'valid loss': valid_loss_list, + 'valid r2': valid_r2_list}) + loss_df.to_csv(params['output_dir'] + '/Val_Loss_orig.txt', header=True, index=False, sep="\t") + + # make train/valid loss plots + best_model = trained_net + tch.save(best_model.state_dict(), modelpath) + #best_model.eval() + # Compute predictions + #val_true, val_pred = predicting(best_model, device, valid_dl) # (groud truth), (predictions) + val_true, val_pred = predict(best_model, device, valid_dl) # (groud truth), (predictions) + + #comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01) + val_true = pd.Series(val_true) + val_pred = pd.Series(val_pred) + val_true_untrans = val_true.apply(lambda x: 10 ** (x) - 0.01) + val_pred_untrans = val_pred.apply(lambda x: 10 ** (x) - 0.01) + # ----------------------------- + # [Req] Save raw predictions in dataframe + # ----------------------------- + # import ipdb; ipdb.set_trace() + frm.store_predictions_df( + y_true=val_true_untrans, + y_pred=val_pred_untrans, + stage="val", + y_col_name=params["y_col_name"], + output_dir=params["output_dir"], + input_dir=params["input_dir"] + ) + + # ----------------------------- + # [Req] Compute performance scores + # ----------------------------- + # import ipdb; ipdb.set_trace() + val_scores = frm.compute_performance_scores( + y_true=val_true_untrans, + y_pred=val_pred_untrans, + stage="val", + metric_type=params["metric_type"], + output_dir=params["output_dir"] + ) + return val_scores + + +def main(args): + cfg = DRPTrainConfig() + params = cfg.initialize_parameters( + file_path, + default_config="PathDSP_params.txt", + additional_definitions=pathdsp_train_params) + # get node name + params["node_name"] = socket.gethostname() + val_scores = run(params) + df = pd.DataFrame.from_dict(params, orient='index', columns=['value']) + df.to_csv(params["output_dir"] + '/params.txt',sep="\t") + + + +if __name__ == "__main__": + start = datetime.now() + main(sys.argv[1:]) + print("[Training finished in {:}]".format(cal_time(datetime.now(), start))) diff --git a/README.md b/README.md index 6c89fee..6bfbad5 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,160 @@ # PathDSP -Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores -# Requirments +This repository demonstrates how to use the [IMPROVE library v0.1.0](https://jdacs4c-improve.github.io/docs/v0.1.0/) for building a drug response prediction (DRP) model using PathDSP, and provides examples with the benchmark [cross-study analysis (CSA) dataset](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/). -# Input format +This version, tagged as `v0.1.0`, introduces a new API which is designed to encourage broader adoption of IMPROVE and its curated models by the research community. -|drug|cell|feature_1|....|feature_n|drug_response| -|----|----|--------|----|--------|----| -|5-FU|03|0|....|0.02|-2.3| -|5-FU|23|1|....|0.04|-3.4| -Where feature_1 to feature_n are the pathway enrichment scores and the chemical fingerprint coming from data processing -# Usage: -```python -# run FNN -python ./PathDSP/PathDSP/FNN.py -i input.txt -o ./output_prefix +## Dependencies +Installation instuctions are detailed below in [Step-by-step instructions](#step-by-step-instructions). -Where input.txt should be in the input format shown above. -Example input file can be found at https://zenodo.org/record/7532963 +Conda `yml` file [PathDSP_env_conda](./PathDSP_env_conda.yml) + +ML framework: ++ [Torch](https://pytorch.org/) -- deep learning framework for building the prediction model + +IMPROVE dependencies: ++ [IMPROVE v0.1.0](https://jdacs4c-improve.github.io/docs/v0.1.0) + + + +## Dataset +Benchmark data for cross-study analysis (CSA) can be downloaded from this [site](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/). + +The data tree is shown below: +``` +csa_data/raw_data/ +├── splits +│   ├── CCLE_all.txt +│   ├── CCLE_split_0_test.txt +│   ├── CCLE_split_0_train.txt +│   ├── CCLE_split_0_val.txt +│   ├── CCLE_split_1_test.txt +│   ├── CCLE_split_1_train.txt +│   ├── CCLE_split_1_val.txt +│   ├── ... +│   ├── GDSCv2_split_9_test.txt +│   ├── GDSCv2_split_9_train.txt +│   └── GDSCv2_split_9_val.txt +├── x_data +│   ├── cancer_copy_number.tsv +│   ├── cancer_discretized_copy_number.tsv +│   ├── cancer_DNA_methylation.tsv +│   ├── cancer_gene_expression.tsv +│   ├── cancer_miRNA_expression.tsv +│   ├── cancer_mutation_count.tsv +│   ├── cancer_mutation_long_format.tsv +│   ├── cancer_mutation.parquet +│   ├── cancer_RPPA.tsv +│   ├── drug_ecfp4_nbits512.tsv +│   ├── drug_info.tsv +│   ├── drug_mordred_descriptor.tsv +│   └── drug_SMILES.tsv +└── y_data + └── response.tsv +``` + + +## Model scripts and parameter file ++ `PathDSP_preprocess_improve.py` - takes benchmark data files and transforms into files for training and inference ++ `PathDSP_train_improve.py` - trains the PathDSP model ++ `PathDSP_infer_improve.py` - runs inference with the trained PathDSP model ++ `model_params_def.py` - definitions of parameters that are specific to the model ++ `PathDSP_params.txt` - default parameter file + + + +# Step-by-step instructions + +### 1. Clone the model repository +``` +git clone https://github.com/JDACS4C-IMPROVE/PathDSP +cd PathDSP +git checkout v0.1.0 +``` + + +### 2. Set computational environment +Create conda env using `yml` +``` +conda env create -f PathDSP_env_conda.yml -n PathDSP_env +conda activate PathDSP_env +``` + + +### 3. Run `setup_improve.sh`. +```bash +source setup_improve.sh ``` -# Data preprocessing -Pathway enrichment scores for categorical data (i.e., mutation, copy number variation, and drug targets) were obtained by running the NetPEA algorithm, which is available at: https://github.com/TangYiChing/NetPEA, while pathway enrichment scores for numeric data (i.e., gene expression) was generated with the single-sample Gene Set Enrichment Analsysis (ssGSEA) available here: https://gseapy.readthedocs.io/en/master/gseapy_example.html#3)-command-line-usage-of-single-sample-gseaby +This will: +1. Download cross-study analysis (CSA) benchmark data into `./csa_data/`. +2. Clone IMPROVE repo (checkout tag `v0.1.0`) outside the PathDSP model repo +3. Set up `PYTHONPATH` (adds IMPROVE repo). +4. Download the model-specific supplemental data (aka author data) and set up the env variable `AUTHOR_DATA_DIR`. -# Reference -Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7 + +### 4. Preprocess CSA benchmark data (_raw data_) to construct model input data (_ML data_) +```bash +python PathDSP_preprocess_improve.py --input_dir ./csa_data/raw_data --output_dir exp_result +``` + +Preprocesses the CSA data and creates train, validation (val), and test datasets. + +Generates: +* three model input data files: `train_data.txt`, `val_data.txt`, `test_data.txt` + +``` +exp_result +├── tmpdir_ssgsea +├── EXP.txt +├── cnv_data.txt +├── CNVnet.txt +├── DGnet.txt +├── MUTnet.txt +├── drug_mbit_df.txt +├── drug_target.txt +├── mutation_data.txt +├── test_data.txt +├── train_data.txt +├── val_data.txt +└── x_data_gene_expression_scaler.gz +``` + + +### 5. Train PathDSP model +```bash +python PathDSP_train_improve.py --input_dir exp_result --output_dir exp_result +``` + +Trains PathDSP using the model input data: `train_data.txt` (training), `val_data.txt` (for early stopping). + +Generates: +* trained model: `model.pt` +* predictions on val data (tabular data): `val_y_data_predicted.csv` +* prediction performance scores on val data: `val_scores.json` +``` +exp_result +├── model.pt +├── checkpoint.pt +├── Val_Loss_orig.txt +├── val_scores.json +└── val_y_data_predicted.csv +``` + + +### 6. Run inference on test data with the trained model +```bash +python PathDSP_infer_improve.py --input_data_dir exp_result --input_model_dir exp_result --output_dir exp_result --calc_infer_score True +``` + +Evaluates the performance on a test dataset with the trained model. + +Generates: +* predictions on test data (tabular data): `test_y_data_predicted.csv` +* prediction performance scores on test data: `test_scores.json` +``` +exp_result +├── test_scores.json +└── test_y_data_predicted.csv +``` diff --git a/README_deephyper_alpha.md b/README_deephyper_alpha.md new file mode 100644 index 0000000..de026cf --- /dev/null +++ b/README_deephyper_alpha.md @@ -0,0 +1,60 @@ +# Run HPO using DeepHyper on Lambda with conda + +## Install conda environment for the curated model (PathDSP) +Install PathDSP: +``` +cd +git clone https://github.com/JDACS4C-IMPROVE/PathDSP +``` + +Install IMPROVE and download data: +``` +cd PathDSP +source setup_improve.sh +``` + +Install PathDSP environment: +``` +cd PathDSP +export PathDSP_env=./PathDSP_env/ +conda env create -f PathDSP_env_conda.yml -p $PathDSP_env +``` + +## Perform preprocessing +Run the preprocess script. This script taks around 40 mins to complete. +The workflow assumes that your preprocessed data is at: "ml_data/{source}-{source}/split_{split}" + +``` +cd PathDSP +conda activate $PathDSP_env +python PathDSP_preprocess_improve.py --input_dir ./csa_data/raw_data --output_dir ./ml_data/CCLE-CCLE/split_0 +conda deactivate +``` + +## Install conda environment for DeepHyper +``` +module load openmpi +conda create -n dh python=3.9 -y +conda activate dh +conda install gxx_linux-64 gcc_linux-64 +pip install "deephyper[default]" +pip install mpi4py +``` + +## Perform HPO +If necesssary, activate environment: +``` +module load openmpi +conda activate dh +export PYTHONPATH=../IMPROVE +``` + +Run HPO: +``` +mpirun -np 10 python hpo_deephyper_subprocess.py +``` + +# Run HPO using DeepHyper on Polaris with conda + +# Run HPO using DeepHyper on Polaris with singularity + diff --git a/RWR.py b/RWR.py new file mode 100644 index 0000000..5bc82b9 --- /dev/null +++ b/RWR.py @@ -0,0 +1,162 @@ +""" +Return cell by gene probability dataframe + +""" + + +import argparse +import numpy as np +import pandas as pd +import scipy.sparse as scisp +import sklearn.preprocessing as skprc +from datetime import datetime + + +class RWR: + """ + Return probability matrix where columns are PPI genes + + :param ppiPathStr: string representing path to ppi file (with three columns) + :param restartPathStr: string representing path to restart file (i.e., input gene sets) + :param restartProbFloat: float representing restart probability (default: 0.5) + :param convergenceFloat: folat representing convergence criterion (default: 1e-5) + :param normalize: string representing normalization method (choices=['l1', 'l2']) + :param weighted: boolean indicating weither to use weighted graph or not (if False, will set weight of all edges to 1) + :param outPathStr: string representing output path + """ + def __init__(self, ppiPathStr, restartPathStr, restartProbFloat=0.5, convergenceFloat=0.00001, normalize='l1', weighted=True, outPathStr='./'): + # initiating + self.ppiPathStr = ppiPathStr + self.restartPathStr = restartPathStr + self.restartProbFloat = float(restartProbFloat) + self.convergenceFloat = float(convergenceFloat) + self.normalize = normalize + self.weighted = weighted + self.outPathStr = outPathStr + + + + def get_prob(self): + # load PPI graph + print('loading protein-protein interaction network.....') + self.adj_mat, self.name_idx_dict = self.load_graph(self.ppiPathStr, normalize=True, weighted=True) + # mapping dictionary of node index number: node name string + self.idx_name_dict = { idx:name for name, idx in self.name_idx_dict.items() } + + # load restart list (i.e., input gene sets) + print('collecting restart list') + df = pd.read_csv(self.restartPathStr, header=0, sep="\t") + df.columns = ['group', 'gene'] + # collect gene sets by group + grps = df.groupby('group') + grps_dict = {} + for grp in df['group'].unique(): + seed_list = grps.get_group(grp)['gene'].values.tolist() #input gene set + # check if input gene set in ppi and convert name to index number + seed_idx_list = [self.name_idx_dict[i] for i in seed_list if i in self.name_idx_dict.keys()] + # update to dictionary + grps_dict.update({ grp: {'gList':seed_list, 'ppiList':seed_idx_list} }) + + # perform random walk + print('performing random walk.....') + n_grps = len(grps_dict) + grp_list = list(grps_dict.keys()) + grp_prob_dict = {} + n_grp_has_no_ppiList = 0 # number of group has restart list not found on PPI network + for i in range(n_grps): + grp = grp_list[i] + if len(grps_dict[grp]['ppiList']) > 0: # has restart list on PPI network + prob_list = self.run_single_rwr(self.adj_mat, grps_dict[grp]['ppiList'], restartProbFloat=self.restartProbFloat, convergenceFloat=self.convergenceFloat) + + else: + n_grp_has_no_ppiList += 1 + prob_list = [0.0] * len(self.name_idx_dict) + + # update to result + grp_prob_dict.update( {grp:prob_list} ) + + # reformat result: dict2fataframe + print('finalizing result of probability matrix.....') + result_df = pd.DataFrame(grp_prob_dict) + result_df = result_df.T + result_df.columns = list(self.name_idx_dict.keys()) + return result_df # probability matrix grp by ppi genes + + + def load_graph(self, ppiPathStr, normalize=True, weighted=True): + """ + Return a graph in adjacency matrix format and its name string and correspoing index number mapping dictionary + + :param ppiPathStr: string representing file name of a graph in edge list format + :param name2index: boolean indicating whether to convert name string to index number or not + :param normalize: boolean indicating whether to perform column-wised normalization + """ + # load data + df = pd.read_pickle(ppiPathStr) + df.columns = ['source', 'target', 'weight'] + + # convert name to index + all_nodes = sorted(list(set( df['source'] ) | set( df['target'] ))) # retrieve name strings of all nodes + + # create name:index mapping dictionary + gnm_gid_dict = { all_nodes[i]:i for i in range(len(all_nodes)) } + + # replace name string with index number + df['source'].update(df['source'].map(gnm_gid_dict)) + df['target'].update(df['target'].map(gnm_gid_dict)) + + # use weighted graph or unweighted graph + if weighted == False: + df['weight'] = 1 # unweighted graph + + # create adjancency matrix + network_matrix = scisp.csr_matrix((df['weight'].values, (df['source'].values, df['target'].values)), + shape=(len(all_nodes), len(all_nodes)), dtype=float) # Create sparse matrix + network_matrix = (network_matrix + network_matrix.T) # Make the ajdacency matrix symmetric + network_matrix.setdiag(0) # Set diagnoals to zero + + # normalization: Normalize the rows of network_matrix because we are multiplying vector by matrix (from left) + if normalize == True: + network_matrix = skprc.normalize(network_matrix, norm='l1', axis=1) + + # return + return network_matrix, gnm_gid_dict + + def run_single_rwr(self, ppiAdjMat, restartList, restartProbFloat=0.5, convergenceFloat=0.00001): + """ + Return + + :param ppiAdjMat: adjacency matrix of protein-protein interaction network + :param restartList: list of restart nodes (i.e., gene list) + :param restartProbFloat: float representing restart probability (default: 0.5) + :param convergenceFloat: folat representing convergence criterion (default: 1e-5) + """ + # settings + convergence_criterion_float = float(convergenceFloat) # stops when vector L1 norm drops below 10^(-5) + restartProbFloat = float(restartProbFloat) + residual_float = 1.0 # difference between p^(t + 1) and p^(t) + max_iter = 1000 + + # initialze probability vector for restart nodes + prob_seed_list = [0] * ppiAdjMat.shape[0] + for idx in restartList: + prob_seed_list[idx] = 1.0 #1/float(len(restartList)) + prob_seed_arr = np.array(prob_seed_list) + steady_prob_old = prob_seed_arr + + # RWR + iter_int = 0 + #print('updating probability array.....') + while (residual_float > convergence_criterion_float): + # update vector + steady_prob_new = scisp.csr_matrix.dot(steady_prob_old, ppiAdjMat) + steady_prob_new *= (1 - restartProbFloat) + steady_prob_new += (prob_seed_arr * restartProbFloat) + + # Calculate the residual -- the sum of the absolute + # differences of the new node probability vector minus the old + # diff_norm = np.linalg.norm(np.subtract(p_t_1, p_t), 1) + residual_float = abs(steady_prob_new - steady_prob_old).sum() + steady_prob_old = steady_prob_new.copy() + return steady_prob_old + diff --git a/csa_bruteforce_params.ini b/csa_bruteforce_params.ini new file mode 100644 index 0000000..d8644d1 --- /dev/null +++ b/csa_bruteforce_params.ini @@ -0,0 +1,51 @@ +[DEFAULT] +input_dir = ./csa_data/raw_data +csa_outdir = ./bruteforce_output +y_col_name = auc +source_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] +target_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"] +split_nums = [] +model_name = PathDSP +only_cross_study = False +epochs = 800 +uses_cuda_name = True + +### Source and target data sources +## Set 1 - full analysis +#source_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +#target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +## Set 2 - smaller datasets +# source_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# source_datasets = ["CCLE", "gCSI", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv2"] +# source_datasets = ["CCLE", "GDSCv1"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +## Set 3 - full analysis for a single source +# source_datasets = ["CCLE"] +# source_datasets = ["CTRPv2"] +# target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv2"] +## Set 4 - same source and target +# source_datasets = ["CCLE"] +# target_datasets = ["CCLE"] +## Set 5 - single source and target +# source_datasets = ["GDSCv1"] +# target_datasets = ["CCLE"] + + +## Splits +#split_nums = [] # all splits +# split_nums = [0] +# split_nums = [4, 7] +# split_nums = [1, 4, 7] +# split_nums = [1, 3, 5, 7, 9] + + + +[Preprocess] + +[Train] + +[Infer] \ No newline at end of file diff --git a/csa_bruteforce_params_def.py b/csa_bruteforce_params_def.py new file mode 100644 index 0000000..6b5d7c9 --- /dev/null +++ b/csa_bruteforce_params_def.py @@ -0,0 +1,53 @@ +from improvelib.utils import str2bool + +csa_bruteforce_params = [ + {"name": "cuda_name", + "type": str, + "default": "cuda:0", + "help": "Cuda device name.", + }, + {"name": "csa_outdir", + "type": str, + "default": "./run.csa.full", + "help": "Outdir for workflow.", + }, + {"name": "source_datasets", + "nargs" : "+", + "type": str, + "default": ['CCLE'], + "help": "source_datasets for cross study analysis" + }, + {"name": "target_datasets", + "nargs" : "+", + "type": str, + "default": ["CCLE", "gCSI"], + "help": "target_datasets for cross study analysis" + }, + {"name": "split_nums", + "nargs" : "+", + "type": str, + "default": ['0'], + "help": "Split of the source datasets for CSA" + }, + {"name": "only_cross_study", + "type": str2bool, + "default": False, + "help": "If only cross study analysis is needed" + }, + {"name": "model_name", + "type": str, + "default": 'graphdrp', ## Change the default to LGBM?? + "help": "Name of the deep learning model" + }, + {"name": "epochs", + "type": int, + "default": 10, + "help": "Number of epochs" + }, + {"name": "uses_cuda_name", + "type": str2bool, + "default": True, + "help": "Change to false if the model doesn't have a cuda_name parameter." + }, + +] \ No newline at end of file diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py new file mode 100644 index 0000000..62f998f --- /dev/null +++ b/csa_bruteforce_wf.py @@ -0,0 +1,250 @@ +""" Python implementation of cross-study analysis workflow """ + + +import os +import subprocess +import warnings +from time import time +from pathlib import Path + +import pandas as pd + + +# IMPROVE imports +# from improvelib.initializer.config import Config +# from improvelib.initializer.stage_config import PreprocessConfig, TrainConfig, InferConfig +from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig +# from improvelib.applications.drug_response_prediction.config import DRPTrainConfig +# from improvelib.applications.drug_response_prediction.config import DRPInferConfig +import improvelib.utils as frm +from csa_bruteforce_params_def import csa_bruteforce_params + + +def build_split_fname(source: str, split: int, phase: str): + """ Build split file name. If file does not exist continue """ + return f"{source_data_name}_split_{split}_{phase}.txt" + +def save_captured_output(result, process, MAIN_LOG_DIR, source_data_name, target_data_name, split): + result_file_name_stdout = MAIN_LOG_DIR / f"{source_data_name}-{target_data_name}-{split}-{process}-log.txt" + with open(result_file_name_stdout, 'w') as file: + file.write(result.stdout) + + + + +class Timer: + """ Measure time. """ + def __init__(self): + self.start = time() + + def timer_end(self): + self.end = time() + return self.end - self.start + + def display_timer(self, print_fn=print): + time_diff = self.timer_end() + if (time_diff) // 3600 > 0: + print_fn("Runtime: {:.1f} hrs".format( (time_diff)/3600) ) + else: + print_fn("Runtime: {:.1f} mins".format( (time_diff)/60) ) + + +filepath = Path(__file__).resolve().parent + +print_fn = print +print_fn(f"File path: {filepath}") + +# =============================================================== +### CSA settings +# =============================================================== + + +cfg = DRPPreprocessConfig() # TODO submit github issue; too many logs printed; is it necessary? +params = cfg.initialize_parameters( + pathToModelDir=filepath, + default_config="csa_bruteforce_params.ini", + additional_definitions=csa_bruteforce_params, + required=None +) +print("Loaded params") + +# Model scripts +model_name = params["model_name"] +preprocess_python_script = f'{model_name}_preprocess_improve.py' +train_python_script = f'{model_name}_train_improve.py' +infer_python_script = f'{model_name}_infer_improve.py' +print("Created script names") +# Specify dirs + +y_col_name = params['y_col_name'] +# maindir = Path(f"./{y_col_name}") +# maindir = Path(f"./0_{y_col_name}_improvelib") # main output dir +MAIN_CSA_OUTDIR = Path(params["csa_outdir"]) # main output dir +# Note! ML data and trained model should be saved to the same dir for inference script +MAIN_ML_DATA_DIR = MAIN_CSA_OUTDIR / 'ml_data' # output_dir_pp, input_dir_train, input_dir_infer +MAIN_MODEL_DIR = MAIN_CSA_OUTDIR / 'models' # output_dir_train, input_dir_infer +MAIN_INFER_DIR = MAIN_CSA_OUTDIR / 'infer' # output_dir infer +MAIN_LOG_DIR = MAIN_CSA_OUTDIR / 'logs' +frm.create_outdir(MAIN_LOG_DIR) +print("Created directory names") +print("MAIN_CSA_OUTDIR: ", MAIN_CSA_OUTDIR) +print("MAIN_ML_DATA_DIR: ", MAIN_ML_DATA_DIR) +print("MAIN_MODEL_DIR: ", MAIN_MODEL_DIR) +print("MAIN_INFER_DIR: ", MAIN_INFER_DIR) +print("MAIN_LOG_DIR: ", MAIN_LOG_DIR) +# Note! Here input_dir is the location of benchmark data +# TODO Should we set input_dir (and output_dir) for each models scrit? +splits_dir = Path(params['input_dir']) / params['splits_dir'] +print("Created splits path") +print("splits_dir: ", splits_dir) + +source_datasets = params["source_datasets"] +target_datasets = params["target_datasets"] +only_cross_study = params["only_cross_study"] +split_nums = params["split_nums"] +epochs = params["epochs"] +cuda_name = params["cuda_name"] +print("internal params") + + +# =============================================================== +### Generate CSA results (within- and cross-study) +# =============================================================== + +timer = Timer() +# Iterate over source datasets +# Note! The "source_data_name" iterations are independent of each other +print_fn(f"\nsource_datasets: {source_datasets}") +print_fn(f"target_datasets: {target_datasets}") +print_fn(f"split_nums: {split_nums}") +# breakpoint() +for source_data_name in source_datasets: + + # Get the split file paths + # This parsing assumes splits file names are: SOURCE_split_NUM_[train/val/test].txt + if len(split_nums) == 0: + # Get all splits + split_files = list((splits_dir).glob(f"{source_data_name}_split_*.txt")) + split_nums = [str(s).split("split_")[1].split("_")[0] for s in split_files] + split_nums = sorted(set(split_nums)) + # num_splits = 1 + else: + # Use the specified splits + split_files = [] + for s in split_nums: + split_files.extend(list((splits_dir).glob(f"{source_data_name}_split_{s}_*.txt"))) + + files_joined = [str(s) for s in split_files] + + # -------------------- + # Preprocess and Train + # -------------------- + for split in split_nums: + print_fn(f"Split id {split} out of {len(split_nums)} splits.") + # Check that train, val, and test are available. Otherwise, continue to the next split. + for phase in ["train", "val", "test"]: + fname = build_split_fname(source_data_name, split, phase) + if fname not in "\t".join(files_joined): + warnings.warn(f"\nThe {phase} split file {fname} is missing (continue to next split)") + continue + + for target_data_name in target_datasets: + if only_cross_study and (source_data_name == target_data_name): + continue # only cross-study + print_fn(f"\nSource data: {source_data_name}") + print_fn(f"Target data: {target_data_name}") + + ml_data_dir = MAIN_ML_DATA_DIR / f"{source_data_name}-{target_data_name}" / \ + f"split_{split}" + model_dir = MAIN_MODEL_DIR / f"{source_data_name}" / f"split_{split}" + infer_dir = MAIN_INFER_DIR / f"{source_data_name}-{target_data_name}" / \ + f"split_{split}" # AP + + if source_data_name == target_data_name: + # If source and target are the same, then infer on the test split + test_split_file = f"{source_data_name}_split_{split}_test.txt" + else: + # If source and target are different, then infer on the entire target dataset + test_split_file = f"{target_data_name}_all.txt" + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # p1 (none): Preprocess train data + # train_split_files = list((ig.splits_dir).glob(f"{source_data_name}_split_0_train*.txt")) # placeholder for LC + timer_preprocess = Timer() + print_fn("\nPreprocessing") + train_split_file = f"{source_data_name}_split_{split}_train.txt" + val_split_file = f"{source_data_name}_split_{split}_val.txt" + print_fn(f"train_split_file: {train_split_file}") + print_fn(f"val_split_file: {val_split_file}") + print_fn(f"test_split_file: {test_split_file}") + preprocess_run = ["python", preprocess_python_script, + "--train_split_file", str(train_split_file), + "--val_split_file", str(val_split_file), + "--test_split_file", str(test_split_file), + "--input_dir", params['input_dir'], # str("./csa_data/raw_data"), + "--output_dir", str(ml_data_dir), + "--y_col_name", str(y_col_name) + ] + result = subprocess.run(preprocess_run, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, universal_newlines=True) + print(f"returncode = {result.returncode}") + save_captured_output(result, "preprocess", MAIN_LOG_DIR, source_data_name, target_data_name, split) + timer_preprocess.display_timer(print_fn) + + # p2 (p1): Train model + # Train a single model for a given [source, split] pair + # Train using train samples and early stop using val samples + if model_dir.exists() is False: + timer_train = Timer() + print_fn("\nTrain") + print_fn(f"ml_data_dir: {ml_data_dir}") + print_fn(f"model_dir: {model_dir}") + if params["uses_cuda_name"]: + train_run = ["python", train_python_script, + "--input_dir", str(ml_data_dir), + "--output_dir", str(model_dir), + "--epochs", str(epochs), # DL-specific + "--cuda_name", cuda_name, # DL-specific + "--y_col_name", y_col_name + ] + else: + train_run = ["python", train_python_script, + "--input_dir", str(ml_data_dir), + "--output_dir", str(model_dir), + "--epochs", str(epochs), # DL-specific + "--y_col_name", y_col_name + ] + result = subprocess.run(train_run, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, universal_newlines=True) + print(f"returncode = {result.returncode}") + save_captured_output(result, "train", MAIN_LOG_DIR, source_data_name, "none", split) + timer_train.display_timer(print_fn) + + # Infer + # p3 (p1, p2): Inference + timer_infer = Timer() + print_fn("\nInfer") + if params["uses_cuda_name"]: + infer_run = ["python", infer_python_script, + "--input_data_dir", str(ml_data_dir), + "--input_model_dir", str(model_dir), + "--output_dir", str(infer_dir), + "--cuda_name", cuda_name, # DL-specific + "--y_col_name", y_col_name, + "--calc_infer_scores", "true" + ] + else: + infer_run = ["python", infer_python_script, + "--input_data_dir", str(ml_data_dir), + "--input_model_dir", str(model_dir), + "--output_dir", str(infer_dir), + "--y_col_name", y_col_name, + "--calc_infer_scores", "true" + ] + result = subprocess.run(infer_run, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, universal_newlines=True) + print(f"returncode = {result.returncode}") + save_captured_output(result, "infer", MAIN_LOG_DIR, source_data_name, target_data_name, split) + timer_infer.display_timer(print_fn) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +timer.display_timer(print_fn) +print_fn('Finished full cross-study run.') \ No newline at end of file diff --git a/csa_params.ini b/csa_params.ini new file mode 100644 index 0000000..f9261e3 --- /dev/null +++ b/csa_params.ini @@ -0,0 +1,30 @@ +[DEFAULT] +input_dir = ./csa_data/raw_data +y_col_name = auc +use_singularity = False +hyperparameters_file = ./hyperparameters_default.json +model_name = PathDSP +only_cross_study = False +epochs = 800 +model_environment = PathDSP_env + + +# Full-scale CSA +# output_dir = ./parsl_csa_exp +# source_datasets = ["CCLE","CTRPv2","gCSI","GDSCv1","GDSCv2"] +# target_datasets = ["CCLE","CTRPv2","gCSI","GDSCv1","GDSCv2"] +# split = ["0","1","2","3","4","5","6","7","8","9"] +# available_accelerators = ["0","1","2","3","4","5","6","7"] + +# Exp 3 +output_dir = ./parsl_csa_exp3 +source_datasets = ["CCLE","GDSCv2","gCSI"] +target_datasets = ["CCLE","GDSCv2","gCSI"] +split = ["0","1"] +available_accelerators = ["4","5","6","7"] + +[Preprocess] + +[Train] + +[Infer] \ No newline at end of file diff --git a/csa_params.test.ini b/csa_params.test.ini new file mode 100644 index 0000000..2fad301 --- /dev/null +++ b/csa_params.test.ini @@ -0,0 +1,20 @@ +[DEFAULT] +input_dir = ./csa_data/raw_data +output_dir=./improve_output +y_col_name = auc +use_singularity = False +hyperparameters_file = ./hyperparameters_default.json +source_datasets = ["gCSI"] +target_datasets = ["gCSI", "CCLE"] +split = ["0", "1"] +model_name = PathDSP +only_cross_study = False +epochs = 3 +available_accelerators=["4","5"] +model_environment = PathDSP_env + +[Preprocess] + +[Train] + +[Infer] \ No newline at end of file diff --git a/csa_params_def.py b/csa_params_def.py new file mode 100644 index 0000000..75daeb5 --- /dev/null +++ b/csa_params_def.py @@ -0,0 +1,62 @@ + +additional_definitions = [ + {"name": "source_datasets", + "nargs" : "+", + "type": str, + "default": ['CCLE'], + "help": "source_datasets for cross study analysis" + }, + {"name": "target_datasets", + "nargs" : "+", + "type": str, + "default": ["CCLE", "gCSI"], + "help": "target_datasets for cross study analysis" + }, + {"name": "split", + "nargs" : "+", + "type": str, + "default": ['0'], + "help": "Split of the source datasets for CSA" + }, + {"name": "only_cross_study", + "type": bool, + "default": False, + "help": "If only cross study analysis is needed" + }, + {"name": "model_name", + "type": str, + "default": 'graphdrp', ## Change the default to LGBM?? + "help": "Name of the deep learning model" + }, + {"name": "model_environment", + "type": str, + "default": '', ## Change the default to LGBM?? + "help": "Name of your model conda environment" + }, + {"name": "hyperparameters_file", + "type": str, + "default": 'hyperparameters_default.json', + "help": "json file containing optimized hyperparameters per dataset" + }, + {"name": "epochs", + "type": int, + "default": 10, + "help": "Number of epochs" + }, + {"name": "available_accelerators", + "nargs" : "+", + "type": str, + "default": ["0", "1"], + "help": "GPU IDs to assign jobs" + }, + {"name": "use_singularity", + "type": bool, + "default": True, + "help": "Do you want to use singularity image for running the model?" + }, + {"name": "singularity_image", + "type": str, + "default": '', + "help": "Singularity image file of the model" + } + ] \ No newline at end of file diff --git a/csa_wf_v3.py b/csa_wf_v3.py new file mode 100644 index 0000000..066b726 --- /dev/null +++ b/csa_wf_v3.py @@ -0,0 +1,292 @@ +""" Python implementation of cross-study analysis workflow """ +# cuda_name = "cuda:6" +cuda_name = "cuda:7" + +import os +import subprocess +import warnings +from time import time +from pathlib import Path + +import pandas as pd + +# IMPROVE imports +from improve import framework as frm +# import improve_utils +# from improve_utils import improve_globals as ig + +# GraphDRP imports +# TODO: change this for your model +import PathDSP_preprocess_improve +import PathDSP_train_improve +import PathDSP_preprocess_improve + +# from ap_utils.classlogger import Logger +# from ap_utils.utils import get_print_func, Timer + + +class Timer: + """ Measure time. """ + def __init__(self): + self.start = time() + + def timer_end(self): + self.end = time() + return self.end - self.start + + def display_timer(self, print_fn=print): + time_diff = self.timer_end() + if time_diff // 3600 > 0: + print_fn("Runtime: {:.1f} hrs".format( (time_diff)/3600) ) + else: + print_fn("Runtime: {:.1f} mins".format( (time_diff)/60) ) + + +fdir = Path(__file__).resolve().parent + +y_col_name = "auc" +# y_col_name = "auc1" + +maindir = Path(f"./{y_col_name}") +MAIN_ML_DATA_DIR = Path(f"./{maindir}/ml.data") +MAIN_MODEL_DIR = Path(f"./{maindir}/models") +MAIN_INFER_OUTDIR = Path(f"./{maindir}/infer") + +# Check that environment variable "IMPROVE_DATA_DIR" has been specified +if os.getenv("IMPROVE_DATA_DIR") is None: + raise Exception("ERROR ! Required system variable not specified. \ + You must define IMPROVE_DATA_DIR ... Exiting.\n") +os.environ["CANDLE_DATA_DIR"] = os.environ["IMPROVE_DATA_DIR"] + +params = frm.initialize_parameters( + fdir, + default_model="csa_workflow_params.txt", +) + +main_datadir = Path(os.environ["IMPROVE_DATA_DIR"]) +raw_datadir = main_datadir / params["raw_data_dir"] +x_datadir = raw_datadir / params["x_data_dir"] +y_datadir = raw_datadir / params["y_data_dir"] +splits_dir = raw_datadir / params["splits_dir"] + +# lg = Logger(main_datadir/"csa.log") +print_fn = print +# print_fn = get_print_func(lg.logger) +print_fn(f"File path: {fdir}") + +### Source and target data sources +## Set 1 - full analysis +# source_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +## Set 2 - smaller datasets +# source_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# source_datasets = ["GDSCv1", "CTRPv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +## Set 3 - full analysis for a single source +# source_datasets = ["CCLE"] +# source_datasets = ["CTRPv2"] +source_datasets = ["GDSCv1"] +target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"] +# target_datasets = ["CCLE", "gCSI", "GDSCv2"] +## Set 4 - same source and target +# source_datasets = ["CCLE"] +# target_datasets = ["CCLE"] +## Set 5 - single source and target +# source_datasets = ["GDSCv1"] +# target_datasets = ["CCLE"] + +# only_cross_study = False +only_cross_study = True + + +## Splits +# split_nums = [] # all splits +# split_nums = [0] +# split_nums = [4, 7] +split_nums = [1, 4, 7] +# split_nums = [1, 3, 5, 7, 9] + +## Parameters of the experiment/run/workflow +# TODO: this should be stored as the experiment metadata that we can go back check +# epochs = 2 +# epochs = 30 +# epochs = 50 +epochs = 70 +# epochs = 100 +# epochs = 150 +# config_file_name = "csa_params.txt" +# config_file_path = fdir/config_file_name + +def build_split_fname(source, split, phasea): + """ Build split file name. If file does not exist continue """ + return f"{source_data_name}_split_{split}_{phase}.txt" + +# =============================================================== +### Generate CSA results (within- and cross-study) +# =============================================================== + +timer = Timer() +# Iterate over source datasets +# Note! The "source_data_name" iterations are independent of each other +print_fn(f"\nsource_datasets: {source_datasets}") +print_fn(f"target_datasets: {target_datasets}") +print_fn(f"split_nums: {split_nums}") +# import pdb; pdb.set_trace() +for source_data_name in source_datasets: + + # Get the split file paths + # This parsing assumes splits file names are: SOURCE_split_NUM_[train/val/test].txt + if len(split_nums) == 0: + # Get all splits + split_files = list((splits_dir).glob(f"{source_data_name}_split_*.txt")) + split_nums = [str(s).split("split_")[1].split("_")[0] for s in split_files] + split_nums = sorted(set(split_nums)) + # num_splits = 1 + else: + # Use the specified splits + split_files = [] + for s in split_nums: + split_files.extend(list((splits_dir).glob(f"{source_data_name}_split_{s}_*.txt"))) + + files_joined = [str(s) for s in split_files] + + # -------------------- + # Preprocess and Train + # -------------------- + # import pdb; pdb.set_trace() + for split in split_nums: + print_fn(f"Split id {split} out of {len(split_nums)} splits.") + # Check that train, val, and test are available. Otherwise, continue to the next split. + # split = 11 + # files_joined = [str(s) for s in split_files] + # TODO: check this! + for phase in ["train", "val", "test"]: + fname = build_split_fname(source_data_name, split, phase) + # print(f"{phase}: {fname}") + if fname not in "\t".join(files_joined): + warnings.warn(f"\nThe {phase} split file {fname} is missing (continue to next split)") + continue + + # import pdb; pdb.set_trace() + for target_data_name in target_datasets: + if only_cross_study and (source_data_name == target_data_name): + continue # only cross-study + print_fn(f"\nSource data: {source_data_name}") + print_fn(f"Target data: {target_data_name}") + + # EXP_ML_DATA_DIR = ig.ml_data_dir/f"{source_data_name}-{target_data_name}"/f"split_{split}" + ml_data_outdir = MAIN_ML_DATA_DIR/f"{source_data_name}-{target_data_name}"/f"split_{split}" + + if source_data_name == target_data_name: + # If source and target are the same, then infer on the test split + test_split_file = f"{source_data_name}_split_{split}_test.txt" + else: + # If source and target are different, then infer on the entire target dataset + test_split_file = f"{target_data_name}_all.txt" + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # p1 (none): Preprocess train data + # import pdb; pdb.set_trace() + # train_split_files = list((ig.splits_dir).glob(f"{source_data_name}_split_0_train*.txt")) # TODO: placeholder for lc analysis + timer_preprocess = Timer() + # ml_data_path = graphdrp_preprocess_improve.main([ + # "--train_split_file", f"{source_data_name}_split_{split}_train.txt", + # "--val_split_file", f"{source_data_name}_split_{split}_val.txt", + # "--test_split_file", str(test_split_file_name), + # "--ml_data_outdir", str(ml_data_outdir), + # "--y_col_name", y_col_name + # ]) + print_fn("\nPreprocessing") + train_split_file = f"{source_data_name}_split_{split}_train.txt" + val_split_file = f"{source_data_name}_split_{split}_val.txt" + # test_split_file = f"{source_data_name}_split_{split}_test.txt" + print_fn(f"train_split_file: {train_split_file}") + print_fn(f"val_split_file: {val_split_file}") + print_fn(f"test_split_file: {test_split_file}") + print_fn(f"ml_data_outdir: {ml_data_outdir}") + # import pdb; pdb.set_trace() + preprocess_run = ["python", + "PathDSP_preprocess_improve.py", + "--train_split_file", str(train_split_file), + "--val_split_file", str(val_split_file), + "--test_split_file", str(test_split_file), + "--ml_data_outdir", str(ml_data_outdir), + "--y_col_name", str(y_col_name) + ] + result = subprocess.run(preprocess_run, capture_output=True, + text=True, check=True) + # print(result.stdout) + # print(result.stderr) + timer_preprocess.display_timer(print_fn) + + # p2 (p1): Train model + # Train a single model for a given [source, split] pair + # Train using train samples and early stop using val samples + # import pdb; pdb.set_trace() + model_outdir = MAIN_MODEL_DIR/f"{source_data_name}"/f"split_{split}" + if model_outdir.exists() is False: + train_ml_data_dir = ml_data_outdir + val_ml_data_dir = ml_data_outdir + timer_train = Timer() + # graphdrp_train_improve.main([ + # "--train_ml_data_dir", str(train_ml_data_dir), + # "--val_ml_data_dir", str(val_ml_data_dir), + # "--model_outdir", str(model_outdir), + # "--epochs", str(epochs), # available in config_file + # # "--ckpt_directory", str(MODEL_OUTDIR), # TODO: we'll use candle known param ckpt_directory instead of model_outdir + # # "--cuda_name", "cuda:5" + # ]) + print_fn("\nTrain") + print_fn(f"train_ml_data_dir: {train_ml_data_dir}") + print_fn(f"val_ml_data_dir: {val_ml_data_dir}") + print_fn(f"model_outdir: {model_outdir}") + # import pdb; pdb.set_trace() + train_run = ["python", + "PathDSP_train_improve.py", + "--train_ml_data_dir", str(train_ml_data_dir), + "--val_ml_data_dir", str(val_ml_data_dir), + "--model_outdir", str(model_outdir), + "--epochs", str(epochs), + "--cuda_name", cuda_name, + "--y_col_name", y_col_name + ] + result = subprocess.run(train_run, capture_output=True, + text=True, check=True) + # print(result.stdout) + # print(result.stderr) + timer_train.display_timer(print_fn) + + # Infer + # p3 (p1, p2): Inference + # import pdb; pdb.set_trace() + test_ml_data_dir = ml_data_outdir + model_dir = model_outdir + infer_outdir = MAIN_INFER_OUTDIR/f"{source_data_name}-{target_data_name}"/f"split_{split}" + timer_infer = Timer() + # graphdrp_infer_improve.main([ + # "--test_ml_data_dir", str(test_ml_data_dir), + # "--model_dir", str(model_dir), + # "--infer_outdir", str(infer_outdir), + # # "--cuda_name", "cuda:5" + # ]) + print_fn("\nInfer") + print_fn(f"test_ml_data_dir: {test_ml_data_dir}") + #print_fn(f"val_ml_data_dir: {val_ml_data_dir}") + print_fn(f"infer_outdir: {infer_outdir}") + # import pdb; pdb.set_trace() + infer_run = ["python", + "PathDSP_infer_improve.py", + "--test_ml_data_dir", str(test_ml_data_dir), + "--model_dir", str(model_dir), + "--infer_outdir", str(infer_outdir), + "--cuda_name", cuda_name, + "--y_col_name", y_col_name + ] + result = subprocess.run(infer_run, capture_output=True, + text=True, check=True) + timer_infer.display_timer(print_fn) + +timer.display_timer(print_fn) +print_fn("Finished a full cross-study run.") diff --git a/csa_workflow_params.txt b/csa_workflow_params.txt new file mode 100644 index 0000000..66a69f8 --- /dev/null +++ b/csa_workflow_params.txt @@ -0,0 +1,8 @@ +[Global_Params] +model_name = "CSA_workflow" + +[CSA_Workflow] +raw_data_dir = "raw_data" +x_data_dir = "x_data" +y_data_dir = "y_data" +splits_dir = "splits" diff --git a/download_author_data.sh b/download_author_data.sh new file mode 100644 index 0000000..999fc4c --- /dev/null +++ b/download_author_data.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# arg 1: output directory to download model-specific data +# If run via container, it needs to be downloaded under the csa_data folder + +OUTPUT_DIR=$1 + +# Check if the data is already downloaded +if [ -f "$OUTPUT_DIR/.downloaded" ]; then + echo "Data present, skipping download" +# Download data if no other download is in progress +elif [ ! -f "$OUTPUT_DIR/.downloading_author_data" ]; then + touch "$OUTPUT_DIR/.downloading_author_data" + # Download files + # Unzip files + wget -P $OUTPUT_DIR https://zenodo.org/record/6093818/files/MSigdb.zip + wget -P $OUTPUT_DIR https://zenodo.org/record/6093818/files/raw_data.zip + wget -P $OUTPUT_DIR https://zenodo.org/record/6093818/files/STRING.zip + unzip -d $OUTPUT_DIR $OUTPUT_DIR/MSigdb.zip + unzip -d $OUTPUT_DIR $OUTPUT_DIR/raw_data.zip + unzip -d $OUTPUT_DIR $OUTPUT_DIR/STRING.zip + touch "$OUTPUT_DIR/.downloaded" + rm "$OUTPUT_DIR/.downloading_author_data" +else + # Wait for other download to finish + iteration=0 + echo "Waiting for external download" + while [ -f "$OUTPUT_DIR/.downloading_author_data" ]; do + iteration=$((iteration + 1)) + if [ "$iteration" -gt 10 ]; then + # Download takes too long, exit and warn user + echo "Check output directory, download still in progress after $iteration minutes." + exit 1 + fi + sleep 60 + done +fi diff --git a/download_csa.sh b/download_csa.sh new file mode 100644 index 0000000..7bfc04c --- /dev/null +++ b/download_csa.sh @@ -0,0 +1,2 @@ +# wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data +wget --cut-dirs=8 -P ./ -nH -np -m https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/ \ No newline at end of file diff --git a/execute_in_conda.sh b/execute_in_conda.sh new file mode 100644 index 0000000..303f81b --- /dev/null +++ b/execute_in_conda.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +model_env=$1 ; shift +script=$1 ; shift +conda_path=$(dirname $(dirname $(which conda))) +source $conda_path/bin/activate $model_env +CMD="python ${script} $@" + +echo "running command ${CMD}" + +$CMD diff --git a/get_hosts_polaris.py b/get_hosts_polaris.py new file mode 100644 index 0000000..196008a --- /dev/null +++ b/get_hosts_polaris.py @@ -0,0 +1,14 @@ +import sys + +if __name__ == "__main__": + ranks_per_node = 4 + fname = sys.argv[1] + output = "" + with open(fname, "r") as f: + for i, line in enumerate(f): + line = line.strip("\n") + if i == 0: + output += f"{line}" + for _ in range(ranks_per_node): + output += f",{line}" + print(output) \ No newline at end of file diff --git a/hpo_deephyper_hyperparameters.py b/hpo_deephyper_hyperparameters.py new file mode 100644 index 0000000..56c446b --- /dev/null +++ b/hpo_deephyper_hyperparameters.py @@ -0,0 +1,43 @@ +hyperparams = [ + {"name": "batch_size", + "type": int, + "min": 8, + "max": 512, + "default": 64, + "log_uniform": True + }, + {"name": "learning_rate", + "type": float, + "min": 1e-6, + "max": 1e-2, + "default": 0.001, + "log_uniform": True + }, + { + "name": "dropout", + "type": float, + "min": 0, + "max": 0.5, + "default": 0, + "log_uniform": False + } +] + + +''' +{ +"name": "dropout", +"type": float, +"min": 0, +"max": 0.5, +"default": 0, +"log_uniform": False +} + +{ +"name": "early_stopping", +"type": "categorical", +"choices": [True, False], +"default": False +} +''' \ No newline at end of file diff --git a/hpo_deephyper_params.ini b/hpo_deephyper_params.ini new file mode 100644 index 0000000..2178f9c --- /dev/null +++ b/hpo_deephyper_params.ini @@ -0,0 +1,21 @@ +[DEFAULT] +input_dir = ./csa_data/raw_data +ml_data_dir = ./ml_data/CCLE-CCLE/split_0 +y_col_name = auc +model_name = PathDSP +model_scripts_dir = ./ +model_environment = ./PathDSP_env/ +epochs = 3 +output_dir = ./test +source = CCLE +split = 0 +max_evals = 5 +interactive_session = True + + + +[Preprocess] + +[Train] + +[Infer] \ No newline at end of file diff --git a/hpo_deephyper_params_def.py b/hpo_deephyper_params_def.py new file mode 100644 index 0000000..0588ea3 --- /dev/null +++ b/hpo_deephyper_params_def.py @@ -0,0 +1,62 @@ +additional_definitions = [ + {"name": "source", + "type": str, + "default": "GDSCv1", + "help": "source dataset for HPO" + }, + {"name": "split", + "type": str, + "default": "4", + "help": "Split of the source datasets for HPO" + }, + {"name": "model_name", + "type": str, + "default": 'PathDSP', + "help": "Name of the deep learning model" + }, + {"name": "model_scripts_dir", + "type": str, + "default": './', + "help": "Path to the model repository" + }, + {"name": "model_environment", + "type": str, + "default": '', + "help": "Name of your model conda environment" + }, + {"name": "epochs", + "type": int, + "default": 10, + "help": "Number of epochs" + }, + {"name": "use_singularity", + "type": bool, + "default": True, + "help": "Do you want to use singularity image for running the model?" + }, + {"name": "singularity_image", + "type": str, + "default": '', + "help": "Singularity image file of the model" + }, + {"name": "val_loss", + "type": str, + "default": 'mse', + "help": "Type of loss for validation" + }, + {"name": "max_evals", + "type": int, + "default": 20, + "help": "Number of evaluations" + }, + {"name": "interactive_session", + "type": bool, + "default": True, + "help": "Are you using an interactive session?" + }, + {"name": "ml_data_dir", + "type": str, + "default": './', + "help": "Location of the preprocessed data." + } + ] \ No newline at end of file diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py new file mode 100644 index 0000000..01c950e --- /dev/null +++ b/hpo_deephyper_subprocess.py @@ -0,0 +1,171 @@ +""" +Before running this script, first need to preprocess the data. +This can be done by running preprocess_example.sh + +and the env vars $PYTHONPATH is set: +export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib + +It also assumes that your processed training and validation data is in ml_data_dir. +Model output files will be saved in output_dir/{source}/split_{split}. + +mpirun -np 10 python hpo_subprocess.py +""" +# import copy +import json +import subprocess +import pandas as pd +import os +import time +from pathlib import Path +import logging +import mpi4py +from deephyper.evaluator import Evaluator, profile +from deephyper.evaluator.callback import TqdmCallback +from deephyper.problem import HpProblem +from deephyper.search.hps import CBO +from mpi4py import MPI +import socket +import hpo_deephyper_params_def +from hpo_deephyper_hyperparameters import hyperparams +from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig + +# --------------------- +# Initialize parameters for DeepHyper HPO +# --------------------- +filepath = Path(__file__).resolve().parent +cfg = DRPPreprocessConfig() +global params +params = cfg.initialize_parameters( + pathToModelDir=filepath, + default_config="hpo_deephyper_params.ini", + additional_definitions=hpo_deephyper_params_def.additional_definitions +) +output_dir = Path(params['output_dir']) +if output_dir.exists() is False: + os.makedirs(output_dir, exist_ok=True) +#params['ml_data_dir'] = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}" +params['model_outdir'] = f"{params['output_dir']}/{params['source']}/split_{params['split']}" +params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py") + +# --------------------- +# Enable using multiple GPUs +# --------------------- + +mpi4py.rc.initialize = False +mpi4py.rc.threads = True +mpi4py.rc.thread_level = "multiple" +mpi4py.rc.recv_mprobe = False + +if not MPI.Is_initialized(): + MPI.Init_thread() + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() +size = comm.Get_size() + +if params['interactive_session']: + num_gpus_per_node = 2 + os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) + cuda_name = "cuda:" + str(rank % num_gpus_per_node) +else: + # CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh + local_rank = os.environ["PMI_LOCAL_RANK"] + +# --------------------- +# Enable logging +# --------------------- + +logging.basicConfig( + # filename=f"deephyper.{rank}.log, # optional if we want to store the logs to disk + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(filename)s:%(funcName)s - %(message)s", + force=True, +) + +# --------------------- +# Hyperparameters +# --------------------- +problem = HpProblem() + +for hp in hyperparams: + if hp['type'] == "categorical": + print("not implemented yet") + else: + if hp['log_uniform']: + problem.add_hyperparameter((hp['min'], hp['max'], "log-uniform"), + hp['name'], default_value=hp['default']) + else: + problem.add_hyperparameter((hp['min'], hp['max']), + hp['name'], default_value=hp['default']) + +params['hyperparams'] = [d['name'] for d in hyperparams] + + +# problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.0) +# problem.add_hyperparameter([True, False], "early_stopping", default_value=False) + + +@profile +def run(job, optuna_trial=None): + model_outdir_job_id = Path(params['model_outdir'] + f"/{job.id}") + #learning_rate = job.parameters["learning_rate"] + #batch_size = job.parameters["batch_size"] + + train_run = ["bash", "hpo_deephyper_subprocess_train.sh", + str(params['model_environment']), + str(params['script_name']), + str(params['ml_data_dir']), + str(model_outdir_job_id), + str(params['epochs']), + str(os.environ["CUDA_VISIBLE_DEVICES"]) + ] + for hp in params['hyperparams']: + train_run = train_run + [str(hp)] + train_run = train_run + [str(job.parameters[hp])] + + print(f"Launching run: ") + print(train_run) + subprocess_res = subprocess.run(train_run, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + # Logger + print(f"returncode = {subprocess_res.returncode}") + result_file_name_stdout = model_outdir_job_id / 'logs.txt' + if model_outdir_job_id.exists() is False: # If subprocess fails, model_dir may not be created and we need to write the log files in model_dir + os.makedirs(model_outdir_job_id, exist_ok=True) + with open(result_file_name_stdout, 'w') as file: + file.write(subprocess_res.stdout) + + # Load val_scores and get val_loss + f = open(model_outdir_job_id / "val_scores.json") + val_scores = json.load(f) + objective = -val_scores[params['val_loss']] + + # Checkpoint the model weights + with open(f"{params['output_dir']}/model_{job.id}.pkl", "w") as f: + f.write("model weights") + + # return score + return {"objective": objective, "metadata": val_scores} + + +if __name__ == "__main__": + with Evaluator.create( + run, method="mpicomm", method_kwargs={"callbacks": [TqdmCallback()]} + ) as evaluator: + + if evaluator is not None: + print(problem) + search = CBO( + problem, + evaluator, + log_dir=params['output_dir'], + verbose=1, + ) + results = search.search(max_evals=params['max_evals']) + results = results.sort_values(f"m:{params['val_loss']}", ascending=True) + results.to_csv(f"{params['output_dir']}/hpo_results.csv", index=False) + print("current node: ", socket.gethostname(), "; current rank: ", rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) + print("Finished deephyper HPO.") diff --git a/hpo_deephyper_subprocess_train.sh b/hpo_deephyper_subprocess_train.sh new file mode 100644 index 0000000..c3f8984 --- /dev/null +++ b/hpo_deephyper_subprocess_train.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0 +# CUDA_VISIBLE_DEVICES=5 bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0 + +# Need to comment this when using ' eval "$(conda shell.bash hook)" ' +# set -e + +# Activate conda env for model using "conda activate myenv" +# https://saturncloud.io/blog/activating-conda-environments-from-scripts-a-guide-for-data-scientists +# https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script +# This doesn't work w/o eval "$(conda shell.bash hook)" +CONDA_ENV=$1 +echo "Activated conda commands in shell script" +conda_path=$(dirname $(dirname $(which conda))) +source $conda_path/bin/activate $CONDA_ENV +echo "Activated conda env $CONDA_ENV" + +# get mandatory arguments +SCRIPT=$2 +input_dir=$3 +output_dir=$4 +epochs=$5 +CUDA_VISIBLE_DEVICES=$6 + +command="python $SCRIPT --input_dir $input_dir --output_dir $output_dir --epochs $epochs " + + +# append hyperparameter arguments to python call +for i in $(seq 7 $#) +do + if [ $(($i % 2)) == 0 ]; then + command="${command} ${!i}" + else + command="${command} --${!i}" + fi +done + + +echo "command: $command" + +# run python script +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} $command + + +source $conda_path/bin/deactivate +echo "Deactivated conda env $CONDA_ENV" diff --git a/hpo_scale.sh b/hpo_scale.sh new file mode 100644 index 0000000..93d517f --- /dev/null +++ b/hpo_scale.sh @@ -0,0 +1,58 @@ +#!/bin/bash +#PBS -l select=2:system=polaris +#PBS -l place=scatter +#PBS -l walltime=00:60:00 +#PBS -q debug +#PBS -A IMPROVE +#PBS -l filesystems=home:eagle +#PBS -N dh_hpo_scale_test + +set -xe + +# Move to the directory where `qsub example-improve.sh` was run +cd ${PBS_O_WORKDIR} + +# source enviroemnt variabels for IMPROVE +source $IMPROVE_env + +# Activate the current environement (module load, conda activate etc...) +# Assume conda is installed +module load PrgEnv-gnu +module use /soft/modulefiles +module load conda +# activate base environment +conda_path=$(dirname $(dirname $(which conda))) +source $conda_path/bin/activate base +#source $conda_path/bin/activate $dh_env + +# Resource allocation for DeepHyper +export NDEPTH=16 +export NRANKS_PER_NODE=4 +export NNODES=`wc -l < $PBS_NODEFILE` +export NTOTRANKS=$(( $NNODES * $NRANKS_PER_NODE + 1)) +export OMP_NUM_THREADS=$NDEPTH + +echo NNODES: ${NNODES} +echo NTOTRANKS: ${NTOTRANKS} +echo OMP_NUM_THREADS: ${OMP_NUM_THREADS} + +# GPU profiling, (quite ad-hoc, copy-paste the `profile_gpu_polaris.sh`, requires to install some small +# python package which queries nvidia-smi, you need a simple parser then to collect data.) +# UNCOMMENT IF USEFULL +# export GPUSTAT_LOG_DIR=$PBS_O_WORKDIR/$log_dir +# mpiexec -n ${NNODES} --ppn 1 --depth=1 --cpu-bind depth --envall ../profile_gpu_polaris.sh & + +# Get list of process ids (basically node names) +echo $PBS_NODEFILE +export RANKS_HOSTS=$(python ./get_hosts_polaris.py $PBS_NODEFILE) + +echo RANKS_HOSTS: ${RANKS_HOSTS} +echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK} + +# Launch DeepHyper +# ensure that mpi is pointing to the one within deephyper conda environment +# set_affinity_gpu_polaris.sh does not seem to work right now +# but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, +mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ + --envall \ + ./set_affinity_gpu_polaris.sh python hpo_subprocess.py \ No newline at end of file diff --git a/hpo_scale_singularity_debug.sh b/hpo_scale_singularity_debug.sh new file mode 100644 index 0000000..f8aa1b6 --- /dev/null +++ b/hpo_scale_singularity_debug.sh @@ -0,0 +1,60 @@ +#!/bin/bash +#PBS -l select=2:ngpus=4:system=polaris +#PBS -l place=scatter +#PBS -l walltime=00:60:00 +#PBS -q debug +#PBS -A IMPROVE +#PBS -l filesystems=home:eagle +#PBS -N dh_hpo_scale_test +#PBS -M liu.yuanhang@mayo.edu + +set -xe + +# Move to the directory where `qsub example-improve.sh` was run +cd ${PBS_O_WORKDIR} + +# source enviroemnt variabels for IMPROVE +source $IMPROVE_env + +# Activate the current environement (module load, conda activate etc...) +module load PrgEnv-gnu +module use /soft/modulefiles +module load conda +# activate base environment +conda_path=$(dirname $(dirname $(which conda))) +source $conda_path/bin/activate base +#source $conda_path/bin/activate $dh_env +# load module to run singularity container +module use /soft/spack/gcc/0.6.1/install/modulefiles/Core +module load apptainer + +# Resource allocation for DeepHyper +export NDEPTH=16 +export NRANKS_PER_NODE=4 +export NNODES=`wc -l < $PBS_NODEFILE` +export NTOTRANKS=$(( $NNODES * $NRANKS_PER_NODE + 1)) +export OMP_NUM_THREADS=$NDEPTH + +echo NNODES: ${NNODES} +echo NTOTRANKS: ${NTOTRANKS} +echo OMP_NUM_THREADS: ${OMP_NUM_THREADS} + +# GPU profiling, (quite ad-hoc, copy-paste the `profile_gpu_polaris.sh`, requires to install some small +# python package which queries nvidia-smi, you need a simple parser then to collect data.) +# UNCOMMENT IF USEFULL +# export GPUSTAT_LOG_DIR=$PBS_O_WORKDIR/$log_dir +# mpiexec -n ${NNODES} --ppn 1 --depth=1 --cpu-bind depth --envall ../profile_gpu_polaris.sh & + +# Get list of process ids (basically node names) +echo $PBS_NODEFILE +export RANKS_HOSTS=$(python ./get_hosts_polaris.py $PBS_NODEFILE) + +echo RANKS_HOSTS: ${RANKS_HOSTS} +echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK} + +# Launch DeepHyper +# ensure that mpi is pointing to the one within deephyper conda environment +# set_affinity_gpu_polaris.sh does not seem to work right now +# but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, +mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ + --envall ./set_affinity_gpu_polaris.sh python hpo_subprocess_singularity.py \ No newline at end of file diff --git a/hpo_scale_singularity_debug_scaling.sh b/hpo_scale_singularity_debug_scaling.sh new file mode 100644 index 0000000..722822f --- /dev/null +++ b/hpo_scale_singularity_debug_scaling.sh @@ -0,0 +1,60 @@ +#!/bin/bash +#PBS -l select=4:ncpus=64:ngpus=4:system=polaris +#PBS -l place=scatter +#PBS -l walltime=00:30:00 +#PBS -q debug-scaling +#PBS -A IMPROVE +#PBS -l filesystems=home:eagle +#PBS -N dh_hpo_scale_test +#PBS -M liu.yuanhang@mayo.edu + +set -xe + +# Move to the directory where `qsub example-improve.sh` was run +cd ${PBS_O_WORKDIR} + +# source enviroemnt variabels for IMPROVE +source $IMPROVE_env + +# Activate the current environement (module load, conda activate etc...) +module load PrgEnv-gnu +module use /soft/modulefiles +module load conda +# activate base environment +conda_path=$(dirname $(dirname $(which conda))) +source $conda_path/bin/activate base +#source $conda_path/bin/activate $dh_env +# load module to run singularity container +module use /soft/spack/gcc/0.6.1/install/modulefiles/Core +module load apptainer + +# Resource allocation for DeepHyper +export NDEPTH=16 +export NRANKS_PER_NODE=4 +export NNODES=`wc -l < $PBS_NODEFILE` +export NTOTRANKS=$(( $NNODES * $NRANKS_PER_NODE + 1)) +export OMP_NUM_THREADS=$NDEPTH + +echo NNODES: ${NNODES} +echo NTOTRANKS: ${NTOTRANKS} +echo OMP_NUM_THREADS: ${OMP_NUM_THREADS} + +# GPU profiling, (quite ad-hoc, copy-paste the `profile_gpu_polaris.sh`, requires to install some small +# python package which queries nvidia-smi, you need a simple parser then to collect data.) +# UNCOMMENT IF USEFULL +# export GPUSTAT_LOG_DIR=$PBS_O_WORKDIR/$log_dir +# mpiexec -n ${NNODES} --ppn 1 --depth=1 --cpu-bind depth --envall ../profile_gpu_polaris.sh & + +# Get list of process ids (basically node names) +echo $PBS_NODEFILE +export RANKS_HOSTS=$(python ./get_hosts_polaris.py $PBS_NODEFILE) + +echo RANKS_HOSTS: ${RANKS_HOSTS} +echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK} + +# Launch DeepHyper +# ensure that mpi is pointing to the one within deephyper conda environment +# set_affinity_gpu_polaris.sh does not seem to work right now +# but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, +mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ + --envall ./set_affinity_gpu_polaris.sh python hpo_subprocess_singularity.py \ No newline at end of file diff --git a/hpo_scale_singularity_prod.sh b/hpo_scale_singularity_prod.sh new file mode 100644 index 0000000..99fd639 --- /dev/null +++ b/hpo_scale_singularity_prod.sh @@ -0,0 +1,60 @@ +#!/bin/bash +#PBS -l select=10:ncpus=64:ngpus=4:system=polaris +#PBS -l place=scatter +#PBS -l walltime=00:10:00 +#PBS -q prod +#PBS -A IMPROVE +#PBS -l filesystems=home:eagle +#PBS -N dh_hpo_scale_test +#PBS -M liu.yuanhang@mayo.edu + +set -xe + +# Move to the directory where `qsub example-improve.sh` was run +cd ${PBS_O_WORKDIR} + +# source enviroemnt variabels for IMPROVE +source $IMPROVE_env + +# Activate the current environement (module load, conda activate etc...) +module load PrgEnv-gnu +module use /soft/modulefiles +module load conda +# activate base environment +conda_path=$(dirname $(dirname $(which conda))) +source $conda_path/bin/activate base +#source $conda_path/bin/activate $dh_env +# load module to run singularity container +module use /soft/spack/gcc/0.6.1/install/modulefiles/Core +module load apptainer + +# Resource allocation for DeepHyper +export NDEPTH=16 +export NRANKS_PER_NODE=4 +export NNODES=`wc -l < $PBS_NODEFILE` +export NTOTRANKS=$(( $NNODES * $NRANKS_PER_NODE + 1)) +export OMP_NUM_THREADS=$NDEPTH + +echo NNODES: ${NNODES} +echo NTOTRANKS: ${NTOTRANKS} +echo OMP_NUM_THREADS: ${OMP_NUM_THREADS} + +# GPU profiling, (quite ad-hoc, copy-paste the `profile_gpu_polaris.sh`, requires to install some small +# python package which queries nvidia-smi, you need a simple parser then to collect data.) +# UNCOMMENT IF USEFULL +# export GPUSTAT_LOG_DIR=$PBS_O_WORKDIR/$log_dir +# mpiexec -n ${NNODES} --ppn 1 --depth=1 --cpu-bind depth --envall ../profile_gpu_polaris.sh & + +# Get list of process ids (basically node names) +echo $PBS_NODEFILE +export RANKS_HOSTS=$(python ./get_hosts_polaris.py $PBS_NODEFILE) + +echo RANKS_HOSTS: ${RANKS_HOSTS} +echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK} + +# Launch DeepHyper +# ensure that mpi is pointing to the one within deephyper conda environment +# set_affinity_gpu_polaris.sh does not seem to work right now +# but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, +mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \ + --envall ./set_affinity_gpu_polaris.sh python hpo_subprocess_singularity.py \ No newline at end of file diff --git a/hpo_subprocess_singularity.py b/hpo_subprocess_singularity.py new file mode 100644 index 0000000..abf7a9a --- /dev/null +++ b/hpo_subprocess_singularity.py @@ -0,0 +1,160 @@ +""" +Before running this script, first need to preprocess the data. + +It is assumed that the csa benchmark data is downloaded via download_csa.sh +and the env vars $IMPROVE_DATA_DIR and $PYTHONPATH are set: +export IMPROVE_DATA_DIR="./csa_data/" +export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib + +mpirun -np 10 python hpo_subprocess.py +""" +# import copy +import json +import subprocess +import pandas as pd +import os +import logging +import mpi4py +from mpi4py import MPI +from deephyper.evaluator import Evaluator, profile +from deephyper.evaluator.callback import TqdmCallback +from deephyper.problem import HpProblem +from deephyper.search.hps import CBO +import socket + +# --------------------- +# Enable using multiple GPUs +# --------------------- + +mpi4py.rc.initialize = False +mpi4py.rc.threads = True +mpi4py.rc.thread_level = "multiple" +mpi4py.rc.recv_mprobe = False + +if not MPI.Is_initialized(): + MPI.Init_thread() + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() +size = comm.Get_size() +local_rank = os.environ["PMI_LOCAL_RANK"] + +# CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh +# uncomment the below commands if running via interactive node +#num_gpus_per_node = 3 +#os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) + +# --------------------- +# Enable logging +# --------------------- + +logging.basicConfig( + # filename=f"deephyper.{rank}.log, # optional if we want to store the logs to disk + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(filename)s:%(funcName)s - %(message)s", + force=True, +) + +# --------------------- +# Hyperparameters +# --------------------- +problem = HpProblem() + +problem.add_hyperparameter((8, 512, "log-uniform"), "batch_size", default_value=8) +problem.add_hyperparameter((1e-6, 1e-2, "log-uniform"), + "learning_rate", default_value=0.0004) +problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.1) +# problem.add_hyperparameter([True, False], "early_stopping", default_value=False) + +# --------------------- +# Some IMPROVE settings +# --------------------- +source = "GDSCv1" +split = 0 +train_ml_data_dir = f"ml_data/{source}-{source}/split_{split}" +val_ml_data_dir = f"ml_data/{source}-{source}/split_{split}" +model_outdir = f"dh_hpo_improve/{source}/split_{split}" +log_dir = "dh_hpo_logs/" +subprocess_bashscript = "subprocess_train_singularity.sh" + + +@profile +def run(job, optuna_trial=None): + + # config = copy.deepcopy(job.parameters) + # params = { + # "epochs": DEEPHYPER_BENCHMARK_MAX_EPOCHS, + # "timeout": DEEPHYPER_BENCHMARK_TIMEOUT, + # "verbose": False, + # } + # if len(config) > 0: + # remap_hyperparameters(config) + # params.update(config) + + model_outdir_job_id = model_outdir + f"/{job.id}" + learning_rate = job.parameters["learning_rate"] + batch_size = job.parameters["batch_size"] + dropout = job.parameters["dropout"] + # val_scores = main_train_grapdrp([ + # "--train_ml_data_dir", str(train_ml_data_dir), + # "--val_ml_data_dir", str(val_ml_data_dir), + # "--model_outdir", str(model_outdir_job_id), + # ]) + subprocess_res = subprocess.run( + [ + "bash", subprocess_bashscript, + str(train_ml_data_dir), + str(val_ml_data_dir), + str(model_outdir_job_id), + str(learning_rate), + str(batch_size), + str(dropout), + str(os.environ["CUDA_VISIBLE_DEVICES"]) + ], + capture_output=True, text=True, check=True + ) + + # print(subprocess_res.stdout) + # print(subprocess_res.stderr) + + # Load val_scores and get val_loss + # f = open(model_outdir + "/val_scores.json") + f = open(os.path.join(os.environ["IMPROVE_DATA_DIR"], model_outdir_job_id, "val_scores.json")) + val_scores = json.load(f) + objective = -val_scores["val_loss"] + # print("objective:", objective) + + # Checkpoint the model weights + with open(f"{log_dir}/model_{job.id}.pkl", "w") as f: + f.write("model weights") + + # return score + return {"objective": objective, "metadata": val_scores} + + +if __name__ == "__main__": + with Evaluator.create( + run, method="mpicomm", method_kwargs={"callbacks": [TqdmCallback()]} + ) as evaluator: + + if evaluator is not None: + print(problem) + + search = CBO( + problem, + evaluator, + log_dir=log_dir, + verbose=1, + ) + + # max_evals = 2 + # max_evals = 4 + # max_evals = 10 + # max_evals = 20 + max_evals = 100 + # max_evals = 100 + results = search.search(max_evals=max_evals) + results = results.sort_values("m:val_loss", ascending=True) + results.to_csv(os.path.join(os.environ["IMPROVE_DATA_DIR"], model_outdir, "hpo_results.csv"), index=False) + print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) + print("Finished deephyper HPO.") diff --git a/hyperparameters_default.json b/hyperparameters_default.json new file mode 100644 index 0000000..a0ca25d --- /dev/null +++ b/hyperparameters_default.json @@ -0,0 +1,82 @@ +{ + "graphdrp": { + "CCLE": { + "batch_size": 256, + "learning_rate": 0.0001 + }, + "CTRPv2": { + "batch_size": 256, + "learning_rate": 0.0001 + }, + "GDSCv1": { + "batch_size": 256, + "learning_rate": 0.0001 + }, + "GDSCv2": { + "batch_size": 256, + "learning_rate": 0.0001 + }, + "gCSI": { + "batch_size": 256, + "learning_rate": 0.0001 + } + }, + "HiDRA": { + "CCLE": {}, + "CTRPv2": {}, + "GDSCv1": {}, + "GDSCv2": {}, + "gCSI": {} + }, + "IGTD": { + "CCLE": {}, + "CTRPv2": {}, + "GDSCv1": {}, + "GDSCv2": {}, + "gCSI": {} + }, + "Paccmann_MCA": { + "CCLE": { + "batch_size": 32, + "learning_rate": 0.0001 + }, + "CTRPv2": { + "batch_size": 32, + "learning_rate": 0.0001 + }, + "GDSCv1": { + "batch_size": 32, + "learning_rate": 0.0001 + }, + "GDSCv2": { + "batch_size": 32, + "learning_rate": 0.0001 + }, + "gCSI": { + "batch_size": 32, + "learning_rate": 0.0001 + } + }, + "PathDSP": { + "CCLE": { + "batch_size": 12, + "learning_rate": 0.0004 + }, + "CTRPv2": { + "batch_size": 12, + "learning_rate": 0.0004 + }, + "GDSCv1": { + "batch_size": 12, + "learning_rate": 0.0004 + }, + "GDSCv2": { + "batch_size": 12, + "learning_rate": 0.0004 + }, + "gCSI": { + "batch_size": 12, + "learning_rate": 0.0004 + } + } +} \ No newline at end of file diff --git a/hyperparameters_hpo.json b/hyperparameters_hpo.json new file mode 100644 index 0000000..f798500 --- /dev/null +++ b/hyperparameters_hpo.json @@ -0,0 +1,101 @@ +{ + "graphdrp": { + "CCLE": { + "batch_size": 64, + "learning_rate": 0.0012233565752066463 + }, + "CTRPv2": { + "batch_size": 256, + "learning_rate": 0.0001 + }, + "GDSCv1": { + "batch_size": 1024, + "learning_rate": 0.06408519376992045 + }, + "GDSCv2": { + "batch_size": 128, + "learning_rate": 0.0005245278010578158 + }, + "gCSI": { + "batch_size": 32, + "learning_rate": 0.00015145175483629418 + } + }, + + "HiDRA": { + "CCLE": { + "batch_size": 16, + "learning_rate": 0.0011825165353494742 + }, + "CTRPv2": {}, + "GDSCv1": { + "batch_size": 64, + "learning_rate": 0.0033642852862458077 + }, + "GDSCv2": {}, + "gCSI": { + "batch_size": 32, + "learning_rate": 0.00685103425830459 + } + }, + "IGTD": { + "CCLE": { + "batch_size": 16, + "learning_rate": 0.00044116909746320135 + }, + "CTRPv2": { + "batch_size": 512, + "learning_rate": 0.0005219211138261751 + }, + "GDSCv1": { + "batch_size": 32, + "learning_rate": 0.0006637139133709175 + }, + "GDSCv2": { + "batch_size": 128, + "learning_rate": 0.0005058417379141607 + }, + "gCSI": { + "batch_size": 32, + "learning_rate": 0.0008434053110179821 + } + }, + "Paccmann_MCA": { + "CCLE": { + "batch_size": 512, + "learning_rate": 0.00019306380321264862 + }, + "CTRPv2": {}, + "GDSCv1": { + "batch_size": 128, + "learning_rate": 2.18431820219191e-05 + }, + "GDSCv2": { + "batch_size": 128, + "learning_rate": 0.00010468988102446662 + }, + "gCSI": { + "batch_size": 256, + "learning_rate": 7.092892399034623e-05 + } + }, + "PathDSP": { + "CCLE": { + "batch_size": 64, + "learning_rate": 9.043143432931976e-05 + }, + "CTRPv2": {}, + "GDSCv1": { + "batch_size": 128, + "learning_rate": 7.565669627288258e-05 + }, + "GDSCv2": { + "batch_size": 16, + "learning_rate": 6.916681700876223e-05 + }, + "gCSI": { + "batch_size": 128, + "learning_rate": 0.00024319069525915603 + } + } +} \ No newline at end of file diff --git a/infer.sh b/infer.sh new file mode 100755 index 0000000..f1dcd94 --- /dev/null +++ b/infer.sh @@ -0,0 +1,7 @@ +IMPROVE_MODEL_NAME=PathDSP +IMPROVE_MODEL_SCRIPT=${IMPROVE_MODEL_NAME}_infer_improve.py + +# Set env if CANDLE_MODEL is not in same directory as this script +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} + +python $IMPROVE_MODEL_DIR/$IMPROVE_MODEL_SCRIPT $@ diff --git a/install_polaris.sh b/install_polaris.sh new file mode 100644 index 0000000..46695e8 --- /dev/null +++ b/install_polaris.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# install script is not needed as it's troublesome to install mpi4py libraray +# use conda base environment directly instead + +# From Romain Egele (this script is called install.sh) + +# Generic installation script for DeepHyper on ALCF's Polaris. +# This script is meant to be run on the login node of the machine. +# It will install DeepHyper and its dependencies in the current directory. +# A good practice is to create a `build` folder and launch the script from there, +# e.g. from the root of the DeepHyper repository: +# $ mkdir build && cd build && ../install/alcf/polaris.sh +# The script will also create a file named `activate-dhenv.sh` that will +# Setup the environment each time it is sourced `source activate-dhenv.sh`. + +set -xe + +# Load modules available on the current system +module load PrgEnv-gnu +# conda is not avilable +# need to install conda locally +#module load conda/2023-10-04 + +# Copy the base conda environment +conda create -p dhenv python=3.9 pip -y +conda activate dhenv/ +pip install --upgrade pip + +# For mpi4py +#module swap PrgEnv-nvhpc PrgEnv-gnu +module load nvhpc-mixed +git clone https://github.com/mpi4py/mpi4py.git +cd mpi4py/ +MPICC=CC python setup.py install +cd ../ +#conda install mpi4py --yes + +# Install the DeepHyper's Python package +git clone -b develop git@github.com:deephyper/deephyper.git +pip install -e "deephyper/[hps,mpi]" + +# Create activation script +touch activate-dhenv.sh +echo "#!/bin/bash" >> activate-dhenv.sh + +# Append modules loading and conda activation +echo "" >> activate-dhenv.sh +echo "module load PrgEnv-gnu" >> activate-dhenv.sh +#echo "module load conda/2023-10-04" >> activate-dhenv.sh +conda_path=$(dirname $(dirname $(which conda))) +echo "source $conda_path/bin/activate $PWD/dhenv/" >> activate-dhenv.sh diff --git a/model_params_def.py b/model_params_def.py new file mode 100644 index 0000000..a9680db --- /dev/null +++ b/model_params_def.py @@ -0,0 +1,68 @@ +pathdsp_preprocess_params = [ + {"name": "bit_int", + "type": int, + "default": 128, + "help": "Number of bits for morgan fingerprints.", + }, + {"name": "permutation_int", + "type": int, + "default": 3, + "help": "Number of permutation for calculating enrichment scores.", + }, + {"name": "seed_int", + "type": int, + "default": 42, + "help": "Random seed for random walk algorithm.", + }, + {"name": "cpu_int", + "type": int, + "default": 20, + "help": "Number of cpus to use when calculating pathway enrichment scores.", + }, + {"name": "drug_bits_file", + "type": str, + "default": "drug_mbit_df.txt", + "help": "File name to save the drug bits file.", + }, + {"name": "dgnet_file", + "type": str, + "default": "DGnet.txt", + "help": "File name to save the drug target net file.", + }, + {"name": "mutnet_file", + "type": str, + "default": "MUTnet.txt", + "help": "File name to save the mutation net file.", + }, + {"name": "cnvnet_file", + "type": str, + "default": "CNVnet.txt", + "help": "File name to save the CNV net file.", + }, + {"name": "exp_file", + "type": str, + "default": "EXPnet.txt", + "help": "File name to save the EXP net file.", + }, +] + +pathdsp_train_params = [ + {"name": "cuda_name", # TODO. frm. How should we control this? + "action": "store", + "type": str, + "help": "Cuda device (e.g.: cuda:0, cuda:1." + }, + {"name": "dropout", + "type": float, + "default": 0.1, + "help": "Dropout rate for the optimizer." + }, +] + +pathdsp_infer_params = [ + {"name": "cuda_name", # TODO. frm. How should we control this? + "action": "store", + "type": str, + "help": "Cuda device (e.g.: cuda:0, cuda:1." + }, +] \ No newline at end of file diff --git a/PathDSP/FNN.py b/model_utils/FNN.py similarity index 99% rename from PathDSP/FNN.py rename to model_utils/FNN.py index a05247b..01c9acc 100644 --- a/PathDSP/FNN.py +++ b/model_utils/FNN.py @@ -30,7 +30,7 @@ import myDataloader as mydl import myDatasplit as mysplit import myUtility as myutil -import myPlotter as myplot +#import myPlotter as myplot import myMetrics as mymts import shap as sp diff --git a/model_utils/FNN_new.py b/model_utils/FNN_new.py new file mode 100644 index 0000000..0f84e5d --- /dev/null +++ b/model_utils/FNN_new.py @@ -0,0 +1,299 @@ +""" +Train a neural network for regression task: + cv: 10 + batch size: 8 + initializer: He normal initializer + optimizer: AdamMax + learning rate: 0.0004 + loss: RMSE + +Calculate RMSE at once, Oct. 3, 2020 revised +""" + +import os +import argparse +import numpy as np +import pandas as pd +import scipy.stats as scistat +from datetime import datetime + +import sklearn.preprocessing as skpre +import sklearn.model_selection as skms +import sklearn.metrics as skmts +import sklearn.utils as skut + +import torch as tch +import torch.utils.data as tchud + +import myModel as mynet +import myFit as myfit +import myDataloader as mydl +import myDatasplit as mysplit +import myUtility as myutil +#import myPlotter as myplot +import myMetrics as mymts +import polars as pl + +#import shap as sp + +class RMSELoss(tch.nn.Module): + def __init__(self): + super(RMSELoss,self).__init__() + + def forward(self,x,y): + eps = 1e-6 + criterion = tch.nn.MSELoss() + loss = tch.sqrt(criterion(x, y) + eps) + return loss + + +def r2_score(y_true, y_pred): + y_mean = np.mean(y_true) + ss_tot = np.sum((y_true - y_mean)**2) + ss_res = np.sum((y_true - y_pred)**2) + r2 = 1 - ss_res / ss_tot + return r2 + +def cal_time(end, start): + '''return time spent''' + # end = datetime.now(), start = datetime.now() + datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' + spend = datetime.strptime(str(end), datetimeFormat) - \ + datetime.strptime(str(start),datetimeFormat) + return spend + + + +def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn): + """ + Return train and valid performance including loss + + :param net: model + :param train_dl: train dataloader + :param valid_dl: valid dataloader + :param epochs: integer representing EPOCH + :param learning_rate: float representing LEARNING_RATE + :param device: string representing cpu or cuda:0 + :param opt_fn: optimization function in torch (e.g., tch.optim.Adam) + :param loss_fn: loss function in torch (e.g., tch.nn.MSELoss) + """ + # setup + criterion = RMSELoss() # setup LOSS function + optimizer = opt_fn(net.parameters(), lr=learning_rate, weight_decay=1e-5) # setup optimizer + net = net.to(device) # load the network onto the device + trainloss_list = [] # metrics: MSE, size equals to EPOCH + validloss_list = [] # metrics: MSE, size equals to EPOCH + validr2_list = [] # metrics: r2, size equals to EPOCH + early_stopping = myutil.EarlyStopping(patience=30, verbose=True, path= os.environ.get("CANDLE_DATA_DIR") + "/Data/checkpoint.pt") # initialize the early_stopping + # repeat the training for EPOCH times + start_total = datetime.now() + for epoch in range(epochs): + ## training phase + start = datetime.now() + net.train() + # initial loss + train_epoch_loss = 0.0 # save loss for each epoch, batch by batch + for i, (X_train, y_train) in enumerate(train_dl): + X_train, y_train = X_train.to(device), y_train.to(device) # load data onto the device + y_train_pred = net(X_train) # train result + train_loss = criterion(y_train_pred, y_train.float()) # calculate loss + optimizer.zero_grad() # clear gradients + train_loss.backward() # backpropagation + #### add this if you have gradient explosion problem ### + clip_value = 5 + tch.nn.utils.clip_grad_value_(net.parameters(), clip_value) + ########climp gradient within -5 ~ 5 ################### + optimizer.step() # update weights + train_epoch_loss += train_loss.item() # adding loss from each batch + # calculate total loss of all batches + avg_train_loss = train_epoch_loss / len(train_dl) + trainloss_list.append( avg_train_loss ) + print('epoch ' + str(epoch) + ' :[Finished in {:}]'.format(cal_time(datetime.now(), start))) + ## validation phase + with tch.no_grad(): + net.eval() + valid_epoch_loss = 0.0 # save loss for each epoch, batch by batch + ss_res = 0.0 + ss_tot = 0.0 + for i, (X_valid, y_valid) in enumerate(valid_dl): + X_valid, y_valid = X_valid.to(device), y_valid.to(device) # load data onto the device + y_valid_pred = net(X_valid) # valid result + valid_loss = criterion(y_valid_pred, y_valid.float())#y_valid.unsqueeze(1)) # calculate loss + valid_epoch_loss += valid_loss.item() # adding loss from each batch + ss_res += tch.sum((y_valid_pred - y_valid.float())**2) + ss_tot += tch.sum((y_valid_pred - y_valid.mean())**2) + + # calculate total loss of all batches, and append to result list + avg_valid_loss = valid_epoch_loss / len(valid_dl) + validloss_list.append( avg_valid_loss) + valid_r2 = 1 - ss_res / ss_tot + validr2_list.append(valid_r2.cpu().numpy()) + # display print message + #print('epoch={:}/{:}, train loss={:.5f}, valid loss={:.5f}'.format( + # epoch+1, epochs, train_epoch_loss / len(train_dl), + # valid_epoch_loss / len(valid_dl))) + + # early_stopping needs the validation loss to check if it has decresed, + # and if it has, it will make a checkpoint of the current model + early_stopping(avg_valid_loss, net) + + if early_stopping.early_stop: + print("Early stopping") + break + + print('Total time (all epochs) :[Finished in {:}]'.format(cal_time(datetime.now(), start_total))) + # load the last checkpoint with the best model + net.load_state_dict(tch.load(os.environ.get('CANDLE_DATA_DIR') + '/Data/checkpoint.pt')) + + return net, trainloss_list, validloss_list, validr2_list + +def predict(net, test_dl, device): + """ + Return prediction list + + :param net: model + :param train_dl: train dataloader + :param device: string representing cpu or cuda:0 + """ + # create result lists + prediction_list = list() + + with tch.no_grad(): + net = net.to(device) # load the network onto the device + net.eval() + for i, (X_test, y_test) in enumerate(test_dl): + X_test, y_test = X_test.to(device), y_test.to(device) # load data onto the device + y_test_pred = net(X_test) # test result + # bring data back to cpu in np.array format, and append to result lists + prediction_list.append( y_test_pred.cpu().numpy() ) + #print(prediction_list) + + # merge all batches + prediction_list = np.vstack(prediction_list) + prediction_list = np.hstack(prediction_list).tolist() + # return + return prediction_list + + +def main(params): + start_time = datetime.now() + # load data + print('loadinig data') + # train_df = pd.read_csv(params['train_data'], header=0, index_col=[0,1], sep="\t") + # val_df = pd.read_csv(params['val_data'], header=0, index_col=[0,1], sep="\t") + # test_df = pd.read_csv(params['test_data'], header=0, index_col=[0,1], sep="\t") + train_df = pl.read_csv(params['train_data'], separator = "\t").to_pandas() + val_df = pl.read_csv(params['val_data'], separator = "\t").to_pandas() + + # shuffle + #sdf = skut.shuffle(df, random_state=params["seed_int"]) + + # set parameters + myutil.set_seed(params["seed_int"]) + device = myutil.get_device(uth=params["gpu_int"]) + #kFold = params["cv_int"] + learning_rate = params['learning_rate'] + epoch = params['epochs'] + batch_size = params['batch_size'] + opt_fn = tch.optim.Adam + + # create result list + # loss_df_list = [] + # score_df_list = [] + # ytest_df_list = [] + # shap_df_list = [] + # # train with cross-validation + #kf = skms.KFold(n_splits=kFold, random_state=params['seed_int'], shuffle=True) + #X_df = train_df.iloc[:, 0:-1] + #y_df = train_df.iloc[:, -1] + # save best model with lowest RMSE +# best_rmse = 10000 +# best_model = None +# best_fold = 0 +# # for i, (train_index, test_index) in enumerate(kf.split(X_df, y_df)): + #n_fold = i+1 + #print('Fold={:}/{:}'.format(n_fold, params['cv_int'])) + # get train/test splits + Xtrain_arr = train_df.iloc[:, 0:-1].values + Xvalid_arr = val_df.iloc[:, 0:-1].values + ytrain_arr = train_df.iloc[:, -1].values + yvalid_arr = val_df.iloc[:, -1].values + + # get train/valid splits from train + #Xtrain_arr, Xvalid_arr, ytrain_arr, yvalid_arr = skms.train_test_split(Xtrain_arr, ytrain_arr, + # test_size=0.1, random_state=params['seed_int']) + #print(' train={:}, valid={:}, test={:}'.format(Xtrain_arr.shape, Xvalid_arr.shape, Xtest_arr.shape)) + # prepare dataframe for output + #ytest_df = test_df.iloc[:, -1].to_frame() + # convert to numpy array + Xtrain_arr = np.array(Xtrain_arr).astype('float32') + Xvalid_arr = np.array(Xvalid_arr).astype('float32') + ytrain_arr = np.array(ytrain_arr).astype('float32') + yvalid_arr = np.array(yvalid_arr).astype('float32') + # create mini-batch + train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) + valid_dataset = mydl.NumpyDataset(tch.from_numpy(Xvalid_arr), tch.from_numpy(yvalid_arr)) + train_dl = tchud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + valid_dl = tchud.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) + # initial weight + def init_weights(m): + if type(m) == tch.nn.Linear: + tch.nn.init.kaiming_uniform_(m.weight) + m.bias.data.fill_(0.01) + # load model + n_features = Xtrain_arr.shape[1] + net = mynet.FNN(n_features) + net.apply(init_weights) + # fit data with model + print('start training process') + trained_net, train_loss_list, valid_loss_list, valid_r2_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn) + # if rmse <= best_rmse: + # best_rmse = rmse + # best_fold = n_fold + # best_model = trained_net + # print('best model so far at fold={:}, rmse={:}'.format(best_fold, best_rmse)) + + + # if params['shap_bool'] == True: + # print('calculate shapely values') + # # random select 100 samples as baseline + # train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr)) + # train_dl = tchud.DataLoader(train_dataset, batch_size=200, shuffle=True) + # background, lbl = next(iter(train_dl)) + # explainer = sp.DeepExplainer(trained_net, background[:100].to(device)) + # shap_arr = explainer.shap_values(tch.from_numpy(Xtest_arr)) + # shap_df = pd.DataFrame(shap_arr, index=ytest_df.index, columns=X_df.columns) + # # append to result + # shap_df_list.append(shap_df) + + # collect result + loss_df = pd.DataFrame({'epoch':[i+1 for i in range(len(train_loss_list))], + 'train loss':train_loss_list, + 'valid loss': valid_loss_list, + 'valid r2': valid_r2_list}) + + #loss_df_list.append(loss_df) + #ytest_df_list.append(ytest_df) + # end of fold + #trained_net = None + + # save to output + #all_ytest_df = pd.concat(ytest_df_list, axis=0) + #all_loss_df = pd.concat(loss_df_list, axis=0) + loss_df.to_csv(params['data_dir'] + '/Loss.txt', header=True, index=False, sep="\t") + # if params['shap_bool'] == True: + # all_shap_df = pd.concat(shap_df_list, axis=0) + # all_shap_df.to_csv(params['output'] + '.FNN.cv_' + str(params['cv_int']) + '.SHAP.txt', header=True, index=True, sep="\t") + + # make train/valid loss plots + best_model = trained_net + tch.save(best_model.state_dict(), params['data_dir'] + '/model.pt') + print( '[Finished in {:}]'.format(myutil.cal_time(datetime.now(), start_time)) ) + # display evaluation metrics of all folds + #mse, rmse, r_square, pccy = mymts.eval_regressor_performance(all_ytest_df, 'response', 'prediction') + + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/model_utils/infer.sh b/model_utils/infer.sh new file mode 100644 index 0000000..aa14ebe --- /dev/null +++ b/model_utils/infer.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# arg 1 CUDA_VISIBLE_DEVICES +# arg 2 CANDLE_DATA_DIR +# arg 3 CANDLE_CONFIG + +### Path to your CANDLEized model's main Python script### +CANDLE_MODEL=infer.py + +### Set env if CANDLE_MODEL is not in same directory as this script +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} + +CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL} +if [ ! -f ${CANDLE_MODEL} ] ; then + echo No such file ${CANDLE_MODEL} + exit 404 +fi + +if [ $# -lt 2 ]; then + echo "Illegal number of parameters" + echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required" + exit +fi + +if [ $# -eq 2 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + CMD="python ${CANDLE_MODEL}" + echo "CMD = $CMD" + +elif [ $# -ge 3 ] ; then + CUDA_VISIBLE_DEVICES=$1 ; shift + CANDLE_DATA_DIR=$1 ; shift + + # if original $3 is a file, set candle_config and passthrough $@ + if [ -f $CANDLE_DATA_DIR/$1 ] ; then + echo "$CANDLE_DATA_DIR/$1 is a file" + CANDLE_CONFIG=$1 ; shift + CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@" + echo "CMD = $CMD $@" + + # else passthrough $@ + else + echo "$1 is not a file" + CMD="python ${CANDLE_MODEL} $@" + echo "CMD = $CMD" + + fi +fi + +if [ -d ${CANDLE_DATA_DIR} ]; then + if [ "$(ls -A ${CANDLE_DATA_DIR})" ] ; then + echo "using data from ${CANDLE_DATA_DIR}" + else + ./candle_glue.sh + echo "using original data placed in ${CANDLE_DATA_DIR}" + fi +fi + +export CANDLE_DATA_DIR=${CANDLE_DATA_DIR} +FULL_DATA_DIR="$CANDLE_DATA_DIR/$MODEL_NAME/Data" +echo $FULL_DATA_DIR + +if [ -d ${FULL_DATA_DIR} ]; then + if [ "$(ls -A ${FULL_DATA_DIR})" ] ; then + echo "using data from ${FULL_DATA_DIR}" + else + ./candle_glue.sh + echo "using original data placed in ${FULL_DATA_DIR}" + fi +else + ./candle_glue.sh + echo "using original data placed in ${FULL_DATA_DIR}" +fi + +# Display runtime arguments +echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}" +echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}" +echo "using CANDLE_CONFIG ${CANDLE_CONFIG}" + +# Set up environmental variables and execute model +echo "activating environment" +#source /opt/conda/etc/profile.d/conda.sh +source activate /usr/local/conda_envs/PathDSP_env +echo "running command ${CMD}" +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD diff --git a/PathDSP/leaveOneGroupOut_FNN.py b/model_utils/leaveOneGroupOut_FNN.py similarity index 100% rename from PathDSP/leaveOneGroupOut_FNN.py rename to model_utils/leaveOneGroupOut_FNN.py diff --git a/PathDSP/load_pretrained.py b/model_utils/load_pretrained.py similarity index 100% rename from PathDSP/load_pretrained.py rename to model_utils/load_pretrained.py diff --git a/PathDSP/myDataloader.py b/model_utils/myDataloader.py similarity index 93% rename from PathDSP/myDataloader.py rename to model_utils/myDataloader.py index e9a4dbf..77eb482 100644 --- a/PathDSP/myDataloader.py +++ b/model_utils/myDataloader.py @@ -9,7 +9,7 @@ import torch.utils.data as tchud import sklearn.model_selection as skms import sklearn.preprocessing as skpre -import myDatasplit as mysplit +import model_utils.myDatasplit as mysplit class NumpyDataset(tchud.Dataset): """ diff --git a/PathDSP/myDatasplit.py b/model_utils/myDatasplit.py similarity index 100% rename from PathDSP/myDatasplit.py rename to model_utils/myDatasplit.py diff --git a/PathDSP/myFit.py b/model_utils/myFit.py similarity index 100% rename from PathDSP/myFit.py rename to model_utils/myFit.py diff --git a/PathDSP/myMetrics.py b/model_utils/myMetrics.py similarity index 100% rename from PathDSP/myMetrics.py rename to model_utils/myMetrics.py diff --git a/PathDSP/myModel.py b/model_utils/myModel.py similarity index 100% rename from PathDSP/myModel.py rename to model_utils/myModel.py diff --git a/PathDSP/myUtility.py b/model_utils/myUtility.py similarity index 100% rename from PathDSP/myUtility.py rename to model_utils/myUtility.py diff --git a/PathDSP/nestedCV.py b/model_utils/nestedCV.py similarity index 100% rename from PathDSP/nestedCV.py rename to model_utils/nestedCV.py diff --git a/old_deephyper/README_deephyper.md b/old_deephyper/README_deephyper.md new file mode 100644 index 0000000..725ce02 --- /dev/null +++ b/old_deephyper/README_deephyper.md @@ -0,0 +1,83 @@ +# Run HPO using deephyper on Polaris + +## Install conda environment for the curated model (PathDSP) + +``` +## install PathDSP +git clone -b deephyper https://github.com/Liuy12/PathDSP.git +## install IMPROVE +git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git +## define where to install PathDSP env +export PathDSP_env=./PathDSP_env/ +conda env create -f ./PathDSP/environment_082223.yml -p $PathDSP_env +conda activate ${PathDSP_env} +pip install git+https://github.com/ECP-CANDLE/candle_lib@develop +``` + +## Download csa benchmark data + +``` +wget --cut-dirs=7 -P ./ -nH -np -m ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data +``` + +## Download additional author data (PathDSP only) + +``` +mkdir author_data +bash ./PathDSP/download_author_data.sh author_data/ +``` + +## Define environment variables + +``` +### if necessary, request an interactive node from polaris to testing purposes +### qsub -A IMPROVE -I -l select=1 -l filesystems=home:eagle -l walltime=1:00:00 -q debug +### NEED to cd into your working directory again once the job started +improve_lib="$PWD/IMPROVE/" +pathdsp_lib="$PWD/PathDSP/" +# notice the extra PathDSP folder after pathdsp_lib +echo "export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/" >> IMPROVE_env +# IMPROVE_DATA_DIR +echo "export IMPROVE_DATA_DIR=$PWD/csa_data/" >> IMPROVE_env +# AUTHOR_DATA_DIR required for PathDSP +echo "export AUTHOR_DATA_DIR=$PWD/author_data/" >> IMPROVE_env +# PathDSP_env: conda environment path for the model +echo "export PathDSP_env=$PathDSP_env" >> IMPROVE_env +source $PWD/IMPROVE_env +``` + +## Perform preprocessing + +``` +conda activate $PathDSP_env +## You can copy the processed files for PathDSP +cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ ./PathDSP/ +## Alternatively, run the preprocess script +## This script taks around 40 mins to complete +## python PathDSP/PathDSP_preprocess_improve.py --ml_data_outdir=./PathDSP/ml_data/GDSCv1-GDSCv1/split_4/ +``` + +## Perform HPO using singularity container across two nodes + +``` +## copy processed to IMPROVE_DATA_DIR +cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ $IMPROVE_DATA_DIR +## specify singularity image file for PathDSP +echo "export PathDSP_sif=/lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP.sif" >> IMPROVE_env +cd PathDSP +## submit to debug queue +qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale_singularity_debug.sh +## to submit to debug-scaling or prod queue +## use hpo_scale_singularity_debug_scaling.sh +## or hpo_scale_singularity_prod.sh +## for interative node, run: mpirun -np 10 python hpo_subprocess_singularity.py +``` + +## Alternatively, perform HPO across two nodes based on conda + +``` +cd PathDSP +# supply environment variables to qsub +qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale.sh +## for interactive node, you can run: mpirun -np 10 python hpo_subprocess.py +``` \ No newline at end of file diff --git a/old_deephyper/hpo_subprocess.py b/old_deephyper/hpo_subprocess.py new file mode 100644 index 0000000..b925ec1 --- /dev/null +++ b/old_deephyper/hpo_subprocess.py @@ -0,0 +1,165 @@ +""" +Before running this script, first need to preprocess the data. +This can be done by running preprocess_example.sh + +It is assumed that the csa benchmark data is downloaded via download_csa.sh +and the env vars $IMPROVE_DATA_DIR and $PYTHONPATH are set: +export IMPROVE_DATA_DIR="./csa_data/" +export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib + +It also assumes that your processed training data is at: "ml_data/{source}-{source}/split_{split}" +validation data is at: "ml_data/{source}-{source}/split_{split}" +model output files will be saved at "dh_hpo_improve/{source}/split_{split}" + +mpirun -np 10 python hpo_subprocess.py +""" +# import copy +import json +import subprocess +import pandas as pd +import os +import logging +import mpi4py +from deephyper.evaluator import Evaluator, profile +from deephyper.evaluator.callback import TqdmCallback +from deephyper.problem import HpProblem +from deephyper.search.hps import CBO +from mpi4py import MPI +import socket + +# --------------------- +# Enable using multiple GPUs +# --------------------- + +mpi4py.rc.initialize = False +mpi4py.rc.threads = True +mpi4py.rc.thread_level = "multiple" +mpi4py.rc.recv_mprobe = False + +if not MPI.Is_initialized(): + MPI.Init_thread() + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() +size = comm.Get_size() +local_rank = os.environ["PMI_LOCAL_RANK"] + +# CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh +# uncomment the below commands if running via interactive node +#num_gpus_per_node = 4 +#os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node) +#cuda_name = "cuda:" + str(rank % num_gpus_per_node) + +# --------------------- +# Enable logging +# --------------------- + +logging.basicConfig( + # filename=f"deephyper.{rank}.log, # optional if we want to store the logs to disk + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(filename)s:%(funcName)s - %(message)s", + force=True, +) + +# --------------------- +# Hyperparameters +# --------------------- +problem = HpProblem() + +problem.add_hyperparameter((8, 512, "log-uniform"), "batch_size", default_value=64) +problem.add_hyperparameter((1e-6, 1e-2, "log-uniform"), + "learning_rate", default_value=0.001) +# problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.0) +# problem.add_hyperparameter([True, False], "early_stopping", default_value=False) + +# --------------------- +# Some IMPROVE settings +# --------------------- +source = "GDSCv1" +split = 4 +train_ml_data_dir = f"ml_data/{source}-{source}/split_{split}" +val_ml_data_dir = f"ml_data/{source}-{source}/split_{split}" +model_outdir = f"dh_hpo_improve/{source}/split_{split}" +log_dir = "dh_hpo_logs/" +subprocess_bashscript = "subprocess_train.sh" + + +@profile +def run(job, optuna_trial=None): + + # config = copy.deepcopy(job.parameters) + # params = { + # "epochs": DEEPHYPER_BENCHMARK_MAX_EPOCHS, + # "timeout": DEEPHYPER_BENCHMARK_TIMEOUT, + # "verbose": False, + # } + # if len(config) > 0: + # remap_hyperparameters(config) + # params.update(config) + + model_outdir_job_id = model_outdir + f"/{job.id}" + learning_rate = job.parameters["learning_rate"] + batch_size = job.parameters["batch_size"] + # val_scores = main_train_grapdrp([ + # "--train_ml_data_dir", str(train_ml_data_dir), + # "--val_ml_data_dir", str(val_ml_data_dir), + # "--model_outdir", str(model_outdir_job_id), + # ]) + subprocess_res = subprocess.run( + [ + "bash", subprocess_bashscript, + str(train_ml_data_dir), + str(val_ml_data_dir), + str(model_outdir_job_id), + str(learning_rate), + str(batch_size), + #str(cuda_name) + str(os.environ["CUDA_VISIBLE_DEVICES"]) + ], + capture_output=True, text=True, check=True + ) + + # print(subprocess_res.stdout) + # print(subprocess_res.stderr) + + # Load val_scores and get val_loss + # f = open(model_outdir + "/val_scores.json") + f = open(model_outdir_job_id + "/val_scores.json") + val_scores = json.load(f) + objective = -val_scores["val_loss"] + # print("objective:", objective) + + # Checkpoint the model weights + with open(f"{log_dir}/model_{job.id}.pkl", "w") as f: + f.write("model weights") + + # return score + return {"objective": objective, "metadata": val_scores} + + +if __name__ == "__main__": + with Evaluator.create( + run, method="mpicomm", method_kwargs={"callbacks": [TqdmCallback()]} + ) as evaluator: + + if evaluator is not None: + print(problem) + + search = CBO( + problem, + evaluator, + log_dir=log_dir, + verbose=1, + ) + + # max_evals = 2 + # max_evals = 4 + # max_evals = 10 + # max_evals = 20 + max_evals = 10 + # max_evals = 100 + results = search.search(max_evals=max_evals) + results = results.sort_values("m:val_loss", ascending=True) + results.to_csv(model_outdir + "/hpo_results.csv", index=False) + print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"]) + print("Finished deephyper HPO.") diff --git a/old_deephyper/subprocess_train.sh b/old_deephyper/subprocess_train.sh new file mode 100755 index 0000000..a2b1541 --- /dev/null +++ b/old_deephyper/subprocess_train.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0 +# CUDA_VISIBLE_DEVICES=5 bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0 + +# Need to comment this when using ' eval "$(conda shell.bash hook)" ' +# set -e + +# Activate conda env for model using "conda activate myenv" +# https://saturncloud.io/blog/activating-conda-environments-from-scripts-a-guide-for-data-scientists +# https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script +# This doesn't work w/o eval "$(conda shell.bash hook)" +CONDA_ENV=$PathDSP_env +#echo "Allow conda commands in shell script by running 'conda shell.bash hook'" +#eval "$(conda shell.bash hook)" +echo "Activated conda commands in shell script" +#conda activate $CONDA_ENV +#source activate $CONDA_ENV +conda_path=$(dirname $(dirname $(which conda))) +source $conda_path/bin/activate $CONDA_ENV +#source /soft/datascience/conda/2023-10-04/mconda3/bin/activate $CONDA_ENV +#source activate $CONDA_ENV +echo "Activated conda env $CONDA_ENV" + +train_ml_data_dir=$1 +val_ml_data_dir=$2 +model_outdir=$3 +learning_rate=$4 +batch_size=$5 +#cuda_name=$6 +CUDA_VISIBLE_DEVICES=$6 + +echo "train_ml_data_dir: $train_ml_data_dir" +echo "val_ml_data_dir: $val_ml_data_dir" +echo "model_outdir: $model_outdir" +echo "learning_rate: $learning_rate" +echo "batch_size: $batch_size" +#echo "cuda_name: $cuda_name" +echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" + +# epochs=10 +epochs=10 +# epochs=50 + +# All train outputs are saved in params["model_outdir"] +#CUDA_VISIBLE_DEVICES=6,7 python PathDSP_train_improve.py \ +#CUDA_VISIBLE_DEVICES=5 +#CUDA_VISIBLE_DEVICES=6,7 +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} python PathDSP_train_improve.py \ + --train_ml_data_dir $train_ml_data_dir \ + --val_ml_data_dir $val_ml_data_dir \ + --model_outdir $model_outdir \ + --epochs $epochs \ + --learning_rate $learning_rate \ + --batch_size $batch_size +# --cuda_name $cuda_name + +#conda deactivate +source $conda_path/bin/deactivate +echo "Deactivated conda env $CONDA_ENV" diff --git a/old_legacy_scripts/PathDSP_cs_model.txt b/old_legacy_scripts/PathDSP_cs_model.txt new file mode 100644 index 0000000..44a4791 --- /dev/null +++ b/old_legacy_scripts/PathDSP_cs_model.txt @@ -0,0 +1,46 @@ +[Global_Params] +model_name='PathDSP' + +[Preprocess] +train_split_file = "GDSCv1_split_0_train.txt" +val_split_file = "GDSCv1_split_0_val.txt" +test_split_file = "GDSCv1_split_0_test.txt" +ml_data_outdir = "./ml_data/GDSCv1-GDSCv1/split_0" +x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]] +x_data_drug_files = [["drug_SMILES.tsv"]] +y_data_files = [["response.tsv"]] +data_format = ".txt" +drug_bits_file='drug_mbit_df.txt' +dgnet_file='DGnet.txt' +mutnet_file='MUTnet.txt' +cnvnet_file='CNVnet.txt' +exp_file='EXP.txt' +bit_int=128 +permutation_int=3 +seed_int=42 +cpu_int=20 + +[Train] +train_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0" +val_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0" +model_outdir = "./out_models/GDSCv1/split_0" +model_file_name = "model" +model_file_format = ".pt" +epochs=800 +batch_size = 32 +val_batch = 32 +loss = "mse" +early_stop_metric = "mse" +patience = 30 +cuda_name = "cuda:2" +learning_rate = 0.001 + +[Infer] +test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0" +model_dir = "./out_models/GDSCv1/split_0" +infer_outdir = "./out_infer/GDSCv1-GDSCv1/split_0" +test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0" +model_dir = "./out_models/GDSCv1/split_0" +infer_outdir = "./out_infer/GDSCv1-GDSCv1/split_0" +test_batch = 256 +cuda_name = "cuda:3" \ No newline at end of file diff --git a/old_legacy_scripts/README_old.md b/old_legacy_scripts/README_old.md new file mode 100644 index 0000000..84ea104 --- /dev/null +++ b/old_legacy_scripts/README_old.md @@ -0,0 +1,134 @@ +# PathDSP +Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores + +# Example usage with singularity container +Setup Singularity + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git +cd Singularity +./setup +source config/improve.env +``` + +Build Singularity from definition file + +``` +singularity build --fakeroot PathDSP.sif definitions/PathDSP.def +``` + +Perform preprocessing step using processed data from original paper + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 0" +``` + +Alternatively, perform preprocessing step using raw data from IMPROVE project + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 1" +``` + +Train the model + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir +``` + +Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss.txt` +Final trained model is located at: `${IMPROVE_DATA_DIR}/Data/model.pt` + +Perform inference on the testing data + +``` +singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir +``` + +Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss_pred.txt` +Final prediction on testing data is located at: `${IMPROVE_DATA_DIR}/Data/Prediction.txt` + +# Example usage with Conda + +Download PathDSP + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git +cd PathDSP +``` + +Create environment + +``` +conda env create -f environment_082223.yml -n PathDSP_env +``` + +Activate environment + +``` +conda activate PathDSP_env +``` + +Intall CANDLE package + +``` +pip install git+https://github.com/ECP-CANDLE/candle_lib@develop +``` + +Perform preprocessing step using processed data from original paper + +``` +export CUDA_VISIBLE_DEVICES=0 +export CANDLE_DATA_DIR=./Data/ +bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 0" +``` + +Alternatively, perform preprocessing step using raw data from IMPROVE project + +``` +bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 1" +``` + +Train the model + +``` +bash train.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +``` + +Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss.txt` +Final trained model is located at: `${CANDLE_DATA_DIR}/Data/model.pt` + +Perform inference on the testing data + +``` +bash infer.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR +``` + +Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss_pred.txt` +Final prediction on testing data is located at: `${CANDLE_DATA_DIR}/Data/Prediction.txt` + +# Docs from original authors (below) + +# Requirments + +# Input format + +|drug|cell|feature_1|....|feature_n|drug_response| +|----|----|--------|----|--------|----| +|5-FU|03|0|....|0.02|-2.3| +|5-FU|23|1|....|0.04|-3.4| + +Where feature_1 to feature_n are the pathway enrichment scores and the chemical fingerprint coming from data processing +# Usage: +```python +# run FNN +python ./PathDSP/PathDSP/FNN.py -i input.txt -o ./output_prefix + +Where input.txt should be in the input format shown above. +Example input file can be found at https://zenodo.org/record/7532963 +``` +# Data preprocessing +Pathway enrichment scores for categorical data (i.e., mutation, copy number variation, and drug targets) were obtained by running the NetPEA algorithm, which is available at: https://github.com/TangYiChing/NetPEA, while pathway enrichment scores for numeric data (i.e., gene expression) was generated with the single-sample Gene Set Enrichment Analsysis (ssGSEA) available here: https://gseapy.readthedocs.io/en/master/gseapy_example.html#3)-command-line-usage-of-single-sample-gseaby + + +# Reference +Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7 \ No newline at end of file diff --git a/old_legacy_scripts/README_old2.md b/old_legacy_scripts/README_old2.md new file mode 100644 index 0000000..aca66fc --- /dev/null +++ b/old_legacy_scripts/README_old2.md @@ -0,0 +1,181 @@ +# PathDSP +Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores + +# Download benchmark data + +Download the cross-study analysis (CSA) benchmark data into the model directory from https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/ + +``` +mkdir process_dir +cd process_dir +wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data +``` + +Benchmark data will be downloaded under `process_dir/csa_data/` + +# Example usage with Conda + +Download PathDSP and IMPROVE + +``` +mkdir repo +cd repo +git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git +git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git +``` + +# Download author data + +``` +cd ../ +mkdir author_data +bash repo/PathDSP/download_author_data.sh author_data/ +``` + +Author data will be downloaded under `process_dir/author_data/` +PathDSP will be installed at `process_dir/repo/PathDSP` +IMPROVE will be installed at `process_dir/repo/IMPROVE` + +Create environment + +``` +cd repo/PathDSP/ +conda env create -f environment_082223.yml -n PathDSP_env +``` + +Activate environment + +``` +conda activate PathDSP_env +``` + +Install CANDLE package + +``` +pip install git+https://github.com/ECP-CANDLE/candle_lib@develop +``` + +Define enviroment variabels + +``` +improve_lib="/path/to/IMPROVE/repo/" +pathdsp_lib="/path/to/pathdsp/repo/" +# notice the extra PathDSP folder after pathdsp_lib +export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/ +export IMPROVE_DATA_DIR="/path/to/csa_data/" +export AUTHOR_DATA_DIR="/path/to/author_data/" +``` + +Perform preprocessing step + +``` +# go two upper level +cd ../../ +python repo/PathDSP/PathDSP_preprocess_improve.py +``` + +Train the model + +``` +python repo/PathDSP/PathDSP_train_improve.py +``` + +Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json` +Final trained model is located at: `${train_ml_data_dir}/model.pt`. Parameter definitions can be found at `process_dir/repo/PathDSP/PathDSP_default_model.txt` + +Perform inference on the testing data + +``` +python repo/PathDSP/PathDSP_infer_improve.py +``` + +Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` +Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` + +# Example usage with singularity container + +# Download benchmark data + +Download the cross-study analysis (CSA) benchmark data into the model directory from https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/ + +``` +mkdir process_dir +cd process_dir +wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data +``` + +# Download author data + +Download model specific data under csa_data/ directory + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git +bash PathDSP/download_author_data.sh csa_data/ +``` + +Setup Singularity + +``` +git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git +cd Singularity +./setup +source config/improve.env +``` + +Build Singularity from definition file + +``` +singularity build --fakeroot PathDSP.sif definitions/PathDSP.def +``` + +Perform preprocessing using csa benchmarking data + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh /candle_data_dir --ml_data_outdir /candle_data_dir/preprocess_data/ +``` + +Train the model + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir --train_ml_data_dir /candle_data_dir/preprocess_data/ --val_ml_data_dir /candle_data_dir/preprocess_data/ --model_outdir /candle_data_dir/out_model/ +``` + +Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json` +Final trained model is located at: `${train_ml_data_dir}/model.pt`. + +Perform inference on the testing data + +``` +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir --test_ml_data_dir /candle_data_dir/preprocess_data/ --model_dir /candle_data_dir/out_model/ --infer_outdir /candle_data_dir/out_infer/ +``` + +Metrics regarding test process is located at: `${infer_outdir}/test_scores.json` +Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv` + + +# Docs from original authors (below) + +# Requirments + +# Input format + +|drug|cell|feature_1|....|feature_n|drug_response| +|----|----|--------|----|--------|----| +|5-FU|03|0|....|0.02|-2.3| +|5-FU|23|1|....|0.04|-3.4| + +Where feature_1 to feature_n are the pathway enrichment scores and the chemical fingerprint coming from data processing +# Usage: +```python +# run FNN +python ./PathDSP/PathDSP/FNN.py -i input.txt -o ./output_prefix + +Where input.txt should be in the input format shown above. +Example input file can be found at https://zenodo.org/record/7532963 +``` +# Data preprocessing +Pathway enrichment scores for categorical data (i.e., mutation, copy number variation, and drug targets) were obtained by running the NetPEA algorithm, which is available at: https://github.com/TangYiChing/NetPEA, while pathway enrichment scores for numeric data (i.e., gene expression) was generated with the single-sample Gene Set Enrichment Analsysis (ssGSEA) available here: https://gseapy.readthedocs.io/en/master/gseapy_example.html#3)-command-line-usage-of-single-sample-gseaby + + +# Reference +Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7 \ No newline at end of file diff --git a/old_legacy_scripts/TODO.txt b/old_legacy_scripts/TODO.txt new file mode 100644 index 0000000..7f87e3c --- /dev/null +++ b/old_legacy_scripts/TODO.txt @@ -0,0 +1,6 @@ +1. Expose parameters - give up on private crap +2. Fix params file +3. Document on docs +4. Use R script to make preprocess.sh +5. make train script +6. make infer script diff --git a/old_legacy_scripts/environment.yml b/old_legacy_scripts/environment.yml new file mode 100644 index 0000000..ad9d1d7 --- /dev/null +++ b/old_legacy_scripts/environment.yml @@ -0,0 +1,231 @@ +name: PathDSP_env +channels: + - bioconda + - pytorch + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - anyio=3.6.2 + - appdirs=1.4.4 + - argon2-cffi=21.3.0 + - argon2-cffi-bindings=21.2.0 + - asttokens=2.2.1 + - async-lru=2.0.2 + - attrs=23.1.0 + - babel=2.12.1 + - backcall=0.2.0 + - backports=1.0 + - backports.functools_lru_cache=1.6.4 + - beautifulsoup4=4.12.2 + - blas=1.0 + - bleach=6.0.0 + - boost=1.78.0 + - boost-cpp=1.78.0 + - bottleneck=1.3.5 + - brotli=1.0.9 + - brotli-bin=1.0.9 + - brotlipy=0.7.0 + - bzip2=1.0.8 + - ca-certificates=2023.05.30 + - cairo=1.16.0 + - certifi=2023.7.22 + - cffi=1.15.1 + - charset-normalizer=2.0.4 + - cloudpickle=2.2.1 + - colorama=0.4.6 + - comm=0.1.3 + - contourpy=1.0.7 + - cryptography=39.0.1 + - cycler=0.11.0 + - debugpy=1.6.7 + - decorator=5.1.1 + - defusedxml=0.7.1 + - entrypoints=0.4 + - executing=1.2.0 + - expat=2.5.0 + - ffmpeg=4.3 + - filelock=3.9.0 + - flit-core=3.9.0 + - fontconfig=2.14.1 + - fonttools=4.39.4 + - freetype=2.12.1 + - gettext=0.21.1 + - giflib=5.2.1 + - glib=2.76.3 + - glib-tools=2.76.3 + - gmp=6.2.1 + - gmpy2=2.1.2 + - gnutls=3.6.15 + - greenlet=2.0.2 + - gseapy=1.0.5 + - icu=72.1 + - idna=3.4 + - importlib-metadata=6.6.0 + - importlib_metadata=6.6.0 + - importlib_resources=5.12.0 + - intel-openmp=2023.1.0 + - ipykernel=6.23.1 + - ipython=8.13.2 + - jedi=0.18.2 + - jinja2=3.1.2 + - joblib=1.2.0 + - jpeg=9e + - json5=0.9.5 + - jsonschema=4.17.3 + - jupyter-lsp=2.1.0 + - jupyter_client=8.2.0 + - jupyter_core=4.12.0 + - jupyter_events=0.6.3 + - jupyter_server=2.5.0 + - jupyter_server_terminals=0.4.4 + - jupyterlab=4.0.0 + - jupyterlab_pygments=0.2.2 + - jupyterlab_server=2.22.1 + - kiwisolver=1.4.4 + - lame=3.100 + - lcms2=2.12 + - ld_impl_linux-64=2.38 + - lerc=3.0 + - libblas=3.9.0 + - libbrotlicommon=1.0.9 + - libbrotlidec=1.0.9 + - libbrotlienc=1.0.9 + - libcblas=3.9.0 + - libdeflate=1.17 + - libexpat=2.5.0 + - libffi=3.4.4 + - libgcc-ng=12.2.0 + - libgfortran-ng=11.2.0 + - libgfortran5=11.2.0 + - libglib=2.76.3 + - libiconv=1.17 + - libidn2=2.3.4 + - liblapack=3.9.0 + - libllvm11=11.1.0 + - libpng=1.6.39 + - libsodium=1.0.18 + - libstdcxx-ng=12.2.0 + - libtasn1=4.19.0 + - libtiff=4.5.0 + - libunistring=0.9.10 + - libuuid=1.41.5 + - libwebp=1.2.4 + - libwebp-base=1.2.4 + - libxcb=1.15 + - libxml2=2.10.4 + - libzlib=1.2.13 + - llvm-openmp=16.0.4 + - llvmlite=0.39.1 + - lz4-c=1.9.4 + - markupsafe=2.1.1 + - matplotlib-base=3.7.1 + - matplotlib-inline=0.1.6 + - mistune=2.0.5 + - mkl=2023.1.0 + - mkl-service=2.4.0 + - mkl_fft=1.3.6 + - mkl_random=1.2.2 + - mpc=1.1.0 + - mpfr=4.0.2 + - mpmath=1.3.0 + - munkres=1.1.4 + - nbclient=0.8.0 + - nbconvert-core=7.4.0 + - nbformat=5.8.0 + - ncurses=6.4 + - nest-asyncio=1.5.6 + - nettle=3.7.3 + - networkx=2.8.4 + - notebook-shim=0.2.3 + - numba=0.56.4 + - numexpr=2.8.4 + - numpy=1.21.6 + - openh264=2.1.1 + - openssl=1.1.1v + - packaging=23.0 + - pandas=1.5.3 + - pandocfilters=1.5.0 + - parso=0.8.3 + - patsy=0.5.3 + - pcre2=10.40 + - pexpect=4.8.0 + - pickleshare=0.7.5 + - pillow=9.4.0 + - pip=23.0.1 + - pixman=0.40.0 + - pkgutil-resolve-name=1.3.10 + - pooch=1.4.0 + - prometheus_client=0.16.0 + - prompt-toolkit=3.0.38 + - prompt_toolkit=3.0.38 + - psutil=5.9.5 + - pthread-stubs=0.4 + - ptyprocess=0.7.0 + - pure_eval=0.2.2 + - pycairo=1.23.0 + - pycparser=2.21 + - pygments=2.15.1 + - pyopenssl=23.0.0 + - pyparsing=3.0.9 + - pyrsistent=0.19.3 + - pysocks=1.7.1 + - python=3.10.11 + - python-dateutil=2.8.2 + - python-fastjsonschema=2.17.1 + - python-json-logger=2.0.7 + - python_abi=3.10 + - pytorch=2.0.1 + - pytorch-mutex=1.0 + - pytz=2022.7 + - pyyaml=6.0 + - pyzmq=25.0.2 + - rdkit=2023.03.1 + - readline=8.2 + - reportlab=3.6.13 + - requests=2.29.0 + - rfc3339-validator=0.1.4 + - rfc3986-validator=0.1.1 + - scikit-learn=1.0.2 + - scipy=1.10.1 + - seaborn=0.12.2 + - seaborn-base=0.12.2 + - send2trash=1.8.2 + - setuptools=66.0.0 + - shap=0.41.0 + - six=1.16.0 + - slicer=0.0.7 + - sniffio=1.3.0 + - soupsieve=2.3.2.post1 + - sqlalchemy=1.4.46 + - sqlite=3.41.2 + - stack_data=0.6.2 + - statsmodels=0.14.0 + - sympy=1.11.1 + - tbb=2021.8.0 + - terminado=0.17.1 + - threadpoolctl=3.1.0 + - tinycss2=1.2.1 + - tk=8.6.12 + - tomli=2.0.1 + - torchvision=0.15.2 + - tornado=6.3.2 + - tqdm=4.65.0 + - traitlets=5.9.0 + - typing_extensions=4.5.0 + - tzdata=2023c + - unicodedata2=15.0.0 + - urllib3=1.26.15 + - wcwidth=0.2.6 + - webencodings=0.5.1 + - websocket-client=1.5.2 + - wheel=0.38.4 + - xorg-libxau=1.0.11 + - xorg-libxdmcp=1.1.3 + - xz=5.4.2 + - yaml=0.2.5 + - zeromq=4.3.4 + - zipp=3.15.0 + - zlib=1.2.13 + - zstd=1.5.5 diff --git a/old_legacy_scripts/environment_081723.yml b/old_legacy_scripts/environment_081723.yml new file mode 100644 index 0000000..870884e --- /dev/null +++ b/old_legacy_scripts/environment_081723.yml @@ -0,0 +1,156 @@ +name: PathDSP_env +channels: + - bioconda + - pytorch + - nvidia + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - blas=1.0 + - boost=1.74.0 + - boost-cpp=1.74.0 + - bottleneck=1.3.5 + - brotlipy=0.7.0 + - bzip2=1.0.8 + - ca-certificates=2023.7.22 + - cairo=1.16.0 + - certifi=2023.7.22 + - cffi=1.15.1 + - charset-normalizer=2.0.4 + - cryptography=41.0.2 + - cuda-cudart=11.7.99 + - cuda-cupti=11.7.101 + - cuda-libraries=11.7.1 + - cuda-nvrtc=11.7.99 + - cuda-nvtx=11.7.91 + - cuda-runtime=11.7.1 + - cycler=0.11.0 + - ffmpeg=4.3 + - filelock=3.9.0 + - fontconfig=2.14.1 + - freetype=2.10.4 + - giflib=5.2.1 + - glib=2.69.1 + - gmp=6.2.1 + - gmpy2=2.1.2 + - gnutls=3.6.15 + - greenlet=2.0.1 + - gseapy=1.0.5 + - icu=70.1 + - idna=3.4 + - intel-openmp=2023.1.0 + - jbig=2.1 + - jinja2=3.1.2 + - jpeg=9e + - kiwisolver=1.4.4 + - lame=3.100 + - lcms2=2.12 + - ld_impl_linux-64=2.38 + - lerc=3.0 + - libcublas=11.10.3.66 + - libcufft=10.7.2.124 + - libcufile=1.7.1.12 + - libcurand=10.3.3.129 + - libcusolver=11.4.0.1 + - libcusparse=11.7.4.91 + - libdeflate=1.8 + - libffi=3.4.4 + - libgcc-ng=13.1.0 + - libgfortran-ng=11.2.0 + - libgfortran5=11.2.0 + - libiconv=1.16 + - libidn2=2.3.4 + - libnpp=11.7.4.75 + - libnsl=2.0.0 + - libnvjpeg=11.8.0.2 + - libpng=1.6.39 + - libsqlite=3.42.0 + - libstdcxx-ng=11.2.0 + - libtasn1=4.19.0 + - libtiff=4.3.0 + - libunistring=0.9.10 + - libuuid=2.38.1 + - libwebp=1.2.4 + - libwebp-base=1.2.4 + - libxcb=1.15 + - libxml2=2.9.14 + - libzlib=1.2.13 + - llvm-openmp=16.0.6 + - lz4-c=1.9.4 + - markupsafe=2.1.1 + - matplotlib-base=3.4.3 + - mkl=2023.1.0 + - mkl-service=2.4.0 + - mkl_fft=1.3.6 + - mkl_random=1.2.2 + - mpc=1.1.0 + - mpfr=4.0.2 + - mpmath=1.3.0 + - ncurses=6.4 + - nettle=3.7.3 + - networkx=3.1 + - numexpr=2.8.4 + - numpy=1.25.2 + - numpy-base=1.25.2 + - openh264=2.1.1 + - openssl=3.1.2 + - pandas=1.5.3 + - pcre=8.45 + - pillow=9.4.0 + - pip=23.2.1 + - pixman=0.40.0 + - pthread-stubs=0.4 + - pycairo=1.24.0 + - pycparser=2.21 + - pyopenssl=23.2.0 + - pysocks=1.7.1 + - python=3.10.12 + - python-dateutil=2.8.2 + - python_abi=3.10 + - pytorch=2.0.1 + - pytorch-cuda=11.7 + - pytorch-mutex=1.0 + - pytz=2022.7 + - rdkit=2022.03.2 + - readline=8.2 + - reportlab=3.6.12 + - requests=2.31.0 + - scipy=1.11.1 + - setuptools=68.0.0 + - six=1.16.0 + - sqlalchemy=1.4.49 + - sqlite=3.41.2 + - sympy=1.11.1 + - tbb=2021.8.0 + - tk=8.6.12 + - torchaudio=2.0.2 + - torchtriton=2.0.0 + - torchvision=0.15.2 + - tornado=6.3.2 + - typing_extensions=4.7.1 + - tzdata=2023c + - urllib3=1.26.16 + - wheel=0.38.4 + - xorg-libxau=1.0.11 + - xorg-libxdmcp=1.1.3 + - xz=5.2.6 + - zlib=1.2.13 + - zstd=1.5.2 + - pip: + - astropy==5.3.2 + - contourpy==1.1.0 + - fonttools==4.42.0 + - joblib==1.3.2 + - matplotlib==3.7.2 + - packaging==23.1 + - patsy==0.5.3 + - protobuf==3.19.0 + - pyerfa==2.0.0.3 + - pyparsing==3.0.9 + - pyyaml==6.0.1 + - scikit-learn==1.3.0 + - statsmodels==0.14.0 + - threadpoolctl==3.2.0 + - tqdm==4.66.1 diff --git a/old_legacy_scripts/get_test_data.py b/old_legacy_scripts/get_test_data.py new file mode 100644 index 0000000..9de863d --- /dev/null +++ b/old_legacy_scripts/get_test_data.py @@ -0,0 +1,14 @@ +import candle +import os + +# Assumes CANDLE_DATA_DIR is an environment variable +os.environ['CANDLE_DATA_DIR'] = 'tmp/' + +fname='input_txt_Nick.txt' +origin='http://chia.team/IMPROVE_data/input_txt_Nick.txt' + +# Download and unpack the data in CANDLE_DATA_DIR +candle.file_utils.get_file(fname, origin) + +# Do it again to confirm it's not re-downloading +#candle.file_utils.get_file(fname, origin) diff --git a/old_legacy_scripts/improve_utils.py b/old_legacy_scripts/improve_utils.py new file mode 100644 index 0000000..1956ee1 --- /dev/null +++ b/old_legacy_scripts/improve_utils.py @@ -0,0 +1,735 @@ +import os +import numpy as np +import pandas as pd +from pathlib import Path, PosixPath +from math import sqrt +from scipy import stats +from typing import List, Union, Optional, Tuple + + +fdir = Path(__file__).resolve().parent + + +# ----------------------------------------------------------------------------- +# TODO +# Note! +# We need to decide how this utils file will be provided for each model. +# Meanwhile, place this .py file in the level as your data preprocessing script. +# For example: +# GraphDRP/ +# |_______ preprocess.py +# |_______ improve_utils.py +# | +# | +# ----------------------------------------------------------------------------- + + + +# ----------------------------------------------------------------------------- +# Global variables +# ---------------- +# These are globals for all models +import types +improve_globals = types.SimpleNamespace() + +# TODO: +# This is CANDLE_DATA_DIR (or something...). +# How this is going to be passed to the code? +improve_globals.main_data_dir = PosixPath(os.environ.get("CANDLE_DATA_DIR") + "/csa_data/") +# improve_globals.main_data_dir = fdir/"improve_data_dir" +# imp_globals.main_data_dir = fdir/"candle_data_dir" + +# Dir names corresponding to the primary input/output blocks in the pipeline +# {}: input/output +# []: process +# train path: {raw_data} --> [preprocess] --> {ml_data} --> [train] --> {models} +# inference path: {ml_data, models} --> [inference] --> {infer} +improve_globals.raw_data_dir_name = "raw_data" # benchmark data +improve_globals.ml_data_dir_name = "ml_data" # preprocessed data for a specific ML model +improve_globals.models_dir_name = "models" # output from model training +improve_globals.infer_dir_name = "infer" # output from model inference (testing) + +# Secondary dirs in raw_data +improve_globals.x_data_dir_name = "x_data" # feature data +improve_globals.y_data_dir_name = "y_data" # target data +improve_globals.splits_dir_name = "splits" # splits files + +# Column names in the raw data files +# imp_globals.canc_col_name = "CancID" +# imp_globals.drug_col_name = "DrugID" +improve_globals.canc_col_name = "improve_sample_id" # column name that contains the cancer sample ids TODO: rename to sample_col_name +improve_globals.drug_col_name = "improve_chem_id" # column name that contains the drug ids +improve_globals.source_col_name = "source" # column name that contains source/study names (CCLE, GDSCv1, etc.) +improve_globals.pred_col_name_suffix = "_pred" # suffix to predictions col name (example of final col name: auc_pred) + +# Response data file name +improve_globals.y_file_name = "response.tsv" # response data + +# Cancer sample features file names +improve_globals.copy_number_fname = "cancer_copy_number.tsv" # cancer feature +improve_globals.discretized_copy_number_fname = "cancer_discretized_copy_number.tsv" # cancer feature +improve_globals.dna_methylation_fname = "cancer_DNA_methylation.tsv" # cancer feature +improve_globals.gene_expression_fname = "cancer_gene_expression.tsv" # cancer feature +improve_globals.miRNA_expression_fname = "cancer_miRNA_expression.tsv" # cancer feature +improve_globals.mutation_count_fname = "cancer_mutation_count.tsv" # cancer feature +improve_globals.mutation_fname = "cancer_mutation.tsv" # cancer feature +improve_globals.rppa_fname = "cancer_RPPA.tsv" # cancer feature + +# Drug features file names +improve_globals.smiles_file_name = "drug_SMILES.tsv" # drug feature +improve_globals.mordred_file_name = "drug_mordred.tsv" # drug feature +improve_globals.ecfp4_512bit_file_name = "drug_ecfp4_512bit.tsv" # drug feature + +# Globals derived from the ones defined above +improve_globals.raw_data_dir = improve_globals.main_data_dir/improve_globals.raw_data_dir_name # raw_data +improve_globals.ml_data_dir = improve_globals.main_data_dir/improve_globals.ml_data_dir_name # ml_data +improve_globals.models_dir = improve_globals.main_data_dir/improve_globals.models_dir_name # models +improve_globals.infer_dir = improve_globals.main_data_dir/improve_globals.infer_dir_name # infer +# ----- +improve_globals.x_data_dir = improve_globals.raw_data_dir/improve_globals.x_data_dir_name # x_data +improve_globals.y_data_dir = improve_globals.raw_data_dir/improve_globals.y_data_dir_name # y_data +improve_globals.splits_dir = improve_globals.raw_data_dir/improve_globals.splits_dir_name # splits + +# Response +improve_globals.y_file_path = improve_globals.y_data_dir/improve_globals.y_file_name # response.txt + +# Cancers +improve_globals.copy_number_file_path = improve_globals.x_data_dir/improve_globals.copy_number_fname # cancer_copy_number.txt +improve_globals.discretized_copy_number_file_path = improve_globals.x_data_dir/improve_globals.discretized_copy_number_fname # cancer_discretized_copy_number.txt +improve_globals.dna_methylation_file_path = improve_globals.x_data_dir/improve_globals.dna_methylation_fname # cancer_DNA_methylation.txt +improve_globals.gene_expression_file_path = improve_globals.x_data_dir/improve_globals.gene_expression_fname # cancer_gene_expression.txt +improve_globals.mirna_expression_file_path = improve_globals.x_data_dir/improve_globals.miRNA_expression_fname # cancer_miRNA_expression.txt +improve_globals.mutation_count_file_path = improve_globals.x_data_dir/improve_globals.mutation_count_fname # cancer_mutation_count.txt +improve_globals.mutation_file_path = improve_globals.x_data_dir/improve_globals.mutation_fname # cancer_mutation.txt +improve_globals.rppa_file_path = improve_globals.x_data_dir/improve_globals.rppa_fname # cancer_RPPA.txt + +# Drugs +improve_globals.smiles_file_path = improve_globals.x_data_dir/improve_globals.smiles_file_name # +improve_globals.mordred_file_path = improve_globals.x_data_dir/improve_globals.mordred_file_name # +improve_globals.ecfp4_512bit_file_path = improve_globals.x_data_dir/improve_globals.ecfp4_512bit_file_name # +# ----------------------------------------------------------------------------- + + +# ------------------------------------- +# Drug response loaders +# ------------------------------------- + +def load_single_drug_response_data( + # source: Union[str, List[str]], + source: str, + split: Union[int, None]=None, + split_type: Union[str, List[str], None]=None, + y_col_name: str="auc", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns datarame with cancer ids, drug ids, and drug response values. Samples + from the original drug response file are filtered based on the specified + sources. + + Args: + source (str or list of str): DRP source name (str) or multiple sources (list of strings) + split(int or None): split id (int), None (load all samples) + split_type (str or None): one of the following: 'train', 'val', 'test' + y_col_name (str): name of drug response measure/score (e.g., AUC, IC50) + + Returns: + pd.Dataframe: dataframe that contains drug response values + """ + # TODO: at this point, this func implements the loading a single source + df = pd.read_csv(improve_globals.y_file_path, sep=sep) + + # import pdb; pdb.set_trace() + if isinstance(split, int): + # Get a subset of samples + ids = load_split_file(source, split, split_type) + df = df.loc[ids] + else: + # Get the full dataset for a given source + df = df[df[improve_globals.source_col_name].isin([source])] + + cols = [improve_globals.source_col_name, + improve_globals.drug_col_name, + improve_globals.canc_col_name, + y_col_name] + df = df[cols] # [source, drug id, cancer id, response] + df = df.reset_index(drop=True) + if verbose: + print(f"Response data: {df.shape}") + print(df[[improve_globals.canc_col_name, improve_globals.drug_col_name]].nunique()) + return df + + +def load_single_drug_response_data_v2( + # source: Union[str, List[str]], + source: str, + # split: Union[int, None]=None, + # split_type: Union[str, List[str], None]=None, + split_file_name: Union[str, List[str], None]=None, + y_col_name: str="auc", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns datarame with cancer ids, drug ids, and drug response values. Samples + from the original drug response file are filtered based on the specified + sources. + + Args: + source (str or list of str): DRP source name (str) or multiple sources (list of strings) + split(int or None): split id (int), None (load all samples) + split_type (str or None): one of the following: 'train', 'val', 'test' + y_col_name (str): name of drug response measure/score (e.g., AUC, IC50) + + Returns: + pd.Dataframe: dataframe that contains drug response values + """ + # TODO: currently, this func implements loading a single data source (CCLE or CTRPv2 or ...) + df = pd.read_csv(improve_globals.y_file_path, sep=sep) + + # Get a subset of samples + if isinstance(split_file_name, list) and len(split_file_name) == 0: + raise ValueError("Empty list is passed via split_file_name.") + if isinstance(split_file_name, str): + split_file_name = [split_file_name] + ids = load_split_ids(split_file_name) + df = df.loc[ids] + # else: + # # Get the full dataset for a given source + # df = df[df[improve_globals.source_col_name].isin([source])] + + # # Get a subset of cols + # cols = [improve_globals.source_col_name, + # improve_globals.drug_col_name, + # improve_globals.canc_col_name, + # y_col_name] + # df = df[cols] # [source, drug id, cancer id, response] + + df = df.reset_index(drop=True) + if verbose: + print(f"Response data: {df.shape}") + print(f"Unique cells: {df[improve_globals.canc_col_name].nunique()}") + print(f"Unique drugs: {df[improve_globals.drug_col_name].nunique()}") + return df + + +def load_split_ids(split_file_name: Union[str, List[str]]) -> List[int]: + """ Returns list of integers, representing the rows in the response dataset. + Args: + split_file_name (str or list of str): splits file name or list of file names + + Returns: + list: list of integers representing the ids + """ + ids = [] + for fname in split_file_name: + fpath = improve_globals.splits_dir/fname + assert fpath.exists(), f"split_file_name {fname} not found." + ids_ = pd.read_csv(fpath, header=None)[0].tolist() + ids.extend(ids_) + return ids + + +def load_split_file( + source: str, + split: Union[int, None]=None, + split_type: Union[str, List[str], None]=None) -> List[int]: + """ + Args: + source (str): DRP source name (str) + + Returns: + ids (list): list of id integers + """ + # TODO: used in the old version of the rsp loader + if isinstance(split_type, str): + split_type = [split_type] + + # Check if the split file exists and load + ids = [] + for st in split_type: + fpath = improve_globals.splits_dir/f"{source}_split_{split}_{st}.txt" + assert fpath.exists(), f"Splits file not found: {fpath}" + ids_ = pd.read_csv(fpath, header=None)[0].tolist() + ids.extend(ids_) + return ids + + +# ------------------------------------- +# Omic feature loaders +# ------------------------------------- + +""" +Notes about omics data. + +Omics data files are multi-level tables with several column types (generally 3 +or 4), each contains gene names using a different gene identifier system: +Entrez ID, Gene Symbol, Ensembl ID, TSS + +The column levels are not organized in the same order across the different +omic files. + +The level_map dict, in each loader function, encodes the column level and the +corresponding identifier systems. + +For example, in the copy number file the level_map is: +level_map = {"Entrez":0, "Gene_Symbol": 1, "Ensembl": 2} +""" + +def set_col_names_in_multilevel_dataframe( + df: pd.DataFrame, + level_map: dict, + gene_system_identifier: Union[str, List[str]]="Gene_Symbol") -> pd.DataFrame: + """ Util function that supports loading of the omic data files. + Returns the input dataframe with the multi-level column names renamed as + specified by the gene_system_identifier arg. + + Args: + df (pd.DataFrame): omics dataframe + level_map (dict): encodes the column level and the corresponding identifier systems + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: the input dataframe with the specified multi-level column names + """ + df = df.copy() + + level_names = list(level_map.keys()) + level_values = list(level_map.values()) + n_levels = len(level_names) + + if isinstance(gene_system_identifier, list) and len(gene_system_identifier) == 1: + gene_system_identifier = gene_system_identifier[0] + + # print(gene_system_identifier) + # import pdb; pdb.set_trace() + if isinstance(gene_system_identifier, str): + if gene_system_identifier == "all": + df.columns = df.columns.rename(level_names, level=level_values) # assign multi-level col names + else: + df.columns = df.columns.get_level_values(level_map[gene_system_identifier]) # retian specific column level + else: + assert len(gene_system_identifier) <= n_levels, f"'gene_system_identifier' can't contain more than {n_levels} items." + set_diff = list(set(gene_system_identifier).difference(set(level_names))) + assert len(set_diff) == 0, f"Passed unknown gene identifiers: {set_diff}" + kk = {i: level_map[i] for i in level_map if i in gene_system_identifier} + # print(list(kk.keys())) + # print(list(kk.values())) + df.columns = df.columns.rename(list(kk.keys()), level=kk.values()) # assign multi-level col names + drop_levels = list(set(level_map.values()).difference(set(kk.values()))) + df = df.droplevel(level=drop_levels, axis=1) + return df + + +def load_copy_number_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns copy number data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + # level_map encodes the relationship btw the column and gene identifier system + level_map = {"Ensembl": 2, "Entrez": 0, "Gene_Symbol": 1} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.copy_number_file_path, sep=sep, index_col=0, header=header) + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + # Test the func + # d0 = set_col_names_in_multilevel_dataframe(df, "all") + # d1 = set_col_names_in_multilevel_dataframe(df, "Ensembl") + # d2 = set_col_names_in_multilevel_dataframe(df, ["Ensembl"]) + # d3 = set_col_names_in_multilevel_dataframe(df, ["Entrez", "Gene_Symbol", "Ensembl"]) + # d4 = set_col_names_in_multilevel_dataframe(df, ["Entrez", "Ensembl"]) + # d5 = set_col_names_in_multilevel_dataframe(df, ["Blah", "Ensembl"]) + if verbose: + print(f"Copy number data: {df.shape}") + # print(df.dtypes) + # print(df.dtypes.value_counts()) + return df + + +def load_discretized_copy_number_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns discretized copy number data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + # level_map encodes the relationship btw the column and gene identifier system + level_map = {"Ensembl": 2, "Entrez": 0, "Gene_Symbol": 1} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.discretized_copy_number_file_path, sep=sep, index_col=0, header=header) + + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + if verbose: + print(f"Discretized copy number data: {df.shape}") + + return df + + +def load_dna_methylation_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns methylation data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + level_map = {"Ensembl": 2, "Entrez": 1, "Gene_Symbol": 3, "TSS": 0} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.dna_methylation_file_path, sep=sep, index_col=0, header=header) + + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + if verbose: + print(f"DNA methylation data: {df.shape}") + # print(df.dtypes) # TODO: many column are of type 'object' + # print(df.dtypes.value_counts()) + return df + + +def load_gene_expression_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns gene expression data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + # level_map encodes the relationship btw the column and gene identifier system + level_map = {"Ensembl": 0, "Entrez": 1, "Gene_Symbol": 2} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.gene_expression_file_path, sep=sep, index_col=0, header=header) + + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + if verbose: + print(f"Gene expression data: {df.shape}") + return df + + +def load_mirna_expression_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + # TODO + raise NotImplementedError("The function is not implemeted yet.") + return None + + +def load_mutation_count_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Returns mutation count data. + + Args: + gene_system_identifier (str or list of str): gene identifier system to use + options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list + combination of ["Entrez", "Gene_Symbol", "Ensembl"] + + Returns: + pd.DataFrame: dataframe with the omic data + """ + # level_map encodes the relationship btw the column and gene identifier system + level_map = {"Ensembl": 2, "Entrez": 0, "Gene_Symbol": 1} + header = [i for i in range(len(level_map))] + + df = pd.read_csv(improve_globals.mutation_count_file_path, sep=sep, index_col=0, header=header) + + df.index.name = improve_globals.canc_col_name # assign index name + df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier) + if verbose: + print(f"Mutation count data: {df.shape}") + + return df + + +def load_mutation_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + # TODO + raise NotImplementedError("The function is not implemeted yet.") + return None + + +def load_rppa_data( + gene_system_identifier: Union[str, List[str]]="Gene_Symbol", + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + # TODO + raise NotImplementedError("The function is not implemeted yet.") + return None + + + + +# ------------------------------------- +# Drug feature loaders +# ------------------------------------- + +def load_smiles_data( + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + IMPROVE-specific func. + Read smiles data. + src_raw_data_dir : data dir where the raw DRP data is stored + """ + df = pd.read_csv(improve_globals.smiles_file_path, sep=sep) + + # TODO: updated this after we update the data + df.columns = ["improve_chem_id", "smiles"] + + if verbose: + print(f"SMILES data: {df.shape}") + # print(df.dtypes) + # print(df.dtypes.value_counts()) + return df + + +def load_mordred_descriptor_data( + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Return Mordred descriptors data. + """ + df = pd.read_csv(improve_globals.mordred_file_path, sep=sep) + df = df.set_index(improve_globals.drug_col_name) + if verbose: + print(f"Mordred descriptors data: {df.shape}") + return df + + +def load_morgan_fingerprint_data( + sep: str="\t", + verbose: bool=True) -> pd.DataFrame: + """ + Return Morgan fingerprints data. + """ + df = pd.read_csv(improve_globals.ecfp4_512bit_file_path, sep=sep) + df = df.set_index(improve_globals.drug_col_name) + return df + + +# ------------------------------------- +# Save data functions +# ------------------------------------- + +def save_preds(df: pd.DataFrame, y_col_name: str, + outpath: Union[str, PosixPath], round_decimals: int=4) -> None: + """ Save model predictions. + This function throws errors if the dataframe does not include the expected + columns: canc_col_name, drug_col_name, y_col_name, y_col_name + "_pred" + + Args: + df (pd.DataFrame): df with model predictions + y_col_name (str): drug response col name (e.g., IC50, AUC) + outpath (str or PosixPath): outdir to save the model predictions df + round (int): round response values + + Returns: + None + """ + # Check that the 4 columns exist + assert improve_globals.canc_col_name in df.columns, f"{improve_globals.canc_col_name} was not found in columns." + assert improve_globals.drug_col_name in df.columns, f"{improve_globals.drug_col_name} was not found in columns." + assert y_col_name in df.columns, f"{y_col_name} was not found in columns." + pred_col_name = y_col_name + f"{improve_globals.pred_col_name_suffix}" + assert pred_col_name in df.columns, f"{pred_col_name} was not found in columns." + + # Round + df = df.round({y_col_name: round_decimals, pred_col_name: round_decimals}) + + # Save preds df + df.to_csv(outpath, index=False) + return None + + + + + + +# ================================================================== +# Leftovers +# ================================================================== +def get_data_splits( + src_raw_data_dir: str, + splitdir_name: str, + split_file_name: str, + rsp_df: pd.DataFrame): + """ + IMPROVE-specific func. + Read smiles data. + src_raw_data_dir : data dir where the raw DRP data is stored + """ + splitdir = src_raw_data_dir/splitdir_name + if len(split_file_name) == 1 and split_file_name[0] == "full": + # Full dataset (take all samples) + ids = list(range(rsp_df.shape[0])) + else: + # Check if the split file exists and load + ids = [] + for fname in split_file_name: + assert (splitdir/fname).exists(), "split_file_name not found." + with open(splitdir/fname) as f: + ids_ = [int(line.rstrip()) for line in f] + ids.extend(ids_) + + """ + # Method 1 + splitdir = Path(os.path.join(src_raw_data_dir))/"splits" + if len(args.split_file_name) == 1 and args.split_file_name[0] == "full": + # Full dataset (take all samples) + ids = list(range(rsp_df.shape[0])) + outdir_name = "full" + else: + # Check if the split file exists and load + ids = [] + split_id_str = [] # e.g. split_5 + split_type_str = [] # e.g. tr, vl, te + for fname in args.split_file_name: + assert (splitdir/fname).exists(), "split_file_name not found." + with open(splitdir/fname) as f: + # Get the ids + ids_ = [int(line.rstrip()) for line in f] + ids.extend(ids_) + # Get the name + fname_sep = fname.split("_") + split_id_str.append("_".join([s for s in fname_sep[:2]])) + split_type_str.append(fname_sep[2]) + assert len(set(split_id_str)) == 1, "Data splits must be from the same dataset source." + split_id_str = list(set(split_id_str))[0] + split_type_str = "_".join([x for x in split_type_str]) + outdir_name = f"{split_id_str}_{split_type_str}" + ML_DATADIR = main_data_dir/"ml_data" + root = ML_DATADIR/f"data.{args.source_data_name}"/outdir_name # ML data + os.makedirs(root, exist_ok=True) + """ + + """ + # Method 2 + splitdir = src_raw_data_dir/args.splitdir_name + if len(args.split_file_name) == 1 and args.split_file_name[0] == "full": + # Full dataset (take all samples) + ids = list(range(rsp_df.shape[0])) + else: + # Check if the split file exists and load + ids = [] + for fname in args.split_file_name: + assert (splitdir/fname).exists(), "split_file_name not found." + with open(splitdir/fname) as f: + ids_ = [int(line.rstrip()) for line in f] + ids.extend(ids_) + """ + return ids + + +def get_common_samples( + df1: pd.DataFrame, + df2: pd.DataFrame, + ref_col: str) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Args: + df1, df2 (pd.DataFrame): dataframes + ref_col (str): the ref column to find the common values + + Returns: + df1, df2 + + Example: + TODO + """ + # Retain (canc, drug) response samples for which we have omic data + common_ids = list(set(df1[ref_col]).intersection(df2[ref_col])) + # print(df1.shape) + df1 = df1[ df1[improve_globals.canc_col_name].isin(common_ids) ].reset_index(drop=True) + # print(df1.shape) + # print(df2.shape) + df2 = df2[ df2[improve_globals.canc_col_name].isin(common_ids) ].reset_index(drop=True) + # print(df2.shape) + return df1, df2 + + +def read_df(fpath: str, sep: str=","): + """ + IMPROVE-specific func. + Load a dataframe. Supports csv and parquet files. + sep : the sepator in the csv file + """ + # TODO: this func might be available in candle + assert Path(fpath).exists(), f"File {fpath} was not found." + if "parquet" in str(fpath): + df = pd.read_parquet(fpath) + else: + df = pd.read_csv(fpath, sep=sep) + return df + + +def get_subset_df(df: pd.DataFrame, ids: list) -> pd.DataFrame: + """ Get a subset of the input dataframe based on row ids.""" + df = df.loc[ids] + return df + + +def rmse(y, f): + rmse = sqrt(((y - f)**2).mean(axis=0)) + return rmse + + +def mse(y, f): + mse = ((y - f)**2).mean(axis=0) + return mse + + +def pearson(y, f): + rp = np.corrcoef(y, f)[0, 1] + return rp + + +def spearman(y, f): + rs = stats.spearmanr(y, f)[0] + return rs + + +def r_square(y_true, y_pred): + from sklearn.metrics import r2_score + return r2_score(y_true, y_pred) diff --git a/old_legacy_scripts/infer.py b/old_legacy_scripts/infer.py new file mode 100755 index 0000000..639c121 --- /dev/null +++ b/old_legacy_scripts/infer.py @@ -0,0 +1,85 @@ +import candle +import os +import sys +#import json +#from json import JSONEncoder +from preprocess_new import mkdir, preprocess +import numpy as np +import pandas as pd +from datetime import datetime +import torch as tch +import torch.utils.data as tchud +import polars as pl +import sklearn.metrics as skmts +#sys.path.append("/usr/local/PathDSP/PathDSP") +sys.path.append("/usr/local/PathDSP/PathDSP") +sys.path.append(os.getcwd() + "/PathDSP") +import FNN_new + + +file_path = os.path.dirname(os.path.realpath(__file__)) +required = None +additional_definitions = None + + +# initialize class +class PathDSP_candle(candle.Benchmark): + def set_locals(self): + ''' + Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the benchmark. + ''' + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + +def initialize_parameters(): + preprocessor_bmk = PathDSP_candle(file_path, + 'PathDSP_params.txt', + 'pytorch', + prog='PathDSP_candle', + desc='Data Preprocessor' + ) + #Initialize parameters + gParameters = candle.finalize_parameters(preprocessor_bmk) + return gParameters + + +def run(params): + test_df = pl.read_csv(params['test_data'], separator = "\t").to_pandas() + Xtest_arr = test_df.iloc[:, 0:-1].values + ytest_arr = test_df.iloc[:, -1].values + Xtest_arr = np.array(Xtest_arr).astype('float32') + ytest_arr = np.array(ytest_arr).astype('float32') + trained_net = FNN_new.mynet.FNN(Xtest_arr.shape[1]) + trained_net.load_state_dict(tch.load(params['data_dir'] + '/model.pt')) + trained_net.eval() + FNN_new.myutil.set_seed(params["seed_int"]) + device = FNN_new.myutil.get_device(uth=params["gpu_int"]) + test_dataset = FNN_new.mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr)) + test_dl = tchud.DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False) + start = datetime.now() + prediction_list = FNN_new.predict(trained_net, test_dl, device) + print('Inference time :[Finished in {:}]'.format(FNN_new.cal_time(datetime.now(), start))) + # evaluation metrics + mse = skmts.mean_squared_error(ytest_arr, prediction_list) + rmse = np.sqrt(mse) + r2_pred = FNN_new.r2_score(ytest_arr, prediction_list) + loss_pred = pd.DataFrame({'metric': ['rmse', 'r2'], + 'value': [rmse, r2_pred]}) + loss_pred.to_csv(params['data_dir'] + '/Loss_pred.txt', header=True, index=False, sep="\t") + ytest_df = test_df.iloc[:, -1].to_frame() + ytest_df['prediction'] = prediction_list + ytest_df.to_csv(params['data_dir'] + '/Prediction.txt', header=True, index=True, sep="\t") + + +def candle_main(): + params = initialize_parameters() + data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/' + params = preprocess(params, data_dir) + run(params) + +if __name__ == "__main__": + candle_main() diff --git a/old_legacy_scripts/parse_DSP_data_Chia_Jan12_2023.R b/old_legacy_scripts/parse_DSP_data_Chia_Jan12_2023.R new file mode 100644 index 0000000..28976f0 --- /dev/null +++ b/old_legacy_scripts/parse_DSP_data_Chia_Jan12_2023.R @@ -0,0 +1,72 @@ +rm(list = ls()) + +setwd("c:/Users/m092469/OneDrive - Mayo Clinic/temp_code/DOE_IMPROVE") + +options(stringsAsFactors = FALSE) + +rDrug <- read.delim( + "data/GDSCv2.Gao2015.Powell2020.Lee2021.GeoSearch.Ding2016.CHEM.256.MBits.txt") + +rGSEA <- read.delim( + "data/GDSCv2.Powell2020.EXP.ssGSEA.txt") + +rPNET <- read.delim( + "data/GDSCv2.Gao2015.Powell2020.Lee2021.GeoSearch.Ding2016.DGNet.NetPEA.txt") + +rReponse0 <- read.delim("data/GDSCv2.resp_PowellAUC.Alias.txt") + +Drug.ID.vec0 <- intersect(rDrug$drug, rReponse0$Therapy) +Cell.ID.vec <- intersect(rGSEA$X, rReponse0$Sample) + +Drug.ID.vec <- intersect(Drug.ID.vec0,rPNET$X) + +sel.Reponse.idx <- which(is.element(rReponse0$Therapy, Drug.ID.vec) & + is.element(rReponse0$Sample, Cell.ID.vec)) + +rReponse <- rReponse0[sel.Reponse.idx, ] + +N.cell <- length(Cell.ID.vec) +N.drug <- length(Drug.ID.vec) +N.comb <- nrow(rReponse) + +head(rDrug[,1:5]) +Drug.fmtx <- data.matrix(rDrug[match(Drug.ID.vec, rDrug$drug), 2: ncol(rDrug)]) +rownames(Drug.fmtx) <- Drug.ID.vec +head(Drug.fmtx[,1:5]) + +Drug.PNEA.fmtx <- data.matrix(rPNET[match(Drug.ID.vec, rPNET$X), 2: ncol(rPNET)]) +rownames(Drug.PNEA.fmtx) <- Drug.ID.vec +head(Drug.PNEA.fmtx[,1:5]) + +all(rownames(Drug.fmtx)==rownames(Drug.PNEA.fmtx)) + +Cell.GSEA.mtx <- data.matrix(rGSEA[match(Cell.ID.vec, rGSEA$X), 2: ncol(rGSEA)]) +rownames(Cell.GSEA.mtx) <- Cell.ID.vec +head(Cell.GSEA.mtx[,1:5]) + +N.col <- ncol(Drug.fmtx) + ncol(Drug.PNEA.fmtx) + + ncol(Cell.GSEA.mtx) + 3 # drug, cell & resp +comb.data.mtx <- mat.or.vec(N.comb, N.col) +colnames(comb.data.mtx) <- c("drug", "cell", + paste0("feature",1:(N.col-3)),"resp") + + +for(i in 1:N.comb){ + # i <- 1 + tmp.cell.ID <- rReponse$Sample[i] + tmp.drug.ID <- rReponse$Therapy[i] + comb.data.mtx[i, "drug"] <- tmp.drug.ID + comb.data.mtx[i, "cell"] <- tmp.cell.ID + comb.data.mtx[i, paste0("feature",1:(N.col-3))] <- + c(Drug.fmtx[tmp.drug.ID,], + Drug.PNEA.fmtx[tmp.drug.ID,], + Cell.GSEA.mtx[tmp.cell.ID,]) + response.idx <-which( + rReponse$Therapy==tmp.drug.ID & rReponse$Sample==tmp.cell.ID) + tmp.resp <- rReponse$Response[response.idx] + comb.data.mtx[i, "resp"] <- tmp.resp + +} + +write.table(x = comb.data.mtx, file = "input_txt_Nick.txt", + quote = FALSE, sep = "\t", row.names = FALSE) \ No newline at end of file diff --git a/old_legacy_scripts/preprocess.py b/old_legacy_scripts/preprocess.py new file mode 100644 index 0000000..378898f --- /dev/null +++ b/old_legacy_scripts/preprocess.py @@ -0,0 +1,36 @@ +def preprocess(params): + fname='input_for_Nick.txt' + origin=params['data_url'] + # Download and unpack the data in CANDLE_DATA_DIR + candle.file_utils.get_file(fname, origin) + params['train_data'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['train_data'] + #params['val_data'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['val_data'] + #params['gep_filepath'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['gep_filepath'] + #params['smi_filepath'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['smi_filepath'] + #params['gene_filepath'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['gene_filepath'] + #params['smiles_language_filepath'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['smiles_language_filepath'] + """ + params["train_data"] = candle.get_file(params['train_data'], origin, datadir=params['data_dir'], cache_subdir=None) + params["val_data"] = candle.get_file(params['val_data'], origin, datadir=params['data_dir'], cache_subdir=None) + params["gep_filepath"] = candle.get_file(params['gep_filepath'], origin, datadir=params['data_dir'], cache_subdir=None) + params["smi_filepath"] = candle.get_file(params['smi_filepath'], origin, datadir=params['data_dir'], cache_subdir=None) + params["gene_filepath"] = candle.get_file(params['gene_filepath'], origin, datadir=params['data_dir'], cache_subdir=None) + params["smiles_language_filepath"] = candle.get_file(params['smiles_language_filepath'], origin, datadir=params['data_dir'], cache_subdir=None) """ + return params + +def run(params): + params['data_type'] = str(params['data_type']) + with open ((params['output_dir']+'/params.json'), 'w') as outfile: + json.dump(params, outfile) + scores = main(params) + with open(params['output_dir'] + "/scores.json", "w", encoding="utf-8") as f: + json.dump(scores, f, ensure_ascii=False, indent=4) + print('IMPROVE_RESULT RMSE:\t' + str(scores['rmse'])) + +def candle_main(): + params = initialize_parameters() + params = preprocess(params) + run(params) + +if __name__ == "__main__": + candle_main() diff --git a/old_legacy_scripts/preprocess_new.py b/old_legacy_scripts/preprocess_new.py new file mode 100644 index 0000000..cec9e6f --- /dev/null +++ b/old_legacy_scripts/preprocess_new.py @@ -0,0 +1,403 @@ +import sys +import os +import numpy as np +import polars as pl +#import torch +#import torch.utils.data as du +#from torch.autograd import Variable +#import torch.nn as nn +#import torch.nn.functional as F +#from code.drugcell_NN import * +import argparse +import numpy as np +import pandas as pd +import candle +#import time +#import logging +#import networkx as nx +#import networkx.algorithms.components.connected as nxacc +#import networkx.algorithms.dag as nxadag +#from pathlib import Path +from functools import reduce +import improve_utils +# import RDKit +from rdkit import Chem +from rdkit.Chem import AllChem +from datetime import datetime +# import NetPEA modules +import RWR as rwr +import NetPEA as pea +#import gsea module +import gseapy as gp +import sklearn.model_selection as skms + + +file_path = os.path.dirname(os.path.realpath(__file__)) +#fdir = Path('__file__').resolve().parent +#source = 'csa_data/raw_data/splits/' +required = None +additional_definitions = None + +# This should be set outside as a user environment variable +#os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' + +# initialize class +class PathDSP_candle(candle.Benchmark): + def set_locals(self): + ''' + Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the benchmark. + ''' + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + + +def initialize_parameters(): + preprocessor_bmk = PathDSP_candle(file_path, + 'PathDSP_params.txt', + 'pytorch', + prog='PathDSP_candle', + desc='Data Preprocessor' + ) + #Initialize parameters + gParameters = candle.finalize_parameters(preprocessor_bmk) + return gParameters + + +def mkdir(directory): + directories = directory.split('/') + + folder = '' + for d in directories: + folder += d + '/' + if not os.path.exists(folder): + print('creating folder: %s'%folder) + os.mkdir(folder) + + +def preprocess(params, data_dir): + print(os.environ['CANDLE_DATA_DIR']) + #requirements go here + #keys_parsing = ['output_dir', 'hidden', 'result', 'metric', 'data_type'] + if not os.path.exists(data_dir): + mkdir(data_dir) + params['data_dir'] = data_dir + #args = candle.ArgumentStruct(**params) + for i in ['train_data', 'test_data', 'val_data', 'drug_bits_file', 'dgnet_file', + 'mutnet_file', 'cnvnet_file', 'exp_file']: + params[i] = params['data_dir'] + '/' + params[i] + return(params) + +def download_anl_data(params): + csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'], 'csa_data', 'raw_data') + splits_dir = os.path.join(csa_data_folder, 'splits') + x_data_dir = os.path.join(csa_data_folder, 'x_data') + y_data_dir = os.path.join(csa_data_folder, 'y_data') + + if not os.path.exists(csa_data_folder): + print('creating folder: %s'%csa_data_folder) + os.makedirs(csa_data_folder) + mkdir(splits_dir) + mkdir(x_data_dir) + mkdir(y_data_dir) + + for improve_file in ['CCLE_all.txt', + 'CCLE_split_' + str(params['split']) + '_test.txt', + 'CCLE_split_' + str(params['split']) + '_train.txt', + 'CCLE_split_' + str(params['split']) + '_val.txt', + 'CTRPv2_all.txt', + 'CTRPv2_split_' + str(params['split']) + '_test.txt', + 'CTRPv2_split_' + str(params['split']) + '_train.txt', + 'CTRPv2_split_' + str(params['split']) + '_val.txt', + 'gCSI_all.txt', + 'GDSCv1_all.txt', + 'GDSCv2_all.txt' + ]: + url_dir = params['improve_data_url'] + '/splits/' + candle.file_utils.get_file(improve_file, url_dir + improve_file, + datadir=splits_dir, + cache_subdir=None) + + for improve_file in ['cancer_mutation_count.tsv', 'drug_SMILES.tsv', 'drug_info.tsv', 'cancer_discretized_copy_number.tsv', 'cancer_gene_expression.tsv']: + url_dir = params['improve_data_url'] + '/x_data/' + candle.file_utils.get_file(fname=improve_file, origin=url_dir + improve_file, + datadir=x_data_dir, + cache_subdir=None) + + url_dir = params['improve_data_url'] + '/y_data/' + response_file = 'response.tsv' + candle.file_utils.get_file(fname=response_file, origin=url_dir + response_file, + datadir=y_data_dir, + cache_subdir=None) + + ## get gene-set data and string data + for db_file in [params['gene_set'], params['ppi_data'], params['drug_target']]: + candle.file_utils.get_file(db_file, params['data_url'] + '/' +db_file, + datadir=params['data_dir'], + cache_subdir=None) + + + + +# set timer +def cal_time(end, start): + '''return time spent''' + # end = datetime.now(), start = datetime.now() + datetimeFormat = '%Y-%m-%d %H:%M:%S.%f' + spend = datetime.strptime(str(end), datetimeFormat) - \ + datetime.strptime(str(start),datetimeFormat) + return spend + + +def download_author_data(params): + data_download_filepath = candle.file_utils.get_file(params['original_data'], params['original_data_url'] + '/' + params['original_data'], + datadir = params['data_dir'], + cache_subdir = None) + print('download_path: {}'.format(data_download_filepath)) + random_seed = 42 + df = pd.read_csv(params['data_dir'] + "/input.txt", sep='\t') # Modify the separator if needed + df = df.set_index(['drug', 'cell']) + train_data, temp_data = skms.train_test_split(df, test_size=0.2, random_state=random_seed) + val_data, test_data = skms.train_test_split(temp_data, test_size=0.5, random_state=random_seed) + pl.from_pandas(train_data).write_csv(params['train_data'], separator = '\t', has_header = True) + pl.from_pandas(val_data).write_csv(params['val_data'], separator = '\t', has_header = True) + pl.from_pandas(test_data).write_csv(params['test_data'], separator = '\t', has_header = True) + + +def smile2bits(params): + start = datetime.now() + rs_all = improve_utils.load_single_drug_response_data(source=params['data_type'], + split=params['split'], split_type=["train", "test", "val"], + y_col_name=params['metric']) + smile_df = improve_utils.load_smiles_data() + smile_df.columns = ['drug', 'smile'] + smile_df = smile_df.drop_duplicates(subset=['drug'], keep='first').set_index('drug') + smile_df = smile_df.loc[smile_df.index.isin(rs_all['improve_chem_id']),] + bit_int = params['bit_int'] + record_list = [] + # smile2bits drug by drug + n_drug = 1 + for idx, row in smile_df.iterrows(): + drug = idx + smile = row['smile'] + mol = Chem.MolFromSmiles(smile) + if mol is None: + continue + mbit = list( AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=bit_int) ) + #drug_mbit_dict.update({drug:mbit}) + # append to result + record_list.append( tuple([drug]+mbit) ) + if len(mbit) == bit_int: + n_drug+=1 + print('total {:} drugs with bits'.format(n_drug)) + # convert dict to dataframe + colname_list = ['drug'] + ['mBit_'+str(i) for i in range(bit_int)] + drug_mbit_df = pd.DataFrame.from_records(record_list, columns=colname_list) + #drug_mbit_df = pd.DataFrame.from_dict(drug_mbit_dict, orient='index', columns=colname_list) + #drug_mbit_df.index.name = 'drug' + print('unique drugs={:}'.format(len(drug_mbit_df['drug'].unique()))) + # save to file + drug_mbit_df.to_csv(params['drug_bits_file'], header=True, index=False, sep='\t') + print('[Finished in {:}]'.format(cal_time(datetime.now(), start))) + +def times_expression(rwr, exp): + ''' + :param rwrDf: dataframe of cell by gene probability matrix + :param expDf: dataframe of cell by gene expression matrix + :return rwr_timesexp_df: dataframe of cell by gene probability matrix, + in which genes are multiplied with expression values + + Note: this function assumes cells are all overlapped while gene maybe not + ''' + cell_list = sorted(list(set(rwr.index) & set(exp.index))) + gene_list = sorted(list(set(rwr.columns)&set(exp.columns))) + + if len(cell_list) == 0: + print('ERROR! no overlapping cell lines') + sys.exit(1) + if len(gene_list) == 0: + print('ERROR! no overlapping genes') + sys.exit(1) + + # multiply with gene expression for overlapping cell, gene + rwr_timesexp = rwr.loc[cell_list, gene_list]*exp.loc[cell_list, gene_list] + + # concat with other gene + out_gene_list = list(set(rwr.columns)-set(gene_list)) + out_df = pd.concat([rwr_timesexp, rwr[out_gene_list]], axis=1) + return out_df + +def run_netpea(params, dtype, multiply_expression): + # timer + start_time = datetime.now() + ppi_path = params['data_dir'] + '/STRING/9606.protein_name.links.v11.0.pkl' + pathway_path = params['data_dir'] + '/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt' + log_transform = False + permutation_int = params['permutation_int'] + seed_int = params['seed_int'] + cpu_int = params['cpu_int'] + csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'], 'csa_data', 'raw_data') + rs_all = improve_utils.load_single_drug_response_data(source=params['data_type'], + split=params['split'], split_type=["train", "test", "val"], + y_col_name=params['metric']) + if dtype == 'DGnet': + drug_info = pd.read_csv(csa_data_folder + '/x_data/drug_info.tsv', sep='\t') + drug_info['NAME'] = drug_info['NAME'].str.upper() + target_info = pd.read_csv(params['data_dir'] + '/data/DB.Drug.Target.txt', sep = '\t') + target_info = target_info.rename(columns={'drug': 'NAME'}) + combined_df = pd.merge(drug_info, target_info, how = 'left', on = 'NAME').dropna(subset=['gene']) + combined_df = combined_df.loc[combined_df['improve_chem_id'].isin(rs_all['improve_chem_id']),] + restart_path = params['data_dir'] + '/drug_target.txt' + combined_df.iloc[:,-2:].to_csv(restart_path, sep = '\t', header= True, index=False) + outpath = params['dgnet_file'] + elif dtype == 'MUTnet': + mutation_data = improve_utils.load_mutation_count_data(gene_system_identifier='Gene_Symbol') + mutation_data = mutation_data.reset_index() + mutation_data = pd.melt(mutation_data, id_vars='improve_sample_id').loc[lambda x: x['value'] > 0] + mutation_data = mutation_data.loc[mutation_data['improve_sample_id'].isin(rs_all['improve_sample_id']),] + restart_path = params['data_dir'] + '/mutation_data.txt' + mutation_data.iloc[:,0:2].to_csv(restart_path, sep = '\t', header= True, index=False) + outpath = params['mutnet_file'] + else: + cnv_data = improve_utils.load_discretized_copy_number_data(gene_system_identifier='Gene_Symbol') + cnv_data = cnv_data.reset_index() + cnv_data = pd.melt(cnv_data, id_vars='improve_sample_id').loc[lambda x: x['value'] != 0] + cnv_data = cnv_data.loc[cnv_data['improve_sample_id'].isin(rs_all['improve_sample_id']),] + restart_path = params['data_dir'] + '/cnv_data.txt' + cnv_data.iloc[:,0:2].to_csv(restart_path, sep = '\t', header= True, index=False) + outpath = params['cnvnet_file'] + # perform Random Walk + print(datetime.now(), 'performing random walk with restart') + rwr_df = rwr.RWR(ppi_path, restart_path, restartProbFloat=0.5, convergenceFloat=0.00001, normalize='l1', weighted=True).get_prob() + # multiply with gene expression + if multiply_expression: + print(datetime.now(), 'multiplying gene expression with random walk probability for genes were expressed') + exp_df = improve_utils.load_gene_expression_data(gene_system_identifier='Gene_Symbol') + rwr_df = times_expression(rwr_df, exp_df) + #rwr_df.to_csv(out_path+'.RWR.txt', header=True, index=True, sep='\t') + # perform Pathwa Enrichment Analysis + print(datetime.now(), 'performing network-based pathway enrichment') + cell_pathway_df = pea.NetPEA(rwr_df, pathway_path, log_transform=log_transform, permutation=permutation_int, seed=seed_int, n_cpu=cpu_int, out_path=outpath) + print( '[Finished in {:}]'.format(cal_time(datetime.now(), start_time)) ) + +def prep_input(params): + # Read data files + drug_mbit_df = pd.read_csv(params['drug_bits_file'], sep = '\t', index_col=0) + drug_mbit_df = drug_mbit_df.reset_index().rename(columns={'drug': 'drug_id'}) + DGnet = pd.read_csv(params['dgnet_file'], sep='\t', index_col=0) + DGnet = DGnet.add_suffix('_dgnet').reset_index().rename(columns={'index': 'drug_id'}) + CNVnet = pd.read_csv(params['cnvnet_file'], sep= '\t',index_col=0) + CNVnet = CNVnet.add_suffix('_cnvnet').reset_index().rename(columns={'index': 'sample_id'}) + MUTnet = pd.read_csv(params['mutnet_file'], sep='\t',index_col=0) + MUTnet = MUTnet.add_suffix('_mutnet').reset_index().rename(columns={'index': 'sample_id'}) + EXP = pd.read_csv(params['exp_file'], sep = '\t', index_col=0) + EXP = EXP.add_suffix('_exp').reset_index().rename(columns={'index': 'sample_id'}) + response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], + split_type=['train', 'test', 'val'], + y_col_name= params['metric']) + response_df = response_df.rename(columns={'improve_chem_id': 'drug_id', 'improve_sample_id': 'sample_id'}) + # Extract relevant IDs + + common_drug_ids = reduce(np.intersect1d, (drug_mbit_df['drug_id'], DGnet['drug_id'], response_df['drug_id'])) + common_sample_ids = reduce(np.intersect1d, (CNVnet['sample_id'], MUTnet['sample_id'], EXP['sample_id'] , response_df['sample_id'])) + response_df = response_df.loc[(response_df['drug_id'].isin(common_drug_ids)) & + (response_df['sample_id'].isin(common_sample_ids)), :] + drug_mbit_df = drug_mbit_df.loc[drug_mbit_df['drug_id'].isin(common_drug_ids), :].set_index('drug_id').sort_index() + DGnet = DGnet.loc[DGnet['drug_id'].isin(common_drug_ids), :].set_index('drug_id').sort_index() + CNVnet = CNVnet.loc[CNVnet['sample_id'].isin(common_sample_ids), :].set_index('sample_id').sort_index() + MUTnet = MUTnet.loc[MUTnet['sample_id'].isin(common_sample_ids), :].set_index('sample_id').sort_index() + EXP = EXP.loc[EXP['sample_id'].isin(common_sample_ids), :].set_index('sample_id').sort_index() + + drug_data = drug_mbit_df.join(DGnet) + sample_data = CNVnet.join([MUTnet, EXP]) + ## export train,val,test set + for i in ['train', 'test', 'val']: + response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'], + split_type=i, + y_col_name= params['metric']) + response_df = response_df.rename(columns={'improve_chem_id': 'drug_id', 'improve_sample_id': 'sample_id'}) + response_df = response_df.loc[(response_df['drug_id'].isin(common_drug_ids)) & + (response_df['sample_id'].isin(common_sample_ids)), :] + comb_data_mtx = pd.DataFrame({'drug_id': response_df['drug_id'].values, + 'sample_id': response_df['sample_id'].values}) + comb_data_mtx = comb_data_mtx.set_index(['drug_id', 'sample_id']).join(drug_data, on = 'drug_id').join(sample_data, on = 'sample_id') + comb_data_mtx['response'] = response_df[params['metric']].values + comb_data_mtx = comb_data_mtx.dropna() + pl.from_pandas(comb_data_mtx).write_csv(params[i + '_data'], separator = '\t', has_header = True) + + +def run_ssgsea(params): + expMat = improve_utils.load_gene_expression_data(sep='\t') + rs_all = improve_utils.load_single_drug_response_data(source=params['data_type'], + split=params['split'], split_type=["train", "test", "val"], + y_col_name=params['metric']) + expMat = expMat.loc[expMat.index.isin(rs_all['improve_sample_id']),] + gct = expMat.T # gene (rows) cell lines (columns) + pathway_path = params['data_dir'] + '/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt' + gmt = pathway_path + tmp_str = params['data_dir'] + + if not os.path.isdir(tmp_str): + os.mkdir(tmp_str) + + # run enrichment + ssgsea = gp.ssgsea(data=gct, #gct: a matrix of gene by sample + gene_sets=gmt, #gmt format + outdir=tmp_str, + scale=True, + permutation_num=0, #1000 + no_plot=True, + processes=params['cpu_int'], + #min_size=0, + format='png') + + result_mat = ssgsea.res2d.T # get the normalized enrichment score (i.e., NES) + result_mat.to_csv(tmp_str+'ssGSEA.txt', header=True, index=True, sep="\t") + + f = open(tmp_str+'ssGSEA.txt', 'r') + lines = f.readlines() + total_dict = {} + for cell in set(lines[1].split()): + total_dict[cell] = {} + cell_lines = lines[1].split() + vals = lines[4].split() + for i, pathway in enumerate((lines[2].split())): + if i > 0: + total_dict[cell_lines[i]][pathway] = float(vals[i]) + df = pd.DataFrame(total_dict) + df.T.to_csv(params['exp_file'], header=True, index=True, sep="\t") + + +def candle_main(anl): + params = initialize_parameters() + data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/' + params = preprocess(params, data_dir) + if params['improve_analysis'] == 'yes' or anl == 1: + download_anl_data(params) + print('convert drug to bits.') + smile2bits(params) + print('compute DGnet.') + run_netpea(params, dtype = 'DGnet', multiply_expression=False) + print('compute MUTnet.') + run_netpea(params, dtype = 'MUTnet', multiply_expression=True) + print('compute CNVnet.') + run_netpea(params, dtype = 'CNVnet', multiply_expression=True) + print('compute EXP.') + run_ssgsea(params) + print('prepare final input file.') + prep_input(params) + else: + download_author_data(params) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('-a', dest='anl', type=int, default=0, help='''whether to perform preprocessing using anl data or directly use processed + data from the original paper, default to 0 to use processed data from original paper''') + args = parser.parse_args() + start = datetime.now() + candle_main(args.anl) + print('[Finished in {:}]'.format(cal_time(datetime.now(), start))) diff --git a/old_legacy_scripts/train.py b/old_legacy_scripts/train.py new file mode 100644 index 0000000..fb18f9d --- /dev/null +++ b/old_legacy_scripts/train.py @@ -0,0 +1,74 @@ +import candle +import os +import sys +#import json +#from json import JSONEncoder +from preprocess_new import mkdir, preprocess +#sys.path.append("/usr/local/PathDSP/PathDSP") +sys.path.append("/usr/local/PathDSP/PathDSP") +sys.path.append(os.getcwd() + "/PathDSP") +import FNN_new + +file_path = os.path.dirname(os.path.realpath(__file__)) +# This should be set outside as a user environment variable +#os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/' +required = None +additional_definitions = None + +# initialize class +class PathDSP_candle(candle.Benchmark): + def set_locals(self): + ''' + Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the benchmark. + ''' + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + +def initialize_parameters(): + preprocessor_bmk = PathDSP_candle(file_path, + 'PathDSP_params.txt', + 'pytorch', + prog='PathDSP_candle', + desc='Data Preprocessor' + ) + #Initialize parameters + gParameters = candle.finalize_parameters(preprocessor_bmk) + return gParameters + +# class CustomData: +# def __init__(self, name, value): +# self.name = name +# self.value = value + +# class CustomEncoder(json.JSONEncoder): +# def default(self, o): +# return o.__dict__ + + +# def run(params): +# params['data_type'] = str(params['data_type']) +# json_out = params['output_dir']+'/params.json' +# print(params) + +# with open (json_out, 'w') as fp: +# json.dump(params, fp, indent=4, cls=CustomEncoder) + +# scores = main(params) +# with open(params['output_dir'] + "/scores.json", "w", encoding="utf-8") as f: +# json.dump(scores, f, ensure_ascii=False, indent=4) +# # print('IMPROVE_RESULT RMSE:\t' + str(scores['rmse'])) + + +def candle_main(): + params = initialize_parameters() + data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/' + params = preprocess(params, data_dir) + FNN_new.main(params) + + +if __name__ == "__main__": + candle_main() diff --git a/preprocess.sh b/preprocess.sh new file mode 100755 index 0000000..4da8d87 --- /dev/null +++ b/preprocess.sh @@ -0,0 +1,8 @@ +IMPROVE_MODEL_NAME=PathDSP +IMPROVE_STAGE=preprocess +IMPROVE_MODEL_SCRIPT=${IMPROVE_MODEL_NAME}_${IMPROVE_STAGE}_improve.py + +# Set env if CANDLE_MODEL is not in same directory as this script +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} + +python $IMPROVE_MODEL_DIR/$IMPROVE_MODEL_SCRIPT $@ diff --git a/set_affinity_gpu_polaris.sh b/set_affinity_gpu_polaris.sh new file mode 100755 index 0000000..f3d4915 --- /dev/null +++ b/set_affinity_gpu_polaris.sh @@ -0,0 +1,6 @@ +#!/bin/bash +num_gpus=4 +gpu=$((${PMI_LOCAL_RANK} % ${num_gpus})) +export CUDA_VISIBLE_DEVICES=$gpu +echo “RANK= ${PMI_RANK} LOCAL_RANK= ${PMI_LOCAL_RANK} gpu= ${gpu}” +exec "$@" diff --git a/setup_improve.sh b/setup_improve.sh new file mode 100644 index 0000000..8fad5bc --- /dev/null +++ b/setup_improve.sh @@ -0,0 +1,58 @@ +#!/bin/bash --login +# Navigate to the dir with the cloned model repo +# Run it like this: source ./setup_improve.sh + +# set -e + +# Get current dir and model dir +model_path=$PWD +echo "Model path: $model_path" +model_name=$(echo "$model_path" | awk -F '/' '{print $NF}') +echo "Model name: $model_name" + +# Download data (if needed) +data_dir="csa_data" +if [ ! -d $PWD/$data_dir/ ]; then + echo "Download CSA data" + source download_csa.sh +else + echo "CSA data folder already exists" +fi + +# Download author data (if needed) - PathDSP specific +author_dir="author_data" +if [ ! -d $PWD/$author_dir/ ]; then + echo "Download author data" + mkdir author_data + source download_author_data.sh author_data/ +else + echo "Author data folder already exists" +fi + +# Env var IMPROVE_DATA_DIR +export IMPROVE_DATA_DIR="./$data_dir/" + +# Env var AUTHOR_DATA_DIR - PathDSP specific +export AUTHOR_DATA_DIR="./$author_dir/" + +# Clone IMPROVE lib (if needed) and checkout the branch/tag +cd ../ +improve_lib_path=$PWD/IMPROVE +# improve_branch="develop" +improve_branch="v0.1.0" +if [ -d $improve_lib_path ]; then + echo "IMPROVE repo exists in ${improve_lib_path}" +else + git clone https://github.com/JDACS4C-IMPROVE/IMPROVE +fi +cd IMPROVE +git checkout -f $improve_branch +cd ../$model_name + +# Env var PYTHOPATH +export PYTHONPATH=$PYTHONPATH:$improve_lib_path + +echo +echo "IMPROVE_DATA_DIR: $IMPROVE_DATA_DIR" +echo "AUTHOR_DATA_DIR: $AUTHOR_DATA_DIR" +echo "PYTHONPATH: $PYTHONPATH" diff --git a/subprocess_train_singularity.sh b/subprocess_train_singularity.sh new file mode 100755 index 0000000..a700cb7 --- /dev/null +++ b/subprocess_train_singularity.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0 +# CUDA_VISIBLE_DEVICES=5 bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0 + +# Need to comment this when using ' eval "$(conda shell.bash hook)" ' +# set -e + +# Activate conda env for model using "conda activate myenv" +# https://saturncloud.io/blog/activating-conda-environments-from-scripts-a-guide-for-data-scientists +# https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script +# This doesn't work w/o eval "$(conda shell.bash hook)" +#CONDA_ENV=$PathDSP_env +#echo "Allow conda commands in shell script by running 'conda shell.bash hook'" +#eval "$(conda shell.bash hook)" +#echo "Activated conda commands in shell script" +#conda activate $CONDA_ENV +#source activate $CONDA_ENV +#conda_path=${dirname `which conda`} +#source $conda_path/activate $CONDA_ENV +#source activate $CONDA_ENV +#echo "Activated conda env $CONDA_ENV" + +train_ml_data_dir=$1 +val_ml_data_dir=$2 +model_outdir=$3 +learning_rate=$4 +batch_size=$5 +dropout=$6 +CUDA_VISIBLE_DEVICES=$7 + +echo "train_ml_data_dir: $train_ml_data_dir" +echo "val_ml_data_dir: $val_ml_data_dir" +echo "model_outdir: $model_outdir" +echo "learning_rate: $learning_rate" +echo "batch_size: $batch_size" +echo "dropout: $dropout" +echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" + +# epochs=10 +epochs=50 +# epochs=50 + +# All train outputs are saved in params["model_outdir"] +#CUDA_VISIBLE_DEVICES=6,7 python PathDSP_train_improve.py \ +#CUDA_VISIBLE_DEVICES=5 +#CUDA_VISIBLE_DEVICES=6,7 +# python PathDSP_train_improve.py \ +# --train_ml_data_dir $train_ml_data_dir \ +# --val_ml_data_dir $val_ml_data_dir \ +# --model_outdir $model_outdir \ +# --epochs $epochs + +#conda deactivate +#source $conda_path/deactivate +#echo "Deactivated conda env $CONDA_ENV" + +echo "train using singularity container" +singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir $PathDSP_sif train.sh ${CUDA_VISIBLE_DEVICES} /candle_data_dir --train_ml_data_dir /candle_data_dir/$train_ml_data_dir --val_ml_data_dir /candle_data_dir/$val_ml_data_dir/ --model_outdir /candle_data_dir/$model_outdir/ --epochs $epochs --learning_rate $learning_rate --batch_size $batch_size --dropout $dropout + diff --git a/train.sh b/train.sh new file mode 100755 index 0000000..a10d6e2 --- /dev/null +++ b/train.sh @@ -0,0 +1,7 @@ +IMPROVE_MODEL_NAME=PathDSP +IMPROVE_MODEL_SCRIPT=${IMPROVE_MODEL_NAME}_train_improve.py + +# Set env if CANDLE_MODEL is not in same directory as this script +IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )} + +python $IMPROVE_MODEL_DIR/$IMPROVE_MODEL_SCRIPT $@ diff --git a/workflow_csa.py b/workflow_csa.py new file mode 100644 index 0000000..7063101 --- /dev/null +++ b/workflow_csa.py @@ -0,0 +1,235 @@ +import json +import logging +import sys +from pathlib import Path +from typing import Sequence, Tuple, Union + +import parsl +from parsl import python_app +from parsl.config import Config +from parsl.executors import HighThroughputExecutor +from parsl.providers import LocalProvider + +import csa_params_def as CSA +from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig + +# Initialize parameters for CSA +additional_definitions = CSA.additional_definitions +filepath = Path(__file__).resolve().parent +cfg = DRPPreprocessConfig() +params = cfg.initialize_parameters( + pathToModelDir=filepath, + default_config="csa_params.ini", + additional_definitions=additional_definitions +) + +##### CONFIG FOR LAMBDA ###### +#available_accelerators: Union[int, Sequence[str]] = 8 +worker_port_range: Tuple[int, int] = (10000, 20000) +retries: int = 1 + +config_lambda = Config( + retries=retries, + executors=[ + HighThroughputExecutor( + address='127.0.0.1', + label="htex", + cpu_affinity="block", + #max_workers_per_node=2, ## IS NOT SUPPORTED IN Parsl version: 2023.06.19. CHECK HOW TO USE THIS??? + worker_debug=True, + worker_port_range=worker_port_range, + provider=LocalProvider( + init_blocks=1, + max_blocks=1, + ), + available_accelerators=params['available_accelerators'], + ) + ], + strategy='simple', +) + +parsl.clear() +parsl.load(config_lambda) + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +fdir = Path(__file__).resolve().parent +logger = logging.getLogger(f'Start workflow') + +############################################################################## +################################ PARSL APPS ################################## +############################################################################## + +@python_app +def train(params, hp_model, source_data_name, split): + """ parsl implementation of training stage using python_app. """ + import json + import subprocess + import time + from pathlib import Path + + hp = hp_model[source_data_name] + if hp.__len__() == 0: + raise Exception(str('Hyperparameters are not defined for ' + source_data_name)) + + model_dir = params['model_dir'] / f"{source_data_name}" / f"split_{split}" + ml_data_dir = params['ml_data_dir'] / \ + f"{source_data_name}-{params['target_datasets'][0]}"/ f"split_{split}" + + if model_dir.exists() is False: + print("\nTrain") + print(f"ml_data_dir: {ml_data_dir}") + print(f"model_dir: {model_dir}") + start = time.time() + if params['use_singularity']: + train_run = ["singularity", "exec", "--nv", + params['singularity_image'], "train.sh", + str("--input_dir " + str(ml_data_dir)), + str("--output_dir " + str(model_dir)), + str("--epochs " + str(params['epochs'])), + str("--y_col_name " + str(params['y_col_name'])), + str("--learning_rate " + str(hp['learning_rate'])), + str("--batch_size " + str(hp['batch_size'])) + ] + else: + train_run = ["bash", "execute_in_conda.sh", + params['model_environment'], + params['train_python_script'], + "--input_dir", str(ml_data_dir), + "--output_dir", str(model_dir), + "--epochs", str(params['epochs']), # DL-specific + "--y_col_name", str(params['y_col_name']), + "--learning_rate", str(hp['learning_rate']), + "--batch_size", str(hp['batch_size']) + ] + + result = subprocess.run(train_run, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True) + + # Logger + print(f"returncode = {result.returncode}") + result_file_name_stdout = model_dir / 'logs.txt' + with open(result_file_name_stdout, 'w') as file: + file.write(result.stdout) + + # Timer + time_diff = time.time() - start + hours = int(time_diff // 3600) + minutes = int((time_diff % 3600) // 60) + seconds = time_diff % 60 + time_diff_dict = {'hours': hours, + 'minutes': minutes, + 'seconds': seconds} + dir_to_save = model_dir + filename = 'runtime.json' + with open(Path(dir_to_save) / filename, 'w') as json_file: + json.dump(time_diff_dict, json_file, indent=4) + + return {'source_data_name': source_data_name, 'split': split} + +@python_app +def infer(params, source_data_name, target_data_name, split): + """ parsl implementation of inferece stage using python_app. """ + import subprocess + import json + import time + from pathlib import Path + + model_dir = params['model_dir'] / f"{source_data_name}" / f"split_{split}" + ml_data_dir = params['ml_data_dir'] / \ + f"{source_data_name}-{target_data_name}" / f"split_{split}" + infer_dir = params['infer_dir'] / \ + f"{source_data_name}-{target_data_name}" / f"split_{split}" + + print("\nInfer") + start = time.time() + if params['use_singularity']: + infer_run = ["singularity", "exec", "--nv", + params['singularity_image'], "infer.sh", + str("--input_data_dir " + str(ml_data_dir)), + str("--input_model_dir " + str(model_dir)), + str("--output_dir " + str(infer_dir)), + str("--calc_infer_scores "+ "true"), + str("--y_col_name " + str(params['y_col_name'])) + ] + else: + infer_run = ["bash", "execute_in_conda.sh", + params['model_environment'], + params['infer_python_script'], + "--input_data_dir", str(ml_data_dir), + "--input_model_dir", str(model_dir), + "--output_dir", str(infer_dir), + "--calc_infer_scores", "true", + "--y_col_name", str(params['y_col_name']) + ] + + result = subprocess.run(infer_run, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True) + + # Logger + print(f"returncode = {result.returncode}") + result_file_name_stdout = infer_dir / 'logs.txt' + with open(result_file_name_stdout, 'w') as file: + file.write(result.stdout) + + # Timer + time_diff = time.time() - start + hours = int(time_diff // 3600) + minutes = int((time_diff % 3600) // 60) + seconds = time_diff % 60 + time_diff_dict = {'hours': hours, + 'minutes': minutes, + 'seconds': seconds} + dir_to_save = infer_dir + filename = 'runtime.json' + with open(Path(dir_to_save) / filename, 'w') as json_file: + json.dump(time_diff_dict, json_file, indent=4) + + return True + +############################### +####### CSA PARAMETERS ######## +############################### + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +fdir = Path(__file__).resolve().parent +y_col_name = params['y_col_name'] +logger = logging.getLogger(f"{params['model_name']}") + +#Output directories for preprocess, train and infer +params['ml_data_dir'] = Path(params['output_dir']) / 'ml_data' +params['model_dir'] = Path(params['output_dir']) / 'models' +params['infer_dir'] = Path(params['output_dir']) / 'infer' + +#Model scripts +params['train_python_script'] = f"{params['model_name']}_train_improve.py" +params['infer_python_script'] = f"{params['model_name']}_infer_improve.py" + +#Read Hyperparameters file +with open(params['hyperparameters_file']) as f: + hp = json.load(f) +hp_model = hp[params['model_name']] + +########################################################################## +##################### START PARSL PARALLEL EXECUTION ##################### +########################################################################## + +##Train execution with Parsl +train_futures = [] +for source_data_name in params['source_datasets']: + for split in params['split']: + train_futures.append(train(params, hp_model, source_data_name, split)) + +##Infer execution with Parsl +infer_futures = [] +for future_t in train_futures: + for target_data_name in params['target_datasets']: + infer_futures.append(infer(params, future_t.result()['source_data_name'], target_data_name, future_t.result()['split'])) + +for future_i in infer_futures: + print(future_i.result()) + +parsl.dfk().cleanup() \ No newline at end of file diff --git a/workflow_preprocess.py b/workflow_preprocess.py new file mode 100644 index 0000000..d0e14bd --- /dev/null +++ b/workflow_preprocess.py @@ -0,0 +1,211 @@ +import json +import logging +import sys +from pathlib import Path +from typing import Sequence, Tuple, Union + +import parsl +from parsl import python_app +from parsl.config import Config +from parsl.executors import HighThroughputExecutor +from parsl.providers import LocalProvider + +import csa_params_def as CSA +from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig + +# Initialize parameters for CSA +additional_definitions = CSA.additional_definitions +filepath = Path(__file__).resolve().parent +cfg = DRPPreprocessConfig() +params = cfg.initialize_parameters( + pathToModelDir=filepath, + default_config="csa_params.ini", + additional_definitions=additional_definitions +) + +##### CONFIG FOR LAMBDA ###### +#available_accelerators: Union[int, Sequence[str]] = 8 +worker_port_range: Tuple[int, int] = (10000, 20000) +retries: int = 1 + +config_lambda = Config( + retries=retries, + executors=[ + HighThroughputExecutor( + address='127.0.0.1', + label="htex_preprocess", + cpu_affinity="alternating", + #max_workers_per_node=2, ## IS NOT SUPPORTED IN Parsl version: 2023.06.19. CHECK HOW TO USE THIS??? + worker_debug=True, + worker_port_range=worker_port_range, + provider=LocalProvider( + init_blocks=1, + max_blocks=1, + ), + ) + ], + strategy='simple', +) + +parsl.clear() +parsl.load(config_lambda) + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +fdir = Path(__file__).resolve().parent +logger = logging.getLogger(f'Start workflow') + +############################################################################## +################################ PARSL APPS ################################## +############################################################################## + +@python_app +def preprocess(inputs=[]): + """ parsl implementation of preprocessing stage using python_app. """ + import json + import subprocess + import time + import warnings + from pathlib import Path + + def build_split_fname(source_data_name, split, phase): + """ Build split file name. If file does not exist continue """ + if split=='all': + return f"{source_data_name}_{split}.txt" + return f"{source_data_name}_split_{split}_{phase}.txt" + + # python_app inputs + params = inputs[0] + source_data_name = inputs[1] + split = inputs[2] + + split_nums = params['split'] + + # Get the split file paths + if len(split_nums) == 0: + # Get all splits + split_files = list((params['splits_path']).glob(f"{source_data_name}_split_*.txt")) + split_nums = [str(s).split("split_")[1].split("_")[0] for s in split_files] + split_nums = sorted(set(split_nums)) + else: + split_files = [] + for s in split_nums: + split_files.extend(list((params['splits_path']).glob(f"{source_data_name}_split_{s}_*.txt"))) + files_joined = [str(s) for s in split_files] + + print(f"Split id {split} out of {len(split_nums)} splits.") + # Check that train, val, and test are available. Otherwise, continue to the next split. + for phase in ["train", "val", "test"]: + fname = build_split_fname(source_data_name, split, phase) + if fname not in "\t".join(files_joined): + warnings.warn(f"\nThe {phase} split file {fname} is missing \ + (continue to next split)") + continue + + for target_data_name in params['target_datasets']: + + if params['only_cross_study'] and (source_data_name == target_data_name): + continue # only cross-study + print(f"\nSource data: {source_data_name}") + print(f"Target data: {target_data_name}") + + ml_data_dir = params['ml_data_dir'] / \ + f"{source_data_name}-{target_data_name}" / f"split_{split}" + if ml_data_dir.exists() is True: + continue + + if source_data_name == target_data_name: + # If source and target are the same, then infer on the test split + test_split_file = f"{source_data_name}_split_{split}_test.txt" + else: + # If source and target are different, then infer on the entire + # target dataset + test_split_file = f"{target_data_name}_all.txt" + + # Preprocess data + print("\nPreprocessing") + train_split_file = f"{source_data_name}_split_{split}_train.txt" + val_split_file = f"{source_data_name}_split_{split}_val.txt" + print(f"train_split_file: {train_split_file}") + print(f"val_split_file: {val_split_file}") + print(f"test_split_file: {test_split_file}") + start = time.time() + if params['use_singularity']: + preprocess_run = ["singularity", "exec", "--nv", + params['singularity_image'], "preprocess.sh", + str("--train_split_file " + str(train_split_file)), + str("--val_split_file " + str(val_split_file)), + str("--test_split_file " + str(test_split_file)), + str("--input_dir " + params['input_dir']), + str("--output_dir " + str(ml_data_dir)), + str("--y_col_name " + str(params['y_col_name'])) + ] + else: + preprocess_run = ["bash", "execute_in_conda.sh", + params['model_environment'], + params['preprocess_python_script'], + "--train_split_file", str(train_split_file), + "--val_split_file", str(val_split_file), + "--test_split_file", str(test_split_file), + "--input_dir", params['input_dir'], + "--output_dir", str(ml_data_dir), + "--y_col_name", str(params['y_col_name']) + ] + + result = subprocess.run(preprocess_run, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True) + + # Logger + print(f"returncode = {result.returncode}") + result_file_name_stdout = ml_data_dir / 'logs.txt' + with open(result_file_name_stdout, 'w') as file: + file.write(result.stdout) + + # Timer + time_diff = time.time() - start + hours = int(time_diff // 3600) + minutes = int((time_diff % 3600) // 60) + seconds = time_diff % 60 + time_diff_dict = {'hours': hours, + 'minutes': minutes, + 'seconds': seconds} + dir_to_save = ml_data_dir + filename = 'runtime.json' + with open(Path(dir_to_save) / filename, 'w') as json_file: + json.dump(time_diff_dict, json_file, indent=4) + + return {'source_data_name': source_data_name, 'split': split} + + +############################### +####### CSA PARAMETERS ######## +############################### + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +fdir = Path(__file__).resolve().parent +y_col_name = params['y_col_name'] +logger = logging.getLogger(f"{params['model_name']}") + +#Output directories for preprocess, train and infer +params['ml_data_dir'] = Path(params['output_dir']) / 'ml_data' + +#Model scripts +params['preprocess_python_script'] = f"{params['model_name']}_preprocess_improve.py" + +########################################################################## +##################### START PARSL PARALLEL EXECUTION ##################### +########################################################################## + +##Preprocess execution with Parsl +preprocess_futures = [] +for source_data_name in params['source_datasets']: + for split in params['split']: + preprocess_futures.append( + preprocess(inputs=[params, source_data_name, split]) + ) + +for future_p in preprocess_futures: + print(future_p.result()) + +parsl.dfk().cleanup() \ No newline at end of file