diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5d7b436
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,20 @@
+.ipynb_checkpoints/
+PathDSP/__pycache__/
+__pycache__/
+EDA.ipynib
+ml_data/
+dh_hpo_improve/
+dh_hpo_logs/
+
+## gpu utilization
+PathDSP_gpu_util_model.txt
+gpu_log_strip.txt
+gpu_logs.txt
+out_models/
+train_gpu_util.sh
+
+## image
+PathDSP.sif
+
+## log files
+dh_hpo_scale_test.*
diff --git a/NetPEA.py b/NetPEA.py
new file mode 100644
index 0000000..d7006ef
--- /dev/null
+++ b/NetPEA.py
@@ -0,0 +1,212 @@
+"""
+Implementation of NetPEA: pathway enrichment with networks (Liu, 2017)
+
+Ref: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5664096/
+zscore >1.65, equivalent to p-value=0.05
+"""
+
+import os
+import sys
+import argparse
+import numpy as np
+import pandas as pd
+import multiprocessing as mp
+import scipy.stats as scistat
+from datetime import datetime
+
+class NetPEA:
+    """
+    :param rwrDf: dataframe with cell by PPI genes
+    :param pathwayGMT: pathway database in gmt format
+    :param permutation:
+    :param seed:
+    :param threshold:
+    """
+    def __init__(self, rwrPath, pathwayGMT, log_transform=False, permutation=1000, seed=42, n_cpu=5, out_path='./'):
+        # load data
+        self.rwr_path = rwrPath #pd.read_csv(rwrDf, header=0, index_col=0, sep="\t")
+        self.pathway_gmt = pathwayGMT
+        self.permutation = int(permutation)
+        self.seed = int(seed)
+        self.out_path = out_path
+
+        # settings
+        np.random.seed(self.seed)
+        self.n_cpu = int(n_cpu)
+        if len(self.rwr_path) < self.n_cpu:
+            self.n_cpu = len(self.rwr_path)
+
+        # prepare pathway genes to save time
+        print('{:}: collect pathway genes'.format(datetime.now()))
+        pathway_geneList_dict = self._get_pathway_genes(pathwayGMT) # {pathway: geneList}
+        # obtain shared genes for calculating score of pathway genes
+        self.rwrDf = self.rwr_path#pd.read_csv(rwrPath, header=0, index_col=0, sep="\t")
+        if log_transform == True:
+            print('log transform input data')
+            self.rwrDf = np.log(self.rwrDf)    
+        pathway_shareGeneList_dict = self._find_overlaps(self.rwrDf, pathway_geneList_dict) # {pathway: shareGeneList}
+        # generate random gene list for calculating score of random pathway genes
+        pathway_randomGeneListList_dict = {}
+        bg_gene_list = self.rwrDf.columns.tolist() # ppi genes
+        for pathway, shareGeneList in pathway_shareGeneList_dict.items():
+            pathway_randomGeneListList_dict.update({pathway:[]})
+            for  p in range(self.permutation):
+                gene_list = np.random.choice(bg_gene_list, len(shareGeneList)).tolist()
+                pathway_randomGeneListList_dict[pathway].append(gene_list)
+        self.pathwayDictList = [pathway_geneList_dict, pathway_shareGeneList_dict, pathway_randomGeneListList_dict]
+
+        # call function
+        self.netpea_parallel(self.rwrDf, self.pathwayDictList, self.n_cpu, self.out_path)
+
+    def netpea_parallel(self, rwrDf, pathwayDictList, n_cpu, out_path):
+        # split dataframe
+        n_partitions = int(n_cpu)
+        split_list = np.array_split(rwrDf, n_partitions)
+        # parallel computing
+        pool = mp.Pool(int(n_cpu))
+        df_list =  pool.starmap(self.netpea, [(df, pathwayDictList) for df in split_list])
+        pool.close()
+        pool.join()
+        print('{:}: comple {:} dfs'.format(datetime.now(), len(df_list)))
+        print(df_list[0])
+
+        # merge result of all cells and save to file
+        print('{:}: merge result of all cells and save to file'.format(datetime.now()))
+        all_cell_zscore_df = pd.concat(df_list, axis=0)
+        zscore_fname = self.out_path
+        all_cell_zscore_df.to_csv(zscore_fname, header=True, index=True, sep="\t")
+        #print(all_cell_zscore_df)
+
+
+    def netpea(self, rwrDf, pathwayDictList):
+        """return dataframe with cell by pathway"""
+        pathway_geneList_dict, pathway_shareGeneList_dict, pathway_randomGeneListList_dict = pathwayDictList
+        # convert to dataframe with headers=[pathway, #pathway genes, overlap genes]
+        pathway_df = self._merge_pathway_dict(pathway_geneList_dict, pathway_shareGeneList_dict)
+        # collect score of random gene list
+        print('{:}: collect score of random gene list'.format(datetime.now()))
+        cell_pathway_bgScoreList_dict = {} # dict of dict
+        for cell in rwrDf.index:
+            cell_pathway_bgScoreList_dict.update({cell:{}})
+            # prepare data
+            rwr_df = rwrDf.loc[cell] # 1 by ppiG dataframe
+            # append aggregate score for each randomgenelist for each pathway
+            for pathway, randomGeneListList in pathway_randomGeneListList_dict.items():
+                bgScoreList = [rwr_df.loc[randomGeneList].mean() for randomGeneList in randomGeneListList]
+                cell_pathway_bgScoreList_dict[cell].update({pathway:bgScoreList})
+
+        # collect score of share gene list
+        print('{:}: collect score of share gene list'.format(datetime.now()))
+        cell_pathway_ScoreList_dict = {} # dict of dict
+        for cell in rwrDf.index:
+            cell_pathway_ScoreList_dict.update({cell:{}})
+            # prepare data
+            rwr_df = rwrDf.loc[cell] # 1 by ppiG dataframe
+            # append aggregate score for each randomgenelist for each pathway
+            for pathway, shareGeneList in pathway_shareGeneList_dict.items():
+                score = rwr_df.loc[shareGeneList].mean()
+                cell_pathway_ScoreList_dict[cell].update({pathway:score})
+        # ztest to determin significance
+        print('{:}: ztest to determin significance'.format(datetime.now()))
+        zscore_dfs = []
+        cell_pathway_zscore_dict = {} # collect zscore for each pathway
+        cell_pathway_ztest_dict = {} # collect zscore and pvalue for each pathway
+        for cell in rwrDf.index:
+            cell_pathway_zscore_dict.update({cell:{}})
+            cell_pathway_ztest_dict.update({cell:{}})
+            pathway_score_dict = cell_pathway_ScoreList_dict[cell]
+            pathway_bgList_dict = cell_pathway_bgScoreList_dict[cell]
+            for pathway in pathway_geneList_dict.keys():
+                score = pathway_score_dict[pathway]
+                bgList = pathway_bgList_dict[pathway]
+                [zscore, pvalue] = self._cal_zscore(score, bgList)
+                cell_pathway_ztest_dict[cell].update({pathway: [zscore, pvalue]})
+                cell_pathway_zscore_dict[cell].update({pathway:zscore})
+            # save per-cell zscore 
+            cell_zscore_df = pd.DataFrame(cell_pathway_zscore_dict[cell], index=[cell])
+            zscore_dfs.append(cell_zscore_df)
+            # save per-cell ztest results 
+            cell_bgtest_df = pd.DataFrame(cell_pathway_ztest_dict[cell], index=['zscore', 'pvalue']).T
+            cell_bgtest_df.index.name = 'pathway'
+            cell_bgtest_df = cell_bgtest_df.join(pathway_df)
+            #percell_fname = self.out_path + '.' + cell + '.NetPEA.background_result.txt'
+            #cell_bgtest_df.to_csv(percell_fname, header=True, index=True, sep="\t")
+        # merge result of all cells and save to file
+        #print('{:}: merge result of all cells and save to file'.format(datetime.now()))
+        all_cell_zscore_df = pd.concat(zscore_dfs, axis=0)
+        #zscore_fname = self.out_path + '.NetPEA.zscore.txt'
+        #all_cell_zscore_df.to_csv(zscore_fname, header=True, index=True, sep="\t")
+       
+        # clear space
+        pathwayDictList = []
+        return all_cell_zscore_df
+
+    def _merge_pathway_dict(self, pathway_geneList_dict, pathway_shareGeneList_dict):
+        """return dataframe with headers = [pathway, #pathway genes, overlap genes]"""
+        pathway_lenG_dict = {pathway: len(geneList) for pathway, geneList in pathway_geneList_dict.items()}
+        pathway_strG_dict = {pathway: ",".join(geneList) for pathway, geneList in pathway_shareGeneList_dict.items()}
+        df1 = pd.DataFrame(pathway_lenG_dict.items(), columns=['pathway', '#pathway genes'])
+        df2 = pd.DataFrame(pathway_strG_dict.items(), columns=['pathway', 'overlap genes'])
+        return df1.set_index('pathway').join(df2.set_index('pathway'))
+                
+    def _find_overlaps(self, rwrDf, pathway_dict):
+        """return diction with pathway:geneList"""
+        # create result dictionary
+        result_dict = {} #pathway:sharedGeneList
+        # get ppiGenes
+        ppi_gene_list = rwrDf.columns.tolist()
+        # find overlaps
+        for pathway, geneList in pathway_dict.items():
+            shareGene_list = sorted(list(set(geneList) & set(ppi_gene_list)))
+            result_dict.update({pathway:shareGene_list})
+        return result_dict
+               
+    def _cal_zscore(self, score, scoreList):
+        """return zscore and pvalue by lookup table"""
+        if np.std(scoreList) != 0:
+            zscore = (score - np.mean(scoreList) ) / np.std(scoreList)
+            pvalue = scistat.norm.sf(abs(zscore)) # not pdf
+            #print('score={:}, scoreList={:}, zscore={:}, pvalue={:}'.format(
+            #       score, scoreList[:10], zscore, pvalue))
+        else:
+            zscore, pvalue = np.nan, np.nan
+        return [zscore, pvalue]
+
+    def _cal_similarity_score(self, rwrDf, geneList):
+        """return similarity score by taking average of rwr for given geneList"""
+        return rwrDf.loc[geneList].mean()
+            
+    def _get_pathway_genes(self, gmt):
+        """
+        Return pathwayStr_geneList_dict
+
+        :param fin: file name to pathway in gmt format
+        :return pathway_dict: dictionary of pathway as key, genelist as values
+        """
+        pathwayStr_geneList_dict = {}
+        with open(gmt, 'r') as f:
+            for line in f:
+                # extract fields
+                line = line.strip('\n').split('\t')
+                pathway_str = line[0]
+                gene_list = line[2:]
+                # update to dict
+                pathwayStr_geneList_dict.update({pathway_str:gene_list})
+        return pathwayStr_geneList_dict
+
+    def _df2dict(self, df):
+        """return 1 by N dataframe to dictionary of N keys"""
+        return df.to_dict('records')[0] # keys are column names = gene nams
+
+
+if __name__ == "__main__":
+    # timer 
+    datetimeFormat = '%Y-%m-%d %H:%M:%S.%f'
+    start_time = datetime.now()
+    rwr_df = 'test.txt' #'/repo4/ytang4/PHD/db/GDSC/processed/GDSC.MUTCNV.STRING.RWR.txt'
+    pathway_gmt = '/repo4/ytang4/PHD/db/MSigdb/c2.cp.pid.v7.1.symbols.gmt'
+    # initiate
+    cell_pathway_df = NetPEA(rwr_df, pathway_gmt, permutation=3, seed=42, n_cpu=5, out_path='./test_netpea/GDSC')
+    spend = datetime.strptime(str(datetime.now()), datetimeFormat) - datetime.strptime(str(start_time),datetimeFormat)
+    print( '[Finished in {:}]'.format(spend) )
+
diff --git a/PathDSP.def b/PathDSP.def
new file mode 100644
index 0000000..b754001
--- /dev/null
+++ b/PathDSP.def
@@ -0,0 +1,59 @@
+Bootstrap: docker
+From: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime 
+
+%labels
+	MANTAINER Yuanhang Liu
+
+%setup
+	cp ./src/Singularity_gpu_fix.sh $SINGULARITY_ROOTFS
+	# add local url of this repository for testing
+
+
+%environment
+	export PATH=$PATH:/usr/local/PathDSP
+	export IMPROVE_MODEL_DIR=/usr/local/PathDSP
+	export CANDLE_DATA_DIR=/candle_data_dir
+	export AUTHOR_DATA_DIR=/candle_data_dir
+	export PYTHONPATH=$PYTHONPATH:/usr/local/IMPROVE/:/usr/local/PathDSP/PathDSP/
+	
+%post
+	apt-get update -y
+	apt-get install wget -y
+	apt-get install -y gnupg
+	apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80
+    apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys A4B469963BF863CC
+
+	apt-get install build-essential -y
+	apt-get install git -y
+	apt-get install vim -y
+	apt-get install subversion -y
+
+	# install gpu fix and clean up
+	cd /
+	chmod +x Singularity_gpu_fix.sh
+    ./Singularity_gpu_fix.sh
+	rm Singularity_gpu_fix.sh
+
+	# these three need to be compiled and linked to the cuda libs.
+	# at the moment, what works for me is to build these in a
+	# singularity shell in a sandbox with the --nv flag to singularity set.
+
+
+	# create default internal candle_data_dir, map external candle_data_dir here
+	mkdir /candle_data_dir
+
+	#install python modules and model prerequites
+	cd /usr/local
+	git clone https://github.com/JDACS4C-IMPROVE/IMPROVE.git
+	git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git
+	cd PathDSP
+
+	# download conda
+	
+	/opt/conda/bin/conda env create -f environment_082223.yml --prefix /usr/local/conda_envs/PathDSP_env/
+	#/opt/conda/bin/conda activate PathDSP_env
+	/usr/local/conda_envs/PathDSP_env/bin/pip install git+https://github.com/ECP-CANDLE/candle_lib@develop
+
+	cp *.sh /usr/local/bin
+	chmod a+x /usr/local/bin/*.sh
+	chmod a+x /usr/local/PathDSP/*.sh
\ No newline at end of file
diff --git a/PathDSP_env_conda.yml b/PathDSP_env_conda.yml
new file mode 100644
index 0000000..0f91f17
--- /dev/null
+++ b/PathDSP_env_conda.yml
@@ -0,0 +1,218 @@
+name: PathDSP_env
+channels:
+  - bioconda
+  - pytorch
+  - nvidia
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=4.5
+  - abseil-cpp=20211102.0
+  - arrow-cpp=11.0.0
+  - asttokens=2.2.1
+  - aws-c-common=0.6.8
+  - aws-c-event-stream=0.1.6
+  - aws-checksums=0.1.11
+  - aws-sdk-cpp=1.8.185
+  - backcall=0.2.0
+  - backports=1.0
+  - backports.functools_lru_cache=1.6.5
+  - blas=1.0
+  - boost=1.74.0
+  - boost-cpp=1.74.0
+  - bottleneck=1.3.5
+  - brotlipy=0.7.0
+  - bzip2=1.0.8
+  - c-ares=1.19.0
+  - ca-certificates=2023.05.30
+  - cairo=1.16.0
+  - certifi=2023.7.22
+  - cffi=1.15.1
+  - charset-normalizer=2.0.4
+  - comm=0.1.4
+  - cryptography=41.0.2
+  - cuda-cudart=11.7.99
+  - cuda-cupti=11.7.101
+  - cuda-libraries=11.7.1
+  - cuda-nvrtc=11.7.99
+  - cuda-nvtx=11.7.91
+  - cuda-runtime=11.7.1
+  - cycler=0.11.0
+  - debugpy=1.6.7
+  - decorator=5.1.1
+  - executing=1.2.0
+  - ffmpeg=4.3
+  - filelock=3.9.0
+  - fontconfig=2.14.1
+  - freetype=2.10.4
+  - gflags=2.2.2
+  - giflib=5.2.1
+  - glib=2.69.1
+  - glog=0.5.0
+  - gmp=6.2.1
+  - gmpy2=2.1.2
+  - gnutls=3.6.15
+  - greenlet=2.0.1
+  - grpc-cpp=1.48.2
+  - gseapy=1.0.5
+  - icu=70.1
+  - idna=3.4
+  - importlib-metadata=6.8.0
+  - importlib_metadata=6.8.0
+  - intel-openmp=2023.1.0
+  - ipykernel=6.25.1
+  - ipython=8.14.0
+  - jbig=2.1
+  - jedi=0.19.0
+  - jinja2=3.1.2
+  - jpeg=9e
+  - jupyter_client=8.3.0
+  - jupyter_core=4.12.0
+  - kiwisolver=1.4.4
+  - krb5=1.20.1
+  - lame=3.100
+  - lcms2=2.12
+  - ld_impl_linux-64=2.38
+  - lerc=3.0
+  - libbrotlicommon=1.0.9
+  - libbrotlidec=1.0.9
+  - libbrotlienc=1.0.9
+  - libcublas=11.10.3.66
+  - libcufft=10.7.2.124
+  - libcufile=1.7.1.12
+  - libcurand=10.3.3.129
+  - libcurl=8.1.1
+  - libcusolver=11.4.0.1
+  - libcusparse=11.7.4.91
+  - libdeflate=1.8
+  - libedit=3.1.20221030
+  - libev=4.33
+  - libevent=2.1.12
+  - libffi=3.4.4
+  - libgcc-ng=13.1.0
+  - libgfortran-ng=11.2.0
+  - libgfortran5=11.2.0
+  - libiconv=1.16
+  - libidn2=2.3.4
+  - libnghttp2=1.52.0
+  - libnpp=11.7.4.75
+  - libnsl=2.0.0
+  - libnvjpeg=11.8.0.2
+  - libpng=1.6.39
+  - libprotobuf=3.20.3
+  - libsodium=1.0.18
+  - libsqlite=3.42.0
+  - libssh2=1.10.0
+  - libstdcxx-ng=11.2.0
+  - libtasn1=4.19.0
+  - libthrift=0.15.0
+  - libtiff=4.3.0
+  - libunistring=0.9.10
+  - libuuid=2.38.1
+  - libwebp=1.2.4
+  - libwebp-base=1.2.4
+  - libxcb=1.15
+  - libxml2=2.9.14
+  - libzlib=1.2.13
+  - llvm-openmp=16.0.6
+  - lz4-c=1.9.4
+  - markupsafe=2.1.1
+  - matplotlib-base=3.4.3
+  - matplotlib-inline=0.1.6
+  - mkl=2023.1.0
+  - mkl-service=2.4.0
+  - mkl_fft=1.3.6
+  - mkl_random=1.2.2
+  - mpc=1.1.0
+  - mpfr=4.0.2
+  - mpmath=1.3.0
+  - ncurses=6.4
+  - nest-asyncio=1.5.6
+  - nettle=3.7.3
+  - networkx=3.1
+  - numexpr=2.8.4
+  - numpy=1.25.2
+  - numpy-base=1.25.2
+  - openh264=2.1.1
+  - openssl=3.1.2
+  - orc=1.7.4
+  - packaging=23.1
+  - pandas=1.5.3
+  - parso=0.8.3
+  - pcre=8.45
+  - pexpect=4.8.0
+  - pickleshare=0.7.5
+  - pillow=9.4.0
+  - pip=23.2.1
+  - pixman=0.40.0
+  - polars=0.18.15
+  - prompt-toolkit=3.0.39
+  - prompt_toolkit=3.0.39
+  - psutil=5.9.5
+  - pthread-stubs=0.4
+  - ptyprocess=0.7.0
+  - pure_eval=0.2.2
+  - pyarrow=11.0.0
+  - pycairo=1.24.0
+  - pycparser=2.21
+  - pygments=2.16.1
+  - pyopenssl=23.2.0
+  - pysocks=1.7.1
+  - python=3.10.12
+  - python-dateutil=2.8.2
+  - python_abi=3.10
+  - pytorch=2.0.1
+  - pytorch-cuda=11.7
+  - pytorch-mutex=1.0
+  - pytz=2022.7
+  - pyzmq=25.1.0
+  - rdkit=2022.03.2
+  - re2=2022.04.01
+  - readline=8.2
+  - reportlab=3.6.12
+  - requests=2.31.0
+  - scipy=1.11.1
+  - seaborn=0.12.2
+  - setuptools=68.0.0
+  - six=1.16.0
+  - snappy=1.1.9
+  - sqlalchemy=1.4.49
+  - sqlite=3.41.2
+  - stack_data=0.6.2
+  - sympy=1.11.1
+  - tbb=2021.8.0
+  - tk=8.6.12
+  - torchaudio=2.0.2
+  - torchtriton=2.0.0
+  - torchvision=0.15.2
+  - tornado=6.3.2
+  - traitlets=5.9.0
+  - typing_extensions=4.7.1
+  - tzdata=2023c
+  - urllib3=1.26.16
+  - utf8proc=2.6.1
+  - wcwidth=0.2.6
+  - wheel=0.38.4
+  - xorg-libxau=1.0.11
+  - xorg-libxdmcp=1.1.3
+  - xz=5.2.6
+  - zeromq=4.3.4
+  - zipp=3.16.2
+  - zlib=1.2.13
+  - zstd=1.5.2
+  - pip:
+      - astropy==5.3.2
+      - contourpy==1.1.0
+      - fonttools==4.42.0
+      - joblib==1.3.2
+      - matplotlib==3.7.2
+      - patsy==0.5.3
+      - protobuf==3.19.0
+      - pyerfa==2.0.0.3
+      - pyparsing==3.0.9
+      - pyyaml==6.0.1
+      - scikit-learn==1.3.0
+      - statsmodels==0.14.0
+      - threadpoolctl==3.2.0
+      - tqdm==4.66.1
diff --git a/PathDSP_infer_improve.py b/PathDSP_infer_improve.py
new file mode 100755
index 0000000..97fc938
--- /dev/null
+++ b/PathDSP_infer_improve.py
@@ -0,0 +1,85 @@
+import os
+import sys
+import numpy as np
+import pandas as pd
+from datetime import datetime
+import torch as tch
+import torch.utils.data as tchud
+import polars as pl
+import model_utils.myModel as mynet
+import model_utils.myDataloader as mydl
+import model_utils.myUtility as myutil
+
+from PathDSP_preprocess_improve import mkdir, preprocess
+from PathDSP_train_improve import (
+    predicting,
+    cal_time,
+)
+from improvelib.applications.drug_response_prediction.config import DRPInferConfig #NCK
+import improvelib.utils as frm #NCK
+from model_params_def import pathdsp_infer_params
+
+file_path = os.path.dirname(os.path.realpath(__file__))
+
+
+def run(params):   
+    frm.create_outdir(outdir=params["output_dir"])
+    #params =  preprocess(params)
+    test_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="test")
+    test_df = pl.read_csv(params["input_data_dir"] + "/" + test_data_fname, separator = "\t").to_pandas()
+    Xtest_arr = test_df.iloc[:, 0:-1].values
+    ytest_arr = test_df.iloc[:, -1].values
+    Xtest_arr = np.array(Xtest_arr).astype('float32')
+    ytest_arr = np.array(ytest_arr).astype('float32')
+    trained_net = mynet.FNN(Xtest_arr.shape[1])
+    modelpath = frm.build_model_path(model_file_name=params["model_file_name"], model_file_format=params["model_file_format"], model_dir=params["input_model_dir"])
+    trained_net.load_state_dict(tch.load(modelpath))
+    trained_net.eval()
+    #myutil.set_seed(params["seed_int"])
+    cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES")
+    if cuda_env_visible is not None:
+        device = 'cuda:0'
+    else:
+        device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1]))
+    test_dataset = mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr))
+    test_dl = tchud.DataLoader(test_dataset, batch_size=params['infer_batch'], shuffle=False)
+    start = datetime.now()
+    test_true, test_pred = predicting(trained_net, device, data_loader=test_dl)
+
+    test_true = pd.Series(test_true)
+    test_pred = pd.Series(test_pred)
+    test_true_untrans = test_true.apply(lambda x: 10 ** (x) - 0.01)
+    test_pred_untrans = test_pred.apply(lambda x: 10 ** (x) - 0.01)
+
+    frm.store_predictions_df(
+        y_true=test_true_untrans,
+        y_pred=test_pred_untrans, 
+        stage="test",
+        y_col_name=params["y_col_name"],
+        output_dir=params["output_dir"],
+        input_dir=params["input_data_dir"]
+    )
+    if params["calc_infer_scores"]:
+        test_scores = frm.compute_performance_scores(
+            y_true=test_true_untrans, 
+            y_pred=test_pred_untrans, 
+            stage="test",
+            metric_type=params["metric_type"],
+            output_dir=params["output_dir"]
+        )
+
+    print('Inference time :[Finished in {:}]'.format(cal_time(datetime.now(), start)))
+    return True
+
+def main(args):
+    cfg = DRPInferConfig()
+    params = cfg.initialize_parameters(
+        file_path, 
+        default_config="PathDSP_params.txt", 
+        additional_definitions=pathdsp_infer_params)
+    if_ran = run(params)
+    print("\nFinished inference of PathDSP model.")
+
+    
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/PathDSP_params.txt b/PathDSP_params.txt
new file mode 100644
index 0000000..91f33f6
--- /dev/null
+++ b/PathDSP_params.txt
@@ -0,0 +1,63 @@
+[Preprocess]
+data_format = .txt
+input_supp_data_dir = ./author_data
+train_split_file = CCLE_split_0_train.txt
+val_split_file = CCLE_split_0_val.txt
+test_split_file = CCLE_split_0_test.txt
+y_data_files = [["response.tsv"]]
+x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]]
+x_data_drug_files = [["drug_SMILES.tsv"]]
+y_col_name = auc
+bit_int = 128
+permutation_int = 3
+seed_int = 42
+cpu_int = 20
+drug_bits_file = drug_mbit_df.txt
+dgnet_file = DGnet.txt
+mutnet_file = MUTnet.txt
+cnvnet_file = CNVnet.txt
+exp_file = EXP.txt
+
+[Train]
+data_format = .txt
+model_file_name = model
+model_file_format = .pt
+#epochs = 20
+epochs = 800
+learning_rate = 0.0004
+batch_size = 12
+val_batch = 12
+loss = mse
+patience = 30
+y_col_name = auc
+cuda_name = cuda:0
+dropout = 0.1
+
+[Infer]
+y_col_name = auc
+infer_batch = 256
+model_file_name = model
+model_file_format = .pt
+data_format = .txt
+cuda_name = cuda:0
+
+#gpu_int=0
+#gene_set = 'MSigdb.zip'
+#ppi_data = 'STRING.zip'
+#drug_target = 'raw_data.zip'
+#raw_data_dir = "raw_data"
+#train_data = 'PathDSP_train.txt'
+#test_data = 'PathDSP_test.txt'
+#val_data = 'PathDSP_val.txt'
+#y_col_name = 'auc'
+#metric='auc'
+#data_type='CTRPv2'
+#split=0
+#eps=0.00001
+#drug_hiddens='100,50,6'
+#final_hiddens=6
+#optimizer = 'adam'
+#improve_analysis='no'
+
+
+
diff --git a/PathDSP_preprocess_improve.py b/PathDSP_preprocess_improve.py
new file mode 100644
index 0000000..e5ce173
--- /dev/null
+++ b/PathDSP_preprocess_improve.py
@@ -0,0 +1,441 @@
+import sys
+import os
+import polars as pl
+import numpy as np
+import pandas as pd
+import copy
+from functools import reduce
+from pathlib import Path
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from datetime import datetime
+import RWR as rwr
+import NetPEA as pea
+import gseapy as gp
+import sklearn.model_selection as skms
+from sklearn.preprocessing import StandardScaler
+from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig #NCK
+from improvelib.utils import str2bool #NCK
+import improvelib.utils as frm #NCK
+import improvelib.applications.drug_response_prediction.drug_utils as drugs #NCK
+import improvelib.applications.drug_response_prediction.omics_utils as omics #NCK
+import improvelib.applications.drug_response_prediction.drp_utils as drp #NCK
+
+from model_params_def import pathdsp_preprocess_params
+
+file_path = Path(__file__).resolve().parent
+
+req_preprocess_args = [ll["name"] for ll in pathdsp_preprocess_params]
+
+def mkdir(directory):
+    directories = directory.split("/")
+    folder = ""
+    for d in directories:
+        folder += d + "/"
+        if not os.path.exists(folder):
+            print("creating folder: %s" % folder)
+            os.mkdir(folder)
+
+
+def preprocess(params):
+    for i in [
+        "drug_bits_file",
+        "dgnet_file",
+        "mutnet_file",
+        "cnvnet_file",
+        "exp_file",
+    ]:
+        params[i] = params["output_dir"] + "/" + params[i]
+    return params
+
+
+# set timer
+def cal_time(end, start):
+    """return time spent"""
+    # end = datetime.now(), start = datetime.now()
+    datetimeFormat = "%Y-%m-%d %H:%M:%S.%f"
+    spend = datetime.strptime(str(end), datetimeFormat) - datetime.strptime(
+        str(start), datetimeFormat
+    )
+    return spend
+
+def response_out(params, split_file):
+    response_df = drp.DrugResponseLoader(params, split_file=split_file, verbose=True)
+    return response_df.dfs["response.tsv"]
+
+
+def smile2bits(params):
+    start = datetime.now()
+    response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]]
+    response_df = pd.concat(response_df, ignore_index=True)
+    smile_df = drugs.DrugsLoader(params)
+    smile_df = smile_df.dfs['drug_SMILES.tsv']
+    smile_df = smile_df.reset_index()
+    smile_df.columns = ["drug", "smile"]
+    smile_df = smile_df.drop_duplicates(subset=["drug"], keep="first").set_index("drug")
+    smile_df = smile_df.loc[smile_df.index.isin(response_df["improve_chem_id"]),]
+    bit_int = params["bit_int"]
+    record_list = []
+    # smile2bits drug by drug
+    n_drug = 1
+    for idx, row in smile_df.iterrows():
+        drug = idx
+        smile = row["smile"]
+        mol = Chem.MolFromSmiles(smile)
+        if mol is None:
+            continue
+        mbit = list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=bit_int))
+        # drug_mbit_dict.update({drug:mbit})
+        # append to result
+        record_list.append(tuple([drug] + mbit))
+        if len(mbit) == bit_int:
+            n_drug += 1
+    print("total {:} drugs with bits".format(n_drug))
+    # convert dict to dataframe
+    colname_list = ["drug"] + ["mBit_" + str(i) for i in range(bit_int)]
+    drug_mbit_df = pd.DataFrame.from_records(record_list, columns=colname_list)
+    # drug_mbit_df = pd.DataFrame.from_dict(drug_mbit_dict, orient='index', columns=colname_list)
+    # drug_mbit_df.index.name = 'drug'
+    print("unique drugs={:}".format(len(drug_mbit_df["drug"].unique())))
+    # save to file
+    drug_mbit_df.to_csv(params["drug_bits_file"], header=True, index=False, sep="\t")
+    print("[Finished in {:}]".format(cal_time(datetime.now(), start)))
+
+
+def times_expression(rwr, exp):
+    """
+    :param rwrDf: dataframe of cell by gene probability matrix
+    :param expDf: dataframe of cell by gene expression matrix
+    :return rwr_timesexp_df: dataframe of cell by gene probability matrix,
+                             in which genes are multiplied with expression values
+
+    Note: this function assumes cells are all overlapped while gene maybe not
+    """
+    cell_list = sorted(list(set(rwr.index) & set(exp.index)))
+    gene_list = sorted(list(set(rwr.columns) & set(exp.columns)))
+
+    if len(cell_list) == 0:
+        print("ERROR! no overlapping cell lines")
+        sys.exit(1)
+    if len(gene_list) == 0:
+        print("ERROR! no overlapping genes")
+        sys.exit(1)
+    # multiply with gene expression for overlapping cell, gene
+    rwr_timesexp = rwr.loc[cell_list, gene_list] * exp.loc[cell_list, gene_list]
+    # concat with other gene
+    out_gene_list = list(set(rwr.columns) - set(gene_list))
+    out_df = pd.concat([rwr_timesexp, rwr[out_gene_list]], axis=1)
+    return out_df
+
+
+def run_netpea(params, dtype, multiply_expression):
+    # timer
+    start_time = datetime.now()
+    ppi_path = params["input_supp_data_dir"] + "/STRING/9606.protein_name.links.v11.0.pkl"
+    pathway_path = (
+        params["input_supp_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt"
+    )
+    log_transform = False
+    permutation_int = params["permutation_int"]
+    seed_int = params["seed_int"]
+    cpu_int = params["cpu_int"]
+    response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]]
+    response_df = pd.concat(response_df, ignore_index=True)
+    omics_data = omics.OmicsLoader(params)
+    if dtype == "DGnet":
+        drug_info = pd.read_csv(params["input_dir"] + "/x_data/drug_info.tsv", sep="\t")
+        drug_info["NAME"] = drug_info["NAME"].str.upper()
+        target_info = pd.read_csv(
+            params["input_supp_data_dir"] + "/data/DB.Drug.Target.txt", sep="\t"
+        )
+        target_info = target_info.rename(columns={"drug": "NAME"})
+        combined_df = pd.merge(drug_info, target_info, how="left", on="NAME").dropna(
+            subset=["gene"]
+        )
+        combined_df = combined_df.loc[
+            combined_df["improve_chem_id"].isin(response_df["improve_chem_id"]),
+        ]
+        restart_path = params["output_dir"] + "/drug_target.txt"
+        combined_df.iloc[:, -2:].to_csv(
+            restart_path, sep="\t", header=True, index=False
+        )
+        outpath = params["dgnet_file"]
+    elif dtype == "MUTnet":
+        mutation_data = omics_data.dfs['cancer_mutation_count.tsv']
+        #mutation_data = mutation_data.reset_index()
+        mutation_data = pd.melt(mutation_data, id_vars="improve_sample_id").loc[
+            lambda x: x["value"] > 0
+        ]
+        mutation_data = mutation_data.loc[
+            mutation_data["improve_sample_id"].isin(response_df["improve_sample_id"]),
+        ]
+        restart_path = params["output_dir"] + "/mutation_data.txt"
+        mutation_data.iloc[:, 0:2].to_csv(
+            restart_path, sep="\t", header=True, index=False
+        )
+        outpath = params["mutnet_file"]
+    else:
+        cnv_data = omics_data.dfs['cancer_discretized_copy_number.tsv']
+        #cnv_data = cnv_data.reset_index()
+        cnv_data = pd.melt(cnv_data, id_vars="improve_sample_id").loc[
+            lambda x: x["value"] != 0
+        ]
+        cnv_data = cnv_data.loc[
+            cnv_data["improve_sample_id"].isin(response_df["improve_sample_id"]),
+        ]
+        restart_path = params["output_dir"] + "/cnv_data.txt"
+        cnv_data.iloc[:, 0:2].to_csv(restart_path, sep="\t", header=True, index=False)
+        outpath = params["cnvnet_file"]
+    # perform Random Walk
+    print(datetime.now(), "performing random walk with restart")
+    rwr_df = rwr.RWR(
+        ppi_path,
+        restart_path,
+        restartProbFloat=0.5,
+        convergenceFloat=0.00001,
+        normalize="l1",
+        weighted=True,
+    ).get_prob()
+    # multiply with gene expression
+    if multiply_expression:
+        print(
+            datetime.now(),
+            "multiplying gene expression with random walk probability for genes were expressed",
+        )
+        # exp_df = improve_utils.load_gene_expression_data(gene_system_identifier='Gene_Symbol')
+        # exp_df = drp.load_omics_data(
+        #     params,
+        #     omics_type="gene_expression",
+        #     canc_col_name="improve_sample_id",
+        #     gene_system_identifier="Gene_Symbol",
+        # )
+        exp_df = omics_data.dfs['cancer_gene_expression.tsv']
+        exp_df = exp_df.set_index(params['canc_col_name'])
+        rwr_df = times_expression(rwr_df, exp_df)
+    # rwr_df.to_csv(out_path+'.RWR.txt', header=True, index=True, sep='\t')
+    # perform Pathwa Enrichment Analysis
+    print(datetime.now(), "performing network-based pathway enrichment")
+    cell_pathway_df = pea.NetPEA(
+        rwr_df,
+        pathway_path,
+        log_transform=log_transform,
+        permutation=permutation_int,
+        seed=seed_int,
+        n_cpu=cpu_int,
+        out_path=outpath,
+    )
+    print("[Finished in {:}]".format(cal_time(datetime.now(), start_time)))
+
+
+def prep_input(params):
+    # Read data files
+    drug_mbit_df = pd.read_csv(params["drug_bits_file"], sep="\t", index_col=0)
+    drug_mbit_df = drug_mbit_df.reset_index().rename(columns={"drug": "drug_id"})
+    DGnet = pd.read_csv(params["dgnet_file"], sep="\t", index_col=0)
+    DGnet = (
+        DGnet.add_suffix("_dgnet").reset_index().rename(columns={"index": "drug_id"})
+    )
+    CNVnet = pd.read_csv(params["cnvnet_file"], sep="\t", index_col=0)
+    CNVnet = (
+        CNVnet.add_suffix("_cnvnet")
+        .reset_index()
+        .rename(columns={"index": "sample_id"})
+    )
+    MUTnet = pd.read_csv(params["mutnet_file"], sep="\t", index_col=0)
+    MUTnet = (
+        MUTnet.add_suffix("_mutnet")
+        .reset_index()
+        .rename(columns={"index": "sample_id"})
+    )
+    EXP = pd.read_csv(params["exp_file"], sep="\t", index_col=0)
+    EXP = EXP.add_suffix("_exp").reset_index().rename(columns={"index": "sample_id"})
+    response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]]
+    response_df = pd.concat(response_df, ignore_index=True)
+    response_df = response_df.rename(
+        columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"}
+    )
+    # Extract relevant IDs
+    common_drug_ids = reduce(
+        np.intersect1d,
+        (drug_mbit_df["drug_id"], DGnet["drug_id"], response_df["drug_id"]),
+    )
+    common_sample_ids = reduce(
+        np.intersect1d,
+        (
+            CNVnet["sample_id"],
+            MUTnet["sample_id"],
+            EXP["sample_id"],
+            response_df["sample_id"],
+        ),
+    )
+    response_df = response_df.loc[
+        (response_df["drug_id"].isin(common_drug_ids))
+        & (response_df["sample_id"].isin(common_sample_ids)),
+        :,
+    ]
+    drug_mbit_df = (
+        drug_mbit_df.loc[drug_mbit_df["drug_id"].isin(common_drug_ids), :]
+        .set_index("drug_id")
+        .sort_index()
+    )
+    DGnet = (
+        DGnet.loc[DGnet["drug_id"].isin(common_drug_ids), :]
+        .set_index("drug_id")
+        .sort_index()
+    )
+    CNVnet = (
+        CNVnet.loc[CNVnet["sample_id"].isin(common_sample_ids), :]
+        .set_index("sample_id")
+        .sort_index()
+    )
+    MUTnet = (
+        MUTnet.loc[MUTnet["sample_id"].isin(common_sample_ids), :]
+        .set_index("sample_id")
+        .sort_index()
+    )
+    EXP = (
+        EXP.loc[EXP["sample_id"].isin(common_sample_ids), :]
+        .set_index("sample_id")
+        .sort_index()
+    )
+
+    drug_data = drug_mbit_df.join(DGnet)
+    sample_data = CNVnet.join([MUTnet, EXP])
+    ## export train,val,test set
+    for i in ["train", "test", "val"]:
+        response_df = drp.DrugResponseLoader(params, split_file=params[i+"_split_file"], verbose=True)
+        response_df = response_df.dfs['response.tsv']
+        response_df['exp_id'] = list(range(0,response_df.shape[0]))
+        response_df = response_df.rename(
+            columns={"improve_chem_id": "drug_id", "improve_sample_id": "sample_id"}
+        )
+        response_df = response_df.loc[
+            (response_df["drug_id"].isin(common_drug_ids))
+            & (response_df["sample_id"].isin(common_sample_ids)),
+            :,
+        ]
+        
+        comb_data_mtx = pd.DataFrame(
+            {
+                "drug_id": response_df["drug_id"].values,
+                "sample_id": response_df["sample_id"].values,
+                "exp_id": response_df["exp_id"].values
+            }
+        )
+        comb_data_mtx = (
+            comb_data_mtx.set_index(["drug_id", "sample_id", "exp_id"])
+            .join(drug_data, on="drug_id")
+            .join(sample_data, on="sample_id")
+        )
+        ss = StandardScaler()
+        comb_data_mtx.iloc[:,params["bit_int"]:comb_data_mtx.shape[1]] = ss.fit_transform(comb_data_mtx.iloc[:,params["bit_int"]:comb_data_mtx.shape[1]])
+        ## add 0.01 to avoid possible inf values
+        comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01)
+        comb_data_mtx = comb_data_mtx.dropna()
+
+        comb_data_mtx_to_save = comb_data_mtx['response']
+        comb_data_mtx_to_save = comb_data_mtx_to_save.reset_index()
+        comb_data_mtx_to_save.rename(columns={'drug_id': 'improve_chem_id', 'sample_id': 'improve_sample_id'}, inplace=True)
+        #comb_data_mtx_to_save[params["y_col_name"]] = comb_data_mtx_to_save["response"].apply(lambda x: 10 ** (x) - 0.01)
+        rsp = drp.DrugResponseLoader(params,
+                                     split_file=params[i+"_split_file"],
+                                     verbose=False).dfs["response.tsv"]
+        rsp['exp_id'] = list(range(0,rsp.shape[0]))
+        ydata = rsp.merge(comb_data_mtx_to_save, on=['exp_id'], how='right')
+        print(comb_data_mtx_to_save)
+        print("YDATA")
+        print(ydata)
+        frm.save_stage_ydf(ydf=ydata, stage=i, output_dir=params["output_dir"])
+        pl.from_pandas(comb_data_mtx).write_csv(
+            params["output_dir"] + "/" + frm.build_ml_data_file_name(data_format=params["data_format"], stage=i)
+, separator="\t", has_header=True
+        )
+
+
+def run_ssgsea(params):
+    # expMat = improve_utils.load_gene_expression_data(sep='\t')
+    # expMat = drp.load_omics_data(
+    #     params,
+    #     omics_type="gene_expression",
+    #     canc_col_name="improve_sample_id",
+    #     gene_system_identifier="Gene_Symbol",
+    # )
+    omics_data = omics.OmicsLoader(params)
+    expMat = omics_data.dfs['cancer_gene_expression.tsv']
+    expMat = expMat.set_index(params['canc_col_name'])
+
+    # response_df = improve_utils.load_single_drug_response_data(source=params['data_type'],
+    #                                                     split=params['split'], split_type=["train", "test", "val"],
+    #                                                     y_col_name=params['metric'])
+    response_df = [response_out(params, params[split_file]) for split_file in ["train_split_file", "test_split_file", "val_split_file"]]
+    response_df = pd.concat(response_df, ignore_index=True)
+    expMat = expMat.loc[expMat.index.isin(response_df["improve_sample_id"]),]
+    gct = expMat.T  # gene (rows) cell lines (columns)
+    pathway_path = (
+        params["input_supp_data_dir"] + "/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt"
+    )
+    gmt = pathway_path
+    tmp_str = params["output_dir"] + "/tmpdir_ssgsea/"
+
+    if not os.path.isdir(tmp_str):
+        os.mkdir(tmp_str)
+
+    # run enrichment
+    ssgsea = gp.ssgsea(
+        data=gct,  # gct: a matrix of gene by sample
+        gene_sets=gmt,  # gmt format
+        outdir=tmp_str,
+        scale=True,
+        permutation_num=0,  # 1000
+        no_plot=True,
+        processes=params["cpu_int"],
+        # min_size=0,
+        format="png",
+    )
+
+    result_mat = ssgsea.res2d.T  # get the normalized enrichment score (i.e., NES)
+    result_mat.to_csv(tmp_str + "ssGSEA.txt", header=True, index=True, sep="\t")
+
+    f = open(tmp_str + "ssGSEA.txt", "r")
+    lines = f.readlines()
+    total_dict = {}
+    for cell in set(lines[1].split()):
+        total_dict[cell] = {}
+    cell_lines = lines[1].split()
+    vals = lines[4].split()
+    for i, pathway in enumerate((lines[2].split())):
+        if i > 0:
+            total_dict[cell_lines[i]][pathway] = float(vals[i])
+    df = pd.DataFrame(total_dict)
+    df.T.to_csv(params["exp_file"], header=True, index=True, sep="\t")
+
+def run(params):
+    frm.create_outdir(outdir=params["output_dir"])
+    params = preprocess(params)
+    print("convert drug to bits.")
+    smile2bits(params)
+    print("compute DGnet.")
+    run_netpea(params, dtype="DGnet", multiply_expression=False)
+    print("compute MUTnet.")
+    run_netpea(params, dtype="MUTnet", multiply_expression=True)
+    print("compute CNVnet.")
+    run_netpea(params, dtype="CNVnet", multiply_expression=True)
+    print("compute EXP.")
+    run_ssgsea(params)
+    print("prepare final input file.")
+    prep_input(params)
+
+
+def main(args):
+    cfg = DRPPreprocessConfig()
+    params = cfg.initialize_parameters(
+        file_path, 
+        default_config="PathDSP_params.txt", 
+        additional_definitions=pathdsp_preprocess_params)
+    run(params)
+
+
+if __name__ == "__main__":
+    start = datetime.now()
+    main(sys.argv[1:])
+    print("[Preprocessing finished in {:}]".format(cal_time(datetime.now(), start)))
diff --git a/PathDSP_train_improve.py b/PathDSP_train_improve.py
new file mode 100644
index 0000000..b3eb9a6
--- /dev/null
+++ b/PathDSP_train_improve.py
@@ -0,0 +1,329 @@
+import os
+import sys
+import numpy as np
+import pandas as pd
+from datetime import datetime
+import socket
+import torch as tch
+import torch.utils.data as tchud
+import model_utils.myModel as mynet
+import model_utils.myDataloader as mydl
+import model_utils.myUtility as myutil
+import polars as pl
+
+from improvelib.applications.drug_response_prediction.config import DRPTrainConfig #NCK
+import improvelib.utils as frm #NCK
+
+from PathDSP_preprocess_improve import cal_time, preprocess
+from model_params_def import pathdsp_train_params
+
+file_path = os.path.dirname(os.path.realpath(__file__))
+
+
+class RMSELoss(tch.nn.Module):
+    def __init__(self):
+        super(RMSELoss,self).__init__()
+
+    def forward(self,x,y):
+        eps = 1e-6
+        criterion = tch.nn.MSELoss()
+        loss = tch.sqrt(criterion(x, y) + eps)
+        return loss
+
+def predicting(model, device, data_loader):
+    """ Method to make predictions/inference.
+    This is used in *train.py and *infer.py
+
+    Parameters
+    ----------
+    model : pytorch model
+        Model to evaluate.
+    device : string
+        Identifier for hardware that will be used to evaluate model.
+    data_loader : pytorch data loader.
+        Object to load data to evaluate.
+
+    Returns
+    -------
+    total_labels: numpy array
+        Array with ground truth.
+    total_preds: numpy array
+        Array with inferred outputs.
+    """
+    model.to(device)
+    model.eval()
+    total_preds = tch.Tensor()
+    total_labels = tch.Tensor()
+    print("Make prediction for {} samples...".format(len(data_loader.dataset)))
+    with tch.no_grad():
+        for i, (data_x, data_y) in enumerate(data_loader):
+            data_x, data_y = data_x.to(device), data_y.to(device)
+            data_y_pred  = model(data_x)
+            # Is this computationally efficient?
+            total_preds = tch.cat((total_preds, data_y_pred.cpu()), 0)  # preds to tensor
+            total_labels = tch.cat((total_labels, data_y.view(-1, 1).cpu()), 0)  # labels to tensor
+    return total_labels.numpy().flatten(), total_preds.numpy().flatten()
+
+
+def predict(net, device, test_dl):
+    """
+    Return prediction list
+
+    :param net: model
+    :param train_dl: train dataloader
+    :param device: string representing cpu or cuda:0
+    """
+    # create result lists
+    prediction_list = list()
+    true_list = list()
+
+    with tch.no_grad():
+        net = net.to(device) # load the network onto the device
+        net.eval()
+        for i, (X_test, y_test) in enumerate(test_dl):
+            X_test, y_test = X_test.to(device), y_test.to(device) # load data onto the device
+            y_test_pred  = net(X_test) # test result
+            # bring data back to cpu in np.array format, and append to result lists
+            prediction_list.append( y_test_pred.cpu().numpy() )
+            true_list.append(y_test.cpu().numpy())
+            #print(prediction_list)
+
+    # merge all batches
+    prediction_list  = np.vstack(prediction_list)
+    prediction_list = np.hstack(prediction_list).tolist()
+    true_list  = np.vstack(true_list)
+    true_list = np.hstack(true_list).tolist()
+    # return
+    return true_list, prediction_list
+
+def r2_score(y_true, y_pred):
+    y_mean = np.mean(y_true)
+    ss_tot = np.sum((y_true - y_mean)**2)
+    ss_res = np.sum((y_true - y_pred)**2)
+    r2 = 1 - ss_res / ss_tot
+    return r2
+
+def cal_time(end, start):
+    '''return time spent'''
+    # end = datetime.now(), start = datetime.now()
+    datetimeFormat = '%Y-%m-%d %H:%M:%S.%f'
+    spend = datetime.strptime(str(end), datetimeFormat) - \
+            datetime.strptime(str(start),datetimeFormat)
+    return spend
+
+
+def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn, params):
+    """
+    Return train and valid performance including loss
+
+    :param net: model
+    :param train_dl: train dataloader
+    :param valid_dl: valid dataloader
+    :param epochs: integer representing EPOCH
+    :param learning_rate: float representing LEARNING_RATE
+    :param device: string representing cpu or cuda:0
+    :param opt_fn: optimization function in torch (e.g., tch.optim.Adam)
+    :param loss_fn: loss function in torch (e.g., tch.nn.MSELoss)
+    """
+    # setup
+    criterion = RMSELoss() # setup LOSS function
+    optimizer = opt_fn(net.parameters(), lr=learning_rate, weight_decay=1e-5) # setup optimizer
+    net = net.to(device) # load the network onto the device
+    trainloss_list = [] # metrics: MSE, size equals to EPOCH
+    validloss_list = [] # metrics: MSE, size equals to EPOCH
+    validr2_list = [] # metrics: r2, size equals to EPOCH
+    early_stopping = myutil.EarlyStopping(patience=params['patience'], verbose=True, path= params["output_dir"] + "/checkpoint.pt") # initialize the early_stopping
+    # repeat the training for EPOCH times
+    start_total = datetime.now()
+    for epoch in range(epochs):
+        ## training phase
+        start = datetime.now()
+        net.train()
+        # initial loss
+        train_epoch_loss = 0.0 # save loss for each epoch, batch by batch
+        for i, (X_train, y_train) in enumerate(train_dl):
+            X_train, y_train = X_train.to(device), y_train.to(device) # load data onto the device
+            y_train_pred  = net(X_train) # train result
+            train_loss = criterion(y_train_pred, y_train.float()) # calculate loss
+            optimizer.zero_grad() # clear gradients
+            train_loss.backward() # backpropagation
+            #### add this if you have gradient explosion problem ###
+            clip_value = 5
+            tch.nn.utils.clip_grad_value_(net.parameters(), clip_value)
+            ########climp gradient within -5 ~ 5 ###################
+            optimizer.step() # update weights
+            train_epoch_loss += train_loss.item() # adding loss from each batch
+        # calculate total loss of all batches
+        avg_train_loss = train_epoch_loss / len(train_dl)
+        trainloss_list.append( avg_train_loss )
+        print('epoch ' + str(epoch) + ' :[Finished in {:}]'.format(cal_time(datetime.now(), start)))
+        ## validation phase
+        with tch.no_grad():
+            net.eval()
+            valid_epoch_loss = 0.0 # save loss for each epoch, batch by batch
+            ss_res = 0.0
+            ss_tot = 0.0
+            for i, (X_valid, y_valid) in enumerate(valid_dl):
+                X_valid, y_valid = X_valid.to(device), y_valid.to(device) # load data onto the device
+                y_valid_pred  = net(X_valid) # valid result
+                valid_loss = criterion(y_valid_pred, y_valid.float())#y_valid.unsqueeze(1)) # calculate loss
+                valid_epoch_loss += valid_loss.item() # adding loss from each batch
+                ss_res += tch.sum((y_valid_pred - y_valid.float())**2)
+                ss_tot += tch.sum((y_valid_pred - y_valid.mean())**2)
+                
+        
+        # calculate total loss of all batches, and append to result list
+        avg_valid_loss = valid_epoch_loss / len(valid_dl)
+        validloss_list.append( avg_valid_loss)
+        valid_r2 = 1 - ss_res / ss_tot
+        validr2_list.append(valid_r2.cpu().numpy())
+        # display print message
+        #print('epoch={:}/{:}, train loss={:.5f}, valid loss={:.5f}'.format(
+        #       epoch+1, epochs, train_epoch_loss / len(train_dl),
+        #                        valid_epoch_loss / len(valid_dl)))
+        
+        # early_stopping needs the validation loss to check if it has decresed, 
+        # and if it has, it will make a checkpoint of the current model
+        early_stopping(avg_valid_loss, net)
+        
+        if early_stopping.early_stop:
+            print("Early stopping")
+            break
+    
+    print('Total time (all epochs) :[Finished in {:}]'.format(cal_time(datetime.now(), start_total)))
+    # load the last checkpoint with the best model
+    net.load_state_dict(tch.load(params["output_dir"] + '/checkpoint.pt'))
+
+    return net, trainloss_list, validloss_list, validr2_list
+
+
+def run(params):
+    frm.create_outdir(outdir=params["output_dir"])
+    modelpath = frm.build_model_path(model_file_name=params["model_file_name"], model_file_format=params["model_file_format"], model_dir=params["output_dir"])
+    train_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="train")
+    val_data_fname = frm.build_ml_data_file_name(data_format=params["data_format"], stage="val")
+    #params =  preprocess(params)
+    
+    # set parameters
+    #myutil.set_seed(params["seed_int"])
+    ## set device
+    cuda_env_visible = os.getenv("CUDA_VISIBLE_DEVICES")
+    if cuda_env_visible is not None:
+        device = 'cuda:0'
+        params["CUDA_VISIBLE_DEVICES"] = cuda_env_visible
+    else:
+        device = myutil.get_device(uth=int(params['cuda_name'].split(':')[1]))
+    #print("Using device: " + device)
+    learning_rate = params['learning_rate']
+    epoch = params['epochs']
+    batch_size = params['batch_size']
+    val_batch = params['val_batch']
+    opt_fn = tch.optim.Adam
+
+    # ------------------------------------------------------
+    # [PathDSP] Prepare dataloaders
+    # ------------------------------------------------------
+    print('loadinig data')
+    train_df = pl.read_csv(params["input_dir"] + "/" + train_data_fname, separator = "\t").to_pandas()
+    val_df = pl.read_csv(params["input_dir"] + "/" + val_data_fname, separator = "\t").to_pandas()
+    Xtrain_arr = train_df.iloc[:, 0:-1].values
+    Xvalid_arr = val_df.iloc[:, 0:-1].values
+    ytrain_arr = train_df.iloc[:, -1].values
+    yvalid_arr = val_df.iloc[:, -1].values    
+    Xtrain_arr = np.array(Xtrain_arr).astype('float32')
+    Xvalid_arr = np.array(Xvalid_arr).astype('float32')
+    ytrain_arr = np.array(ytrain_arr).astype('float32')
+    yvalid_arr = np.array(yvalid_arr).astype('float32')
+    # create mini-batch
+    train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr))
+    valid_dataset = mydl.NumpyDataset(tch.from_numpy(Xvalid_arr), tch.from_numpy(yvalid_arr))
+    train_dl = tchud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    valid_dl = tchud.DataLoader(valid_dataset, batch_size=val_batch, shuffle=False)
+    
+    # ------------------------------------------------------
+    # [PathDSP] Prepare model
+    # ------------------------------------------------------
+    # initial weight
+    def init_weights(m):
+        if type(m) == tch.nn.Linear:
+            tch.nn.init.kaiming_uniform_(m.weight)
+            m.bias.data.fill_(0.01)
+    # load model
+    n_features = Xtrain_arr.shape[1]
+    net = mynet.FNN(n_features)
+    ## specify dropout rate 
+    for module in net.modules():
+        if isinstance(module, tch.nn.Dropout):
+            module.p = params['dropout']
+    net.apply(init_weights)
+    
+    # ------------------------------------------------------
+    # [PathDSP] Training
+    # ------------------------------------------------------
+    print('start training process')
+    trained_net, train_loss_list, valid_loss_list, valid_r2_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn, params)
+
+    loss_df = pd.DataFrame({'epoch':[i+1 for i in range(len(train_loss_list))],
+                            'train loss':train_loss_list, 
+                            'valid loss': valid_loss_list,
+                            'valid r2': valid_r2_list})
+    loss_df.to_csv(params['output_dir'] + '/Val_Loss_orig.txt', header=True, index=False, sep="\t")
+
+    # make train/valid loss plots
+    best_model = trained_net
+    tch.save(best_model.state_dict(), modelpath)
+    #best_model.eval()
+    # Compute predictions
+    #val_true, val_pred = predicting(best_model, device, valid_dl) # (groud truth), (predictions)
+    val_true, val_pred = predict(best_model, device, valid_dl) # (groud truth), (predictions)
+
+    #comb_data_mtx["response"] = np.log10(response_df[params["y_col_name"]].values + 0.01)
+    val_true = pd.Series(val_true)
+    val_pred = pd.Series(val_pred)
+    val_true_untrans = val_true.apply(lambda x: 10 ** (x) - 0.01)
+    val_pred_untrans = val_pred.apply(lambda x: 10 ** (x) - 0.01)
+    # -----------------------------
+    # [Req] Save raw predictions in dataframe
+    # -----------------------------
+    # import ipdb; ipdb.set_trace()
+    frm.store_predictions_df(
+        y_true=val_true_untrans, 
+        y_pred=val_pred_untrans, 
+        stage="val",
+        y_col_name=params["y_col_name"],
+        output_dir=params["output_dir"],
+        input_dir=params["input_dir"]
+    )
+
+    # -----------------------------
+    # [Req] Compute performance scores
+    # -----------------------------
+    # import ipdb; ipdb.set_trace()
+    val_scores = frm.compute_performance_scores(
+        y_true=val_true_untrans, 
+        y_pred=val_pred_untrans, 
+        stage="val",
+        metric_type=params["metric_type"],
+        output_dir=params["output_dir"]
+    )
+    return val_scores
+
+
+def main(args):
+    cfg = DRPTrainConfig()
+    params = cfg.initialize_parameters(
+        file_path, 
+        default_config="PathDSP_params.txt", 
+        additional_definitions=pathdsp_train_params)
+    # get node name
+    params["node_name"] = socket.gethostname()
+    val_scores = run(params)
+    df = pd.DataFrame.from_dict(params, orient='index', columns=['value'])
+    df.to_csv(params["output_dir"] + '/params.txt',sep="\t")
+
+
+
+if __name__ == "__main__":
+    start = datetime.now()
+    main(sys.argv[1:])
+    print("[Training finished in {:}]".format(cal_time(datetime.now(), start)))
diff --git a/README.md b/README.md
index 6c89fee..6bfbad5 100644
--- a/README.md
+++ b/README.md
@@ -1,27 +1,160 @@
 # PathDSP
-Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores
 
-# Requirments
+This repository demonstrates how to use the [IMPROVE library v0.1.0](https://jdacs4c-improve.github.io/docs/v0.1.0/) for building a drug response prediction (DRP) model using PathDSP, and provides examples with the benchmark [cross-study analysis (CSA) dataset](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/).
 
-# Input format
+This version, tagged as `v0.1.0`, introduces a new API which is designed to encourage broader adoption of IMPROVE and its curated models by the research community.
 
-|drug|cell|feature_1|....|feature_n|drug_response|
-|----|----|--------|----|--------|----|
-|5-FU|03|0|....|0.02|-2.3|
-|5-FU|23|1|....|0.04|-3.4|
 
-Where feature_1 to feature_n are the pathway enrichment scores and the chemical fingerprint coming from data processing
-# Usage:
-```python
-# run FNN 
-python ./PathDSP/PathDSP/FNN.py -i input.txt -o ./output_prefix
+## Dependencies
+Installation instuctions are detailed below in [Step-by-step instructions](#step-by-step-instructions).
 
-Where input.txt should be in the input format shown above. 
-Example input file can be found at https://zenodo.org/record/7532963
+Conda `yml` file [PathDSP_env_conda](./PathDSP_env_conda.yml)
+
+ML framework:
++ [Torch](https://pytorch.org/) -- deep learning framework for building the prediction model
+
+IMPROVE dependencies:
++ [IMPROVE v0.1.0](https://jdacs4c-improve.github.io/docs/v0.1.0) 
+
+
+
+## Dataset
+Benchmark data for cross-study analysis (CSA) can be downloaded from this [site](https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/).
+
+The data tree is shown below:
+```
+csa_data/raw_data/
+├── splits
+│   ├── CCLE_all.txt
+│   ├── CCLE_split_0_test.txt
+│   ├── CCLE_split_0_train.txt
+│   ├── CCLE_split_0_val.txt
+│   ├── CCLE_split_1_test.txt
+│   ├── CCLE_split_1_train.txt
+│   ├── CCLE_split_1_val.txt
+│   ├── ...
+│   ├── GDSCv2_split_9_test.txt
+│   ├── GDSCv2_split_9_train.txt
+│   └── GDSCv2_split_9_val.txt
+├── x_data
+│   ├── cancer_copy_number.tsv
+│   ├── cancer_discretized_copy_number.tsv
+│   ├── cancer_DNA_methylation.tsv
+│   ├── cancer_gene_expression.tsv
+│   ├── cancer_miRNA_expression.tsv
+│   ├── cancer_mutation_count.tsv
+│   ├── cancer_mutation_long_format.tsv
+│   ├── cancer_mutation.parquet
+│   ├── cancer_RPPA.tsv
+│   ├── drug_ecfp4_nbits512.tsv
+│   ├── drug_info.tsv
+│   ├── drug_mordred_descriptor.tsv
+│   └── drug_SMILES.tsv
+└── y_data
+    └── response.tsv
+```
+
+
+## Model scripts and parameter file
++ `PathDSP_preprocess_improve.py` - takes benchmark data files and transforms into files for training and inference
++ `PathDSP_train_improve.py` - trains the PathDSP model
++ `PathDSP_infer_improve.py` - runs inference with the trained PathDSP model
++ `model_params_def.py` - definitions of parameters that are specific to the model
++ `PathDSP_params.txt` - default parameter file
+
+
+
+# Step-by-step instructions
+
+### 1. Clone the model repository
+```
+git clone https://github.com/JDACS4C-IMPROVE/PathDSP
+cd PathDSP
+git checkout v0.1.0
+```
+
+
+### 2. Set computational environment
+Create conda env using `yml`
+```
+conda env create -f PathDSP_env_conda.yml -n PathDSP_env
+conda activate PathDSP_env
+```
+
+
+### 3. Run `setup_improve.sh`.
+```bash
+source setup_improve.sh
 ```
-# Data preprocessing
-Pathway enrichment scores for categorical data (i.e., mutation, copy number variation, and drug targets) were obtained by running the NetPEA algorithm, which is available at: https://github.com/TangYiChing/NetPEA, while pathway enrichment scores for numeric data (i.e., gene expression) was generated with the single-sample Gene Set Enrichment Analsysis (ssGSEA) available here: https://gseapy.readthedocs.io/en/master/gseapy_example.html#3)-command-line-usage-of-single-sample-gseaby 
 
+This will:
+1. Download cross-study analysis (CSA) benchmark data into `./csa_data/`.
+2. Clone IMPROVE repo (checkout tag `v0.1.0`) outside the PathDSP model repo
+3. Set up `PYTHONPATH` (adds IMPROVE repo).
+4. Download the model-specific supplemental data (aka author data) and set up the env variable `AUTHOR_DATA_DIR`.
 
-# Reference
-Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7
+
+### 4. Preprocess CSA benchmark data (_raw data_) to construct model input data (_ML data_)
+```bash
+python PathDSP_preprocess_improve.py --input_dir ./csa_data/raw_data --output_dir exp_result
+```
+
+Preprocesses the CSA data and creates train, validation (val), and test datasets.
+
+Generates:
+* three model input data files: `train_data.txt`, `val_data.txt`, `test_data.txt`
+
+```
+exp_result
+├── tmpdir_ssgsea
+├── EXP.txt
+├── cnv_data.txt
+├── CNVnet.txt
+├── DGnet.txt
+├── MUTnet.txt
+├── drug_mbit_df.txt
+├── drug_target.txt
+├── mutation_data.txt 
+├── test_data.txt
+├── train_data.txt
+├── val_data.txt
+└── x_data_gene_expression_scaler.gz
+```
+
+
+### 5. Train PathDSP model
+```bash
+python PathDSP_train_improve.py --input_dir exp_result --output_dir exp_result
+```
+
+Trains PathDSP using the model input data: `train_data.txt` (training), `val_data.txt` (for early stopping).
+
+Generates:
+* trained model: `model.pt`
+* predictions on val data (tabular data): `val_y_data_predicted.csv`
+* prediction performance scores on val data: `val_scores.json`
+```
+exp_result
+├── model.pt
+├── checkpoint.pt
+├── Val_Loss_orig.txt
+├── val_scores.json
+└── val_y_data_predicted.csv
+```
+
+
+### 6. Run inference on test data with the trained model
+```bash
+python PathDSP_infer_improve.py --input_data_dir exp_result --input_model_dir exp_result --output_dir exp_result --calc_infer_score True
+```
+
+Evaluates the performance on a test dataset with the trained model.
+
+Generates:
+* predictions on test data (tabular data): `test_y_data_predicted.csv`
+* prediction performance scores on test data: `test_scores.json`
+```
+exp_result
+├── test_scores.json
+└── test_y_data_predicted.csv
+```
diff --git a/README_deephyper_alpha.md b/README_deephyper_alpha.md
new file mode 100644
index 0000000..de026cf
--- /dev/null
+++ b/README_deephyper_alpha.md
@@ -0,0 +1,60 @@
+# Run HPO using DeepHyper on Lambda with conda
+
+## Install conda environment for the curated model (PathDSP)
+Install PathDSP:
+```
+cd <working_dir>
+git clone https://github.com/JDACS4C-IMPROVE/PathDSP
+```
+
+Install IMPROVE and download data:
+```
+cd PathDSP
+source setup_improve.sh
+```
+
+Install PathDSP environment:
+```
+cd PathDSP
+export PathDSP_env=./PathDSP_env/
+conda env create -f PathDSP_env_conda.yml -p $PathDSP_env
+```
+
+## Perform preprocessing
+Run the preprocess script. This script taks around 40 mins to complete.
+The workflow assumes that your preprocessed data is at: "ml_data/{source}-{source}/split_{split}"
+
+```
+cd PathDSP
+conda activate $PathDSP_env
+python PathDSP_preprocess_improve.py --input_dir ./csa_data/raw_data --output_dir ./ml_data/CCLE-CCLE/split_0
+conda deactivate
+```
+
+## Install conda environment for DeepHyper
+```
+module load openmpi
+conda create -n dh python=3.9 -y
+conda activate dh
+conda install gxx_linux-64 gcc_linux-64
+pip install "deephyper[default]"
+pip install mpi4py
+```
+
+## Perform HPO
+If necesssary, activate environment:
+```
+module load openmpi 
+conda activate dh
+export PYTHONPATH=../IMPROVE
+```
+
+Run HPO:
+```
+mpirun -np 10 python hpo_deephyper_subprocess.py
+```
+
+# Run HPO using DeepHyper on Polaris with conda
+
+# Run HPO using DeepHyper on Polaris with singularity
+
diff --git a/RWR.py b/RWR.py
new file mode 100644
index 0000000..5bc82b9
--- /dev/null
+++ b/RWR.py
@@ -0,0 +1,162 @@
+"""
+Return cell by gene probability dataframe
+
+"""
+
+
+import argparse
+import numpy as np
+import pandas as pd
+import scipy.sparse as scisp
+import sklearn.preprocessing as skprc
+from datetime import datetime
+
+
+class RWR:
+    """
+    Return probability matrix where columns are PPI genes
+
+    :param ppiPathStr: string representing path to ppi file (with three columns)
+    :param restartPathStr: string representing path to restart file (i.e., input gene sets)
+    :param restartProbFloat: float representing restart probability (default: 0.5)
+    :param convergenceFloat: folat representing convergence criterion (default: 1e-5)
+    :param normalize: string representing normalization method (choices=['l1', 'l2'])
+    :param weighted: boolean indicating weither to use weighted graph or not (if False, will set weight of all edges to 1)
+    :param outPathStr: string representing output path  
+    """
+    def __init__(self, ppiPathStr, restartPathStr, restartProbFloat=0.5, convergenceFloat=0.00001, normalize='l1', weighted=True, outPathStr='./'):
+        # initiating
+        self.ppiPathStr = ppiPathStr
+        self.restartPathStr = restartPathStr
+        self.restartProbFloat = float(restartProbFloat)
+        self.convergenceFloat = float(convergenceFloat)
+        self.normalize = normalize
+        self.weighted = weighted
+        self.outPathStr = outPathStr
+
+       
+
+    def get_prob(self):
+        # load PPI graph
+        print('loading protein-protein interaction network.....')
+        self.adj_mat, self.name_idx_dict = self.load_graph(self.ppiPathStr, normalize=True, weighted=True)
+        # mapping dictionary of node index number: node name string
+        self.idx_name_dict = { idx:name for name, idx in self.name_idx_dict.items() }
+ 
+        # load restart list (i.e., input gene sets)
+        print('collecting restart list')
+        df = pd.read_csv(self.restartPathStr, header=0, sep="\t")
+        df.columns = ['group', 'gene']
+        # collect gene sets by group
+        grps = df.groupby('group')
+        grps_dict = {}
+        for grp in df['group'].unique():
+            seed_list = grps.get_group(grp)['gene'].values.tolist() #input gene set
+            # check if input gene set in ppi and convert name to index number
+            seed_idx_list = [self.name_idx_dict[i] for i in seed_list if i in self.name_idx_dict.keys()]
+            # update to dictionary
+            grps_dict.update({ grp: {'gList':seed_list, 'ppiList':seed_idx_list} })
+        
+        # perform random walk 
+        print('performing random walk.....')
+        n_grps = len(grps_dict)
+        grp_list = list(grps_dict.keys())
+        grp_prob_dict = {}
+        n_grp_has_no_ppiList = 0 # number of group has restart list not found on PPI network
+        for i in range(n_grps):
+            grp = grp_list[i]
+            if len(grps_dict[grp]['ppiList']) > 0: # has restart list on PPI network
+                prob_list = self.run_single_rwr(self.adj_mat, grps_dict[grp]['ppiList'], restartProbFloat=self.restartProbFloat, convergenceFloat=self.convergenceFloat)
+            
+            else:
+                n_grp_has_no_ppiList += 1
+                prob_list = [0.0] * len(self.name_idx_dict)
+
+            # update to result
+            grp_prob_dict.update( {grp:prob_list} )
+
+        # reformat result: dict2fataframe
+        print('finalizing result of probability matrix.....')
+        result_df = pd.DataFrame(grp_prob_dict)
+        result_df = result_df.T
+        result_df.columns = list(self.name_idx_dict.keys())
+        return result_df # probability matrix grp by ppi genes
+
+
+    def load_graph(self, ppiPathStr, normalize=True, weighted=True):
+        """
+        Return a graph in adjacency matrix format and its name string and correspoing index number mapping dictionary
+
+        :param ppiPathStr: string representing file name of a graph in edge list format
+        :param name2index: boolean indicating whether to convert name string to index number or not
+        :param normalize: boolean indicating whether to perform column-wised normalization
+        """
+        # load data
+        df = pd.read_pickle(ppiPathStr) 
+        df.columns = ['source', 'target', 'weight']
+    
+        # convert name to index
+        all_nodes = sorted(list(set( df['source'] ) | set( df['target'] ))) # retrieve name strings of all nodes
+    
+        # create name:index mapping dictionary
+        gnm_gid_dict = { all_nodes[i]:i for i in range(len(all_nodes)) }
+    
+        # replace name string with index number
+        df['source'].update(df['source'].map(gnm_gid_dict))
+        df['target'].update(df['target'].map(gnm_gid_dict))
+    
+        # use weighted graph or unweighted graph
+        if weighted == False:
+            df['weight'] = 1 # unweighted graph
+    
+        # create adjancency matrix
+        network_matrix = scisp.csr_matrix((df['weight'].values, (df['source'].values, df['target'].values)),
+                                           shape=(len(all_nodes), len(all_nodes)), dtype=float) # Create sparse matrix
+        network_matrix = (network_matrix + network_matrix.T) # Make the ajdacency matrix symmetric
+        network_matrix.setdiag(0) # Set diagnoals to zero
+    
+        # normalization: Normalize the rows of network_matrix because we are multiplying vector by matrix (from left)
+        if normalize == True:
+            network_matrix = skprc.normalize(network_matrix, norm='l1', axis=1)
+    
+        # return 
+        return network_matrix, gnm_gid_dict
+
+    def run_single_rwr(self, ppiAdjMat, restartList, restartProbFloat=0.5, convergenceFloat=0.00001):
+        """
+        Return
+
+        :param ppiAdjMat: adjacency matrix of protein-protein interaction network
+        :param restartList: list of restart nodes (i.e., gene list)
+        :param restartProbFloat: float representing restart probability (default: 0.5)
+        :param convergenceFloat: folat representing convergence criterion (default: 1e-5)
+        """
+        # settings
+        convergence_criterion_float = float(convergenceFloat)   # stops when vector L1 norm drops below 10^(-5)
+        restartProbFloat = float(restartProbFloat)
+        residual_float = 1.0 # difference between p^(t + 1) and p^(t)
+        max_iter = 1000
+
+        # initialze probability vector for restart nodes
+        prob_seed_list = [0] * ppiAdjMat.shape[0]
+        for idx in restartList:
+            prob_seed_list[idx] = 1.0 #1/float(len(restartList))
+        prob_seed_arr = np.array(prob_seed_list)
+        steady_prob_old = prob_seed_arr
+
+        # RWR
+        iter_int = 0
+        #print('updating probability array.....')
+        while (residual_float > convergence_criterion_float):
+            # update vector
+            steady_prob_new = scisp.csr_matrix.dot(steady_prob_old, ppiAdjMat)
+            steady_prob_new *= (1 - restartProbFloat)
+            steady_prob_new += (prob_seed_arr * restartProbFloat)
+
+            # Calculate the residual -- the sum of the absolute
+            # differences of the new node probability vector minus the old
+            # diff_norm = np.linalg.norm(np.subtract(p_t_1, p_t), 1)
+            residual_float = abs(steady_prob_new - steady_prob_old).sum()
+            steady_prob_old = steady_prob_new.copy()
+        return steady_prob_old
+
diff --git a/csa_bruteforce_params.ini b/csa_bruteforce_params.ini
new file mode 100644
index 0000000..d8644d1
--- /dev/null
+++ b/csa_bruteforce_params.ini
@@ -0,0 +1,51 @@
+[DEFAULT]
+input_dir = ./csa_data/raw_data
+csa_outdir = ./bruteforce_output
+y_col_name = auc
+source_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"]
+target_datasets = ["gCSI", "CCLE", "GDSCv1", "GDSCv2", "CTRPv2"]
+split_nums = []
+model_name = PathDSP
+only_cross_study = False
+epochs = 800
+uses_cuda_name = True
+
+### Source and target data sources
+## Set 1 - full analysis
+#source_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"]
+#target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"]
+## Set 2 - smaller datasets
+# source_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"]
+# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"]
+# source_datasets = ["CCLE", "gCSI", "GDSCv2"]
+# target_datasets = ["CCLE", "gCSI", "GDSCv2"]
+# source_datasets = ["CCLE", "GDSCv1"]
+# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"]
+## Set 3 - full analysis for a single source
+# source_datasets = ["CCLE"]
+# source_datasets = ["CTRPv2"]
+# target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"]
+# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"]
+# target_datasets = ["CCLE", "gCSI", "GDSCv2"]
+## Set 4 - same source and target
+# source_datasets = ["CCLE"]
+# target_datasets = ["CCLE"]
+## Set 5 - single source and target
+# source_datasets = ["GDSCv1"]
+# target_datasets = ["CCLE"]
+
+
+## Splits
+#split_nums = []  # all splits
+# split_nums = [0]
+# split_nums = [4, 7]
+# split_nums = [1, 4, 7]
+# split_nums = [1, 3, 5, 7, 9]
+
+
+
+[Preprocess]
+
+[Train]
+
+[Infer]
\ No newline at end of file
diff --git a/csa_bruteforce_params_def.py b/csa_bruteforce_params_def.py
new file mode 100644
index 0000000..6b5d7c9
--- /dev/null
+++ b/csa_bruteforce_params_def.py
@@ -0,0 +1,53 @@
+from improvelib.utils import str2bool
+
+csa_bruteforce_params = [
+    {"name": "cuda_name",
+     "type": str,
+     "default": "cuda:0",
+     "help": "Cuda device name.",
+    },
+    {"name": "csa_outdir",
+     "type": str,
+     "default": "./run.csa.full",
+     "help": "Outdir for workflow.",
+    },
+    {"name": "source_datasets",
+     "nargs" : "+",
+     "type": str,
+     "default": ['CCLE'],
+     "help": "source_datasets for cross study analysis"
+    },
+    {"name": "target_datasets",
+     "nargs" : "+",
+     "type": str,
+     "default": ["CCLE", "gCSI"],
+     "help": "target_datasets for cross study analysis"
+    },
+    {"name": "split_nums",
+     "nargs" : "+",
+     "type": str,
+     "default": ['0'],
+     "help": "Split of the source datasets for CSA"
+    },
+    {"name": "only_cross_study",
+     "type": str2bool,
+     "default": False,
+     "help": "If only cross study analysis is needed"
+    },
+    {"name": "model_name",
+     "type": str,
+     "default": 'graphdrp', ## Change the default to LGBM??
+     "help": "Name of the deep learning model"
+    },
+    {"name": "epochs",
+     "type": int,
+     "default": 10,
+     "help": "Number of epochs"
+    },
+    {"name": "uses_cuda_name",
+     "type": str2bool,
+     "default": True,
+     "help": "Change to false if the model doesn't have a cuda_name parameter."
+    },
+    
+]
\ No newline at end of file
diff --git a/csa_bruteforce_wf.py b/csa_bruteforce_wf.py
new file mode 100644
index 0000000..62f998f
--- /dev/null
+++ b/csa_bruteforce_wf.py
@@ -0,0 +1,250 @@
+""" Python implementation of cross-study analysis workflow """
+
+
+import os
+import subprocess
+import warnings
+from time import time
+from pathlib import Path
+
+import pandas as pd
+
+
+# IMPROVE imports
+# from improvelib.initializer.config import Config
+# from improvelib.initializer.stage_config import PreprocessConfig, TrainConfig, InferConfig
+from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig
+# from improvelib.applications.drug_response_prediction.config import DRPTrainConfig
+# from improvelib.applications.drug_response_prediction.config import DRPInferConfig
+import improvelib.utils as frm
+from csa_bruteforce_params_def import csa_bruteforce_params
+
+
+def build_split_fname(source: str, split: int, phase: str):
+    """ Build split file name. If file does not exist continue """
+    return f"{source_data_name}_split_{split}_{phase}.txt"
+
+def save_captured_output(result, process, MAIN_LOG_DIR, source_data_name, target_data_name, split):
+    result_file_name_stdout = MAIN_LOG_DIR / f"{source_data_name}-{target_data_name}-{split}-{process}-log.txt"
+    with open(result_file_name_stdout, 'w') as file:
+        file.write(result.stdout)
+
+
+
+
+class Timer:
+    """ Measure time. """
+    def __init__(self):
+        self.start = time()
+
+    def timer_end(self):
+        self.end = time()
+        return self.end - self.start
+
+    def display_timer(self, print_fn=print):
+        time_diff = self.timer_end()
+        if (time_diff) // 3600 > 0:
+            print_fn("Runtime: {:.1f} hrs".format( (time_diff)/3600) )
+        else:
+            print_fn("Runtime: {:.1f} mins".format( (time_diff)/60) )
+
+
+filepath = Path(__file__).resolve().parent
+
+print_fn = print
+print_fn(f"File path: {filepath}")
+
+# ===============================================================
+###  CSA settings
+# ===============================================================
+
+
+cfg = DRPPreprocessConfig() # TODO submit github issue; too many logs printed; is it necessary?
+params = cfg.initialize_parameters(
+    pathToModelDir=filepath,
+    default_config="csa_bruteforce_params.ini",
+    additional_definitions=csa_bruteforce_params,
+    required=None
+)
+print("Loaded params")
+
+# Model scripts
+model_name = params["model_name"]
+preprocess_python_script = f'{model_name}_preprocess_improve.py'
+train_python_script = f'{model_name}_train_improve.py'
+infer_python_script = f'{model_name}_infer_improve.py'
+print("Created script names")
+# Specify dirs
+
+y_col_name = params['y_col_name']
+# maindir = Path(f"./{y_col_name}")
+# maindir = Path(f"./0_{y_col_name}_improvelib") # main output dir
+MAIN_CSA_OUTDIR = Path(params["csa_outdir"]) # main output dir
+# Note! ML data and trained model should be saved to the same dir for inference script
+MAIN_ML_DATA_DIR = MAIN_CSA_OUTDIR / 'ml_data' # output_dir_pp, input_dir_train, input_dir_infer
+MAIN_MODEL_DIR = MAIN_CSA_OUTDIR / 'models' # output_dir_train, input_dir_infer
+MAIN_INFER_DIR = MAIN_CSA_OUTDIR / 'infer' # output_dir infer
+MAIN_LOG_DIR = MAIN_CSA_OUTDIR / 'logs'
+frm.create_outdir(MAIN_LOG_DIR)
+print("Created directory names")
+print("MAIN_CSA_OUTDIR: ", MAIN_CSA_OUTDIR)
+print("MAIN_ML_DATA_DIR: ", MAIN_ML_DATA_DIR)
+print("MAIN_MODEL_DIR: ", MAIN_MODEL_DIR)
+print("MAIN_INFER_DIR: ", MAIN_INFER_DIR)
+print("MAIN_LOG_DIR: ", MAIN_LOG_DIR)
+# Note! Here input_dir is the location of benchmark data
+# TODO Should we set input_dir (and output_dir) for each models scrit?
+splits_dir = Path(params['input_dir']) / params['splits_dir']
+print("Created splits path")
+print("splits_dir: ", splits_dir)
+
+source_datasets = params["source_datasets"]
+target_datasets = params["target_datasets"]
+only_cross_study = params["only_cross_study"]
+split_nums = params["split_nums"]
+epochs = params["epochs"]
+cuda_name = params["cuda_name"]
+print("internal params")
+
+
+# ===============================================================
+###  Generate CSA results (within- and cross-study)
+# ===============================================================
+
+timer = Timer()
+# Iterate over source datasets
+# Note! The "source_data_name" iterations are independent of each other
+print_fn(f"\nsource_datasets: {source_datasets}")
+print_fn(f"target_datasets: {target_datasets}")
+print_fn(f"split_nums:      {split_nums}")
+# breakpoint()
+for source_data_name in source_datasets:
+
+    # Get the split file paths
+    # This parsing assumes splits file names are: SOURCE_split_NUM_[train/val/test].txt
+    if len(split_nums) == 0:
+        # Get all splits
+        split_files = list((splits_dir).glob(f"{source_data_name}_split_*.txt"))
+        split_nums = [str(s).split("split_")[1].split("_")[0] for s in split_files]
+        split_nums = sorted(set(split_nums))
+        # num_splits = 1
+    else:
+        # Use the specified splits
+        split_files = []
+        for s in split_nums:
+            split_files.extend(list((splits_dir).glob(f"{source_data_name}_split_{s}_*.txt")))
+
+    files_joined = [str(s) for s in split_files]
+
+    # --------------------
+    # Preprocess and Train
+    # --------------------
+    for split in split_nums:
+        print_fn(f"Split id {split} out of {len(split_nums)} splits.")
+        # Check that train, val, and test are available. Otherwise, continue to the next split.
+        for phase in ["train", "val", "test"]:
+            fname = build_split_fname(source_data_name, split, phase)
+            if fname not in "\t".join(files_joined):
+                warnings.warn(f"\nThe {phase} split file {fname} is missing (continue to next split)")
+                continue
+
+        for target_data_name in target_datasets:
+            if only_cross_study and (source_data_name == target_data_name):
+                continue # only cross-study
+            print_fn(f"\nSource data: {source_data_name}")
+            print_fn(f"Target data: {target_data_name}")
+
+            ml_data_dir = MAIN_ML_DATA_DIR / f"{source_data_name}-{target_data_name}" / \
+                f"split_{split}"
+            model_dir = MAIN_MODEL_DIR / f"{source_data_name}" / f"split_{split}"
+            infer_dir = MAIN_INFER_DIR / f"{source_data_name}-{target_data_name}" / \
+                f"split_{split}" # AP
+
+            if source_data_name == target_data_name:
+                # If source and target are the same, then infer on the test split
+                test_split_file = f"{source_data_name}_split_{split}_test.txt"
+            else:
+                # If source and target are different, then infer on the entire target dataset
+                test_split_file = f"{target_data_name}_all.txt"
+
+            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # p1 (none): Preprocess train data
+            # train_split_files = list((ig.splits_dir).glob(f"{source_data_name}_split_0_train*.txt"))  # placeholder for LC
+            timer_preprocess = Timer()
+            print_fn("\nPreprocessing")
+            train_split_file = f"{source_data_name}_split_{split}_train.txt"
+            val_split_file = f"{source_data_name}_split_{split}_val.txt"
+            print_fn(f"train_split_file: {train_split_file}")
+            print_fn(f"val_split_file:   {val_split_file}")
+            print_fn(f"test_split_file:  {test_split_file}")
+            preprocess_run = ["python", preprocess_python_script,
+                  "--train_split_file", str(train_split_file),
+                  "--val_split_file", str(val_split_file),
+                  "--test_split_file", str(test_split_file),
+                  "--input_dir", params['input_dir'], # str("./csa_data/raw_data"),
+                  "--output_dir", str(ml_data_dir),
+                  "--y_col_name", str(y_col_name)
+            ]
+            result = subprocess.run(preprocess_run, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, universal_newlines=True)
+            print(f"returncode = {result.returncode}")
+            save_captured_output(result, "preprocess", MAIN_LOG_DIR, source_data_name, target_data_name, split)
+            timer_preprocess.display_timer(print_fn)
+
+            # p2 (p1): Train model
+            # Train a single model for a given [source, split] pair
+            # Train using train samples and early stop using val samples
+            if model_dir.exists() is False:
+                timer_train = Timer()
+                print_fn("\nTrain")
+                print_fn(f"ml_data_dir: {ml_data_dir}")
+                print_fn(f"model_dir:   {model_dir}")
+                if params["uses_cuda_name"]:
+                    train_run = ["python", train_python_script,
+                        "--input_dir", str(ml_data_dir),
+                        "--output_dir", str(model_dir),
+                        "--epochs", str(epochs),  # DL-specific
+                        "--cuda_name", cuda_name, # DL-specific
+                        "--y_col_name", y_col_name
+                    ]
+                else:
+                    train_run = ["python", train_python_script,
+                        "--input_dir", str(ml_data_dir),
+                        "--output_dir", str(model_dir),
+                        "--epochs", str(epochs),  # DL-specific
+                        "--y_col_name", y_col_name
+                    ]
+                result = subprocess.run(train_run, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, universal_newlines=True)
+                print(f"returncode = {result.returncode}")
+                save_captured_output(result, "train", MAIN_LOG_DIR, source_data_name, "none", split)
+                timer_train.display_timer(print_fn)
+
+            # Infer
+            # p3 (p1, p2): Inference
+            timer_infer = Timer()
+            print_fn("\nInfer")
+            if params["uses_cuda_name"]:
+                infer_run = ["python", infer_python_script,
+                    "--input_data_dir", str(ml_data_dir),
+                    "--input_model_dir", str(model_dir),
+                    "--output_dir", str(infer_dir),
+                    "--cuda_name", cuda_name, # DL-specific
+                    "--y_col_name", y_col_name,
+                    "--calc_infer_scores", "true"
+                ]
+            else:
+                infer_run = ["python", infer_python_script,
+                    "--input_data_dir", str(ml_data_dir),
+                    "--input_model_dir", str(model_dir),
+                    "--output_dir", str(infer_dir),
+                    "--y_col_name", y_col_name,
+                    "--calc_infer_scores", "true"
+                ]
+            result = subprocess.run(infer_run, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, universal_newlines=True)
+            print(f"returncode = {result.returncode}")
+            save_captured_output(result, "infer", MAIN_LOG_DIR, source_data_name, target_data_name, split)
+            timer_infer.display_timer(print_fn)
+
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+timer.display_timer(print_fn)
+print_fn('Finished full cross-study run.')
\ No newline at end of file
diff --git a/csa_params.ini b/csa_params.ini
new file mode 100644
index 0000000..f9261e3
--- /dev/null
+++ b/csa_params.ini
@@ -0,0 +1,30 @@
+[DEFAULT]
+input_dir = ./csa_data/raw_data
+y_col_name = auc
+use_singularity = False
+hyperparameters_file = ./hyperparameters_default.json
+model_name = PathDSP
+only_cross_study = False
+epochs = 800
+model_environment = PathDSP_env
+
+
+# Full-scale CSA
+# output_dir = ./parsl_csa_exp
+# source_datasets = ["CCLE","CTRPv2","gCSI","GDSCv1","GDSCv2"]
+# target_datasets = ["CCLE","CTRPv2","gCSI","GDSCv1","GDSCv2"]
+# split = ["0","1","2","3","4","5","6","7","8","9"]
+# available_accelerators = ["0","1","2","3","4","5","6","7"]
+
+# Exp 3
+output_dir = ./parsl_csa_exp3
+source_datasets = ["CCLE","GDSCv2","gCSI"]
+target_datasets = ["CCLE","GDSCv2","gCSI"]
+split = ["0","1"]
+available_accelerators = ["4","5","6","7"]
+
+[Preprocess]
+
+[Train]
+
+[Infer]
\ No newline at end of file
diff --git a/csa_params.test.ini b/csa_params.test.ini
new file mode 100644
index 0000000..2fad301
--- /dev/null
+++ b/csa_params.test.ini
@@ -0,0 +1,20 @@
+[DEFAULT]
+input_dir = ./csa_data/raw_data
+output_dir=./improve_output
+y_col_name = auc
+use_singularity = False
+hyperparameters_file = ./hyperparameters_default.json
+source_datasets = ["gCSI"]
+target_datasets = ["gCSI", "CCLE"]
+split = ["0", "1"]
+model_name = PathDSP
+only_cross_study = False
+epochs = 3
+available_accelerators=["4","5"]
+model_environment = PathDSP_env
+
+[Preprocess]
+
+[Train]
+
+[Infer]
\ No newline at end of file
diff --git a/csa_params_def.py b/csa_params_def.py
new file mode 100644
index 0000000..75daeb5
--- /dev/null
+++ b/csa_params_def.py
@@ -0,0 +1,62 @@
+
+additional_definitions = [
+    {"name": "source_datasets",
+     "nargs" : "+",
+     "type": str,
+     "default": ['CCLE'],
+     "help": "source_datasets for cross study analysis"
+    },
+    {"name": "target_datasets",
+     "nargs" : "+",
+     "type": str,
+     "default": ["CCLE", "gCSI"],
+     "help": "target_datasets for cross study analysis"
+    },
+    {"name": "split",
+     "nargs" : "+",
+     "type": str,
+     "default": ['0'],
+     "help": "Split of the source datasets for CSA"
+    },
+    {"name": "only_cross_study",
+     "type": bool,
+     "default": False,
+     "help": "If only cross study analysis is needed"
+    },
+    {"name": "model_name",
+     "type": str,
+     "default": 'graphdrp', ## Change the default to LGBM??
+     "help": "Name of the deep learning model"
+    },
+    {"name": "model_environment",
+     "type": str,
+     "default": '', ## Change the default to LGBM??
+     "help": "Name of your model conda environment"
+    },
+    {"name": "hyperparameters_file",
+     "type": str,
+     "default": 'hyperparameters_default.json',
+     "help": "json file containing optimized hyperparameters per dataset"
+    },
+    {"name": "epochs",
+     "type": int,
+     "default": 10,
+     "help": "Number of epochs"
+    },
+    {"name": "available_accelerators",
+     "nargs" : "+",
+     "type": str,
+     "default": ["0", "1"],
+     "help": "GPU IDs to assign jobs"
+    },
+    {"name": "use_singularity",
+     "type": bool,
+     "default": True,
+     "help": "Do you want to use singularity image for running the model?"
+    },
+    {"name": "singularity_image",
+     "type": str,
+     "default": '',
+     "help": "Singularity image file of the model"
+    }
+    ]
\ No newline at end of file
diff --git a/csa_wf_v3.py b/csa_wf_v3.py
new file mode 100644
index 0000000..066b726
--- /dev/null
+++ b/csa_wf_v3.py
@@ -0,0 +1,292 @@
+""" Python implementation of cross-study analysis workflow """
+# cuda_name = "cuda:6"
+cuda_name = "cuda:7"
+
+import os
+import subprocess
+import warnings
+from time import time
+from pathlib import Path
+
+import pandas as pd
+
+# IMPROVE imports
+from improve import framework as frm
+# import improve_utils
+# from improve_utils import improve_globals as ig
+
+# GraphDRP imports
+# TODO: change this for your model
+import PathDSP_preprocess_improve
+import PathDSP_train_improve
+import PathDSP_preprocess_improve
+
+# from ap_utils.classlogger import Logger
+# from ap_utils.utils import get_print_func, Timer
+
+
+class Timer:
+  """ Measure time. """
+  def __init__(self):
+    self.start = time()
+
+  def timer_end(self):
+    self.end = time()
+    return self.end - self.start
+
+  def display_timer(self, print_fn=print):
+    time_diff = self.timer_end()
+    if time_diff // 3600 > 0:
+        print_fn("Runtime: {:.1f} hrs".format( (time_diff)/3600) )
+    else:
+        print_fn("Runtime: {:.1f} mins".format( (time_diff)/60) )
+
+
+fdir = Path(__file__).resolve().parent
+
+y_col_name = "auc"
+# y_col_name = "auc1"
+
+maindir = Path(f"./{y_col_name}")
+MAIN_ML_DATA_DIR = Path(f"./{maindir}/ml.data")
+MAIN_MODEL_DIR = Path(f"./{maindir}/models")
+MAIN_INFER_OUTDIR = Path(f"./{maindir}/infer")
+
+# Check that environment variable "IMPROVE_DATA_DIR" has been specified
+if os.getenv("IMPROVE_DATA_DIR") is None:
+    raise Exception("ERROR ! Required system variable not specified.  \
+                    You must define IMPROVE_DATA_DIR ... Exiting.\n")
+os.environ["CANDLE_DATA_DIR"] = os.environ["IMPROVE_DATA_DIR"]
+
+params = frm.initialize_parameters(
+    fdir,
+    default_model="csa_workflow_params.txt",
+)
+
+main_datadir = Path(os.environ["IMPROVE_DATA_DIR"])
+raw_datadir = main_datadir / params["raw_data_dir"]
+x_datadir = raw_datadir / params["x_data_dir"]
+y_datadir = raw_datadir / params["y_data_dir"]
+splits_dir = raw_datadir / params["splits_dir"]
+
+# lg = Logger(main_datadir/"csa.log")
+print_fn = print
+# print_fn = get_print_func(lg.logger)
+print_fn(f"File path: {fdir}")
+
+### Source and target data sources
+## Set 1 - full analysis
+# source_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"]
+# target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"]
+## Set 2 - smaller datasets
+# source_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"]
+# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"]
+# source_datasets = ["GDSCv1", "CTRPv2"]
+# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"]
+## Set 3 - full analysis for a single source
+# source_datasets = ["CCLE"]
+# source_datasets = ["CTRPv2"]
+source_datasets = ["GDSCv1"]
+target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"]
+# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"]
+# target_datasets = ["CCLE", "gCSI", "GDSCv2"]
+## Set 4 - same source and target
+# source_datasets = ["CCLE"]
+# target_datasets = ["CCLE"]
+## Set 5 - single source and target
+# source_datasets = ["GDSCv1"]
+# target_datasets = ["CCLE"]
+
+# only_cross_study = False
+only_cross_study = True
+
+
+## Splits
+# split_nums = []  # all splits
+# split_nums = [0]
+# split_nums = [4, 7]
+split_nums = [1, 4, 7]
+# split_nums = [1, 3, 5, 7, 9]
+
+## Parameters of the experiment/run/workflow
+# TODO: this should be stored as the experiment metadata that we can go back check
+# epochs = 2
+# epochs = 30
+# epochs = 50
+epochs = 70
+# epochs = 100
+# epochs = 150
+# config_file_name = "csa_params.txt"
+# config_file_path = fdir/config_file_name
+
+def build_split_fname(source, split, phasea):
+    """ Build split file name. If file does not exist continue """
+    return f"{source_data_name}_split_{split}_{phase}.txt"
+
+# ===============================================================
+###  Generate CSA results (within- and cross-study)
+# ===============================================================
+
+timer = Timer()
+# Iterate over source datasets
+# Note! The "source_data_name" iterations are independent of each other
+print_fn(f"\nsource_datasets: {source_datasets}")
+print_fn(f"target_datasets: {target_datasets}")
+print_fn(f"split_nums:      {split_nums}")
+# import pdb; pdb.set_trace()
+for source_data_name in source_datasets:
+
+    # Get the split file paths
+    # This parsing assumes splits file names are: SOURCE_split_NUM_[train/val/test].txt
+    if len(split_nums) == 0:
+        # Get all splits
+        split_files = list((splits_dir).glob(f"{source_data_name}_split_*.txt"))
+        split_nums = [str(s).split("split_")[1].split("_")[0] for s in split_files]
+        split_nums = sorted(set(split_nums))
+        # num_splits = 1
+    else:
+        # Use the specified splits
+        split_files = []
+        for s in split_nums:
+            split_files.extend(list((splits_dir).glob(f"{source_data_name}_split_{s}_*.txt")))
+
+    files_joined = [str(s) for s in split_files]
+
+    # --------------------
+    # Preprocess and Train
+    # --------------------
+    # import pdb; pdb.set_trace()
+    for split in split_nums:
+        print_fn(f"Split id {split} out of {len(split_nums)} splits.")
+        # Check that train, val, and test are available. Otherwise, continue to the next split.
+        # split = 11
+        # files_joined = [str(s) for s in split_files]
+        # TODO: check this!
+        for phase in ["train", "val", "test"]:
+            fname = build_split_fname(source_data_name, split, phase)
+            # print(f"{phase}: {fname}")
+            if fname not in "\t".join(files_joined):
+                warnings.warn(f"\nThe {phase} split file {fname} is missing (continue to next split)")
+                continue
+
+        # import pdb; pdb.set_trace()
+        for target_data_name in target_datasets:
+            if only_cross_study and (source_data_name == target_data_name):
+                continue # only cross-study
+            print_fn(f"\nSource data: {source_data_name}")
+            print_fn(f"Target data: {target_data_name}")
+
+            # EXP_ML_DATA_DIR = ig.ml_data_dir/f"{source_data_name}-{target_data_name}"/f"split_{split}"
+            ml_data_outdir = MAIN_ML_DATA_DIR/f"{source_data_name}-{target_data_name}"/f"split_{split}"
+
+            if source_data_name == target_data_name:
+                # If source and target are the same, then infer on the test split
+                test_split_file = f"{source_data_name}_split_{split}_test.txt"
+            else:
+                # If source and target are different, then infer on the entire target dataset
+                test_split_file = f"{target_data_name}_all.txt"
+
+            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # p1 (none): Preprocess train data
+            # import pdb; pdb.set_trace()
+            # train_split_files = list((ig.splits_dir).glob(f"{source_data_name}_split_0_train*.txt"))  # TODO: placeholder for lc analysis
+            timer_preprocess = Timer()
+            # ml_data_path = graphdrp_preprocess_improve.main([
+            #     "--train_split_file", f"{source_data_name}_split_{split}_train.txt",
+            #     "--val_split_file", f"{source_data_name}_split_{split}_val.txt",
+            #     "--test_split_file", str(test_split_file_name),
+            #     "--ml_data_outdir", str(ml_data_outdir),
+            #     "--y_col_name", y_col_name
+            # ])
+            print_fn("\nPreprocessing")
+            train_split_file = f"{source_data_name}_split_{split}_train.txt"
+            val_split_file = f"{source_data_name}_split_{split}_val.txt"
+            # test_split_file = f"{source_data_name}_split_{split}_test.txt"
+            print_fn(f"train_split_file: {train_split_file}")
+            print_fn(f"val_split_file:   {val_split_file}")
+            print_fn(f"test_split_file:  {test_split_file}")
+            print_fn(f"ml_data_outdir:   {ml_data_outdir}")
+            # import pdb; pdb.set_trace()
+            preprocess_run = ["python",
+                  "PathDSP_preprocess_improve.py",
+                  "--train_split_file", str(train_split_file),
+                  "--val_split_file", str(val_split_file),
+                  "--test_split_file", str(test_split_file),
+                  "--ml_data_outdir", str(ml_data_outdir),
+                  "--y_col_name", str(y_col_name)
+            ]
+            result = subprocess.run(preprocess_run, capture_output=True,
+                                    text=True, check=True)
+            # print(result.stdout)
+            # print(result.stderr)
+            timer_preprocess.display_timer(print_fn)
+
+            # p2 (p1): Train model
+            # Train a single model for a given [source, split] pair
+            # Train using train samples and early stop using val samples
+            # import pdb; pdb.set_trace()
+            model_outdir = MAIN_MODEL_DIR/f"{source_data_name}"/f"split_{split}"
+            if model_outdir.exists() is False:
+                train_ml_data_dir = ml_data_outdir
+                val_ml_data_dir = ml_data_outdir
+                timer_train = Timer()
+                # graphdrp_train_improve.main([
+                #     "--train_ml_data_dir", str(train_ml_data_dir),
+                #     "--val_ml_data_dir", str(val_ml_data_dir),
+                #     "--model_outdir", str(model_outdir),
+                #     "--epochs", str(epochs),  # available in config_file
+                #     # "--ckpt_directory", str(MODEL_OUTDIR),  # TODO: we'll use candle known param ckpt_directory instead of model_outdir
+                #     # "--cuda_name", "cuda:5"
+                # ])
+                print_fn("\nTrain")
+                print_fn(f"train_ml_data_dir: {train_ml_data_dir}")
+                print_fn(f"val_ml_data_dir:   {val_ml_data_dir}")
+                print_fn(f"model_outdir:      {model_outdir}")
+                # import pdb; pdb.set_trace()
+                train_run = ["python",
+                      "PathDSP_train_improve.py",
+                      "--train_ml_data_dir", str(train_ml_data_dir),
+                      "--val_ml_data_dir", str(val_ml_data_dir),
+                      "--model_outdir", str(model_outdir),
+                      "--epochs", str(epochs),
+                      "--cuda_name", cuda_name,
+                      "--y_col_name", y_col_name
+                ]
+                result = subprocess.run(train_run, capture_output=True,
+                                        text=True, check=True)
+                # print(result.stdout)
+                # print(result.stderr)
+                timer_train.display_timer(print_fn)
+
+            # Infer
+            # p3 (p1, p2): Inference
+            # import pdb; pdb.set_trace()
+            test_ml_data_dir = ml_data_outdir
+            model_dir = model_outdir
+            infer_outdir = MAIN_INFER_OUTDIR/f"{source_data_name}-{target_data_name}"/f"split_{split}"
+            timer_infer = Timer()
+            # graphdrp_infer_improve.main([
+            #     "--test_ml_data_dir", str(test_ml_data_dir),
+            #     "--model_dir", str(model_dir),
+            #     "--infer_outdir", str(infer_outdir),
+            #     # "--cuda_name", "cuda:5"
+            # ])
+            print_fn("\nInfer")
+            print_fn(f"test_ml_data_dir: {test_ml_data_dir}")
+            #print_fn(f"val_ml_data_dir:  {val_ml_data_dir}")
+            print_fn(f"infer_outdir:     {infer_outdir}")
+            # import pdb; pdb.set_trace()
+            infer_run = ["python",
+                  "PathDSP_infer_improve.py",
+                  "--test_ml_data_dir", str(test_ml_data_dir),
+                  "--model_dir", str(model_dir),
+                  "--infer_outdir", str(infer_outdir),
+                  "--cuda_name", cuda_name,
+                  "--y_col_name", y_col_name
+            ]
+            result = subprocess.run(infer_run, capture_output=True,
+                                    text=True, check=True)
+            timer_infer.display_timer(print_fn)
+
+timer.display_timer(print_fn)
+print_fn("Finished a full cross-study run.")
diff --git a/csa_workflow_params.txt b/csa_workflow_params.txt
new file mode 100644
index 0000000..66a69f8
--- /dev/null
+++ b/csa_workflow_params.txt
@@ -0,0 +1,8 @@
+[Global_Params]
+model_name = "CSA_workflow"
+
+[CSA_Workflow]
+raw_data_dir = "raw_data"
+x_data_dir = "x_data"
+y_data_dir = "y_data"
+splits_dir = "splits"
diff --git a/download_author_data.sh b/download_author_data.sh
new file mode 100644
index 0000000..999fc4c
--- /dev/null
+++ b/download_author_data.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# arg 1: output directory to download model-specific data
+# If run via container, it needs to be downloaded under the csa_data folder
+
+OUTPUT_DIR=$1
+
+# Check if the data is already downloaded
+if [ -f "$OUTPUT_DIR/.downloaded" ]; then 
+  echo "Data present, skipping download"
+# Download data if no other download is in progress
+elif [ ! -f "$OUTPUT_DIR/.downloading_author_data" ]; then
+  touch "$OUTPUT_DIR/.downloading_author_data"
+  # Download files
+  # Unzip files
+  wget -P $OUTPUT_DIR https://zenodo.org/record/6093818/files/MSigdb.zip
+  wget -P $OUTPUT_DIR https://zenodo.org/record/6093818/files/raw_data.zip
+  wget -P $OUTPUT_DIR https://zenodo.org/record/6093818/files/STRING.zip
+  unzip -d $OUTPUT_DIR $OUTPUT_DIR/MSigdb.zip
+  unzip -d $OUTPUT_DIR $OUTPUT_DIR/raw_data.zip
+  unzip -d $OUTPUT_DIR $OUTPUT_DIR/STRING.zip
+  touch "$OUTPUT_DIR/.downloaded"
+  rm "$OUTPUT_DIR/.downloading_author_data"
+else
+  # Wait for other download to finish
+  iteration=0
+  echo "Waiting for external download"
+  while [ -f "$OUTPUT_DIR/.downloading_author_data" ]; do
+    iteration=$((iteration + 1))
+    if [ "$iteration" -gt 10 ]; then
+      # Download takes too long, exit and warn user
+      echo "Check output directory, download still in progress after $iteration minutes."
+      exit 1
+    fi
+    sleep 60
+  done
+fi
diff --git a/download_csa.sh b/download_csa.sh
new file mode 100644
index 0000000..7bfc04c
--- /dev/null
+++ b/download_csa.sh
@@ -0,0 +1,2 @@
+# wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data
+wget --cut-dirs=8 -P ./ -nH -np -m https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/
\ No newline at end of file
diff --git a/execute_in_conda.sh b/execute_in_conda.sh
new file mode 100644
index 0000000..303f81b
--- /dev/null
+++ b/execute_in_conda.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+model_env=$1 ; shift
+script=$1 ; shift
+conda_path=$(dirname $(dirname $(which conda)))
+source $conda_path/bin/activate $model_env
+CMD="python ${script} $@"
+
+echo "running command ${CMD}"
+
+$CMD
diff --git a/get_hosts_polaris.py b/get_hosts_polaris.py
new file mode 100644
index 0000000..196008a
--- /dev/null
+++ b/get_hosts_polaris.py
@@ -0,0 +1,14 @@
+import sys
+
+if __name__ == "__main__":
+    ranks_per_node = 4
+    fname = sys.argv[1]
+    output = ""
+    with open(fname, "r") as f:
+        for i, line in enumerate(f):
+            line = line.strip("\n")
+            if i == 0:
+                output += f"{line}"
+            for _ in range(ranks_per_node):
+                output += f",{line}"
+    print(output)
\ No newline at end of file
diff --git a/hpo_deephyper_hyperparameters.py b/hpo_deephyper_hyperparameters.py
new file mode 100644
index 0000000..56c446b
--- /dev/null
+++ b/hpo_deephyper_hyperparameters.py
@@ -0,0 +1,43 @@
+hyperparams = [
+    {"name": "batch_size",
+     "type": int,
+     "min": 8,
+     "max": 512,
+     "default": 64,
+     "log_uniform": True
+    },
+    {"name": "learning_rate",
+     "type": float,
+     "min": 1e-6,
+     "max": 1e-2,
+     "default": 0.001,
+     "log_uniform": True
+    },
+    {
+    "name": "dropout",
+    "type": float,
+    "min": 0,
+    "max": 0.5,
+    "default": 0,
+    "log_uniform": False
+    }
+]
+
+
+'''
+{
+"name": "dropout",
+"type": float,
+"min": 0,
+"max": 0.5,
+"default": 0,
+"log_uniform": False
+}
+
+{
+"name": "early_stopping",
+"type": "categorical",
+"choices": [True, False], 
+"default": False
+}
+'''
\ No newline at end of file
diff --git a/hpo_deephyper_params.ini b/hpo_deephyper_params.ini
new file mode 100644
index 0000000..2178f9c
--- /dev/null
+++ b/hpo_deephyper_params.ini
@@ -0,0 +1,21 @@
+[DEFAULT]
+input_dir = ./csa_data/raw_data
+ml_data_dir = ./ml_data/CCLE-CCLE/split_0
+y_col_name = auc
+model_name = PathDSP
+model_scripts_dir = ./
+model_environment = ./PathDSP_env/
+epochs = 3
+output_dir = ./test
+source = CCLE
+split = 0
+max_evals = 5
+interactive_session = True
+
+
+
+[Preprocess]
+
+[Train]
+
+[Infer]
\ No newline at end of file
diff --git a/hpo_deephyper_params_def.py b/hpo_deephyper_params_def.py
new file mode 100644
index 0000000..0588ea3
--- /dev/null
+++ b/hpo_deephyper_params_def.py
@@ -0,0 +1,62 @@
+additional_definitions = [
+    {"name": "source",
+     "type": str,
+     "default": "GDSCv1",
+     "help": "source dataset for HPO"
+    },
+    {"name": "split",
+     "type": str,
+     "default": "4",
+     "help": "Split of the source datasets for HPO"
+    },
+    {"name": "model_name",
+     "type": str,
+     "default": 'PathDSP',
+     "help": "Name of the deep learning model"
+    },
+    {"name": "model_scripts_dir",
+     "type": str,
+     "default": './', 
+     "help": "Path to the model repository"
+    },
+    {"name": "model_environment",
+     "type": str,
+     "default": '',
+     "help": "Name of your model conda environment"
+    },
+    {"name": "epochs",
+     "type": int,
+     "default": 10,
+     "help": "Number of epochs"
+    },
+    {"name": "use_singularity",
+     "type": bool,
+     "default": True,
+     "help": "Do you want to use singularity image for running the model?"
+    },
+    {"name": "singularity_image",
+     "type": str,
+     "default": '',
+     "help": "Singularity image file of the model"
+    },
+    {"name": "val_loss",
+     "type": str,
+     "default": 'mse',
+     "help": "Type of loss for validation"
+    },
+    {"name": "max_evals",
+     "type": int,
+     "default": 20,
+     "help": "Number of evaluations"
+    },
+    {"name": "interactive_session",
+     "type": bool,
+     "default": True,
+     "help": "Are you using an interactive session?"
+    },
+    {"name": "ml_data_dir",
+     "type": str,
+     "default": './',
+     "help": "Location of the preprocessed data."
+    }
+    ]
\ No newline at end of file
diff --git a/hpo_deephyper_subprocess.py b/hpo_deephyper_subprocess.py
new file mode 100644
index 0000000..01c950e
--- /dev/null
+++ b/hpo_deephyper_subprocess.py
@@ -0,0 +1,171 @@
+"""
+Before running this script, first need to preprocess the data.
+This can be done by running preprocess_example.sh
+
+and the env vars $PYTHONPATH is set:
+export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib
+
+It also assumes that your processed training and validation data is in ml_data_dir.
+Model output files will be saved in output_dir/{source}/split_{split}.
+
+mpirun -np 10 python hpo_subprocess.py
+"""
+# import copy
+import json
+import subprocess
+import pandas as pd
+import os
+import time
+from pathlib import Path
+import logging
+import mpi4py
+from deephyper.evaluator import Evaluator, profile
+from deephyper.evaluator.callback import TqdmCallback
+from deephyper.problem import HpProblem
+from deephyper.search.hps import CBO
+from mpi4py import MPI
+import socket
+import hpo_deephyper_params_def
+from hpo_deephyper_hyperparameters import hyperparams
+from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig
+
+# ---------------------
+# Initialize parameters for DeepHyper HPO
+# ---------------------
+filepath = Path(__file__).resolve().parent
+cfg = DRPPreprocessConfig() 
+global params
+params = cfg.initialize_parameters(
+    pathToModelDir=filepath,
+    default_config="hpo_deephyper_params.ini",
+    additional_definitions=hpo_deephyper_params_def.additional_definitions
+)
+output_dir = Path(params['output_dir'])
+if output_dir.exists() is False:
+    os.makedirs(output_dir, exist_ok=True)
+#params['ml_data_dir'] = f"ml_data/{params['source']}-{params['source']}/split_{params['split']}"
+params['model_outdir'] = f"{params['output_dir']}/{params['source']}/split_{params['split']}"
+params['script_name'] = os.path.join(params['model_scripts_dir'],f"{params['model_name']}_train_improve.py")
+
+# ---------------------
+# Enable using multiple GPUs
+# ---------------------
+
+mpi4py.rc.initialize = False
+mpi4py.rc.threads = True
+mpi4py.rc.thread_level = "multiple"
+mpi4py.rc.recv_mprobe = False
+
+if not MPI.Is_initialized():
+    MPI.Init_thread()
+
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+size = comm.Get_size()
+
+if params['interactive_session']:
+    num_gpus_per_node = 2
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node)
+    cuda_name = "cuda:" + str(rank % num_gpus_per_node)
+else:
+    # CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh
+    local_rank = os.environ["PMI_LOCAL_RANK"]
+
+# ---------------------
+# Enable logging
+# ---------------------
+
+logging.basicConfig(
+    # filename=f"deephyper.{rank}.log, # optional if we want to store the logs to disk
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(filename)s:%(funcName)s - %(message)s",
+    force=True,
+)
+
+# ---------------------
+# Hyperparameters
+# ---------------------
+problem = HpProblem()
+
+for hp in hyperparams:
+    if hp['type'] == "categorical":
+        print("not implemented yet")
+    else:
+        if hp['log_uniform']:
+            problem.add_hyperparameter((hp['min'], hp['max'], "log-uniform"), 
+                                       hp['name'], default_value=hp['default'])
+        else:
+            problem.add_hyperparameter((hp['min'], hp['max']), 
+                                       hp['name'], default_value=hp['default'])
+
+params['hyperparams'] = [d['name'] for d in hyperparams]
+
+
+# problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.0)
+# problem.add_hyperparameter([True, False], "early_stopping", default_value=False)
+
+
+@profile
+def run(job, optuna_trial=None):
+    model_outdir_job_id = Path(params['model_outdir'] + f"/{job.id}")
+    #learning_rate = job.parameters["learning_rate"]
+    #batch_size = job.parameters["batch_size"]
+
+    train_run = ["bash", "hpo_deephyper_subprocess_train.sh",
+             str(params['model_environment']),
+             str(params['script_name']),
+             str(params['ml_data_dir']),
+             str(model_outdir_job_id),
+             str(params['epochs']),
+             str(os.environ["CUDA_VISIBLE_DEVICES"])
+        ]
+    for hp in params['hyperparams']:
+        train_run = train_run + [str(hp)]
+        train_run = train_run + [str(job.parameters[hp])]
+
+    print(f"Launching run: ")
+    print(train_run)
+    subprocess_res = subprocess.run(train_run, 
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        universal_newlines=True
+    )
+    # Logger
+    print(f"returncode = {subprocess_res.returncode}")
+    result_file_name_stdout = model_outdir_job_id / 'logs.txt'
+    if model_outdir_job_id.exists() is False: # If subprocess fails, model_dir may not be created and we need to write the log files in model_dir
+        os.makedirs(model_outdir_job_id, exist_ok=True)
+    with open(result_file_name_stdout, 'w') as file:
+        file.write(subprocess_res.stdout)
+
+    # Load val_scores and get val_loss
+    f = open(model_outdir_job_id / "val_scores.json")
+    val_scores = json.load(f)
+    objective = -val_scores[params['val_loss']]
+
+    # Checkpoint the model weights
+    with open(f"{params['output_dir']}/model_{job.id}.pkl", "w") as f:
+        f.write("model weights")
+
+    # return score
+    return {"objective": objective, "metadata": val_scores}
+
+
+if __name__ == "__main__":
+    with Evaluator.create(
+        run, method="mpicomm", method_kwargs={"callbacks": [TqdmCallback()]}
+    ) as evaluator:
+
+        if evaluator is not None:
+            print(problem)
+            search = CBO(
+                problem,
+                evaluator,
+                log_dir=params['output_dir'],
+                verbose=1,
+            )
+            results = search.search(max_evals=params['max_evals'])
+            results = results.sort_values(f"m:{params['val_loss']}", ascending=True)
+            results.to_csv(f"{params['output_dir']}/hpo_results.csv", index=False)
+    print("current node: ", socket.gethostname(), "; current rank: ", rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"])
+    print("Finished deephyper HPO.")
diff --git a/hpo_deephyper_subprocess_train.sh b/hpo_deephyper_subprocess_train.sh
new file mode 100644
index 0000000..c3f8984
--- /dev/null
+++ b/hpo_deephyper_subprocess_train.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0
+# CUDA_VISIBLE_DEVICES=5 bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0
+
+# Need to comment this when using ' eval "$(conda shell.bash hook)" '
+# set -e
+
+# Activate conda env for model using "conda activate myenv"
+# https://saturncloud.io/blog/activating-conda-environments-from-scripts-a-guide-for-data-scientists
+# https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script
+# This doesn't work w/o eval "$(conda shell.bash hook)"
+CONDA_ENV=$1
+echo "Activated conda commands in shell script"
+conda_path=$(dirname $(dirname $(which conda)))
+source $conda_path/bin/activate $CONDA_ENV
+echo "Activated conda env $CONDA_ENV"
+
+# get mandatory arguments
+SCRIPT=$2
+input_dir=$3
+output_dir=$4
+epochs=$5
+CUDA_VISIBLE_DEVICES=$6
+
+command="python $SCRIPT --input_dir $input_dir --output_dir $output_dir --epochs $epochs "
+
+
+# append hyperparameter arguments to python call
+for i in $(seq 7 $#)
+do
+    if [ $(($i % 2)) == 0 ]; then
+        command="${command} ${!i}"
+    else
+        command="${command} --${!i}"
+    fi
+done
+
+
+echo "command: $command"
+
+# run python script
+CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} $command
+
+
+source $conda_path/bin/deactivate
+echo "Deactivated conda env $CONDA_ENV"
diff --git a/hpo_scale.sh b/hpo_scale.sh
new file mode 100644
index 0000000..93d517f
--- /dev/null
+++ b/hpo_scale.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+#PBS -l select=2:system=polaris
+#PBS -l place=scatter
+#PBS -l walltime=00:60:00
+#PBS -q debug
+#PBS -A IMPROVE
+#PBS -l filesystems=home:eagle
+#PBS -N dh_hpo_scale_test
+
+set -xe
+
+# Move to the directory where `qsub example-improve.sh` was run
+cd ${PBS_O_WORKDIR}
+
+# source enviroemnt variabels for IMPROVE
+source $IMPROVE_env
+
+# Activate the current environement (module load, conda activate etc...)
+# Assume conda is installed
+module load PrgEnv-gnu
+module use /soft/modulefiles 
+module load conda
+# activate base environment
+conda_path=$(dirname $(dirname $(which conda)))
+source $conda_path/bin/activate base
+#source $conda_path/bin/activate $dh_env
+
+# Resource allocation for DeepHyper
+export NDEPTH=16
+export NRANKS_PER_NODE=4
+export NNODES=`wc -l < $PBS_NODEFILE`
+export NTOTRANKS=$(( $NNODES * $NRANKS_PER_NODE + 1))
+export OMP_NUM_THREADS=$NDEPTH
+
+echo NNODES: ${NNODES}
+echo NTOTRANKS: ${NTOTRANKS}
+echo OMP_NUM_THREADS: ${OMP_NUM_THREADS}
+
+# GPU profiling, (quite ad-hoc, copy-paste the `profile_gpu_polaris.sh`, requires to install some small
+# python package which queries nvidia-smi, you need a simple parser then to collect data.)
+# UNCOMMENT IF USEFULL
+# export GPUSTAT_LOG_DIR=$PBS_O_WORKDIR/$log_dir
+# mpiexec -n ${NNODES} --ppn 1 --depth=1 --cpu-bind depth --envall ../profile_gpu_polaris.sh &
+
+# Get list of process ids (basically node names)
+echo $PBS_NODEFILE
+export RANKS_HOSTS=$(python ./get_hosts_polaris.py $PBS_NODEFILE)
+
+echo RANKS_HOSTS: ${RANKS_HOSTS}
+echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK}
+
+# Launch DeepHyper
+# ensure that mpi is pointing to the one within deephyper conda environment
+# set_affinity_gpu_polaris.sh does not seem to work right now
+# but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, 
+mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \
+    --envall \
+    ./set_affinity_gpu_polaris.sh python hpo_subprocess.py
\ No newline at end of file
diff --git a/hpo_scale_singularity_debug.sh b/hpo_scale_singularity_debug.sh
new file mode 100644
index 0000000..f8aa1b6
--- /dev/null
+++ b/hpo_scale_singularity_debug.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+#PBS -l select=2:ngpus=4:system=polaris
+#PBS -l place=scatter
+#PBS -l walltime=00:60:00
+#PBS -q debug
+#PBS -A IMPROVE
+#PBS -l filesystems=home:eagle
+#PBS -N dh_hpo_scale_test
+#PBS -M liu.yuanhang@mayo.edu
+
+set -xe
+
+# Move to the directory where `qsub example-improve.sh` was run
+cd ${PBS_O_WORKDIR}
+
+# source enviroemnt variabels for IMPROVE
+source $IMPROVE_env
+
+# Activate the current environement (module load, conda activate etc...)
+module load PrgEnv-gnu
+module use /soft/modulefiles 
+module load conda
+# activate base environment
+conda_path=$(dirname $(dirname $(which conda)))
+source $conda_path/bin/activate base
+#source $conda_path/bin/activate $dh_env
+# load module to run singularity container
+module use /soft/spack/gcc/0.6.1/install/modulefiles/Core
+module load apptainer
+
+# Resource allocation for DeepHyper
+export NDEPTH=16
+export NRANKS_PER_NODE=4
+export NNODES=`wc -l < $PBS_NODEFILE`
+export NTOTRANKS=$(( $NNODES * $NRANKS_PER_NODE + 1))
+export OMP_NUM_THREADS=$NDEPTH
+
+echo NNODES: ${NNODES}
+echo NTOTRANKS: ${NTOTRANKS}
+echo OMP_NUM_THREADS: ${OMP_NUM_THREADS}
+
+# GPU profiling, (quite ad-hoc, copy-paste the `profile_gpu_polaris.sh`, requires to install some small
+# python package which queries nvidia-smi, you need a simple parser then to collect data.)
+# UNCOMMENT IF USEFULL
+# export GPUSTAT_LOG_DIR=$PBS_O_WORKDIR/$log_dir
+# mpiexec -n ${NNODES} --ppn 1 --depth=1 --cpu-bind depth --envall ../profile_gpu_polaris.sh &
+
+# Get list of process ids (basically node names)
+echo $PBS_NODEFILE
+export RANKS_HOSTS=$(python ./get_hosts_polaris.py $PBS_NODEFILE)
+
+echo RANKS_HOSTS: ${RANKS_HOSTS}
+echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK}
+
+# Launch DeepHyper
+# ensure that mpi is pointing to the one within deephyper conda environment
+# set_affinity_gpu_polaris.sh does not seem to work right now
+# but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, 
+mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \
+    --envall ./set_affinity_gpu_polaris.sh python hpo_subprocess_singularity.py
\ No newline at end of file
diff --git a/hpo_scale_singularity_debug_scaling.sh b/hpo_scale_singularity_debug_scaling.sh
new file mode 100644
index 0000000..722822f
--- /dev/null
+++ b/hpo_scale_singularity_debug_scaling.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+#PBS -l select=4:ncpus=64:ngpus=4:system=polaris
+#PBS -l place=scatter
+#PBS -l walltime=00:30:00
+#PBS -q debug-scaling
+#PBS -A IMPROVE
+#PBS -l filesystems=home:eagle
+#PBS -N dh_hpo_scale_test
+#PBS -M liu.yuanhang@mayo.edu
+
+set -xe
+
+# Move to the directory where `qsub example-improve.sh` was run
+cd ${PBS_O_WORKDIR}
+
+# source enviroemnt variabels for IMPROVE
+source $IMPROVE_env
+
+# Activate the current environement (module load, conda activate etc...)
+module load PrgEnv-gnu
+module use /soft/modulefiles 
+module load conda
+# activate base environment
+conda_path=$(dirname $(dirname $(which conda)))
+source $conda_path/bin/activate base
+#source $conda_path/bin/activate $dh_env
+# load module to run singularity container
+module use /soft/spack/gcc/0.6.1/install/modulefiles/Core
+module load apptainer
+
+# Resource allocation for DeepHyper
+export NDEPTH=16
+export NRANKS_PER_NODE=4
+export NNODES=`wc -l < $PBS_NODEFILE`
+export NTOTRANKS=$(( $NNODES * $NRANKS_PER_NODE + 1))
+export OMP_NUM_THREADS=$NDEPTH
+
+echo NNODES: ${NNODES}
+echo NTOTRANKS: ${NTOTRANKS}
+echo OMP_NUM_THREADS: ${OMP_NUM_THREADS}
+
+# GPU profiling, (quite ad-hoc, copy-paste the `profile_gpu_polaris.sh`, requires to install some small
+# python package which queries nvidia-smi, you need a simple parser then to collect data.)
+# UNCOMMENT IF USEFULL
+# export GPUSTAT_LOG_DIR=$PBS_O_WORKDIR/$log_dir
+# mpiexec -n ${NNODES} --ppn 1 --depth=1 --cpu-bind depth --envall ../profile_gpu_polaris.sh &
+
+# Get list of process ids (basically node names)
+echo $PBS_NODEFILE
+export RANKS_HOSTS=$(python ./get_hosts_polaris.py $PBS_NODEFILE)
+
+echo RANKS_HOSTS: ${RANKS_HOSTS}
+echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK}
+
+# Launch DeepHyper
+# ensure that mpi is pointing to the one within deephyper conda environment
+# set_affinity_gpu_polaris.sh does not seem to work right now
+# but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, 
+mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \
+    --envall ./set_affinity_gpu_polaris.sh python hpo_subprocess_singularity.py
\ No newline at end of file
diff --git a/hpo_scale_singularity_prod.sh b/hpo_scale_singularity_prod.sh
new file mode 100644
index 0000000..99fd639
--- /dev/null
+++ b/hpo_scale_singularity_prod.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+#PBS -l select=10:ncpus=64:ngpus=4:system=polaris
+#PBS -l place=scatter
+#PBS -l walltime=00:10:00
+#PBS -q prod
+#PBS -A IMPROVE
+#PBS -l filesystems=home:eagle
+#PBS -N dh_hpo_scale_test
+#PBS -M liu.yuanhang@mayo.edu
+
+set -xe
+
+# Move to the directory where `qsub example-improve.sh` was run
+cd ${PBS_O_WORKDIR}
+
+# source enviroemnt variabels for IMPROVE
+source $IMPROVE_env
+
+# Activate the current environement (module load, conda activate etc...)
+module load PrgEnv-gnu
+module use /soft/modulefiles 
+module load conda
+# activate base environment
+conda_path=$(dirname $(dirname $(which conda)))
+source $conda_path/bin/activate base
+#source $conda_path/bin/activate $dh_env
+# load module to run singularity container
+module use /soft/spack/gcc/0.6.1/install/modulefiles/Core
+module load apptainer
+
+# Resource allocation for DeepHyper
+export NDEPTH=16
+export NRANKS_PER_NODE=4
+export NNODES=`wc -l < $PBS_NODEFILE`
+export NTOTRANKS=$(( $NNODES * $NRANKS_PER_NODE + 1))
+export OMP_NUM_THREADS=$NDEPTH
+
+echo NNODES: ${NNODES}
+echo NTOTRANKS: ${NTOTRANKS}
+echo OMP_NUM_THREADS: ${OMP_NUM_THREADS}
+
+# GPU profiling, (quite ad-hoc, copy-paste the `profile_gpu_polaris.sh`, requires to install some small
+# python package which queries nvidia-smi, you need a simple parser then to collect data.)
+# UNCOMMENT IF USEFULL
+# export GPUSTAT_LOG_DIR=$PBS_O_WORKDIR/$log_dir
+# mpiexec -n ${NNODES} --ppn 1 --depth=1 --cpu-bind depth --envall ../profile_gpu_polaris.sh &
+
+# Get list of process ids (basically node names)
+echo $PBS_NODEFILE
+export RANKS_HOSTS=$(python ./get_hosts_polaris.py $PBS_NODEFILE)
+
+echo RANKS_HOSTS: ${RANKS_HOSTS}
+echo PMI_LOCAL_RANK: ${PMI_LOCAL_RANK}
+
+# Launch DeepHyper
+# ensure that mpi is pointing to the one within deephyper conda environment
+# set_affinity_gpu_polaris.sh does not seem to work right now
+# but CUDA_VISIBLE_DEVICES was set within hpo_subprocess.py, 
+mpiexec -n ${NTOTRANKS} -host ${RANKS_HOSTS} \
+    --envall ./set_affinity_gpu_polaris.sh python hpo_subprocess_singularity.py
\ No newline at end of file
diff --git a/hpo_subprocess_singularity.py b/hpo_subprocess_singularity.py
new file mode 100644
index 0000000..abf7a9a
--- /dev/null
+++ b/hpo_subprocess_singularity.py
@@ -0,0 +1,160 @@
+"""
+Before running this script, first need to preprocess the data.
+
+It is assumed that the csa benchmark data is downloaded via download_csa.sh
+and the env vars $IMPROVE_DATA_DIR and $PYTHONPATH are set:
+export IMPROVE_DATA_DIR="./csa_data/"
+export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib
+
+mpirun -np 10 python hpo_subprocess.py
+"""
+# import copy
+import json
+import subprocess
+import pandas as pd
+import os
+import logging
+import mpi4py
+from mpi4py import MPI
+from deephyper.evaluator import Evaluator, profile
+from deephyper.evaluator.callback import TqdmCallback
+from deephyper.problem import HpProblem
+from deephyper.search.hps import CBO
+import socket
+
+# ---------------------
+# Enable using multiple GPUs
+# ---------------------
+
+mpi4py.rc.initialize = False
+mpi4py.rc.threads = True
+mpi4py.rc.thread_level = "multiple"
+mpi4py.rc.recv_mprobe = False
+
+if not MPI.Is_initialized():
+    MPI.Init_thread()
+
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+size = comm.Get_size()
+local_rank = os.environ["PMI_LOCAL_RANK"]
+
+# CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh
+# uncomment the below commands if running via interactive node
+#num_gpus_per_node = 3
+#os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node)
+
+# ---------------------
+# Enable logging
+# ---------------------
+
+logging.basicConfig(
+    # filename=f"deephyper.{rank}.log, # optional if we want to store the logs to disk
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(filename)s:%(funcName)s - %(message)s",
+    force=True,
+)
+
+# ---------------------
+# Hyperparameters
+# ---------------------
+problem = HpProblem()
+
+problem.add_hyperparameter((8, 512, "log-uniform"), "batch_size", default_value=8)
+problem.add_hyperparameter((1e-6, 1e-2, "log-uniform"),
+                           "learning_rate", default_value=0.0004)
+problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.1)
+# problem.add_hyperparameter([True, False], "early_stopping", default_value=False)
+
+# ---------------------
+# Some IMPROVE settings
+# ---------------------
+source = "GDSCv1"
+split = 0
+train_ml_data_dir = f"ml_data/{source}-{source}/split_{split}"
+val_ml_data_dir = f"ml_data/{source}-{source}/split_{split}"
+model_outdir = f"dh_hpo_improve/{source}/split_{split}"
+log_dir = "dh_hpo_logs/"
+subprocess_bashscript = "subprocess_train_singularity.sh"
+
+
+@profile
+def run(job, optuna_trial=None):
+
+    # config = copy.deepcopy(job.parameters)
+    # params = {
+    #     "epochs": DEEPHYPER_BENCHMARK_MAX_EPOCHS,
+    #     "timeout": DEEPHYPER_BENCHMARK_TIMEOUT,
+    #     "verbose": False,
+    # }
+    # if len(config) > 0:
+    #     remap_hyperparameters(config)
+    #     params.update(config)
+
+    model_outdir_job_id = model_outdir + f"/{job.id}"
+    learning_rate = job.parameters["learning_rate"]
+    batch_size = job.parameters["batch_size"]
+    dropout = job.parameters["dropout"]
+    # val_scores = main_train_grapdrp([
+    #     "--train_ml_data_dir", str(train_ml_data_dir),
+    #     "--val_ml_data_dir", str(val_ml_data_dir),
+    #     "--model_outdir", str(model_outdir_job_id),
+    # ])
+    subprocess_res = subprocess.run(
+        [
+            "bash", subprocess_bashscript,
+             str(train_ml_data_dir),
+             str(val_ml_data_dir),
+             str(model_outdir_job_id),
+             str(learning_rate),
+             str(batch_size),
+             str(dropout),
+             str(os.environ["CUDA_VISIBLE_DEVICES"])
+        ], 
+        capture_output=True, text=True, check=True
+    )
+    
+    # print(subprocess_res.stdout)
+    # print(subprocess_res.stderr)
+
+    # Load val_scores and get val_loss
+    # f = open(model_outdir + "/val_scores.json")
+    f = open(os.path.join(os.environ["IMPROVE_DATA_DIR"], model_outdir_job_id, "val_scores.json"))
+    val_scores = json.load(f)
+    objective = -val_scores["val_loss"]
+    # print("objective:", objective)
+
+    # Checkpoint the model weights
+    with open(f"{log_dir}/model_{job.id}.pkl", "w") as f:
+        f.write("model weights")
+
+    # return score
+    return {"objective": objective, "metadata": val_scores}
+
+
+if __name__ == "__main__":
+    with Evaluator.create(
+        run, method="mpicomm", method_kwargs={"callbacks": [TqdmCallback()]}
+    ) as evaluator:
+
+        if evaluator is not None:
+            print(problem)
+
+            search = CBO(
+                problem,
+                evaluator,
+                log_dir=log_dir,
+                verbose=1,
+            )
+
+            # max_evals = 2
+            # max_evals = 4
+            # max_evals = 10
+            # max_evals = 20
+            max_evals = 100
+            # max_evals = 100
+            results = search.search(max_evals=max_evals)
+            results = results.sort_values("m:val_loss", ascending=True)
+            results.to_csv(os.path.join(os.environ["IMPROVE_DATA_DIR"], model_outdir, "hpo_results.csv"), index=False)
+    print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"])
+    print("Finished deephyper HPO.")
diff --git a/hyperparameters_default.json b/hyperparameters_default.json
new file mode 100644
index 0000000..a0ca25d
--- /dev/null
+++ b/hyperparameters_default.json
@@ -0,0 +1,82 @@
+{
+    "graphdrp": {
+        "CCLE": {
+            "batch_size": 256,
+            "learning_rate": 0.0001
+        },
+        "CTRPv2": {
+            "batch_size": 256,
+            "learning_rate": 0.0001
+        },
+        "GDSCv1": {
+            "batch_size": 256,
+            "learning_rate": 0.0001
+        },
+        "GDSCv2": {
+            "batch_size": 256,
+            "learning_rate": 0.0001
+        },
+        "gCSI": {
+            "batch_size": 256,
+            "learning_rate": 0.0001
+        }
+    },
+    "HiDRA": {
+        "CCLE": {},
+        "CTRPv2": {},
+        "GDSCv1": {},
+        "GDSCv2": {},
+        "gCSI": {}  
+    },
+    "IGTD": {
+        "CCLE": {},
+        "CTRPv2": {},
+        "GDSCv1": {},
+        "GDSCv2": {},
+        "gCSI": {}  
+    },
+    "Paccmann_MCA": {
+        "CCLE": {
+            "batch_size": 32,
+            "learning_rate": 0.0001
+        },
+        "CTRPv2": {
+            "batch_size": 32,
+            "learning_rate": 0.0001
+        },
+        "GDSCv1": {
+            "batch_size": 32,
+            "learning_rate": 0.0001
+        },
+        "GDSCv2": {
+            "batch_size": 32,
+            "learning_rate": 0.0001
+        },
+        "gCSI": {
+            "batch_size": 32,
+            "learning_rate": 0.0001
+        }
+    },
+    "PathDSP": {
+        "CCLE": {
+            "batch_size": 12,
+            "learning_rate": 0.0004
+        },
+        "CTRPv2": {
+            "batch_size": 12,
+            "learning_rate": 0.0004
+        },
+        "GDSCv1": {
+            "batch_size": 12,
+            "learning_rate": 0.0004
+        },
+        "GDSCv2": {
+            "batch_size": 12,
+            "learning_rate": 0.0004
+        },
+        "gCSI": {
+            "batch_size": 12,
+            "learning_rate": 0.0004
+        }
+    }
+}
\ No newline at end of file
diff --git a/hyperparameters_hpo.json b/hyperparameters_hpo.json
new file mode 100644
index 0000000..f798500
--- /dev/null
+++ b/hyperparameters_hpo.json
@@ -0,0 +1,101 @@
+{
+    "graphdrp": {
+        "CCLE": {
+            "batch_size": 64,
+            "learning_rate": 0.0012233565752066463
+        },
+        "CTRPv2": {
+            "batch_size": 256,
+            "learning_rate": 0.0001
+        },
+        "GDSCv1": {
+            "batch_size": 1024,
+            "learning_rate": 0.06408519376992045
+        },
+        "GDSCv2": {
+            "batch_size": 128,
+            "learning_rate": 0.0005245278010578158
+        },
+        "gCSI": {
+            "batch_size": 32,
+            "learning_rate": 0.00015145175483629418
+        }
+    },
+
+    "HiDRA": {
+        "CCLE": {
+            "batch_size": 16,
+            "learning_rate": 0.0011825165353494742
+        },
+        "CTRPv2": {},
+        "GDSCv1": {
+            "batch_size": 64,
+            "learning_rate": 0.0033642852862458077
+        },
+        "GDSCv2": {},
+        "gCSI": {
+            "batch_size": 32,
+            "learning_rate": 0.00685103425830459
+        }
+    },
+    "IGTD": {
+        "CCLE": {
+            "batch_size": 16,
+            "learning_rate": 0.00044116909746320135
+        },
+        "CTRPv2": {
+            "batch_size": 512,
+            "learning_rate": 0.0005219211138261751
+        },
+        "GDSCv1": {
+            "batch_size": 32,
+            "learning_rate": 0.0006637139133709175
+        },
+        "GDSCv2": {
+            "batch_size": 128,
+            "learning_rate": 0.0005058417379141607
+        },
+        "gCSI": {
+            "batch_size": 32,
+            "learning_rate": 0.0008434053110179821
+        }
+    },
+    "Paccmann_MCA": {
+        "CCLE": {
+            "batch_size": 512,
+            "learning_rate": 0.00019306380321264862
+        },
+        "CTRPv2": {},
+        "GDSCv1": {
+            "batch_size": 128,
+            "learning_rate": 2.18431820219191e-05
+        },
+        "GDSCv2": {
+            "batch_size": 128,
+            "learning_rate": 0.00010468988102446662
+        },
+        "gCSI": {
+            "batch_size": 256,
+            "learning_rate": 7.092892399034623e-05
+        }
+    },
+    "PathDSP": {
+        "CCLE": {
+            "batch_size": 64,
+            "learning_rate": 9.043143432931976e-05
+        },
+        "CTRPv2": {},
+        "GDSCv1": {
+            "batch_size": 128,
+            "learning_rate": 7.565669627288258e-05
+        },
+        "GDSCv2": {
+            "batch_size": 16,
+            "learning_rate": 6.916681700876223e-05
+        },
+        "gCSI": {
+            "batch_size": 128,
+            "learning_rate": 0.00024319069525915603
+        }
+    }
+}
\ No newline at end of file
diff --git a/infer.sh b/infer.sh
new file mode 100755
index 0000000..f1dcd94
--- /dev/null
+++ b/infer.sh
@@ -0,0 +1,7 @@
+IMPROVE_MODEL_NAME=PathDSP
+IMPROVE_MODEL_SCRIPT=${IMPROVE_MODEL_NAME}_infer_improve.py
+
+# Set env if CANDLE_MODEL is not in same directory as this script
+IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )}
+
+python $IMPROVE_MODEL_DIR/$IMPROVE_MODEL_SCRIPT $@
diff --git a/install_polaris.sh b/install_polaris.sh
new file mode 100644
index 0000000..46695e8
--- /dev/null
+++ b/install_polaris.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# install script is not needed as it's troublesome to install mpi4py libraray
+# use conda base environment directly instead
+
+# From Romain Egele (this script is called install.sh)
+
+# Generic installation script for DeepHyper on ALCF's Polaris.
+# This script is meant to be run on the login node of the machine.
+# It will install DeepHyper and its dependencies in the current directory.
+# A good practice is to create a `build` folder and launch the script from there,
+# e.g. from the root of the DeepHyper repository:
+# $ mkdir build && cd build && ../install/alcf/polaris.sh
+# The script will also create a file named `activate-dhenv.sh` that will
+# Setup the environment each time it is sourced `source activate-dhenv.sh`.
+
+set -xe
+
+# Load modules available on the current system
+module load PrgEnv-gnu
+# conda is not avilable
+# need to install conda locally
+#module load conda/2023-10-04
+
+# Copy the base conda environment
+conda create -p dhenv python=3.9 pip -y
+conda activate dhenv/
+pip install --upgrade pip
+
+# For mpi4py
+#module swap PrgEnv-nvhpc PrgEnv-gnu
+module load nvhpc-mixed
+git clone https://github.com/mpi4py/mpi4py.git
+cd mpi4py/
+MPICC=CC python setup.py install
+cd ../
+#conda install mpi4py --yes
+
+# Install the DeepHyper's Python package
+git clone -b develop git@github.com:deephyper/deephyper.git
+pip install -e "deephyper/[hps,mpi]"
+
+# Create activation script
+touch activate-dhenv.sh
+echo "#!/bin/bash" >> activate-dhenv.sh
+
+# Append modules loading and conda activation
+echo "" >> activate-dhenv.sh
+echo "module load PrgEnv-gnu" >> activate-dhenv.sh
+#echo "module load conda/2023-10-04" >> activate-dhenv.sh
+conda_path=$(dirname $(dirname $(which conda)))
+echo "source $conda_path/bin/activate $PWD/dhenv/" >> activate-dhenv.sh
diff --git a/model_params_def.py b/model_params_def.py
new file mode 100644
index 0000000..a9680db
--- /dev/null
+++ b/model_params_def.py
@@ -0,0 +1,68 @@
+pathdsp_preprocess_params = [
+    {"name": "bit_int",
+     "type": int,
+     "default": 128,
+     "help": "Number of bits for morgan fingerprints.",
+    },
+    {"name": "permutation_int",
+     "type": int,
+     "default": 3,
+     "help": "Number of permutation for calculating enrichment scores.",
+    },
+    {"name": "seed_int",
+     "type": int,
+     "default": 42,
+     "help": "Random seed for random walk algorithm.",
+    },
+    {"name": "cpu_int",
+     "type": int,
+     "default": 20,
+     "help": "Number of cpus to use when calculating pathway enrichment scores.",
+    },    
+    {"name": "drug_bits_file",
+     "type": str,
+     "default": "drug_mbit_df.txt",
+     "help": "File name to save the drug bits file.",
+    },
+    {"name": "dgnet_file",
+     "type": str,
+     "default": "DGnet.txt",
+     "help": "File name to save the drug target net file.",
+    },
+    {"name": "mutnet_file",
+     "type": str,
+     "default": "MUTnet.txt",
+     "help": "File name to save the mutation net file.",
+    },    
+    {"name": "cnvnet_file",
+     "type": str,
+     "default": "CNVnet.txt",
+     "help": "File name to save the CNV net file.",
+    },
+    {"name": "exp_file",
+     "type": str,
+     "default": "EXPnet.txt",
+     "help": "File name to save the EXP net file.",
+    },    
+]
+
+pathdsp_train_params = [
+    {"name": "cuda_name",  # TODO. frm. How should we control this?
+     "action": "store",
+     "type": str,
+     "help": "Cuda device (e.g.: cuda:0, cuda:1."
+     },
+    {"name": "dropout",
+     "type": float,
+     "default": 0.1,
+     "help": "Dropout rate for the optimizer."
+    },
+]
+
+pathdsp_infer_params = [
+    {"name": "cuda_name",  # TODO. frm. How should we control this?
+     "action": "store",
+     "type": str,
+     "help": "Cuda device (e.g.: cuda:0, cuda:1."
+     },
+]
\ No newline at end of file
diff --git a/PathDSP/FNN.py b/model_utils/FNN.py
similarity index 99%
rename from PathDSP/FNN.py
rename to model_utils/FNN.py
index a05247b..01c9acc 100644
--- a/PathDSP/FNN.py
+++ b/model_utils/FNN.py
@@ -30,7 +30,7 @@
 import myDataloader as mydl
 import myDatasplit as mysplit
 import myUtility as myutil
-import myPlotter as myplot
+#import myPlotter as myplot
 import myMetrics as mymts
 
 import shap as sp
diff --git a/model_utils/FNN_new.py b/model_utils/FNN_new.py
new file mode 100644
index 0000000..0f84e5d
--- /dev/null
+++ b/model_utils/FNN_new.py
@@ -0,0 +1,299 @@
+"""
+Train a neural network for regression task:
+    cv: 10
+    batch size: 8
+    initializer: He normal initializer
+    optimizer: AdamMax
+    learning rate: 0.0004
+    loss: RMSE
+
+Calculate RMSE at once, Oct. 3, 2020 revised
+"""
+
+import os
+import argparse
+import numpy as np
+import pandas as pd
+import scipy.stats as scistat
+from datetime import datetime
+
+import sklearn.preprocessing as skpre
+import sklearn.model_selection as skms
+import sklearn.metrics as skmts
+import sklearn.utils as skut
+
+import torch as tch
+import torch.utils.data as tchud
+
+import myModel as mynet
+import myFit as myfit
+import myDataloader as mydl
+import myDatasplit as mysplit
+import myUtility as myutil
+#import myPlotter as myplot
+import myMetrics as mymts
+import polars as pl
+
+#import shap as sp
+
+class RMSELoss(tch.nn.Module):
+    def __init__(self):
+        super(RMSELoss,self).__init__()
+
+    def forward(self,x,y):
+        eps = 1e-6
+        criterion = tch.nn.MSELoss()
+        loss = tch.sqrt(criterion(x, y) + eps)
+        return loss
+
+
+def r2_score(y_true, y_pred):
+    y_mean = np.mean(y_true)
+    ss_tot = np.sum((y_true - y_mean)**2)
+    ss_res = np.sum((y_true - y_pred)**2)
+    r2 = 1 - ss_res / ss_tot
+    return r2
+
+def cal_time(end, start):
+    '''return time spent'''
+    # end = datetime.now(), start = datetime.now()
+    datetimeFormat = '%Y-%m-%d %H:%M:%S.%f'
+    spend = datetime.strptime(str(end), datetimeFormat) - \
+            datetime.strptime(str(start),datetimeFormat)
+    return spend
+
+
+
+def fit(net, train_dl, valid_dl, epochs, learning_rate, device, opt_fn):
+    """
+    Return train and valid performance including loss
+
+    :param net: model
+    :param train_dl: train dataloader
+    :param valid_dl: valid dataloader
+    :param epochs: integer representing EPOCH
+    :param learning_rate: float representing LEARNING_RATE
+    :param device: string representing cpu or cuda:0
+    :param opt_fn: optimization function in torch (e.g., tch.optim.Adam)
+    :param loss_fn: loss function in torch (e.g., tch.nn.MSELoss)
+    """
+    # setup
+    criterion = RMSELoss() # setup LOSS function
+    optimizer = opt_fn(net.parameters(), lr=learning_rate, weight_decay=1e-5) # setup optimizer
+    net = net.to(device) # load the network onto the device
+    trainloss_list = [] # metrics: MSE, size equals to EPOCH
+    validloss_list = [] # metrics: MSE, size equals to EPOCH
+    validr2_list = [] # metrics: r2, size equals to EPOCH
+    early_stopping = myutil.EarlyStopping(patience=30, verbose=True, path= os.environ.get("CANDLE_DATA_DIR") + "/Data/checkpoint.pt") # initialize the early_stopping
+    # repeat the training for EPOCH times
+    start_total = datetime.now()
+    for epoch in range(epochs):
+        ## training phase
+        start = datetime.now()
+        net.train()
+        # initial loss
+        train_epoch_loss = 0.0 # save loss for each epoch, batch by batch
+        for i, (X_train, y_train) in enumerate(train_dl):
+            X_train, y_train = X_train.to(device), y_train.to(device) # load data onto the device
+            y_train_pred  = net(X_train) # train result
+            train_loss = criterion(y_train_pred, y_train.float()) # calculate loss
+            optimizer.zero_grad() # clear gradients
+            train_loss.backward() # backpropagation
+            #### add this if you have gradient explosion problem ###
+            clip_value = 5
+            tch.nn.utils.clip_grad_value_(net.parameters(), clip_value)
+            ########climp gradient within -5 ~ 5 ###################
+            optimizer.step() # update weights
+            train_epoch_loss += train_loss.item() # adding loss from each batch
+        # calculate total loss of all batches
+        avg_train_loss = train_epoch_loss / len(train_dl)
+        trainloss_list.append( avg_train_loss )
+        print('epoch ' + str(epoch) + ' :[Finished in {:}]'.format(cal_time(datetime.now(), start)))
+        ## validation phase
+        with tch.no_grad():
+            net.eval()
+            valid_epoch_loss = 0.0 # save loss for each epoch, batch by batch
+            ss_res = 0.0
+            ss_tot = 0.0
+            for i, (X_valid, y_valid) in enumerate(valid_dl):
+                X_valid, y_valid = X_valid.to(device), y_valid.to(device) # load data onto the device
+                y_valid_pred  = net(X_valid) # valid result
+                valid_loss = criterion(y_valid_pred, y_valid.float())#y_valid.unsqueeze(1)) # calculate loss
+                valid_epoch_loss += valid_loss.item() # adding loss from each batch
+                ss_res += tch.sum((y_valid_pred - y_valid.float())**2)
+                ss_tot += tch.sum((y_valid_pred - y_valid.mean())**2)
+        
+        # calculate total loss of all batches, and append to result list
+        avg_valid_loss = valid_epoch_loss / len(valid_dl)
+        validloss_list.append( avg_valid_loss)
+        valid_r2 = 1 - ss_res / ss_tot
+        validr2_list.append(valid_r2.cpu().numpy())
+        # display print message
+        #print('epoch={:}/{:}, train loss={:.5f}, valid loss={:.5f}'.format(
+        #       epoch+1, epochs, train_epoch_loss / len(train_dl),
+        #                        valid_epoch_loss / len(valid_dl)))
+        
+        # early_stopping needs the validation loss to check if it has decresed, 
+        # and if it has, it will make a checkpoint of the current model
+        early_stopping(avg_valid_loss, net)
+        
+        if early_stopping.early_stop:
+            print("Early stopping")
+            break
+    
+    print('Total time (all epochs) :[Finished in {:}]'.format(cal_time(datetime.now(), start_total)))
+    # load the last checkpoint with the best model
+    net.load_state_dict(tch.load(os.environ.get('CANDLE_DATA_DIR') + '/Data/checkpoint.pt'))
+
+    return  net, trainloss_list, validloss_list, validr2_list
+
+def predict(net, test_dl, device):
+    """
+    Return prediction list
+
+    :param net: model
+    :param train_dl: train dataloader
+    :param device: string representing cpu or cuda:0
+    """
+    # create result lists
+    prediction_list = list()
+
+    with tch.no_grad():
+        net = net.to(device) # load the network onto the device
+        net.eval()
+        for i, (X_test, y_test) in enumerate(test_dl):
+            X_test, y_test = X_test.to(device), y_test.to(device) # load data onto the device
+            y_test_pred  = net(X_test) # test result
+            # bring data back to cpu in np.array format, and append to result lists
+            prediction_list.append( y_test_pred.cpu().numpy() )
+            #print(prediction_list)
+
+    # merge all batches
+    prediction_list  = np.vstack(prediction_list)
+    prediction_list = np.hstack(prediction_list).tolist()
+    # return
+    return prediction_list
+
+
+def main(params):
+    start_time = datetime.now()
+    # load data
+    print('loadinig data')
+    # train_df = pd.read_csv(params['train_data'], header=0, index_col=[0,1], sep="\t")
+    # val_df = pd.read_csv(params['val_data'], header=0, index_col=[0,1], sep="\t")
+    # test_df = pd.read_csv(params['test_data'], header=0, index_col=[0,1], sep="\t")
+    train_df = pl.read_csv(params['train_data'], separator = "\t").to_pandas()
+    val_df = pl.read_csv(params['val_data'], separator = "\t").to_pandas()
+    
+    # shuffle
+    #sdf = skut.shuffle(df, random_state=params["seed_int"])
+
+    # set parameters
+    myutil.set_seed(params["seed_int"])
+    device = myutil.get_device(uth=params["gpu_int"])
+    #kFold = params["cv_int"]
+    learning_rate = params['learning_rate']
+    epoch = params['epochs']
+    batch_size = params['batch_size']
+    opt_fn = tch.optim.Adam
+
+    # create result list
+    # loss_df_list = []
+    # score_df_list = []
+    # ytest_df_list = []
+    # shap_df_list = []
+    # # train with cross-validation
+    #kf = skms.KFold(n_splits=kFold, random_state=params['seed_int'], shuffle=True)
+    #X_df = train_df.iloc[:, 0:-1]
+    #y_df = train_df.iloc[:, -1]
+    # save best model with lowest RMSE
+#     best_rmse = 10000
+#     best_model = None
+#     best_fold = 0
+# #    for i, (train_index, test_index) in enumerate(kf.split(X_df, y_df)):
+    #n_fold = i+1
+    #print('Fold={:}/{:}'.format(n_fold, params['cv_int']))
+    # get train/test splits
+    Xtrain_arr = train_df.iloc[:, 0:-1].values
+    Xvalid_arr = val_df.iloc[:, 0:-1].values
+    ytrain_arr = train_df.iloc[:, -1].values
+    yvalid_arr = val_df.iloc[:, -1].values    
+    
+    # get train/valid splits from train
+    #Xtrain_arr, Xvalid_arr, ytrain_arr, yvalid_arr = skms.train_test_split(Xtrain_arr, ytrain_arr,
+    #                                                                        test_size=0.1, random_state=params['seed_int'])
+    #print('    train={:}, valid={:}, test={:}'.format(Xtrain_arr.shape, Xvalid_arr.shape, Xtest_arr.shape))
+    # prepare dataframe for output
+    #ytest_df = test_df.iloc[:, -1].to_frame()
+    # convert to numpy array
+    Xtrain_arr = np.array(Xtrain_arr).astype('float32')
+    Xvalid_arr = np.array(Xvalid_arr).astype('float32')
+    ytrain_arr = np.array(ytrain_arr).astype('float32')
+    yvalid_arr = np.array(yvalid_arr).astype('float32')
+    # create mini-batch
+    train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr))
+    valid_dataset = mydl.NumpyDataset(tch.from_numpy(Xvalid_arr), tch.from_numpy(yvalid_arr))
+    train_dl = tchud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    valid_dl = tchud.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
+    # initial weight
+    def init_weights(m):
+        if type(m) == tch.nn.Linear:
+            tch.nn.init.kaiming_uniform_(m.weight)
+            m.bias.data.fill_(0.01)
+    # load model
+    n_features = Xtrain_arr.shape[1]
+    net = mynet.FNN(n_features)
+    net.apply(init_weights)
+    # fit data with model
+    print('start training process')
+    trained_net, train_loss_list, valid_loss_list, valid_r2_list = fit(net, train_dl, valid_dl, epoch, learning_rate, device, opt_fn)
+    # if rmse <= best_rmse:
+    #     best_rmse = rmse
+    #     best_fold = n_fold
+    #     best_model = trained_net
+    #     print('best model so far at fold={:}, rmse={:}'.format(best_fold, best_rmse))
+    
+    
+    # if params['shap_bool'] == True:
+    #     print('calculate shapely values')
+    #     # random select 100 samples as baseline
+    #     train_dataset = mydl.NumpyDataset(tch.from_numpy(Xtrain_arr), tch.from_numpy(ytrain_arr))
+    #     train_dl = tchud.DataLoader(train_dataset, batch_size=200, shuffle=True)
+    #     background, lbl  = next(iter(train_dl))
+    #     explainer = sp.DeepExplainer(trained_net, background[:100].to(device))
+    #     shap_arr = explainer.shap_values(tch.from_numpy(Xtest_arr))
+    #     shap_df = pd.DataFrame(shap_arr, index=ytest_df.index, columns=X_df.columns)
+    #     # append to result
+    #     shap_df_list.append(shap_df)
+        
+    # collect result
+    loss_df = pd.DataFrame({'epoch':[i+1 for i in range(len(train_loss_list))],
+                            'train loss':train_loss_list, 
+                            'valid loss': valid_loss_list,
+                            'valid r2': valid_r2_list})
+
+    #loss_df_list.append(loss_df)
+    #ytest_df_list.append(ytest_df)
+    # end of fold
+    #trained_net = None
+
+    # save to output
+    #all_ytest_df = pd.concat(ytest_df_list, axis=0)
+    #all_loss_df = pd.concat(loss_df_list, axis=0)
+    loss_df.to_csv(params['data_dir'] + '/Loss.txt', header=True, index=False, sep="\t")
+    # if params['shap_bool'] == True:
+    #     all_shap_df = pd.concat(shap_df_list, axis=0)
+    #     all_shap_df.to_csv(params['output'] + '.FNN.cv_' + str(params['cv_int']) + '.SHAP.txt', header=True, index=True, sep="\t")
+
+    # make train/valid loss plots
+    best_model = trained_net
+    tch.save(best_model.state_dict(), params['data_dir'] + '/model.pt')
+    print( '[Finished in {:}]'.format(myutil.cal_time(datetime.now(), start_time)) )
+    # display evaluation metrics of all folds
+    #mse, rmse, r_square, pccy = mymts.eval_regressor_performance(all_ytest_df, 'response', 'prediction')
+
+
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/model_utils/infer.sh b/model_utils/infer.sh
new file mode 100644
index 0000000..aa14ebe
--- /dev/null
+++ b/model_utils/infer.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+  
+# arg 1 CUDA_VISIBLE_DEVICES
+# arg 2 CANDLE_DATA_DIR
+# arg 3 CANDLE_CONFIG
+
+### Path to your CANDLEized model's main Python script###
+CANDLE_MODEL=infer.py
+
+### Set env if CANDLE_MODEL is not in same directory as this script
+IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )}
+
+CANDLE_MODEL=${IMPROVE_MODEL_DIR}/${CANDLE_MODEL}
+if [ ! -f ${CANDLE_MODEL} ] ; then
+    echo No such file ${CANDLE_MODEL}
+    exit 404
+fi
+
+if [ $# -lt 2 ]; then
+    echo "Illegal number of parameters"
+    echo "CUDA_VISIBLE_DEVICES and CANDLE_DATA_DIR are required"
+    exit
+fi
+
+if [ $# -eq 2 ] ; then
+    CUDA_VISIBLE_DEVICES=$1 ; shift
+    CANDLE_DATA_DIR=$1 ; shift
+    CMD="python ${CANDLE_MODEL}"
+    echo "CMD = $CMD"
+
+elif [ $# -ge 3 ] ; then
+    CUDA_VISIBLE_DEVICES=$1 ; shift
+    CANDLE_DATA_DIR=$1 ; shift
+    
+    # if original $3 is a file, set candle_config and passthrough $@
+    if [ -f $CANDLE_DATA_DIR/$1 ] ; then
+	echo "$CANDLE_DATA_DIR/$1 is a file"
+        CANDLE_CONFIG=$1 ; shift
+        CMD="python ${CANDLE_MODEL} --config_file $CANDLE_CONFIG $@"
+        echo "CMD = $CMD $@"
+
+     # else passthrough $@
+    else
+	echo "$1 is not a file"
+        CMD="python ${CANDLE_MODEL} $@"
+        echo "CMD = $CMD"
+	
+    fi
+fi
+
+if [ -d ${CANDLE_DATA_DIR} ]; then
+    if [ "$(ls -A ${CANDLE_DATA_DIR})" ] ; then
+	echo "using data from ${CANDLE_DATA_DIR}"
+    else
+	./candle_glue.sh
+	echo "using original data placed in ${CANDLE_DATA_DIR}"
+    fi
+fi
+
+export CANDLE_DATA_DIR=${CANDLE_DATA_DIR}
+FULL_DATA_DIR="$CANDLE_DATA_DIR/$MODEL_NAME/Data"
+echo $FULL_DATA_DIR
+
+if [ -d ${FULL_DATA_DIR} ]; then
+    if [ "$(ls -A ${FULL_DATA_DIR})" ] ; then
+	echo "using data from ${FULL_DATA_DIR}"
+    else
+	./candle_glue.sh
+	echo "using original data placed in ${FULL_DATA_DIR}"
+    fi
+else
+    ./candle_glue.sh
+    echo "using original data placed in ${FULL_DATA_DIR}"
+fi
+
+# Display runtime arguments
+echo "using CUDA_VISIBLE_DEVICES ${CUDA_VISIBLE_DEVICES}"
+echo "using CANDLE_DATA_DIR ${CANDLE_DATA_DIR}"
+echo "using CANDLE_CONFIG ${CANDLE_CONFIG}"
+
+# Set up environmental variables and execute model
+echo "activating environment"
+#source /opt/conda/etc/profile.d/conda.sh
+source activate /usr/local/conda_envs/PathDSP_env
+echo "running command ${CMD}"
+CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} CANDLE_DATA_DIR=${CANDLE_DATA_DIR} $CMD
diff --git a/PathDSP/leaveOneGroupOut_FNN.py b/model_utils/leaveOneGroupOut_FNN.py
similarity index 100%
rename from PathDSP/leaveOneGroupOut_FNN.py
rename to model_utils/leaveOneGroupOut_FNN.py
diff --git a/PathDSP/load_pretrained.py b/model_utils/load_pretrained.py
similarity index 100%
rename from PathDSP/load_pretrained.py
rename to model_utils/load_pretrained.py
diff --git a/PathDSP/myDataloader.py b/model_utils/myDataloader.py
similarity index 93%
rename from PathDSP/myDataloader.py
rename to model_utils/myDataloader.py
index e9a4dbf..77eb482 100644
--- a/PathDSP/myDataloader.py
+++ b/model_utils/myDataloader.py
@@ -9,7 +9,7 @@
 import torch.utils.data as tchud
 import sklearn.model_selection as skms
 import sklearn.preprocessing as skpre
-import myDatasplit as mysplit
+import model_utils.myDatasplit as mysplit
 
 class NumpyDataset(tchud.Dataset):
     """
diff --git a/PathDSP/myDatasplit.py b/model_utils/myDatasplit.py
similarity index 100%
rename from PathDSP/myDatasplit.py
rename to model_utils/myDatasplit.py
diff --git a/PathDSP/myFit.py b/model_utils/myFit.py
similarity index 100%
rename from PathDSP/myFit.py
rename to model_utils/myFit.py
diff --git a/PathDSP/myMetrics.py b/model_utils/myMetrics.py
similarity index 100%
rename from PathDSP/myMetrics.py
rename to model_utils/myMetrics.py
diff --git a/PathDSP/myModel.py b/model_utils/myModel.py
similarity index 100%
rename from PathDSP/myModel.py
rename to model_utils/myModel.py
diff --git a/PathDSP/myUtility.py b/model_utils/myUtility.py
similarity index 100%
rename from PathDSP/myUtility.py
rename to model_utils/myUtility.py
diff --git a/PathDSP/nestedCV.py b/model_utils/nestedCV.py
similarity index 100%
rename from PathDSP/nestedCV.py
rename to model_utils/nestedCV.py
diff --git a/old_deephyper/README_deephyper.md b/old_deephyper/README_deephyper.md
new file mode 100644
index 0000000..725ce02
--- /dev/null
+++ b/old_deephyper/README_deephyper.md
@@ -0,0 +1,83 @@
+# Run HPO using deephyper on Polaris
+
+## Install conda environment for the curated model (PathDSP)
+
+```
+## install PathDSP
+git clone -b deephyper https://github.com/Liuy12/PathDSP.git
+## install IMPROVE
+git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git
+## define where to install PathDSP env
+export PathDSP_env=./PathDSP_env/
+conda env create -f ./PathDSP/environment_082223.yml -p $PathDSP_env
+conda activate ${PathDSP_env}
+pip install git+https://github.com/ECP-CANDLE/candle_lib@develop
+```
+
+## Download csa benchmark data
+
+```
+wget --cut-dirs=7 -P ./ -nH -np -m ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data
+```
+
+## Download additional author data (PathDSP only)
+
+```
+mkdir author_data
+bash ./PathDSP/download_author_data.sh author_data/
+```
+
+## Define environment variables
+
+```
+### if necessary, request an interactive node from polaris to testing purposes
+###  qsub -A IMPROVE -I -l select=1 -l filesystems=home:eagle -l walltime=1:00:00 -q debug
+### NEED to cd into your working directory again once the job started
+improve_lib="$PWD/IMPROVE/"
+pathdsp_lib="$PWD/PathDSP/"
+# notice the extra PathDSP folder after pathdsp_lib
+echo "export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/" >> IMPROVE_env
+# IMPROVE_DATA_DIR
+echo "export IMPROVE_DATA_DIR=$PWD/csa_data/" >> IMPROVE_env
+# AUTHOR_DATA_DIR required for PathDSP
+echo "export AUTHOR_DATA_DIR=$PWD/author_data/" >> IMPROVE_env
+# PathDSP_env: conda environment path for the model
+echo "export PathDSP_env=$PathDSP_env" >> IMPROVE_env
+source $PWD/IMPROVE_env
+```
+
+## Perform preprocessing
+
+```
+conda activate $PathDSP_env
+## You can copy the processed files for PathDSP
+cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ ./PathDSP/
+## Alternatively, run the preprocess script
+## This script taks around 40 mins to complete
+## python PathDSP/PathDSP_preprocess_improve.py --ml_data_outdir=./PathDSP/ml_data/GDSCv1-GDSCv1/split_4/
+```
+
+## Perform HPO using singularity container across two nodes
+
+```
+## copy processed to IMPROVE_DATA_DIR
+cp -r /lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP/ml_data/ $IMPROVE_DATA_DIR
+## specify singularity image file for PathDSP
+echo "export PathDSP_sif=/lus/eagle/projects/IMPROVE_Aim1/yuanhangl_alcf/PathDSP.sif" >> IMPROVE_env
+cd PathDSP
+## submit to debug queue
+qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale_singularity_debug.sh
+## to submit to debug-scaling or prod queue
+## use hpo_scale_singularity_debug_scaling.sh 
+## or hpo_scale_singularity_prod.sh
+## for interative node, run: mpirun -np 10 python hpo_subprocess_singularity.py
+```
+
+## Alternatively, perform HPO across two nodes based on conda
+
+```
+cd PathDSP
+# supply environment variables to qsub
+qsub -v IMPROVE_env=../IMPROVE_env ./hpo_scale.sh
+## for interactive node, you can run: mpirun -np 10 python hpo_subprocess.py
+```
\ No newline at end of file
diff --git a/old_deephyper/hpo_subprocess.py b/old_deephyper/hpo_subprocess.py
new file mode 100644
index 0000000..b925ec1
--- /dev/null
+++ b/old_deephyper/hpo_subprocess.py
@@ -0,0 +1,165 @@
+"""
+Before running this script, first need to preprocess the data.
+This can be done by running preprocess_example.sh
+
+It is assumed that the csa benchmark data is downloaded via download_csa.sh
+and the env vars $IMPROVE_DATA_DIR and $PYTHONPATH are set:
+export IMPROVE_DATA_DIR="./csa_data/"
+export PYTHONPATH=$PYTHONPATH:/path/to/IMPROVE_lib
+
+It also assumes that your processed training data is at: "ml_data/{source}-{source}/split_{split}"
+validation data is at: "ml_data/{source}-{source}/split_{split}"
+model output files will be saved at "dh_hpo_improve/{source}/split_{split}"
+
+mpirun -np 10 python hpo_subprocess.py
+"""
+# import copy
+import json
+import subprocess
+import pandas as pd
+import os
+import logging
+import mpi4py
+from deephyper.evaluator import Evaluator, profile
+from deephyper.evaluator.callback import TqdmCallback
+from deephyper.problem import HpProblem
+from deephyper.search.hps import CBO
+from mpi4py import MPI
+import socket
+
+# ---------------------
+# Enable using multiple GPUs
+# ---------------------
+
+mpi4py.rc.initialize = False
+mpi4py.rc.threads = True
+mpi4py.rc.thread_level = "multiple"
+mpi4py.rc.recv_mprobe = False
+
+if not MPI.Is_initialized():
+    MPI.Init_thread()
+
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+size = comm.Get_size()
+local_rank = os.environ["PMI_LOCAL_RANK"]
+
+# CUDA_VISIBLE_DEVICES is now set via set_affinity_gpu_polaris.sh
+# uncomment the below commands if running via interactive node
+#num_gpus_per_node = 4
+#os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % num_gpus_per_node)
+#cuda_name = "cuda:" + str(rank % num_gpus_per_node)
+
+# ---------------------
+# Enable logging
+# ---------------------
+
+logging.basicConfig(
+    # filename=f"deephyper.{rank}.log, # optional if we want to store the logs to disk
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(filename)s:%(funcName)s - %(message)s",
+    force=True,
+)
+
+# ---------------------
+# Hyperparameters
+# ---------------------
+problem = HpProblem()
+
+problem.add_hyperparameter((8, 512, "log-uniform"), "batch_size", default_value=64)
+problem.add_hyperparameter((1e-6, 1e-2, "log-uniform"),
+                           "learning_rate", default_value=0.001)
+# problem.add_hyperparameter((0, 0.5), "dropout", default_value=0.0)
+# problem.add_hyperparameter([True, False], "early_stopping", default_value=False)
+
+# ---------------------
+# Some IMPROVE settings
+# ---------------------
+source = "GDSCv1"
+split = 4
+train_ml_data_dir = f"ml_data/{source}-{source}/split_{split}"
+val_ml_data_dir = f"ml_data/{source}-{source}/split_{split}"
+model_outdir = f"dh_hpo_improve/{source}/split_{split}"
+log_dir = "dh_hpo_logs/"
+subprocess_bashscript = "subprocess_train.sh"
+
+
+@profile
+def run(job, optuna_trial=None):
+
+    # config = copy.deepcopy(job.parameters)
+    # params = {
+    #     "epochs": DEEPHYPER_BENCHMARK_MAX_EPOCHS,
+    #     "timeout": DEEPHYPER_BENCHMARK_TIMEOUT,
+    #     "verbose": False,
+    # }
+    # if len(config) > 0:
+    #     remap_hyperparameters(config)
+    #     params.update(config)
+
+    model_outdir_job_id = model_outdir + f"/{job.id}"
+    learning_rate = job.parameters["learning_rate"]
+    batch_size = job.parameters["batch_size"]
+    # val_scores = main_train_grapdrp([
+    #     "--train_ml_data_dir", str(train_ml_data_dir),
+    #     "--val_ml_data_dir", str(val_ml_data_dir),
+    #     "--model_outdir", str(model_outdir_job_id),
+    # ])
+    subprocess_res = subprocess.run(
+        [
+            "bash", subprocess_bashscript,
+             str(train_ml_data_dir),
+             str(val_ml_data_dir),
+             str(model_outdir_job_id),
+             str(learning_rate),
+             str(batch_size),
+             #str(cuda_name)
+             str(os.environ["CUDA_VISIBLE_DEVICES"])
+        ], 
+        capture_output=True, text=True, check=True
+    )
+    
+    # print(subprocess_res.stdout)
+    # print(subprocess_res.stderr)
+
+    # Load val_scores and get val_loss
+    # f = open(model_outdir + "/val_scores.json")
+    f = open(model_outdir_job_id + "/val_scores.json")
+    val_scores = json.load(f)
+    objective = -val_scores["val_loss"]
+    # print("objective:", objective)
+
+    # Checkpoint the model weights
+    with open(f"{log_dir}/model_{job.id}.pkl", "w") as f:
+        f.write("model weights")
+
+    # return score
+    return {"objective": objective, "metadata": val_scores}
+
+
+if __name__ == "__main__":
+    with Evaluator.create(
+        run, method="mpicomm", method_kwargs={"callbacks": [TqdmCallback()]}
+    ) as evaluator:
+
+        if evaluator is not None:
+            print(problem)
+
+            search = CBO(
+                problem,
+                evaluator,
+                log_dir=log_dir,
+                verbose=1,
+            )
+
+            # max_evals = 2
+            # max_evals = 4
+            # max_evals = 10
+            # max_evals = 20
+            max_evals = 10
+            # max_evals = 100
+            results = search.search(max_evals=max_evals)
+            results = results.sort_values("m:val_loss", ascending=True)
+            results.to_csv(model_outdir + "/hpo_results.csv", index=False)
+    print("current node: ", socket.gethostname(), "; current rank: ", rank, "; local rank", local_rank, "; CUDA_VISIBLE_DEVICE is set to: ", os.environ["CUDA_VISIBLE_DEVICES"])
+    print("Finished deephyper HPO.")
diff --git a/old_deephyper/subprocess_train.sh b/old_deephyper/subprocess_train.sh
new file mode 100755
index 0000000..a2b1541
--- /dev/null
+++ b/old_deephyper/subprocess_train.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0
+# CUDA_VISIBLE_DEVICES=5 bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0
+
+# Need to comment this when using ' eval "$(conda shell.bash hook)" '
+# set -e
+
+# Activate conda env for model using "conda activate myenv"
+# https://saturncloud.io/blog/activating-conda-environments-from-scripts-a-guide-for-data-scientists
+# https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script
+# This doesn't work w/o eval "$(conda shell.bash hook)"
+CONDA_ENV=$PathDSP_env
+#echo "Allow conda commands in shell script by running 'conda shell.bash hook'"
+#eval "$(conda shell.bash hook)"
+echo "Activated conda commands in shell script"
+#conda activate $CONDA_ENV
+#source activate $CONDA_ENV
+conda_path=$(dirname $(dirname $(which conda)))
+source $conda_path/bin/activate $CONDA_ENV
+#source /soft/datascience/conda/2023-10-04/mconda3/bin/activate $CONDA_ENV
+#source activate $CONDA_ENV
+echo "Activated conda env $CONDA_ENV"
+
+train_ml_data_dir=$1
+val_ml_data_dir=$2
+model_outdir=$3
+learning_rate=$4
+batch_size=$5
+#cuda_name=$6
+CUDA_VISIBLE_DEVICES=$6
+
+echo "train_ml_data_dir: $train_ml_data_dir"
+echo "val_ml_data_dir:   $val_ml_data_dir"
+echo "model_outdir:      $model_outdir"
+echo "learning_rate:      $learning_rate"
+echo "batch_size:      $batch_size"
+#echo "cuda_name:       $cuda_name"
+echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
+
+# epochs=10
+epochs=10
+# epochs=50
+
+# All train outputs are saved in params["model_outdir"]
+#CUDA_VISIBLE_DEVICES=6,7 python PathDSP_train_improve.py \
+#CUDA_VISIBLE_DEVICES=5
+#CUDA_VISIBLE_DEVICES=6,7
+CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} python PathDSP_train_improve.py \
+    --train_ml_data_dir $train_ml_data_dir \
+    --val_ml_data_dir $val_ml_data_dir \
+    --model_outdir $model_outdir \
+    --epochs $epochs \
+    --learning_rate $learning_rate \
+    --batch_size $batch_size
+#    --cuda_name $cuda_name
+
+#conda deactivate
+source $conda_path/bin/deactivate
+echo "Deactivated conda env $CONDA_ENV"
diff --git a/old_legacy_scripts/PathDSP_cs_model.txt b/old_legacy_scripts/PathDSP_cs_model.txt
new file mode 100644
index 0000000..44a4791
--- /dev/null
+++ b/old_legacy_scripts/PathDSP_cs_model.txt
@@ -0,0 +1,46 @@
+[Global_Params]
+model_name='PathDSP'
+
+[Preprocess]
+train_split_file = "GDSCv1_split_0_train.txt"
+val_split_file = "GDSCv1_split_0_val.txt"
+test_split_file = "GDSCv1_split_0_test.txt"
+ml_data_outdir = "./ml_data/GDSCv1-GDSCv1/split_0"
+x_data_canc_files = [["cancer_gene_expression.tsv", ["Gene_Symbol"]], ["cancer_mutation_count.tsv",["Gene_Symbol"]], ["cancer_discretized_copy_number.tsv", ["Gene_Symbol"]]]
+x_data_drug_files = [["drug_SMILES.tsv"]]
+y_data_files = [["response.tsv"]]
+data_format = ".txt"
+drug_bits_file='drug_mbit_df.txt'
+dgnet_file='DGnet.txt'
+mutnet_file='MUTnet.txt'
+cnvnet_file='CNVnet.txt'
+exp_file='EXP.txt'
+bit_int=128
+permutation_int=3
+seed_int=42
+cpu_int=20
+
+[Train]
+train_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0"
+val_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0"
+model_outdir = "./out_models/GDSCv1/split_0"
+model_file_name = "model"
+model_file_format = ".pt"
+epochs=800
+batch_size = 32
+val_batch = 32
+loss = "mse"
+early_stop_metric = "mse"
+patience = 30
+cuda_name = "cuda:2"
+learning_rate = 0.001
+
+[Infer]
+test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0"
+model_dir = "./out_models/GDSCv1/split_0"
+infer_outdir = "./out_infer/GDSCv1-GDSCv1/split_0"
+test_ml_data_dir = "./ml_data/GDSCv1-GDSCv1/split_0"
+model_dir = "./out_models/GDSCv1/split_0"
+infer_outdir = "./out_infer/GDSCv1-GDSCv1/split_0"
+test_batch = 256
+cuda_name = "cuda:3"
\ No newline at end of file
diff --git a/old_legacy_scripts/README_old.md b/old_legacy_scripts/README_old.md
new file mode 100644
index 0000000..84ea104
--- /dev/null
+++ b/old_legacy_scripts/README_old.md
@@ -0,0 +1,134 @@
+# PathDSP
+Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores
+
+# Example usage with singularity container
+Setup Singularity
+
+```
+git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git
+cd Singularity
+./setup
+source config/improve.env
+```
+
+Build Singularity from definition file
+
+```
+singularity build --fakeroot PathDSP.sif definitions/PathDSP.def
+```
+
+Perform preprocessing step using processed data from original paper
+
+```
+singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 0"
+```
+
+Alternatively, perform preprocessing step using raw data from IMPROVE project
+
+```
+singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh 0 /candle_data_dir "-a 1"
+```
+
+Train the model
+
+```
+singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir
+```
+
+Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss.txt`
+Final trained model is located at: `${IMPROVE_DATA_DIR}/Data/model.pt`
+
+Perform inference on the testing data
+
+```
+singularity exec --nv --pwd /usr/local/PathDSP/ --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir
+```
+
+Metrics regarding training process is located at: `${IMPROVE_DATA_DIR}/Data/Loss_pred.txt`
+Final prediction on testing data is located at: `${IMPROVE_DATA_DIR}/Data/Prediction.txt`
+
+# Example usage with Conda
+
+Download PathDSP
+
+```
+git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git
+cd PathDSP
+```
+
+Create environment
+
+```
+conda env create -f environment_082223.yml -n PathDSP_env
+```
+
+Activate environment
+
+```
+conda activate PathDSP_env
+```
+
+Intall CANDLE package
+
+```
+pip install git+https://github.com/ECP-CANDLE/candle_lib@develop
+```
+
+Perform preprocessing step using processed data from original paper
+
+```
+export CUDA_VISIBLE_DEVICES=0
+export CANDLE_DATA_DIR=./Data/
+bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 0"
+```
+
+Alternatively, perform preprocessing step using raw data from IMPROVE project
+
+```
+bash preprocess.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR "-a 1"
+```
+
+Train the model
+
+```
+bash train.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR
+```
+
+Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss.txt`
+Final trained model is located at: `${CANDLE_DATA_DIR}/Data/model.pt`
+
+Perform inference on the testing data
+
+```
+bash infer.sh $CUDA_VISIBLE_DEVICES $CANDLE_DATA_DIR
+```
+
+Metrics regarding training process is located at: `${CANDLE_DATA_DIR}/Data/Loss_pred.txt`
+Final prediction on testing data is located at: `${CANDLE_DATA_DIR}/Data/Prediction.txt`
+
+# Docs from original authors (below)
+
+# Requirments
+
+# Input format
+
+|drug|cell|feature_1|....|feature_n|drug_response|
+|----|----|--------|----|--------|----|
+|5-FU|03|0|....|0.02|-2.3|
+|5-FU|23|1|....|0.04|-3.4|
+
+Where feature_1 to feature_n are the pathway enrichment scores and the chemical fingerprint coming from data processing
+# Usage:
+```python
+# run FNN 
+python ./PathDSP/PathDSP/FNN.py -i input.txt -o ./output_prefix
+
+Where input.txt should be in the input format shown above. 
+Example input file can be found at https://zenodo.org/record/7532963
+```
+# Data preprocessing
+Pathway enrichment scores for categorical data (i.e., mutation, copy number variation, and drug targets) were obtained by running the NetPEA algorithm, which is available at: https://github.com/TangYiChing/NetPEA, while pathway enrichment scores for numeric data (i.e., gene expression) was generated with the single-sample Gene Set Enrichment Analsysis (ssGSEA) available here: https://gseapy.readthedocs.io/en/master/gseapy_example.html#3)-command-line-usage-of-single-sample-gseaby 
+
+
+# Reference
+Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7
\ No newline at end of file
diff --git a/old_legacy_scripts/README_old2.md b/old_legacy_scripts/README_old2.md
new file mode 100644
index 0000000..aca66fc
--- /dev/null
+++ b/old_legacy_scripts/README_old2.md
@@ -0,0 +1,181 @@
+# PathDSP
+Explainable Drug Sensitivity Prediction through Cancer Pathway Enrichment Scores
+
+# Download benchmark data
+
+Download the cross-study analysis (CSA) benchmark data into the model directory from https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/
+
+```
+mkdir process_dir
+cd process_dir
+wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data
+```
+
+Benchmark data will be downloaded under `process_dir/csa_data/`
+
+# Example usage with Conda
+
+Download PathDSP and IMPROVE
+
+```
+mkdir repo
+cd repo
+git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git
+git clone -b develop https://github.com/JDACS4C-IMPROVE/IMPROVE.git
+```
+
+# Download author data
+
+```
+cd ../
+mkdir author_data
+bash repo/PathDSP/download_author_data.sh author_data/
+```
+
+Author data will be downloaded under `process_dir/author_data/`
+PathDSP will be installed at `process_dir/repo/PathDSP`
+IMPROVE will be installed at `process_dir/repo/IMPROVE`
+
+Create environment
+
+```
+cd repo/PathDSP/
+conda env create -f environment_082223.yml -n PathDSP_env
+```
+
+Activate environment
+
+```
+conda activate PathDSP_env
+```
+
+Install CANDLE package
+
+```
+pip install git+https://github.com/ECP-CANDLE/candle_lib@develop
+```
+
+Define enviroment variabels
+
+```
+improve_lib="/path/to/IMPROVE/repo/"
+pathdsp_lib="/path/to/pathdsp/repo/"
+# notice the extra PathDSP folder after pathdsp_lib
+export PYTHONPATH=$PYTHONPATH:${improve_lib}:${pathdsp_lib}/PathDSP/
+export IMPROVE_DATA_DIR="/path/to/csa_data/"
+export AUTHOR_DATA_DIR="/path/to/author_data/"
+```
+
+Perform preprocessing step
+
+```
+# go two upper level
+cd ../../
+python repo/PathDSP/PathDSP_preprocess_improve.py
+```
+
+Train the model
+
+```
+python repo/PathDSP/PathDSP_train_improve.py
+```
+
+Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json`
+Final trained model is located at: `${train_ml_data_dir}/model.pt`. Parameter definitions can be found at `process_dir/repo/PathDSP/PathDSP_default_model.txt`
+
+Perform inference on the testing data
+
+```
+python repo/PathDSP/PathDSP_infer_improve.py
+```
+
+Metrics regarding test process is located at: `${infer_outdir}/test_scores.json`
+Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv`
+
+# Example usage with singularity container
+
+# Download benchmark data
+
+Download the cross-study analysis (CSA) benchmark data into the model directory from https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/
+
+```
+mkdir process_dir
+cd process_dir
+wget --cut-dirs=7 -P ./ -nH -np -m ftp://ftp.mcs.anl.gov/pub/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data
+```
+
+# Download author data
+
+Download model specific data under csa_data/ directory
+
+```
+git clone -b develop https://github.com/JDACS4C-IMPROVE/PathDSP.git
+bash PathDSP/download_author_data.sh csa_data/
+```
+
+Setup Singularity
+
+```
+git clone -b develop https://github.com/JDACS4C-IMPROVE/Singularity.git
+cd Singularity
+./setup
+source config/improve.env
+```
+
+Build Singularity from definition file
+
+```
+singularity build --fakeroot PathDSP.sif definitions/PathDSP.def
+```
+
+Perform preprocessing using csa benchmarking data
+
+```
+singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif preprocess.sh /candle_data_dir --ml_data_outdir /candle_data_dir/preprocess_data/
+```
+
+Train the model
+
+```
+singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif train.sh 0 /candle_data_dir --train_ml_data_dir /candle_data_dir/preprocess_data/ --val_ml_data_dir /candle_data_dir/preprocess_data/ --model_outdir /candle_data_dir/out_model/
+```
+
+Metrics regarding validation scores is located at: `${train_ml_data_dir}/val_scores.json`
+Final trained model is located at: `${train_ml_data_dir}/model.pt`. 
+
+Perform inference on the testing data
+
+```
+singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir PathDSP.sif infer.sh 0 /candle_data_dir --test_ml_data_dir /candle_data_dir/preprocess_data/ --model_dir /candle_data_dir/out_model/ --infer_outdir /candle_data_dir/out_infer/
+```
+
+Metrics regarding test process is located at: `${infer_outdir}/test_scores.json`
+Final prediction on testing data is located at: `${infer_outdir}/test_y_data_predicted.csv`
+
+
+# Docs from original authors (below)
+
+# Requirments
+
+# Input format
+
+|drug|cell|feature_1|....|feature_n|drug_response|
+|----|----|--------|----|--------|----|
+|5-FU|03|0|....|0.02|-2.3|
+|5-FU|23|1|....|0.04|-3.4|
+
+Where feature_1 to feature_n are the pathway enrichment scores and the chemical fingerprint coming from data processing
+# Usage:
+```python
+# run FNN 
+python ./PathDSP/PathDSP/FNN.py -i input.txt -o ./output_prefix
+
+Where input.txt should be in the input format shown above. 
+Example input file can be found at https://zenodo.org/record/7532963
+```
+# Data preprocessing
+Pathway enrichment scores for categorical data (i.e., mutation, copy number variation, and drug targets) were obtained by running the NetPEA algorithm, which is available at: https://github.com/TangYiChing/NetPEA, while pathway enrichment scores for numeric data (i.e., gene expression) was generated with the single-sample Gene Set Enrichment Analsysis (ssGSEA) available here: https://gseapy.readthedocs.io/en/master/gseapy_example.html#3)-command-line-usage-of-single-sample-gseaby 
+
+
+# Reference
+Tang, Y.-C., & Gottlieb, A. (2021). Explainable drug sensitivity prediction through cancer pathway enrichment. Scientific Reports, 11(1), 3128. https://doi.org/10.1038/s41598-021-82612-7
\ No newline at end of file
diff --git a/old_legacy_scripts/TODO.txt b/old_legacy_scripts/TODO.txt
new file mode 100644
index 0000000..7f87e3c
--- /dev/null
+++ b/old_legacy_scripts/TODO.txt
@@ -0,0 +1,6 @@
+1. Expose parameters - give up on private crap
+2. Fix params file
+3. Document on docs
+4. Use R script to make preprocess.sh
+5. make train script
+6. make infer script
diff --git a/old_legacy_scripts/environment.yml b/old_legacy_scripts/environment.yml
new file mode 100644
index 0000000..ad9d1d7
--- /dev/null
+++ b/old_legacy_scripts/environment.yml
@@ -0,0 +1,231 @@
+name: PathDSP_env
+channels:
+  - bioconda
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=4.5
+  - anyio=3.6.2
+  - appdirs=1.4.4
+  - argon2-cffi=21.3.0
+  - argon2-cffi-bindings=21.2.0
+  - asttokens=2.2.1
+  - async-lru=2.0.2
+  - attrs=23.1.0
+  - babel=2.12.1
+  - backcall=0.2.0
+  - backports=1.0
+  - backports.functools_lru_cache=1.6.4
+  - beautifulsoup4=4.12.2
+  - blas=1.0
+  - bleach=6.0.0
+  - boost=1.78.0
+  - boost-cpp=1.78.0
+  - bottleneck=1.3.5
+  - brotli=1.0.9
+  - brotli-bin=1.0.9
+  - brotlipy=0.7.0
+  - bzip2=1.0.8
+  - ca-certificates=2023.05.30
+  - cairo=1.16.0
+  - certifi=2023.7.22
+  - cffi=1.15.1
+  - charset-normalizer=2.0.4
+  - cloudpickle=2.2.1
+  - colorama=0.4.6
+  - comm=0.1.3
+  - contourpy=1.0.7
+  - cryptography=39.0.1
+  - cycler=0.11.0
+  - debugpy=1.6.7
+  - decorator=5.1.1
+  - defusedxml=0.7.1
+  - entrypoints=0.4
+  - executing=1.2.0
+  - expat=2.5.0
+  - ffmpeg=4.3
+  - filelock=3.9.0
+  - flit-core=3.9.0
+  - fontconfig=2.14.1
+  - fonttools=4.39.4
+  - freetype=2.12.1
+  - gettext=0.21.1
+  - giflib=5.2.1
+  - glib=2.76.3
+  - glib-tools=2.76.3
+  - gmp=6.2.1
+  - gmpy2=2.1.2
+  - gnutls=3.6.15
+  - greenlet=2.0.2
+  - gseapy=1.0.5
+  - icu=72.1
+  - idna=3.4
+  - importlib-metadata=6.6.0
+  - importlib_metadata=6.6.0
+  - importlib_resources=5.12.0
+  - intel-openmp=2023.1.0
+  - ipykernel=6.23.1
+  - ipython=8.13.2
+  - jedi=0.18.2
+  - jinja2=3.1.2
+  - joblib=1.2.0
+  - jpeg=9e
+  - json5=0.9.5
+  - jsonschema=4.17.3
+  - jupyter-lsp=2.1.0
+  - jupyter_client=8.2.0
+  - jupyter_core=4.12.0
+  - jupyter_events=0.6.3
+  - jupyter_server=2.5.0
+  - jupyter_server_terminals=0.4.4
+  - jupyterlab=4.0.0
+  - jupyterlab_pygments=0.2.2
+  - jupyterlab_server=2.22.1
+  - kiwisolver=1.4.4
+  - lame=3.100
+  - lcms2=2.12
+  - ld_impl_linux-64=2.38
+  - lerc=3.0
+  - libblas=3.9.0
+  - libbrotlicommon=1.0.9
+  - libbrotlidec=1.0.9
+  - libbrotlienc=1.0.9
+  - libcblas=3.9.0
+  - libdeflate=1.17
+  - libexpat=2.5.0
+  - libffi=3.4.4
+  - libgcc-ng=12.2.0
+  - libgfortran-ng=11.2.0
+  - libgfortran5=11.2.0
+  - libglib=2.76.3
+  - libiconv=1.17
+  - libidn2=2.3.4
+  - liblapack=3.9.0
+  - libllvm11=11.1.0
+  - libpng=1.6.39
+  - libsodium=1.0.18
+  - libstdcxx-ng=12.2.0
+  - libtasn1=4.19.0
+  - libtiff=4.5.0
+  - libunistring=0.9.10
+  - libuuid=1.41.5
+  - libwebp=1.2.4
+  - libwebp-base=1.2.4
+  - libxcb=1.15
+  - libxml2=2.10.4
+  - libzlib=1.2.13
+  - llvm-openmp=16.0.4
+  - llvmlite=0.39.1
+  - lz4-c=1.9.4
+  - markupsafe=2.1.1
+  - matplotlib-base=3.7.1
+  - matplotlib-inline=0.1.6
+  - mistune=2.0.5
+  - mkl=2023.1.0
+  - mkl-service=2.4.0
+  - mkl_fft=1.3.6
+  - mkl_random=1.2.2
+  - mpc=1.1.0
+  - mpfr=4.0.2
+  - mpmath=1.3.0
+  - munkres=1.1.4
+  - nbclient=0.8.0
+  - nbconvert-core=7.4.0
+  - nbformat=5.8.0
+  - ncurses=6.4
+  - nest-asyncio=1.5.6
+  - nettle=3.7.3
+  - networkx=2.8.4
+  - notebook-shim=0.2.3
+  - numba=0.56.4
+  - numexpr=2.8.4
+  - numpy=1.21.6
+  - openh264=2.1.1
+  - openssl=1.1.1v
+  - packaging=23.0
+  - pandas=1.5.3
+  - pandocfilters=1.5.0
+  - parso=0.8.3
+  - patsy=0.5.3
+  - pcre2=10.40
+  - pexpect=4.8.0
+  - pickleshare=0.7.5
+  - pillow=9.4.0
+  - pip=23.0.1
+  - pixman=0.40.0
+  - pkgutil-resolve-name=1.3.10
+  - pooch=1.4.0
+  - prometheus_client=0.16.0
+  - prompt-toolkit=3.0.38
+  - prompt_toolkit=3.0.38
+  - psutil=5.9.5
+  - pthread-stubs=0.4
+  - ptyprocess=0.7.0
+  - pure_eval=0.2.2
+  - pycairo=1.23.0
+  - pycparser=2.21
+  - pygments=2.15.1
+  - pyopenssl=23.0.0
+  - pyparsing=3.0.9
+  - pyrsistent=0.19.3
+  - pysocks=1.7.1
+  - python=3.10.11
+  - python-dateutil=2.8.2
+  - python-fastjsonschema=2.17.1
+  - python-json-logger=2.0.7
+  - python_abi=3.10
+  - pytorch=2.0.1
+  - pytorch-mutex=1.0
+  - pytz=2022.7
+  - pyyaml=6.0
+  - pyzmq=25.0.2
+  - rdkit=2023.03.1
+  - readline=8.2
+  - reportlab=3.6.13
+  - requests=2.29.0
+  - rfc3339-validator=0.1.4
+  - rfc3986-validator=0.1.1
+  - scikit-learn=1.0.2
+  - scipy=1.10.1
+  - seaborn=0.12.2
+  - seaborn-base=0.12.2
+  - send2trash=1.8.2
+  - setuptools=66.0.0
+  - shap=0.41.0
+  - six=1.16.0
+  - slicer=0.0.7
+  - sniffio=1.3.0
+  - soupsieve=2.3.2.post1
+  - sqlalchemy=1.4.46
+  - sqlite=3.41.2
+  - stack_data=0.6.2
+  - statsmodels=0.14.0
+  - sympy=1.11.1
+  - tbb=2021.8.0
+  - terminado=0.17.1
+  - threadpoolctl=3.1.0
+  - tinycss2=1.2.1
+  - tk=8.6.12
+  - tomli=2.0.1
+  - torchvision=0.15.2
+  - tornado=6.3.2
+  - tqdm=4.65.0
+  - traitlets=5.9.0
+  - typing_extensions=4.5.0
+  - tzdata=2023c
+  - unicodedata2=15.0.0
+  - urllib3=1.26.15
+  - wcwidth=0.2.6
+  - webencodings=0.5.1
+  - websocket-client=1.5.2
+  - wheel=0.38.4
+  - xorg-libxau=1.0.11
+  - xorg-libxdmcp=1.1.3
+  - xz=5.4.2
+  - yaml=0.2.5
+  - zeromq=4.3.4
+  - zipp=3.15.0
+  - zlib=1.2.13
+  - zstd=1.5.5
diff --git a/old_legacy_scripts/environment_081723.yml b/old_legacy_scripts/environment_081723.yml
new file mode 100644
index 0000000..870884e
--- /dev/null
+++ b/old_legacy_scripts/environment_081723.yml
@@ -0,0 +1,156 @@
+name: PathDSP_env
+channels:
+  - bioconda
+  - pytorch
+  - nvidia
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=4.5
+  - blas=1.0
+  - boost=1.74.0
+  - boost-cpp=1.74.0
+  - bottleneck=1.3.5
+  - brotlipy=0.7.0
+  - bzip2=1.0.8
+  - ca-certificates=2023.7.22
+  - cairo=1.16.0
+  - certifi=2023.7.22
+  - cffi=1.15.1
+  - charset-normalizer=2.0.4
+  - cryptography=41.0.2
+  - cuda-cudart=11.7.99
+  - cuda-cupti=11.7.101
+  - cuda-libraries=11.7.1
+  - cuda-nvrtc=11.7.99
+  - cuda-nvtx=11.7.91
+  - cuda-runtime=11.7.1
+  - cycler=0.11.0
+  - ffmpeg=4.3
+  - filelock=3.9.0
+  - fontconfig=2.14.1
+  - freetype=2.10.4
+  - giflib=5.2.1
+  - glib=2.69.1
+  - gmp=6.2.1
+  - gmpy2=2.1.2
+  - gnutls=3.6.15
+  - greenlet=2.0.1
+  - gseapy=1.0.5
+  - icu=70.1
+  - idna=3.4
+  - intel-openmp=2023.1.0
+  - jbig=2.1
+  - jinja2=3.1.2
+  - jpeg=9e
+  - kiwisolver=1.4.4
+  - lame=3.100
+  - lcms2=2.12
+  - ld_impl_linux-64=2.38
+  - lerc=3.0
+  - libcublas=11.10.3.66
+  - libcufft=10.7.2.124
+  - libcufile=1.7.1.12
+  - libcurand=10.3.3.129
+  - libcusolver=11.4.0.1
+  - libcusparse=11.7.4.91
+  - libdeflate=1.8
+  - libffi=3.4.4
+  - libgcc-ng=13.1.0
+  - libgfortran-ng=11.2.0
+  - libgfortran5=11.2.0
+  - libiconv=1.16
+  - libidn2=2.3.4
+  - libnpp=11.7.4.75
+  - libnsl=2.0.0
+  - libnvjpeg=11.8.0.2
+  - libpng=1.6.39
+  - libsqlite=3.42.0
+  - libstdcxx-ng=11.2.0
+  - libtasn1=4.19.0
+  - libtiff=4.3.0
+  - libunistring=0.9.10
+  - libuuid=2.38.1
+  - libwebp=1.2.4
+  - libwebp-base=1.2.4
+  - libxcb=1.15
+  - libxml2=2.9.14
+  - libzlib=1.2.13
+  - llvm-openmp=16.0.6
+  - lz4-c=1.9.4
+  - markupsafe=2.1.1
+  - matplotlib-base=3.4.3
+  - mkl=2023.1.0
+  - mkl-service=2.4.0
+  - mkl_fft=1.3.6
+  - mkl_random=1.2.2
+  - mpc=1.1.0
+  - mpfr=4.0.2
+  - mpmath=1.3.0
+  - ncurses=6.4
+  - nettle=3.7.3
+  - networkx=3.1
+  - numexpr=2.8.4
+  - numpy=1.25.2
+  - numpy-base=1.25.2
+  - openh264=2.1.1
+  - openssl=3.1.2
+  - pandas=1.5.3
+  - pcre=8.45
+  - pillow=9.4.0
+  - pip=23.2.1
+  - pixman=0.40.0
+  - pthread-stubs=0.4
+  - pycairo=1.24.0
+  - pycparser=2.21
+  - pyopenssl=23.2.0
+  - pysocks=1.7.1
+  - python=3.10.12
+  - python-dateutil=2.8.2
+  - python_abi=3.10
+  - pytorch=2.0.1
+  - pytorch-cuda=11.7
+  - pytorch-mutex=1.0
+  - pytz=2022.7
+  - rdkit=2022.03.2
+  - readline=8.2
+  - reportlab=3.6.12
+  - requests=2.31.0
+  - scipy=1.11.1
+  - setuptools=68.0.0
+  - six=1.16.0
+  - sqlalchemy=1.4.49
+  - sqlite=3.41.2
+  - sympy=1.11.1
+  - tbb=2021.8.0
+  - tk=8.6.12
+  - torchaudio=2.0.2
+  - torchtriton=2.0.0
+  - torchvision=0.15.2
+  - tornado=6.3.2
+  - typing_extensions=4.7.1
+  - tzdata=2023c
+  - urllib3=1.26.16
+  - wheel=0.38.4
+  - xorg-libxau=1.0.11
+  - xorg-libxdmcp=1.1.3
+  - xz=5.2.6
+  - zlib=1.2.13
+  - zstd=1.5.2
+  - pip:
+      - astropy==5.3.2
+      - contourpy==1.1.0
+      - fonttools==4.42.0
+      - joblib==1.3.2
+      - matplotlib==3.7.2
+      - packaging==23.1
+      - patsy==0.5.3
+      - protobuf==3.19.0
+      - pyerfa==2.0.0.3
+      - pyparsing==3.0.9
+      - pyyaml==6.0.1
+      - scikit-learn==1.3.0
+      - statsmodels==0.14.0
+      - threadpoolctl==3.2.0
+      - tqdm==4.66.1
diff --git a/old_legacy_scripts/get_test_data.py b/old_legacy_scripts/get_test_data.py
new file mode 100644
index 0000000..9de863d
--- /dev/null
+++ b/old_legacy_scripts/get_test_data.py
@@ -0,0 +1,14 @@
+import candle
+import os
+
+# Assumes CANDLE_DATA_DIR is an environment variable
+os.environ['CANDLE_DATA_DIR'] = 'tmp/'
+
+fname='input_txt_Nick.txt'
+origin='http://chia.team/IMPROVE_data/input_txt_Nick.txt'
+
+# Download and unpack the data in CANDLE_DATA_DIR
+candle.file_utils.get_file(fname, origin)
+
+# Do it again to confirm it's not re-downloading
+#candle.file_utils.get_file(fname, origin)
diff --git a/old_legacy_scripts/improve_utils.py b/old_legacy_scripts/improve_utils.py
new file mode 100644
index 0000000..1956ee1
--- /dev/null
+++ b/old_legacy_scripts/improve_utils.py
@@ -0,0 +1,735 @@
+import os
+import numpy as np
+import pandas as pd
+from pathlib import Path, PosixPath
+from math import sqrt
+from scipy import stats
+from typing import List, Union, Optional, Tuple
+
+
+fdir = Path(__file__).resolve().parent
+
+
+# -----------------------------------------------------------------------------
+# TODO
+# Note!
+# We need to decide how this utils file will be provided for each model.
+# Meanwhile, place this .py file in the level as your data preprocessing script.
+# For example:
+# GraphDRP/
+# |_______ preprocess.py
+# |_______ improve_utils.py
+# |
+# | 
+# -----------------------------------------------------------------------------
+
+
+
+# -----------------------------------------------------------------------------
+# Global variables
+# ----------------
+# These are globals for all models
+import types
+improve_globals = types.SimpleNamespace()
+
+# TODO:
+# This is CANDLE_DATA_DIR (or something...).
+# How this is going to be passed to the code?
+improve_globals.main_data_dir = PosixPath(os.environ.get("CANDLE_DATA_DIR") + "/csa_data/")
+# improve_globals.main_data_dir = fdir/"improve_data_dir"
+# imp_globals.main_data_dir = fdir/"candle_data_dir"
+
+# Dir names corresponding to the primary input/output blocks in the pipeline
+# {}: input/output
+# []: process
+# train path:      {raw_data} --> [preprocess] --> {ml_data} --> [train] --> {models}
+# inference path:  {ml_data, models} --> [inference] --> {infer}
+improve_globals.raw_data_dir_name = "raw_data"  # benchmark data
+improve_globals.ml_data_dir_name = "ml_data"    # preprocessed data for a specific ML model
+improve_globals.models_dir_name = "models"      # output from model training
+improve_globals.infer_dir_name = "infer"        # output from model inference (testing)
+
+# Secondary dirs in raw_data
+improve_globals.x_data_dir_name = "x_data"      # feature data
+improve_globals.y_data_dir_name = "y_data"      # target data
+improve_globals.splits_dir_name = "splits"      # splits files
+
+# Column names in the raw data files
+# imp_globals.canc_col_name = "CancID"
+# imp_globals.drug_col_name = "DrugID"
+improve_globals.canc_col_name = "improve_sample_id"  # column name that contains the cancer sample ids TODO: rename to sample_col_name
+improve_globals.drug_col_name = "improve_chem_id"    # column name that contains the drug ids
+improve_globals.source_col_name = "source"           # column name that contains source/study names (CCLE, GDSCv1, etc.)
+improve_globals.pred_col_name_suffix = "_pred"       # suffix to predictions col name (example of final col name: auc_pred)
+
+# Response data file name
+improve_globals.y_file_name = "response.tsv"  # response data
+
+# Cancer sample features file names
+improve_globals.copy_number_fname = "cancer_copy_number.tsv"  # cancer feature
+improve_globals.discretized_copy_number_fname = "cancer_discretized_copy_number.tsv"  # cancer feature
+improve_globals.dna_methylation_fname = "cancer_DNA_methylation.tsv"  # cancer feature
+improve_globals.gene_expression_fname = "cancer_gene_expression.tsv"  # cancer feature
+improve_globals.miRNA_expression_fname = "cancer_miRNA_expression.tsv"  # cancer feature
+improve_globals.mutation_count_fname = "cancer_mutation_count.tsv"  # cancer feature
+improve_globals.mutation_fname = "cancer_mutation.tsv"  # cancer feature
+improve_globals.rppa_fname = "cancer_RPPA.tsv"  # cancer feature
+
+# Drug features file names
+improve_globals.smiles_file_name = "drug_SMILES.tsv"  # drug feature
+improve_globals.mordred_file_name = "drug_mordred.tsv"  # drug feature
+improve_globals.ecfp4_512bit_file_name = "drug_ecfp4_512bit.tsv"  # drug feature
+
+# Globals derived from the ones defined above
+improve_globals.raw_data_dir = improve_globals.main_data_dir/improve_globals.raw_data_dir_name # raw_data
+improve_globals.ml_data_dir  = improve_globals.main_data_dir/improve_globals.ml_data_dir_name  # ml_data
+improve_globals.models_dir   = improve_globals.main_data_dir/improve_globals.models_dir_name   # models
+improve_globals.infer_dir    = improve_globals.main_data_dir/improve_globals.infer_dir_name    # infer
+# -----
+improve_globals.x_data_dir   = improve_globals.raw_data_dir/improve_globals.x_data_dir_name    # x_data
+improve_globals.y_data_dir   = improve_globals.raw_data_dir/improve_globals.y_data_dir_name    # y_data
+improve_globals.splits_dir   = improve_globals.raw_data_dir/improve_globals.splits_dir_name    # splits
+
+# Response
+improve_globals.y_file_path = improve_globals.y_data_dir/improve_globals.y_file_name           # response.txt
+
+# Cancers
+improve_globals.copy_number_file_path = improve_globals.x_data_dir/improve_globals.copy_number_fname  # cancer_copy_number.txt
+improve_globals.discretized_copy_number_file_path = improve_globals.x_data_dir/improve_globals.discretized_copy_number_fname # cancer_discretized_copy_number.txt
+improve_globals.dna_methylation_file_path = improve_globals.x_data_dir/improve_globals.dna_methylation_fname  # cancer_DNA_methylation.txt
+improve_globals.gene_expression_file_path = improve_globals.x_data_dir/improve_globals.gene_expression_fname  # cancer_gene_expression.txt
+improve_globals.mirna_expression_file_path = improve_globals.x_data_dir/improve_globals.miRNA_expression_fname  # cancer_miRNA_expression.txt
+improve_globals.mutation_count_file_path = improve_globals.x_data_dir/improve_globals.mutation_count_fname # cancer_mutation_count.txt
+improve_globals.mutation_file_path = improve_globals.x_data_dir/improve_globals.mutation_fname # cancer_mutation.txt
+improve_globals.rppa_file_path = improve_globals.x_data_dir/improve_globals.rppa_fname # cancer_RPPA.txt
+
+# Drugs
+improve_globals.smiles_file_path = improve_globals.x_data_dir/improve_globals.smiles_file_name  # 
+improve_globals.mordred_file_path = improve_globals.x_data_dir/improve_globals.mordred_file_name  # 
+improve_globals.ecfp4_512bit_file_path = improve_globals.x_data_dir/improve_globals.ecfp4_512bit_file_name  # 
+# -----------------------------------------------------------------------------
+
+
+# -------------------------------------
+# Drug response loaders
+# -------------------------------------
+
+def load_single_drug_response_data(
+    # source: Union[str, List[str]],
+    source: str,
+    split: Union[int, None]=None,
+    split_type: Union[str, List[str], None]=None,
+    y_col_name: str="auc",
+    sep: str="\t",
+    verbose: bool=True) -> pd.DataFrame:
+    """
+    Returns datarame with cancer ids, drug ids, and drug response values. Samples
+    from the original drug response file are filtered based on the specified
+    sources.
+
+    Args:
+        source (str or list of str): DRP source name (str) or multiple sources (list of strings)
+        split(int or None): split id (int), None (load all samples)
+        split_type (str or None): one of the following: 'train', 'val', 'test'
+        y_col_name (str): name of drug response measure/score (e.g., AUC, IC50)
+
+    Returns:
+        pd.Dataframe: dataframe that contains drug response values
+    """
+    # TODO: at this point, this func implements the loading a single source
+    df = pd.read_csv(improve_globals.y_file_path, sep=sep)
+
+    # import pdb; pdb.set_trace()
+    if isinstance(split, int):
+        # Get a subset of samples
+        ids = load_split_file(source, split, split_type)
+        df = df.loc[ids]
+    else:
+        # Get the full dataset for a given source
+        df = df[df[improve_globals.source_col_name].isin([source])]
+
+    cols = [improve_globals.source_col_name,
+            improve_globals.drug_col_name,
+            improve_globals.canc_col_name,
+            y_col_name]
+    df = df[cols]  # [source, drug id, cancer id, response]
+    df = df.reset_index(drop=True)
+    if verbose:
+        print(f"Response data: {df.shape}")
+        print(df[[improve_globals.canc_col_name, improve_globals.drug_col_name]].nunique())
+    return df
+
+
+def load_single_drug_response_data_v2(
+    # source: Union[str, List[str]],
+    source: str,
+    # split: Union[int, None]=None,
+    # split_type: Union[str, List[str], None]=None,
+    split_file_name: Union[str, List[str], None]=None,
+    y_col_name: str="auc",
+    sep: str="\t",
+    verbose: bool=True) -> pd.DataFrame:
+    """
+    Returns datarame with cancer ids, drug ids, and drug response values. Samples
+    from the original drug response file are filtered based on the specified
+    sources.
+
+    Args:
+        source (str or list of str): DRP source name (str) or multiple sources (list of strings)
+        split(int or None): split id (int), None (load all samples)
+        split_type (str or None): one of the following: 'train', 'val', 'test'
+        y_col_name (str): name of drug response measure/score (e.g., AUC, IC50)
+
+    Returns:
+        pd.Dataframe: dataframe that contains drug response values
+    """
+    # TODO: currently, this func implements loading a single data source (CCLE or CTRPv2 or ...)
+    df = pd.read_csv(improve_globals.y_file_path, sep=sep)
+
+    # Get a subset of samples
+    if isinstance(split_file_name, list) and len(split_file_name) == 0:
+        raise ValueError("Empty list is passed via split_file_name.")
+    if isinstance(split_file_name, str):
+        split_file_name = [split_file_name]
+    ids = load_split_ids(split_file_name)
+    df = df.loc[ids]
+    # else:
+    #     # Get the full dataset for a given source
+    #     df = df[df[improve_globals.source_col_name].isin([source])]
+
+    # # Get a subset of cols
+    # cols = [improve_globals.source_col_name,
+    #         improve_globals.drug_col_name,
+    #         improve_globals.canc_col_name,
+    #         y_col_name]
+    # df = df[cols]  # [source, drug id, cancer id, response]
+
+    df = df.reset_index(drop=True)
+    if verbose:
+        print(f"Response data: {df.shape}")
+        print(f"Unique cells:  {df[improve_globals.canc_col_name].nunique()}")
+        print(f"Unique drugs:  {df[improve_globals.drug_col_name].nunique()}")
+    return df
+
+
+def load_split_ids(split_file_name: Union[str, List[str]]) -> List[int]:
+    """ Returns list of integers, representing the rows in the response dataset.
+    Args:
+        split_file_name (str or list of str): splits file name or list of file names
+
+    Returns:
+        list: list of integers representing the ids
+    """
+    ids = []
+    for fname in split_file_name:
+        fpath = improve_globals.splits_dir/fname
+        assert fpath.exists(), f"split_file_name {fname} not found."
+        ids_ = pd.read_csv(fpath, header=None)[0].tolist()
+        ids.extend(ids_)
+    return ids
+
+
+def load_split_file(
+    source: str,
+    split: Union[int, None]=None,
+    split_type: Union[str, List[str], None]=None) -> List[int]:
+    """
+    Args:
+        source (str): DRP source name (str)
+
+    Returns:
+        ids (list): list of id integers
+    """
+    # TODO: used in the old version of the rsp loader
+    if isinstance(split_type, str):
+        split_type = [split_type]
+
+    # Check if the split file exists and load
+    ids = []
+    for st in split_type:
+        fpath = improve_globals.splits_dir/f"{source}_split_{split}_{st}.txt"
+        assert fpath.exists(), f"Splits file not found: {fpath}"
+        ids_ = pd.read_csv(fpath, header=None)[0].tolist()
+        ids.extend(ids_)
+    return ids
+
+
+# -------------------------------------
+# Omic feature loaders
+# -------------------------------------
+
+"""
+Notes about omics data.
+
+Omics data files are multi-level tables with several column types (generally 3
+or 4), each contains gene names using a different gene identifier system:
+Entrez ID, Gene Symbol, Ensembl ID, TSS
+
+The column levels are not organized in the same order across the different
+omic files.
+
+The level_map dict, in each loader function, encodes the column level and the
+corresponding identifier systems.
+
+For example, in the copy number file the level_map is:  
+level_map = {"Entrez":0, "Gene_Symbol": 1, "Ensembl": 2}
+"""
+
+def set_col_names_in_multilevel_dataframe(
+    df: pd.DataFrame,
+    level_map: dict,
+    gene_system_identifier: Union[str, List[str]]="Gene_Symbol") -> pd.DataFrame:
+    """ Util function that supports loading of the omic data files.
+    Returns the input dataframe with the multi-level column names renamed as
+    specified by the gene_system_identifier arg.
+
+    Args:
+        df (pd.DataFrame): omics dataframe
+        level_map (dict): encodes the column level and the corresponding identifier systems
+        gene_system_identifier (str or list of str): gene identifier system to use
+            options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list
+                     combination of ["Entrez", "Gene_Symbol", "Ensembl"]
+
+    Returns:
+        pd.DataFrame: the input dataframe with the specified multi-level column names
+    """
+    df = df.copy()
+
+    level_names = list(level_map.keys())
+    level_values = list(level_map.values())
+    n_levels = len(level_names)
+    
+    if isinstance(gene_system_identifier, list) and len(gene_system_identifier) == 1:
+        gene_system_identifier = gene_system_identifier[0]
+
+    # print(gene_system_identifier)
+    # import pdb; pdb.set_trace()
+    if isinstance(gene_system_identifier, str):
+        if gene_system_identifier == "all":
+            df.columns = df.columns.rename(level_names, level=level_values)  # assign multi-level col names
+        else:
+            df.columns = df.columns.get_level_values(level_map[gene_system_identifier])  # retian specific column level
+    else:
+        assert len(gene_system_identifier) <= n_levels, f"'gene_system_identifier' can't contain more than {n_levels} items."
+        set_diff = list(set(gene_system_identifier).difference(set(level_names)))
+        assert len(set_diff) == 0, f"Passed unknown gene identifiers: {set_diff}"
+        kk = {i: level_map[i] for i in level_map if i in gene_system_identifier}
+        # print(list(kk.keys()))
+        # print(list(kk.values()))
+        df.columns = df.columns.rename(list(kk.keys()), level=kk.values())  # assign multi-level col names
+        drop_levels = list(set(level_map.values()).difference(set(kk.values())))
+        df = df.droplevel(level=drop_levels, axis=1)
+    return df
+
+
+def load_copy_number_data(
+    gene_system_identifier: Union[str, List[str]]="Gene_Symbol",
+    sep: str="\t",
+    verbose: bool=True) -> pd.DataFrame:
+    """
+    Returns copy number data.
+
+    Args:
+        gene_system_identifier (str or list of str): gene identifier system to use
+            options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list
+                     combination of ["Entrez", "Gene_Symbol", "Ensembl"]
+
+    Returns:
+        pd.DataFrame: dataframe with the omic data
+    """
+    # level_map encodes the relationship btw the column and gene identifier system
+    level_map = {"Ensembl": 2, "Entrez": 0, "Gene_Symbol": 1}
+    header = [i for i in range(len(level_map))]
+
+    df = pd.read_csv(improve_globals.copy_number_file_path, sep=sep, index_col=0, header=header)
+    df.index.name = improve_globals.canc_col_name  # assign index name
+    df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier)
+    # Test the func
+    # d0 = set_col_names_in_multilevel_dataframe(df, "all")
+    # d1 = set_col_names_in_multilevel_dataframe(df, "Ensembl")
+    # d2 = set_col_names_in_multilevel_dataframe(df, ["Ensembl"])
+    # d3 = set_col_names_in_multilevel_dataframe(df, ["Entrez", "Gene_Symbol", "Ensembl"])
+    # d4 = set_col_names_in_multilevel_dataframe(df, ["Entrez", "Ensembl"])
+    # d5 = set_col_names_in_multilevel_dataframe(df, ["Blah", "Ensembl"])
+    if verbose:
+        print(f"Copy number data: {df.shape}")
+        # print(df.dtypes)
+        # print(df.dtypes.value_counts())
+    return df
+
+
+def load_discretized_copy_number_data(
+    gene_system_identifier: Union[str, List[str]]="Gene_Symbol",
+    sep: str="\t",
+    verbose: bool=True) -> pd.DataFrame:
+    """
+    Returns discretized copy number data.
+
+    Args:
+        gene_system_identifier (str or list of str): gene identifier system to use
+            options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list
+                     combination of ["Entrez", "Gene_Symbol", "Ensembl"]
+
+    Returns:
+        pd.DataFrame: dataframe with the omic data
+    """
+    # level_map encodes the relationship btw the column and gene identifier system
+    level_map = {"Ensembl": 2, "Entrez": 0, "Gene_Symbol": 1}
+    header = [i for i in range(len(level_map))]
+
+    df = pd.read_csv(improve_globals.discretized_copy_number_file_path, sep=sep, index_col=0, header=header)
+
+    df.index.name = improve_globals.canc_col_name  # assign index name
+    df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier)
+    if verbose:
+        print(f"Discretized copy number data: {df.shape}")
+
+    return df
+
+
+def load_dna_methylation_data(
+    gene_system_identifier: Union[str, List[str]]="Gene_Symbol",
+    sep: str="\t",
+    verbose: bool=True) -> pd.DataFrame:
+    """
+    Returns methylation data.
+
+    Args:
+        gene_system_identifier (str or list of str): gene identifier system to use
+            options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list
+                     combination of ["Entrez", "Gene_Symbol", "Ensembl"]
+
+    Returns:
+        pd.DataFrame: dataframe with the omic data
+    """
+    level_map = {"Ensembl": 2, "Entrez": 1, "Gene_Symbol": 3, "TSS": 0}
+    header = [i for i in range(len(level_map))]
+
+    df = pd.read_csv(improve_globals.dna_methylation_file_path, sep=sep, index_col=0, header=header)
+
+    df.index.name = improve_globals.canc_col_name  # assign index name
+    df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier)
+    if verbose:
+        print(f"DNA methylation data: {df.shape}")
+        # print(df.dtypes)  # TODO: many column are of type 'object'
+        # print(df.dtypes.value_counts())
+    return df
+
+
+def load_gene_expression_data(
+    gene_system_identifier: Union[str, List[str]]="Gene_Symbol",
+    sep: str="\t",
+    verbose: bool=True) -> pd.DataFrame:
+    """
+    Returns gene expression data.
+
+    Args:
+        gene_system_identifier (str or list of str): gene identifier system to use
+            options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list
+                     combination of ["Entrez", "Gene_Symbol", "Ensembl"]
+
+    Returns:
+        pd.DataFrame: dataframe with the omic data
+    """
+    # level_map encodes the relationship btw the column and gene identifier system
+    level_map = {"Ensembl": 0, "Entrez": 1, "Gene_Symbol": 2}
+    header = [i for i in range(len(level_map))]
+
+    df = pd.read_csv(improve_globals.gene_expression_file_path, sep=sep, index_col=0, header=header)
+
+    df.index.name = improve_globals.canc_col_name  # assign index name
+    df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier)
+    if verbose:
+        print(f"Gene expression data: {df.shape}")
+    return df
+
+
+def load_mirna_expression_data(
+    gene_system_identifier: Union[str, List[str]]="Gene_Symbol",
+    sep: str="\t",
+    verbose: bool=True) -> pd.DataFrame:
+    # TODO
+    raise NotImplementedError("The function is not implemeted yet.")
+    return None
+
+
+def load_mutation_count_data(
+    gene_system_identifier: Union[str, List[str]]="Gene_Symbol",
+    sep: str="\t",
+    verbose: bool=True) -> pd.DataFrame:
+    """
+    Returns mutation count data.
+
+    Args:
+        gene_system_identifier (str or list of str): gene identifier system to use
+            options: "Entrez", "Gene_Symbol", "Ensembl", "all", or any list
+                     combination of ["Entrez", "Gene_Symbol", "Ensembl"]
+
+    Returns:
+        pd.DataFrame: dataframe with the omic data
+    """
+    # level_map encodes the relationship btw the column and gene identifier system
+    level_map = {"Ensembl": 2, "Entrez": 0, "Gene_Symbol": 1}
+    header = [i for i in range(len(level_map))]
+
+    df = pd.read_csv(improve_globals.mutation_count_file_path, sep=sep, index_col=0, header=header)
+
+    df.index.name = improve_globals.canc_col_name  # assign index name
+    df = set_col_names_in_multilevel_dataframe(df, level_map, gene_system_identifier)
+    if verbose:
+        print(f"Mutation count data: {df.shape}")
+    
+    return df
+
+
+def load_mutation_data(
+    gene_system_identifier: Union[str, List[str]]="Gene_Symbol",
+    sep: str="\t",
+    verbose: bool=True) -> pd.DataFrame:
+    # TODO
+    raise NotImplementedError("The function is not implemeted yet.")
+    return None
+
+
+def load_rppa_data(
+    gene_system_identifier: Union[str, List[str]]="Gene_Symbol",
+    sep: str="\t",
+    verbose: bool=True) -> pd.DataFrame:
+    # TODO
+    raise NotImplementedError("The function is not implemeted yet.")
+    return None
+
+
+
+
+# -------------------------------------
+# Drug feature loaders
+# -------------------------------------
+
+def load_smiles_data(
+    sep: str="\t",
+    verbose: bool=True) -> pd.DataFrame:
+    """
+    IMPROVE-specific func.
+    Read smiles data.
+    src_raw_data_dir : data dir where the raw DRP data is stored
+    """
+    df = pd.read_csv(improve_globals.smiles_file_path, sep=sep)
+
+    # TODO: updated this after we update the data
+    df.columns = ["improve_chem_id", "smiles"]
+
+    if verbose:
+        print(f"SMILES data: {df.shape}")
+        # print(df.dtypes)
+        # print(df.dtypes.value_counts())
+    return df
+
+
+def load_mordred_descriptor_data(
+    sep: str="\t",
+    verbose: bool=True) -> pd.DataFrame:
+    """
+    Return Mordred descriptors data.
+    """
+    df = pd.read_csv(improve_globals.mordred_file_path, sep=sep)
+    df = df.set_index(improve_globals.drug_col_name)
+    if verbose:
+        print(f"Mordred descriptors data: {df.shape}")
+    return df
+
+
+def load_morgan_fingerprint_data(
+    sep: str="\t",
+    verbose: bool=True) -> pd.DataFrame:
+    """
+    Return Morgan fingerprints data.
+    """
+    df = pd.read_csv(improve_globals.ecfp4_512bit_file_path, sep=sep)
+    df = df.set_index(improve_globals.drug_col_name)
+    return df
+
+
+# -------------------------------------
+# Save data functions
+# -------------------------------------
+
+def save_preds(df: pd.DataFrame, y_col_name: str,
+               outpath: Union[str, PosixPath], round_decimals: int=4) -> None:
+    """ Save model predictions.
+    This function throws errors if the dataframe does not include the expected
+    columns: canc_col_name, drug_col_name, y_col_name, y_col_name + "_pred"
+
+    Args:
+        df (pd.DataFrame): df with model predictions
+        y_col_name (str): drug response col name (e.g., IC50, AUC)
+        outpath (str or PosixPath): outdir to save the model predictions df
+        round (int): round response values 
+        
+    Returns:
+        None
+    """
+    # Check that the 4 columns exist
+    assert improve_globals.canc_col_name in df.columns, f"{improve_globals.canc_col_name} was not found in columns."
+    assert improve_globals.drug_col_name in df.columns, f"{improve_globals.drug_col_name} was not found in columns."
+    assert y_col_name in df.columns, f"{y_col_name} was not found in columns."
+    pred_col_name = y_col_name + f"{improve_globals.pred_col_name_suffix}"
+    assert pred_col_name in df.columns, f"{pred_col_name} was not found in columns."
+
+    # Round
+    df = df.round({y_col_name: round_decimals, pred_col_name: round_decimals})
+
+    # Save preds df
+    df.to_csv(outpath, index=False)
+    return None
+
+
+
+
+
+
+# ==================================================================
+# Leftovers
+# ==================================================================
+def get_data_splits(
+    src_raw_data_dir: str,
+    splitdir_name: str,
+    split_file_name: str,
+    rsp_df: pd.DataFrame):
+    """
+    IMPROVE-specific func.
+    Read smiles data.
+    src_raw_data_dir : data dir where the raw DRP data is stored
+    """
+    splitdir = src_raw_data_dir/splitdir_name
+    if len(split_file_name) == 1 and split_file_name[0] == "full":
+        # Full dataset (take all samples)
+        ids = list(range(rsp_df.shape[0]))
+    else:
+        # Check if the split file exists and load
+        ids = []
+        for fname in split_file_name:
+            assert (splitdir/fname).exists(), "split_file_name not found."
+            with open(splitdir/fname) as f:
+                ids_ = [int(line.rstrip()) for line in f]
+                ids.extend(ids_)
+
+    """
+    # Method 1
+    splitdir = Path(os.path.join(src_raw_data_dir))/"splits"
+    if len(args.split_file_name) == 1 and args.split_file_name[0] == "full":
+        # Full dataset (take all samples)
+        ids = list(range(rsp_df.shape[0]))
+        outdir_name = "full"
+    else:
+        # Check if the split file exists and load
+        ids = []
+        split_id_str = []    # e.g. split_5
+        split_type_str = []  # e.g. tr, vl, te
+        for fname in args.split_file_name:
+            assert (splitdir/fname).exists(), "split_file_name not found."
+            with open(splitdir/fname) as f:
+                # Get the ids
+                ids_ = [int(line.rstrip()) for line in f]
+                ids.extend(ids_)
+                # Get the name
+                fname_sep = fname.split("_")
+                split_id_str.append("_".join([s for s in fname_sep[:2]]))
+                split_type_str.append(fname_sep[2])
+        assert len(set(split_id_str)) == 1, "Data splits must be from the same dataset source."
+        split_id_str = list(set(split_id_str))[0]
+        split_type_str = "_".join([x for x in split_type_str])
+        outdir_name = f"{split_id_str}_{split_type_str}"
+    ML_DATADIR = main_data_dir/"ml_data"
+    root = ML_DATADIR/f"data.{args.source_data_name}"/outdir_name # ML data
+    os.makedirs(root, exist_ok=True)
+    """
+
+    """
+    # Method 2
+    splitdir = src_raw_data_dir/args.splitdir_name
+    if len(args.split_file_name) == 1 and args.split_file_name[0] == "full":
+        # Full dataset (take all samples)
+        ids = list(range(rsp_df.shape[0]))
+    else:
+        # Check if the split file exists and load
+        ids = []
+        for fname in args.split_file_name:
+            assert (splitdir/fname).exists(), "split_file_name not found."
+            with open(splitdir/fname) as f:
+                ids_ = [int(line.rstrip()) for line in f]
+                ids.extend(ids_)
+    """
+    return ids
+
+
+def get_common_samples(
+    df1: pd.DataFrame,
+    df2: pd.DataFrame,
+    ref_col: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Args:
+        df1, df2 (pd.DataFrame): dataframes
+        ref_col (str): the ref column to find the common values
+
+    Returns:
+        df1, df2
+
+    Example:
+        TODO
+    """
+    # Retain (canc, drug) response samples for which we have omic data
+    common_ids = list(set(df1[ref_col]).intersection(df2[ref_col]))
+    # print(df1.shape)
+    df1 = df1[ df1[improve_globals.canc_col_name].isin(common_ids) ].reset_index(drop=True)
+    # print(df1.shape)
+    # print(df2.shape)
+    df2 = df2[ df2[improve_globals.canc_col_name].isin(common_ids) ].reset_index(drop=True)
+    # print(df2.shape)
+    return df1, df2
+
+
+def read_df(fpath: str, sep: str=","):
+    """
+    IMPROVE-specific func.
+    Load a dataframe. Supports csv and parquet files.
+    sep : the sepator in the csv file
+    """
+    # TODO: this func might be available in candle
+    assert Path(fpath).exists(), f"File {fpath} was not found."
+    if "parquet" in str(fpath):
+        df = pd.read_parquet(fpath)
+    else:
+        df = pd.read_csv(fpath, sep=sep)
+    return df
+
+
+def get_subset_df(df: pd.DataFrame, ids: list) -> pd.DataFrame:
+    """ Get a subset of the input dataframe based on row ids."""
+    df = df.loc[ids]
+    return df
+
+
+def rmse(y, f):
+    rmse = sqrt(((y - f)**2).mean(axis=0))
+    return rmse
+
+
+def mse(y, f):
+    mse = ((y - f)**2).mean(axis=0)
+    return mse
+
+
+def pearson(y, f):
+    rp = np.corrcoef(y, f)[0, 1]
+    return rp
+
+
+def spearman(y, f):
+    rs = stats.spearmanr(y, f)[0]
+    return rs
+
+
+def r_square(y_true, y_pred):
+    from sklearn.metrics import r2_score
+    return r2_score(y_true, y_pred)
diff --git a/old_legacy_scripts/infer.py b/old_legacy_scripts/infer.py
new file mode 100755
index 0000000..639c121
--- /dev/null
+++ b/old_legacy_scripts/infer.py
@@ -0,0 +1,85 @@
+import candle
+import os
+import sys
+#import json
+#from json import JSONEncoder
+from preprocess_new import mkdir, preprocess
+import numpy as np
+import pandas as pd
+from datetime import datetime
+import torch as tch
+import torch.utils.data as tchud
+import polars as pl
+import sklearn.metrics as skmts
+#sys.path.append("/usr/local/PathDSP/PathDSP")
+sys.path.append("/usr/local/PathDSP/PathDSP")
+sys.path.append(os.getcwd() + "/PathDSP")
+import FNN_new
+
+
+file_path = os.path.dirname(os.path.realpath(__file__))
+required = None
+additional_definitions = None
+
+
+# initialize class
+class PathDSP_candle(candle.Benchmark):
+    def set_locals(self):
+        '''
+        Functionality to set variables specific for the benchmark
+        - required: set of required parameters for the benchmark.
+        - additional_definitions: list of dictionaries describing the additional parameters for the benchmark.
+        '''
+        if required is not None: 
+            self.required = set(required)
+        if additional_definitions is not None:
+            self.additional_definitions = additional_definitions
+
+def initialize_parameters():
+    preprocessor_bmk = PathDSP_candle(file_path,
+        'PathDSP_params.txt',
+        'pytorch',
+        prog='PathDSP_candle',
+        desc='Data Preprocessor'
+    )
+    #Initialize parameters
+    gParameters = candle.finalize_parameters(preprocessor_bmk)
+    return gParameters
+
+    
+def run(params):
+    test_df = pl.read_csv(params['test_data'], separator = "\t").to_pandas()
+    Xtest_arr = test_df.iloc[:, 0:-1].values
+    ytest_arr = test_df.iloc[:, -1].values
+    Xtest_arr = np.array(Xtest_arr).astype('float32')
+    ytest_arr = np.array(ytest_arr).astype('float32')
+    trained_net = FNN_new.mynet.FNN(Xtest_arr.shape[1])
+    trained_net.load_state_dict(tch.load(params['data_dir'] + '/model.pt'))
+    trained_net.eval()
+    FNN_new.myutil.set_seed(params["seed_int"])
+    device = FNN_new.myutil.get_device(uth=params["gpu_int"])
+    test_dataset = FNN_new.mydl.NumpyDataset(tch.from_numpy(Xtest_arr), tch.from_numpy(ytest_arr))
+    test_dl = tchud.DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)
+    start = datetime.now()
+    prediction_list = FNN_new.predict(trained_net, test_dl, device)
+    print('Inference time :[Finished in {:}]'.format(FNN_new.cal_time(datetime.now(), start)))
+    # evaluation metrics
+    mse = skmts.mean_squared_error(ytest_arr, prediction_list)
+    rmse = np.sqrt(mse)
+    r2_pred = FNN_new.r2_score(ytest_arr, prediction_list)
+    loss_pred = pd.DataFrame({'metric': ['rmse', 'r2'],
+                              'value': [rmse, r2_pred]})
+    loss_pred.to_csv(params['data_dir'] + '/Loss_pred.txt', header=True, index=False, sep="\t")
+    ytest_df = test_df.iloc[:, -1].to_frame()
+    ytest_df['prediction'] = prediction_list
+    ytest_df.to_csv(params['data_dir'] + '/Prediction.txt', header=True, index=True, sep="\t")
+
+
+def candle_main():
+    params = initialize_parameters()
+    data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/'
+    params =  preprocess(params, data_dir)
+    run(params)
+    
+if __name__ == "__main__":
+    candle_main()
diff --git a/old_legacy_scripts/parse_DSP_data_Chia_Jan12_2023.R b/old_legacy_scripts/parse_DSP_data_Chia_Jan12_2023.R
new file mode 100644
index 0000000..28976f0
--- /dev/null
+++ b/old_legacy_scripts/parse_DSP_data_Chia_Jan12_2023.R
@@ -0,0 +1,72 @@
+rm(list = ls())
+
+setwd("c:/Users/m092469/OneDrive - Mayo Clinic/temp_code/DOE_IMPROVE")
+
+options(stringsAsFactors = FALSE)
+
+rDrug <- read.delim(
+  "data/GDSCv2.Gao2015.Powell2020.Lee2021.GeoSearch.Ding2016.CHEM.256.MBits.txt")
+
+rGSEA <- read.delim(
+  "data/GDSCv2.Powell2020.EXP.ssGSEA.txt")
+
+rPNET <- read.delim(
+  "data/GDSCv2.Gao2015.Powell2020.Lee2021.GeoSearch.Ding2016.DGNet.NetPEA.txt")
+
+rReponse0 <- read.delim("data/GDSCv2.resp_PowellAUC.Alias.txt")
+
+Drug.ID.vec0 <- intersect(rDrug$drug, rReponse0$Therapy)
+Cell.ID.vec <- intersect(rGSEA$X, rReponse0$Sample)
+
+Drug.ID.vec <- intersect(Drug.ID.vec0,rPNET$X)
+
+sel.Reponse.idx <- which(is.element(rReponse0$Therapy, Drug.ID.vec) &
+                           is.element(rReponse0$Sample, Cell.ID.vec))
+
+rReponse <- rReponse0[sel.Reponse.idx, ]
+
+N.cell <- length(Cell.ID.vec)
+N.drug <- length(Drug.ID.vec)
+N.comb <- nrow(rReponse)
+
+head(rDrug[,1:5])
+Drug.fmtx <- data.matrix(rDrug[match(Drug.ID.vec, rDrug$drug), 2: ncol(rDrug)])
+rownames(Drug.fmtx) <- Drug.ID.vec
+head(Drug.fmtx[,1:5])
+
+Drug.PNEA.fmtx <- data.matrix(rPNET[match(Drug.ID.vec, rPNET$X), 2: ncol(rPNET)])
+rownames(Drug.PNEA.fmtx) <- Drug.ID.vec
+head(Drug.PNEA.fmtx[,1:5])
+
+all(rownames(Drug.fmtx)==rownames(Drug.PNEA.fmtx))
+
+Cell.GSEA.mtx <- data.matrix(rGSEA[match(Cell.ID.vec, rGSEA$X), 2: ncol(rGSEA)])
+rownames(Cell.GSEA.mtx) <- Cell.ID.vec
+head(Cell.GSEA.mtx[,1:5])
+
+N.col <- ncol(Drug.fmtx) + ncol(Drug.PNEA.fmtx) + 
+  ncol(Cell.GSEA.mtx) + 3 # drug, cell & resp
+comb.data.mtx <- mat.or.vec(N.comb, N.col)
+colnames(comb.data.mtx) <- c("drug", "cell",
+                             paste0("feature",1:(N.col-3)),"resp")
+
+ 
+for(i in 1:N.comb){
+    # i <- 1
+    tmp.cell.ID <- rReponse$Sample[i]    
+    tmp.drug.ID <- rReponse$Therapy[i]
+    comb.data.mtx[i, "drug"] <- tmp.drug.ID
+    comb.data.mtx[i, "cell"] <- tmp.cell.ID
+    comb.data.mtx[i, paste0("feature",1:(N.col-3))] <- 
+      c(Drug.fmtx[tmp.drug.ID,],
+        Drug.PNEA.fmtx[tmp.drug.ID,],
+        Cell.GSEA.mtx[tmp.cell.ID,])
+    response.idx <-which(
+      rReponse$Therapy==tmp.drug.ID & rReponse$Sample==tmp.cell.ID)
+    tmp.resp <- rReponse$Response[response.idx]
+    comb.data.mtx[i, "resp"] <- tmp.resp
+   
+}
+
+write.table(x = comb.data.mtx, file = "input_txt_Nick.txt", 
+            quote = FALSE, sep = "\t", row.names = FALSE)
\ No newline at end of file
diff --git a/old_legacy_scripts/preprocess.py b/old_legacy_scripts/preprocess.py
new file mode 100644
index 0000000..378898f
--- /dev/null
+++ b/old_legacy_scripts/preprocess.py
@@ -0,0 +1,36 @@
+def preprocess(params):
+    fname='input_for_Nick.txt'
+    origin=params['data_url']
+    # Download and unpack the data in CANDLE_DATA_DIR
+    candle.file_utils.get_file(fname, origin)
+    params['train_data'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['train_data']
+    #params['val_data'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['val_data']
+    #params['gep_filepath'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['gep_filepath']
+    #params['smi_filepath'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['smi_filepath']
+    #params['gene_filepath'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['gene_filepath']
+    #params['smiles_language_filepath'] = os.environ['CANDLE_DATA_DIR'] + '/common/Data/'+params['smiles_language_filepath']
+    """ 
+    params["train_data"] = candle.get_file(params['train_data'], origin, datadir=params['data_dir'], cache_subdir=None)
+    params["val_data"] = candle.get_file(params['val_data'], origin, datadir=params['data_dir'], cache_subdir=None)
+    params["gep_filepath"] = candle.get_file(params['gep_filepath'], origin, datadir=params['data_dir'], cache_subdir=None)
+    params["smi_filepath"] = candle.get_file(params['smi_filepath'], origin, datadir=params['data_dir'], cache_subdir=None)
+    params["gene_filepath"] = candle.get_file(params['gene_filepath'], origin, datadir=params['data_dir'], cache_subdir=None)
+    params["smiles_language_filepath"] = candle.get_file(params['smiles_language_filepath'], origin, datadir=params['data_dir'], cache_subdir=None) """
+    return params
+
+def run(params):
+    params['data_type'] = str(params['data_type'])
+    with open ((params['output_dir']+'/params.json'), 'w') as outfile:
+        json.dump(params, outfile)
+    scores = main(params)
+    with open(params['output_dir'] + "/scores.json", "w", encoding="utf-8") as f:
+        json.dump(scores, f, ensure_ascii=False, indent=4)
+    print('IMPROVE_RESULT RMSE:\t' + str(scores['rmse']))
+
+def candle_main():
+    params = initialize_parameters()
+    params = preprocess(params)
+    run(params)
+
+if __name__ == "__main__":
+    candle_main()
diff --git a/old_legacy_scripts/preprocess_new.py b/old_legacy_scripts/preprocess_new.py
new file mode 100644
index 0000000..cec9e6f
--- /dev/null
+++ b/old_legacy_scripts/preprocess_new.py
@@ -0,0 +1,403 @@
+import sys
+import os
+import numpy as np
+import polars as pl
+#import torch
+#import torch.utils.data as du
+#from torch.autograd import Variable
+#import torch.nn as nn
+#import torch.nn.functional as F
+#from code.drugcell_NN import *
+import argparse
+import numpy as np
+import pandas as pd
+import candle
+#import time
+#import logging
+#import networkx as nx
+#import networkx.algorithms.components.connected as nxacc
+#import networkx.algorithms.dag as nxadag
+#from pathlib import Path
+from functools import reduce
+import improve_utils
+# import RDKit
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from datetime import datetime
+# import NetPEA modules
+import RWR as rwr
+import NetPEA as pea
+#import gsea module
+import gseapy as gp
+import sklearn.model_selection as skms
+
+
+file_path = os.path.dirname(os.path.realpath(__file__))
+#fdir = Path('__file__').resolve().parent
+#source = 'csa_data/raw_data/splits/'
+required = None
+additional_definitions = None
+
+# This should be set outside as a user environment variable
+#os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/'
+
+# initialize class
+class PathDSP_candle(candle.Benchmark):
+    def set_locals(self):
+        '''
+        Functionality to set variables specific for the benchmark
+        - required: set of required parameters for the benchmark.
+        - additional_definitions: list of dictionaries describing the additional parameters for the benchmark.
+        '''
+        if required is not None: 
+            self.required = set(required)
+        if additional_definitions is not None:
+            self.additional_definitions = additional_definitions
+
+
+def initialize_parameters():
+    preprocessor_bmk = PathDSP_candle(file_path,
+        'PathDSP_params.txt',
+        'pytorch',
+        prog='PathDSP_candle',
+        desc='Data Preprocessor'
+    )
+    #Initialize parameters
+    gParameters = candle.finalize_parameters(preprocessor_bmk)
+    return gParameters
+
+
+def mkdir(directory):
+    directories = directory.split('/')   
+
+    folder = ''
+    for d in directories:
+        folder += d + '/'
+        if not os.path.exists(folder):
+            print('creating folder: %s'%folder)
+            os.mkdir(folder)
+
+
+def preprocess(params, data_dir):
+    print(os.environ['CANDLE_DATA_DIR'])
+    #requirements go here
+    #keys_parsing = ['output_dir', 'hidden', 'result', 'metric', 'data_type']
+    if not os.path.exists(data_dir):
+        mkdir(data_dir)
+    params['data_dir'] = data_dir
+    #args = candle.ArgumentStruct(**params)
+    for i in ['train_data', 'test_data', 'val_data', 'drug_bits_file', 'dgnet_file', 
+              'mutnet_file', 'cnvnet_file', 'exp_file']:
+        params[i] = params['data_dir'] + '/' + params[i]
+    return(params)
+
+def download_anl_data(params):
+    csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'], 'csa_data', 'raw_data')
+    splits_dir = os.path.join(csa_data_folder, 'splits') 
+    x_data_dir = os.path.join(csa_data_folder, 'x_data')
+    y_data_dir = os.path.join(csa_data_folder, 'y_data')
+
+    if not os.path.exists(csa_data_folder):
+        print('creating folder: %s'%csa_data_folder)
+        os.makedirs(csa_data_folder)
+        mkdir(splits_dir)
+        mkdir(x_data_dir)
+        mkdir(y_data_dir)
+
+    for improve_file in ['CCLE_all.txt', 
+                         'CCLE_split_' + str(params['split']) + '_test.txt',
+                         'CCLE_split_' + str(params['split']) + '_train.txt',
+                         'CCLE_split_' + str(params['split']) + '_val.txt',
+                         'CTRPv2_all.txt', 
+                         'CTRPv2_split_' + str(params['split']) + '_test.txt',
+                         'CTRPv2_split_' + str(params['split']) + '_train.txt',
+                         'CTRPv2_split_' + str(params['split']) + '_val.txt',
+                         'gCSI_all.txt',
+                         'GDSCv1_all.txt',
+                         'GDSCv2_all.txt'
+                         ]:
+        url_dir = params['improve_data_url'] + '/splits/' 
+        candle.file_utils.get_file(improve_file, url_dir + improve_file,
+                                   datadir=splits_dir,
+                                   cache_subdir=None)
+
+    for improve_file in ['cancer_mutation_count.tsv', 'drug_SMILES.tsv', 'drug_info.tsv', 'cancer_discretized_copy_number.tsv', 'cancer_gene_expression.tsv']:
+        url_dir = params['improve_data_url'] + '/x_data/' 
+        candle.file_utils.get_file(fname=improve_file, origin=url_dir + improve_file,
+                                   datadir=x_data_dir,
+                                   cache_subdir=None)
+
+    url_dir = params['improve_data_url'] + '/y_data/'
+    response_file  = 'response.tsv'
+    candle.file_utils.get_file(fname=response_file, origin=url_dir + response_file,
+                                   datadir=y_data_dir,
+                                   cache_subdir=None)
+    
+    ## get gene-set data and string data
+    for db_file in [params['gene_set'], params['ppi_data'], params['drug_target']]:
+        candle.file_utils.get_file(db_file, params['data_url'] + '/' +db_file,
+                                   datadir=params['data_dir'],
+                                   cache_subdir=None)
+        
+
+    
+    
+# set timer
+def cal_time(end, start):
+    '''return time spent'''
+    # end = datetime.now(), start = datetime.now()
+    datetimeFormat = '%Y-%m-%d %H:%M:%S.%f'
+    spend = datetime.strptime(str(end), datetimeFormat) - \
+            datetime.strptime(str(start),datetimeFormat)
+    return spend
+
+
+def download_author_data(params):
+    data_download_filepath = candle.file_utils.get_file(params['original_data'], params['original_data_url'] + '/' + params['original_data'],
+                                             datadir = params['data_dir'],
+                                             cache_subdir = None)
+    print('download_path: {}'.format(data_download_filepath))
+    random_seed = 42
+    df = pd.read_csv(params['data_dir'] + "/input.txt", sep='\t')  # Modify the separator if needed
+    df = df.set_index(['drug', 'cell'])
+    train_data, temp_data = skms.train_test_split(df, test_size=0.2, random_state=random_seed)
+    val_data, test_data = skms.train_test_split(temp_data, test_size=0.5, random_state=random_seed)
+    pl.from_pandas(train_data).write_csv(params['train_data'], separator = '\t', has_header = True)
+    pl.from_pandas(val_data).write_csv(params['val_data'], separator = '\t', has_header = True)
+    pl.from_pandas(test_data).write_csv(params['test_data'], separator = '\t', has_header = True)
+    
+
+def smile2bits(params):
+    start = datetime.now()
+    rs_all = improve_utils.load_single_drug_response_data(source=params['data_type'],
+                                                         split=params['split'], split_type=["train", "test", "val"],
+                                                         y_col_name=params['metric'])
+    smile_df = improve_utils.load_smiles_data()
+    smile_df.columns = ['drug', 'smile']
+    smile_df = smile_df.drop_duplicates(subset=['drug'], keep='first').set_index('drug')
+    smile_df = smile_df.loc[smile_df.index.isin(rs_all['improve_chem_id']),]
+    bit_int = params['bit_int']
+    record_list = []
+    # smile2bits drug by drug
+    n_drug = 1
+    for idx, row in smile_df.iterrows():
+        drug = idx
+        smile = row['smile']
+        mol = Chem.MolFromSmiles(smile)
+        if mol is None:
+            continue
+        mbit = list( AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=bit_int) )
+        #drug_mbit_dict.update({drug:mbit})
+        # append to result
+        record_list.append( tuple([drug]+mbit) )
+        if len(mbit) == bit_int:
+            n_drug+=1
+    print('total {:} drugs with bits'.format(n_drug))
+    # convert dict to dataframe
+    colname_list = ['drug'] + ['mBit_'+str(i) for i in range(bit_int)]
+    drug_mbit_df = pd.DataFrame.from_records(record_list, columns=colname_list)
+    #drug_mbit_df = pd.DataFrame.from_dict(drug_mbit_dict, orient='index', columns=colname_list)
+    #drug_mbit_df.index.name = 'drug'
+    print('unique drugs={:}'.format(len(drug_mbit_df['drug'].unique())))
+    # save to file
+    drug_mbit_df.to_csv(params['drug_bits_file'], header=True, index=False, sep='\t')
+    print('[Finished in {:}]'.format(cal_time(datetime.now(), start)))
+
+def times_expression(rwr, exp):
+    '''
+    :param rwrDf: dataframe of cell by gene probability matrix
+    :param expDf: dataframe of cell by gene expression matrix
+    :return rwr_timesexp_df: dataframe of cell by gene probability matrix,
+                             in which genes are multiplied with expression values
+
+    Note: this function assumes cells are all overlapped while gene maybe not
+    '''
+    cell_list = sorted(list(set(rwr.index) & set(exp.index)))
+    gene_list = sorted(list(set(rwr.columns)&set(exp.columns)))
+
+    if len(cell_list) == 0:
+        print('ERROR! no overlapping cell lines')
+        sys.exit(1)
+    if len(gene_list) == 0:
+        print('ERROR! no overlapping genes')
+        sys.exit(1)
+
+    # multiply with gene expression for overlapping cell, gene
+    rwr_timesexp =  rwr.loc[cell_list, gene_list]*exp.loc[cell_list, gene_list]
+
+    # concat with other gene
+    out_gene_list = list(set(rwr.columns)-set(gene_list))
+    out_df = pd.concat([rwr_timesexp, rwr[out_gene_list]], axis=1)
+    return out_df
+
+def run_netpea(params, dtype, multiply_expression):
+    # timer
+    start_time = datetime.now()
+    ppi_path = params['data_dir'] + '/STRING/9606.protein_name.links.v11.0.pkl'
+    pathway_path = params['data_dir'] + '/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt'
+    log_transform = False
+    permutation_int = params['permutation_int']
+    seed_int = params['seed_int']
+    cpu_int = params['cpu_int']
+    csa_data_folder = os.path.join(os.environ['CANDLE_DATA_DIR'], 'csa_data', 'raw_data')
+    rs_all = improve_utils.load_single_drug_response_data(source=params['data_type'],
+                                                        split=params['split'], split_type=["train", "test", "val"],
+                                                        y_col_name=params['metric'])
+    if dtype == 'DGnet':
+        drug_info = pd.read_csv(csa_data_folder + '/x_data/drug_info.tsv', sep='\t')
+        drug_info['NAME'] = drug_info['NAME'].str.upper()
+        target_info = pd.read_csv(params['data_dir'] + '/data/DB.Drug.Target.txt', sep = '\t')
+        target_info = target_info.rename(columns={'drug': 'NAME'})
+        combined_df = pd.merge(drug_info, target_info, how = 'left', on = 'NAME').dropna(subset=['gene'])
+        combined_df = combined_df.loc[combined_df['improve_chem_id'].isin(rs_all['improve_chem_id']),]
+        restart_path = params['data_dir'] + '/drug_target.txt'
+        combined_df.iloc[:,-2:].to_csv(restart_path, sep = '\t', header= True, index=False)
+        outpath = params['dgnet_file']
+    elif dtype == 'MUTnet':
+        mutation_data = improve_utils.load_mutation_count_data(gene_system_identifier='Gene_Symbol')
+        mutation_data = mutation_data.reset_index()
+        mutation_data = pd.melt(mutation_data, id_vars='improve_sample_id').loc[lambda x: x['value'] > 0]
+        mutation_data = mutation_data.loc[mutation_data['improve_sample_id'].isin(rs_all['improve_sample_id']),]
+        restart_path = params['data_dir'] + '/mutation_data.txt'
+        mutation_data.iloc[:,0:2].to_csv(restart_path, sep = '\t', header= True, index=False)
+        outpath = params['mutnet_file']
+    else:
+        cnv_data = improve_utils.load_discretized_copy_number_data(gene_system_identifier='Gene_Symbol')
+        cnv_data = cnv_data.reset_index()
+        cnv_data = pd.melt(cnv_data, id_vars='improve_sample_id').loc[lambda x: x['value'] != 0]
+        cnv_data = cnv_data.loc[cnv_data['improve_sample_id'].isin(rs_all['improve_sample_id']),]
+        restart_path = params['data_dir'] + '/cnv_data.txt'
+        cnv_data.iloc[:,0:2].to_csv(restart_path, sep = '\t', header= True, index=False)
+        outpath = params['cnvnet_file']
+    # perform Random Walk
+    print(datetime.now(), 'performing random walk with restart')
+    rwr_df = rwr.RWR(ppi_path, restart_path, restartProbFloat=0.5, convergenceFloat=0.00001, normalize='l1', weighted=True).get_prob()
+    # multiply with gene expression
+    if multiply_expression:
+        print(datetime.now(), 'multiplying gene expression with random walk probability for genes were expressed')
+        exp_df = improve_utils.load_gene_expression_data(gene_system_identifier='Gene_Symbol')
+        rwr_df = times_expression(rwr_df, exp_df)
+    #rwr_df.to_csv(out_path+'.RWR.txt', header=True, index=True, sep='\t')
+    # perform Pathwa Enrichment Analysis
+    print(datetime.now(), 'performing network-based pathway enrichment')
+    cell_pathway_df = pea.NetPEA(rwr_df, pathway_path, log_transform=log_transform, permutation=permutation_int, seed=seed_int, n_cpu=cpu_int, out_path=outpath)
+    print( '[Finished in {:}]'.format(cal_time(datetime.now(), start_time)) )
+    
+def prep_input(params):
+    # Read data files
+    drug_mbit_df = pd.read_csv(params['drug_bits_file'], sep = '\t', index_col=0)
+    drug_mbit_df = drug_mbit_df.reset_index().rename(columns={'drug': 'drug_id'})
+    DGnet = pd.read_csv(params['dgnet_file'], sep='\t', index_col=0)
+    DGnet = DGnet.add_suffix('_dgnet').reset_index().rename(columns={'index': 'drug_id'})
+    CNVnet = pd.read_csv(params['cnvnet_file'], sep= '\t',index_col=0)
+    CNVnet = CNVnet.add_suffix('_cnvnet').reset_index().rename(columns={'index': 'sample_id'})
+    MUTnet = pd.read_csv(params['mutnet_file'], sep='\t',index_col=0)
+    MUTnet = MUTnet.add_suffix('_mutnet').reset_index().rename(columns={'index': 'sample_id'})
+    EXP = pd.read_csv(params['exp_file'], sep = '\t', index_col=0)
+    EXP = EXP.add_suffix('_exp').reset_index().rename(columns={'index': 'sample_id'})
+    response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'],
+                                                            split_type=['train', 'test', 'val'],
+                                                            y_col_name= params['metric'])
+    response_df = response_df.rename(columns={'improve_chem_id': 'drug_id', 'improve_sample_id': 'sample_id'})
+    # Extract relevant IDs
+
+    common_drug_ids = reduce(np.intersect1d, (drug_mbit_df['drug_id'], DGnet['drug_id'], response_df['drug_id']))
+    common_sample_ids = reduce(np.intersect1d, (CNVnet['sample_id'], MUTnet['sample_id'], EXP['sample_id'] , response_df['sample_id']))
+    response_df = response_df.loc[(response_df['drug_id'].isin(common_drug_ids)) & 
+                            (response_df['sample_id'].isin(common_sample_ids)), :]
+    drug_mbit_df = drug_mbit_df.loc[drug_mbit_df['drug_id'].isin(common_drug_ids), :].set_index('drug_id').sort_index()
+    DGnet = DGnet.loc[DGnet['drug_id'].isin(common_drug_ids), :].set_index('drug_id').sort_index()
+    CNVnet = CNVnet.loc[CNVnet['sample_id'].isin(common_sample_ids), :].set_index('sample_id').sort_index()
+    MUTnet = MUTnet.loc[MUTnet['sample_id'].isin(common_sample_ids), :].set_index('sample_id').sort_index() 
+    EXP = EXP.loc[EXP['sample_id'].isin(common_sample_ids), :].set_index('sample_id').sort_index()
+    
+    drug_data = drug_mbit_df.join(DGnet)
+    sample_data = CNVnet.join([MUTnet, EXP])
+    ## export train,val,test set
+    for i in ['train', 'test', 'val']:
+        response_df = improve_utils.load_single_drug_response_data(source=params['data_type'], split=params['split'],
+                                                            split_type=i,
+                                                            y_col_name= params['metric'])
+        response_df = response_df.rename(columns={'improve_chem_id': 'drug_id', 'improve_sample_id': 'sample_id'})
+        response_df = response_df.loc[(response_df['drug_id'].isin(common_drug_ids)) & 
+                        (response_df['sample_id'].isin(common_sample_ids)), :]
+        comb_data_mtx = pd.DataFrame({'drug_id': response_df['drug_id'].values, 
+                            'sample_id': response_df['sample_id'].values})
+        comb_data_mtx = comb_data_mtx.set_index(['drug_id', 'sample_id']).join(drug_data, on = 'drug_id').join(sample_data, on = 'sample_id')
+        comb_data_mtx['response'] = response_df[params['metric']].values
+        comb_data_mtx = comb_data_mtx.dropna()
+        pl.from_pandas(comb_data_mtx).write_csv(params[i + '_data'], separator = '\t', has_header = True)
+
+
+def run_ssgsea(params):
+    expMat = improve_utils.load_gene_expression_data(sep='\t')
+    rs_all = improve_utils.load_single_drug_response_data(source=params['data_type'],
+                                                        split=params['split'], split_type=["train", "test", "val"],
+                                                        y_col_name=params['metric'])
+    expMat = expMat.loc[expMat.index.isin(rs_all['improve_sample_id']),]
+    gct = expMat.T # gene (rows) cell lines (columns)
+    pathway_path = params['data_dir'] + '/MSigdb/union.c2.cp.pid.reactome.v7.2.symbols.gmt'
+    gmt = pathway_path
+    tmp_str = params['data_dir']
+
+    if not os.path.isdir(tmp_str):
+        os.mkdir(tmp_str) 
+
+    # run enrichment
+    ssgsea = gp.ssgsea(data=gct,  #gct: a matrix of gene by sample
+                           gene_sets=gmt, #gmt format
+                           outdir=tmp_str,
+                           scale=True,
+                           permutation_num=0, #1000
+                           no_plot=True,
+                           processes=params['cpu_int'],
+                           #min_size=0,
+                           format='png')
+
+    result_mat = ssgsea.res2d.T # get the normalized enrichment score (i.e., NES)
+    result_mat.to_csv(tmp_str+'ssGSEA.txt', header=True, index=True, sep="\t")
+
+    f = open(tmp_str+'ssGSEA.txt', 'r')
+    lines = f.readlines()
+    total_dict = {}
+    for cell in set(lines[1].split()):
+        total_dict[cell] = {}
+    cell_lines = lines[1].split()
+    vals = lines[4].split()
+    for i, pathway in enumerate((lines[2].split())):
+        if i > 0:
+            total_dict[cell_lines[i]][pathway] = float(vals[i])
+    df = pd.DataFrame(total_dict)
+    df.T.to_csv(params['exp_file'], header=True, index=True, sep="\t")
+
+
+def candle_main(anl):
+    params = initialize_parameters()
+    data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/'
+    params =  preprocess(params, data_dir)
+    if params['improve_analysis'] == 'yes' or anl == 1:
+        download_anl_data(params)
+        print('convert drug to bits.')
+        smile2bits(params)
+        print('compute DGnet.')
+        run_netpea(params, dtype = 'DGnet', multiply_expression=False)
+        print('compute MUTnet.')
+        run_netpea(params, dtype = 'MUTnet', multiply_expression=True)
+        print('compute CNVnet.')
+        run_netpea(params, dtype = 'CNVnet', multiply_expression=True)
+        print('compute EXP.')
+        run_ssgsea(params)
+        print('prepare final input file.')
+        prep_input(params)
+    else:
+        download_author_data(params)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('-a', dest='anl', type=int, default=0, help='''whether to perform preprocessing using anl data or directly use processed 
+                        data from the original paper, default to 0 to use processed data from original paper''')
+    args = parser.parse_args()
+    start = datetime.now()
+    candle_main(args.anl)
+    print('[Finished in {:}]'.format(cal_time(datetime.now(), start)))
diff --git a/old_legacy_scripts/train.py b/old_legacy_scripts/train.py
new file mode 100644
index 0000000..fb18f9d
--- /dev/null
+++ b/old_legacy_scripts/train.py
@@ -0,0 +1,74 @@
+import candle
+import os
+import sys
+#import json
+#from json import JSONEncoder
+from preprocess_new import mkdir, preprocess
+#sys.path.append("/usr/local/PathDSP/PathDSP")
+sys.path.append("/usr/local/PathDSP/PathDSP")
+sys.path.append(os.getcwd() + "/PathDSP")
+import FNN_new
+
+file_path = os.path.dirname(os.path.realpath(__file__))
+# This should be set outside as a user environment variable
+#os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir/'
+required = None
+additional_definitions = None
+
+# initialize class
+class PathDSP_candle(candle.Benchmark):
+    def set_locals(self):
+        '''
+        Functionality to set variables specific for the benchmark
+        - required: set of required parameters for the benchmark.
+        - additional_definitions: list of dictionaries describing the additional parameters for the benchmark.
+        '''
+        if required is not None: 
+            self.required = set(required)
+        if additional_definitions is not None:
+            self.additional_definitions = additional_definitions
+
+def initialize_parameters():
+    preprocessor_bmk = PathDSP_candle(file_path,
+        'PathDSP_params.txt',
+        'pytorch',
+        prog='PathDSP_candle',
+        desc='Data Preprocessor'
+    )
+    #Initialize parameters
+    gParameters = candle.finalize_parameters(preprocessor_bmk)
+    return gParameters
+
+# class CustomData:
+#     def __init__(self, name, value):
+#         self.name = name
+#         self.value = value
+
+# class CustomEncoder(json.JSONEncoder):
+#     def default(self, o):
+#             return o.__dict__
+
+
+# def run(params):
+#     params['data_type'] = str(params['data_type'])
+#     json_out = params['output_dir']+'/params.json'
+#     print(params)
+
+#     with open (json_out, 'w') as fp:
+#         json.dump(params, fp, indent=4, cls=CustomEncoder)
+
+#     scores = main(params)
+#     with open(params['output_dir'] + "/scores.json", "w", encoding="utf-8") as f:
+#         json.dump(scores, f, ensure_ascii=False, indent=4)
+# #    print('IMPROVE_RESULT RMSE:\t' + str(scores['rmse']))
+
+
+def candle_main():
+    params = initialize_parameters()
+    data_dir = os.environ['CANDLE_DATA_DIR'] + '/' + '/Data/'
+    params =  preprocess(params, data_dir)
+    FNN_new.main(params)
+
+
+if __name__ == "__main__":
+    candle_main()
diff --git a/preprocess.sh b/preprocess.sh
new file mode 100755
index 0000000..4da8d87
--- /dev/null
+++ b/preprocess.sh
@@ -0,0 +1,8 @@
+IMPROVE_MODEL_NAME=PathDSP
+IMPROVE_STAGE=preprocess
+IMPROVE_MODEL_SCRIPT=${IMPROVE_MODEL_NAME}_${IMPROVE_STAGE}_improve.py
+
+# Set env if CANDLE_MODEL is not in same directory as this script
+IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )}
+
+python $IMPROVE_MODEL_DIR/$IMPROVE_MODEL_SCRIPT $@
diff --git a/set_affinity_gpu_polaris.sh b/set_affinity_gpu_polaris.sh
new file mode 100755
index 0000000..f3d4915
--- /dev/null
+++ b/set_affinity_gpu_polaris.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+num_gpus=4
+gpu=$((${PMI_LOCAL_RANK} % ${num_gpus}))
+export CUDA_VISIBLE_DEVICES=$gpu
+echo “RANK= ${PMI_RANK} LOCAL_RANK= ${PMI_LOCAL_RANK} gpu= ${gpu}”
+exec "$@"
diff --git a/setup_improve.sh b/setup_improve.sh
new file mode 100644
index 0000000..8fad5bc
--- /dev/null
+++ b/setup_improve.sh
@@ -0,0 +1,58 @@
+#!/bin/bash --login
+# Navigate to the dir with the cloned model repo
+# Run it like this: source ./setup_improve.sh
+
+# set -e
+
+# Get current dir and model dir
+model_path=$PWD
+echo "Model path: $model_path"
+model_name=$(echo "$model_path" | awk -F '/' '{print $NF}')
+echo "Model name: $model_name"
+
+# Download data (if needed)
+data_dir="csa_data"
+if [ ! -d $PWD/$data_dir/ ]; then
+    echo "Download CSA data"
+    source download_csa.sh
+else
+    echo "CSA data folder already exists"
+fi
+
+# Download author data (if needed) - PathDSP specific
+author_dir="author_data"
+if [ ! -d $PWD/$author_dir/ ]; then
+    echo "Download author data"
+    mkdir author_data
+    source download_author_data.sh author_data/
+else
+    echo "Author data folder already exists"
+fi
+
+# Env var IMPROVE_DATA_DIR
+export IMPROVE_DATA_DIR="./$data_dir/"
+
+# Env var AUTHOR_DATA_DIR - PathDSP specific
+export AUTHOR_DATA_DIR="./$author_dir/"
+
+# Clone IMPROVE lib (if needed) and checkout the branch/tag
+cd ../
+improve_lib_path=$PWD/IMPROVE
+# improve_branch="develop"
+improve_branch="v0.1.0"
+if [ -d $improve_lib_path ]; then
+    echo "IMPROVE repo exists in ${improve_lib_path}"
+else
+    git clone https://github.com/JDACS4C-IMPROVE/IMPROVE
+fi
+cd IMPROVE
+git checkout -f $improve_branch 
+cd ../$model_name
+
+# Env var PYTHOPATH
+export PYTHONPATH=$PYTHONPATH:$improve_lib_path
+
+echo
+echo "IMPROVE_DATA_DIR: $IMPROVE_DATA_DIR"
+echo "AUTHOR_DATA_DIR: $AUTHOR_DATA_DIR"
+echo "PYTHONPATH: $PYTHONPATH"
diff --git a/subprocess_train_singularity.sh b/subprocess_train_singularity.sh
new file mode 100755
index 0000000..a700cb7
--- /dev/null
+++ b/subprocess_train_singularity.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0
+# CUDA_VISIBLE_DEVICES=5 bash subprocess_train.sh ml_data/CCLE-CCLE/split_0 ml_data/CCLE-CCLE/split_0 out_model/CCLE/split_0
+
+# Need to comment this when using ' eval "$(conda shell.bash hook)" '
+# set -e
+
+# Activate conda env for model using "conda activate myenv"
+# https://saturncloud.io/blog/activating-conda-environments-from-scripts-a-guide-for-data-scientists
+# https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script
+# This doesn't work w/o eval "$(conda shell.bash hook)"
+#CONDA_ENV=$PathDSP_env
+#echo "Allow conda commands in shell script by running 'conda shell.bash hook'"
+#eval "$(conda shell.bash hook)"
+#echo "Activated conda commands in shell script"
+#conda activate $CONDA_ENV
+#source activate $CONDA_ENV
+#conda_path=${dirname `which conda`}
+#source $conda_path/activate $CONDA_ENV
+#source activate $CONDA_ENV
+#echo "Activated conda env $CONDA_ENV"
+
+train_ml_data_dir=$1
+val_ml_data_dir=$2
+model_outdir=$3
+learning_rate=$4
+batch_size=$5
+dropout=$6
+CUDA_VISIBLE_DEVICES=$7
+
+echo "train_ml_data_dir:    $train_ml_data_dir"
+echo "val_ml_data_dir:      $val_ml_data_dir"
+echo "model_outdir:         $model_outdir"
+echo "learning_rate:      $learning_rate"
+echo "batch_size:      $batch_size"
+echo "dropout:      $dropout"
+echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
+
+# epochs=10
+epochs=50
+# epochs=50
+
+# All train outputs are saved in params["model_outdir"]
+#CUDA_VISIBLE_DEVICES=6,7 python PathDSP_train_improve.py \
+#CUDA_VISIBLE_DEVICES=5
+#CUDA_VISIBLE_DEVICES=6,7
+# python PathDSP_train_improve.py \
+#     --train_ml_data_dir $train_ml_data_dir \
+#     --val_ml_data_dir $val_ml_data_dir \
+#     --model_outdir $model_outdir \
+#     --epochs $epochs
+
+#conda deactivate
+#source $conda_path/deactivate
+#echo "Deactivated conda env $CONDA_ENV"
+
+echo "train using singularity container"
+singularity exec --nv --bind ${IMPROVE_DATA_DIR}:/candle_data_dir $PathDSP_sif train.sh ${CUDA_VISIBLE_DEVICES} /candle_data_dir --train_ml_data_dir /candle_data_dir/$train_ml_data_dir --val_ml_data_dir /candle_data_dir/$val_ml_data_dir/ --model_outdir /candle_data_dir/$model_outdir/ --epochs $epochs --learning_rate $learning_rate --batch_size $batch_size --dropout $dropout
+
diff --git a/train.sh b/train.sh
new file mode 100755
index 0000000..a10d6e2
--- /dev/null
+++ b/train.sh
@@ -0,0 +1,7 @@
+IMPROVE_MODEL_NAME=PathDSP
+IMPROVE_MODEL_SCRIPT=${IMPROVE_MODEL_NAME}_train_improve.py
+
+# Set env if CANDLE_MODEL is not in same directory as this script
+IMPROVE_MODEL_DIR=${IMPROVE_MODEL_DIR:-$( dirname -- "$0" )}
+
+python $IMPROVE_MODEL_DIR/$IMPROVE_MODEL_SCRIPT $@
diff --git a/workflow_csa.py b/workflow_csa.py
new file mode 100644
index 0000000..7063101
--- /dev/null
+++ b/workflow_csa.py
@@ -0,0 +1,235 @@
+import json
+import logging
+import sys
+from pathlib import Path
+from typing import Sequence, Tuple, Union
+
+import parsl
+from parsl import python_app
+from parsl.config import Config
+from parsl.executors import HighThroughputExecutor
+from parsl.providers import LocalProvider
+
+import csa_params_def as CSA
+from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig
+
+# Initialize parameters for CSA
+additional_definitions = CSA.additional_definitions
+filepath = Path(__file__).resolve().parent
+cfg = DRPPreprocessConfig() 
+params = cfg.initialize_parameters(
+    pathToModelDir=filepath,
+    default_config="csa_params.ini",
+    additional_definitions=additional_definitions
+)
+
+##### CONFIG FOR LAMBDA ######
+#available_accelerators: Union[int, Sequence[str]] = 8
+worker_port_range: Tuple[int, int] = (10000, 20000)
+retries: int = 1
+
+config_lambda = Config(
+    retries=retries,
+    executors=[
+        HighThroughputExecutor(
+            address='127.0.0.1',
+            label="htex",
+            cpu_affinity="block",
+            #max_workers_per_node=2, ## IS NOT SUPPORTED IN Parsl version: 2023.06.19. CHECK HOW TO USE THIS???
+            worker_debug=True,
+            worker_port_range=worker_port_range,
+            provider=LocalProvider(
+                init_blocks=1,
+                max_blocks=1,
+            ),
+            available_accelerators=params['available_accelerators'], 
+        )
+    ],
+    strategy='simple',
+)
+
+parsl.clear()
+parsl.load(config_lambda)
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+fdir = Path(__file__).resolve().parent
+logger = logging.getLogger(f'Start workflow')
+
+##############################################################################
+################################ PARSL APPS ##################################
+##############################################################################
+
+@python_app 
+def train(params, hp_model, source_data_name, split):
+    """ parsl implementation of training stage using python_app. """
+    import json
+    import subprocess
+    import time
+    from pathlib import Path
+
+    hp = hp_model[source_data_name]
+    if hp.__len__() == 0:
+        raise Exception(str('Hyperparameters are not defined for ' + source_data_name))
+    
+    model_dir = params['model_dir'] / f"{source_data_name}" / f"split_{split}"
+    ml_data_dir = params['ml_data_dir'] / \
+        f"{source_data_name}-{params['target_datasets'][0]}"/ f"split_{split}"
+
+    if model_dir.exists() is False:
+        print("\nTrain")
+        print(f"ml_data_dir: {ml_data_dir}")
+        print(f"model_dir:   {model_dir}")
+        start = time.time()
+        if params['use_singularity']:
+            train_run = ["singularity", "exec", "--nv",
+                        params['singularity_image'], "train.sh",
+                        str("--input_dir " + str(ml_data_dir)),
+                        str("--output_dir " + str(model_dir)),
+                        str("--epochs " + str(params['epochs'])),
+                        str("--y_col_name " + str(params['y_col_name'])),
+                        str("--learning_rate " + str(hp['learning_rate'])),
+                        str("--batch_size " + str(hp['batch_size']))
+            ]
+        else:
+            train_run = ["bash", "execute_in_conda.sh",
+                         params['model_environment'], 
+                         params['train_python_script'],
+                         "--input_dir", str(ml_data_dir),
+                         "--output_dir", str(model_dir),
+                         "--epochs", str(params['epochs']), # DL-specific
+                         "--y_col_name", str(params['y_col_name']),
+                         "--learning_rate", str(hp['learning_rate']),
+                         "--batch_size", str(hp['batch_size'])
+            ]
+
+        result = subprocess.run(train_run,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.STDOUT,
+                                universal_newlines=True)
+
+        # Logger
+        print(f"returncode = {result.returncode}")
+        result_file_name_stdout = model_dir / 'logs.txt'
+        with open(result_file_name_stdout, 'w') as file:
+            file.write(result.stdout)
+
+        # Timer
+        time_diff = time.time() - start
+        hours = int(time_diff // 3600)
+        minutes = int((time_diff % 3600) // 60)
+        seconds = time_diff % 60
+        time_diff_dict = {'hours': hours,
+                          'minutes': minutes,
+                          'seconds': seconds}
+        dir_to_save = model_dir
+        filename = 'runtime.json'
+        with open(Path(dir_to_save) / filename, 'w') as json_file:
+            json.dump(time_diff_dict, json_file, indent=4)
+
+    return {'source_data_name': source_data_name, 'split': split}
+
+@python_app  
+def infer(params, source_data_name, target_data_name, split):
+    """ parsl implementation of inferece stage using python_app. """
+    import subprocess
+    import json
+    import time
+    from pathlib import Path
+
+    model_dir = params['model_dir'] / f"{source_data_name}" / f"split_{split}"
+    ml_data_dir = params['ml_data_dir'] / \
+        f"{source_data_name}-{target_data_name}" / f"split_{split}"
+    infer_dir = params['infer_dir'] / \
+        f"{source_data_name}-{target_data_name}" / f"split_{split}"
+
+    print("\nInfer")
+    start = time.time()
+    if params['use_singularity']:
+        infer_run = ["singularity", "exec", "--nv",
+                    params['singularity_image'], "infer.sh",
+                    str("--input_data_dir " + str(ml_data_dir)),
+                    str("--input_model_dir " + str(model_dir)),
+                    str("--output_dir " + str(infer_dir)),
+                    str("--calc_infer_scores "+ "true"),
+                    str("--y_col_name " + str(params['y_col_name']))
+        ]
+    else:
+        infer_run = ["bash", "execute_in_conda.sh",
+                     params['model_environment'], 
+                     params['infer_python_script'],
+                     "--input_data_dir", str(ml_data_dir),
+                     "--input_model_dir", str(model_dir),
+                     "--output_dir", str(infer_dir),
+                     "--calc_infer_scores", "true",
+                     "--y_col_name", str(params['y_col_name'])
+        ]
+
+    result = subprocess.run(infer_run,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT,
+                            universal_newlines=True)
+
+    # Logger
+    print(f"returncode = {result.returncode}")
+    result_file_name_stdout = infer_dir / 'logs.txt'
+    with open(result_file_name_stdout, 'w') as file:
+        file.write(result.stdout)
+
+    # Timer
+    time_diff = time.time() - start
+    hours = int(time_diff // 3600)
+    minutes = int((time_diff % 3600) // 60)
+    seconds = time_diff % 60
+    time_diff_dict = {'hours': hours,
+                      'minutes': minutes,
+                      'seconds': seconds}
+    dir_to_save = infer_dir
+    filename = 'runtime.json'
+    with open(Path(dir_to_save) / filename, 'w') as json_file:
+        json.dump(time_diff_dict, json_file, indent=4)
+
+    return True
+
+###############################
+####### CSA PARAMETERS ########
+###############################
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+fdir = Path(__file__).resolve().parent
+y_col_name = params['y_col_name']
+logger = logging.getLogger(f"{params['model_name']}")
+
+#Output directories for preprocess, train and infer
+params['ml_data_dir'] = Path(params['output_dir']) / 'ml_data' 
+params['model_dir'] = Path(params['output_dir']) / 'models'
+params['infer_dir'] = Path(params['output_dir']) / 'infer'
+
+#Model scripts
+params['train_python_script'] = f"{params['model_name']}_train_improve.py"
+params['infer_python_script'] = f"{params['model_name']}_infer_improve.py"
+
+#Read Hyperparameters file
+with open(params['hyperparameters_file']) as f:
+    hp = json.load(f)
+hp_model = hp[params['model_name']]
+
+##########################################################################
+##################### START PARSL PARALLEL EXECUTION #####################
+##########################################################################
+
+##Train execution with Parsl
+train_futures = []
+for source_data_name in params['source_datasets']:
+    for split in params['split']:
+        train_futures.append(train(params, hp_model, source_data_name, split))
+
+##Infer execution with Parsl
+infer_futures = []
+for future_t in train_futures:
+    for target_data_name in params['target_datasets']:
+        infer_futures.append(infer(params, future_t.result()['source_data_name'], target_data_name, future_t.result()['split']))
+
+for future_i in infer_futures:
+    print(future_i.result())
+
+parsl.dfk().cleanup()
\ No newline at end of file
diff --git a/workflow_preprocess.py b/workflow_preprocess.py
new file mode 100644
index 0000000..d0e14bd
--- /dev/null
+++ b/workflow_preprocess.py
@@ -0,0 +1,211 @@
+import json
+import logging
+import sys
+from pathlib import Path
+from typing import Sequence, Tuple, Union
+
+import parsl
+from parsl import python_app
+from parsl.config import Config
+from parsl.executors import HighThroughputExecutor
+from parsl.providers import LocalProvider
+
+import csa_params_def as CSA
+from improvelib.applications.drug_response_prediction.config import DRPPreprocessConfig
+
+# Initialize parameters for CSA
+additional_definitions = CSA.additional_definitions
+filepath = Path(__file__).resolve().parent
+cfg = DRPPreprocessConfig() 
+params = cfg.initialize_parameters(
+    pathToModelDir=filepath,
+    default_config="csa_params.ini",
+    additional_definitions=additional_definitions
+)
+
+##### CONFIG FOR LAMBDA ######
+#available_accelerators: Union[int, Sequence[str]] = 8
+worker_port_range: Tuple[int, int] = (10000, 20000)
+retries: int = 1
+
+config_lambda = Config(
+    retries=retries,
+    executors=[
+        HighThroughputExecutor(
+            address='127.0.0.1',
+            label="htex_preprocess",
+            cpu_affinity="alternating",
+            #max_workers_per_node=2, ## IS NOT SUPPORTED IN Parsl version: 2023.06.19. CHECK HOW TO USE THIS???
+            worker_debug=True,
+            worker_port_range=worker_port_range,
+            provider=LocalProvider(
+                init_blocks=1,
+                max_blocks=1,
+            ),
+        )
+    ],
+    strategy='simple',
+)
+
+parsl.clear()
+parsl.load(config_lambda)
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+fdir = Path(__file__).resolve().parent
+logger = logging.getLogger(f'Start workflow')
+
+##############################################################################
+################################ PARSL APPS ##################################
+##############################################################################
+
+@python_app  
+def preprocess(inputs=[]):
+    """ parsl implementation of preprocessing stage using python_app. """
+    import json
+    import subprocess
+    import time
+    import warnings
+    from pathlib import Path
+
+    def build_split_fname(source_data_name, split, phase):
+        """ Build split file name. If file does not exist continue """
+        if split=='all':
+            return f"{source_data_name}_{split}.txt"
+        return f"{source_data_name}_split_{split}_{phase}.txt"
+
+    # python_app inputs
+    params = inputs[0]
+    source_data_name = inputs[1]
+    split = inputs[2]
+
+    split_nums = params['split']
+
+    # Get the split file paths
+    if len(split_nums) == 0:
+        # Get all splits
+        split_files = list((params['splits_path']).glob(f"{source_data_name}_split_*.txt"))
+        split_nums = [str(s).split("split_")[1].split("_")[0] for s in split_files]
+        split_nums = sorted(set(split_nums))
+    else:
+        split_files = []
+        for s in split_nums:
+            split_files.extend(list((params['splits_path']).glob(f"{source_data_name}_split_{s}_*.txt")))
+    files_joined = [str(s) for s in split_files]
+
+    print(f"Split id {split} out of {len(split_nums)} splits.")
+    # Check that train, val, and test are available. Otherwise, continue to the next split.
+    for phase in ["train", "val", "test"]:
+        fname = build_split_fname(source_data_name, split, phase)
+        if fname not in "\t".join(files_joined):
+            warnings.warn(f"\nThe {phase} split file {fname} is missing \
+                          (continue to next split)")
+            continue
+
+    for target_data_name in params['target_datasets']:
+
+        if params['only_cross_study'] and (source_data_name == target_data_name):
+            continue # only cross-study
+        print(f"\nSource data: {source_data_name}")
+        print(f"Target data: {target_data_name}")
+
+        ml_data_dir = params['ml_data_dir'] / \
+            f"{source_data_name}-{target_data_name}" / f"split_{split}"
+        if ml_data_dir.exists() is True:
+            continue
+
+        if source_data_name == target_data_name:
+            # If source and target are the same, then infer on the test split
+            test_split_file = f"{source_data_name}_split_{split}_test.txt"
+        else:
+            # If source and target are different, then infer on the entire
+            # target dataset
+            test_split_file = f"{target_data_name}_all.txt"
+
+        # Preprocess  data
+        print("\nPreprocessing")
+        train_split_file = f"{source_data_name}_split_{split}_train.txt"
+        val_split_file = f"{source_data_name}_split_{split}_val.txt"
+        print(f"train_split_file: {train_split_file}")
+        print(f"val_split_file:   {val_split_file}")
+        print(f"test_split_file:  {test_split_file}")
+        start = time.time()
+        if params['use_singularity']:
+            preprocess_run = ["singularity", "exec", "--nv",
+                              params['singularity_image'], "preprocess.sh",
+                              str("--train_split_file " + str(train_split_file)),
+                              str("--val_split_file " + str(val_split_file)),
+                              str("--test_split_file " + str(test_split_file)),
+                              str("--input_dir " + params['input_dir']),
+                              str("--output_dir " + str(ml_data_dir)),
+                              str("--y_col_name " + str(params['y_col_name']))
+            ]
+        else:
+            preprocess_run = ["bash", "execute_in_conda.sh",
+                              params['model_environment'], 
+                              params['preprocess_python_script'],
+                              "--train_split_file", str(train_split_file),
+                              "--val_split_file", str(val_split_file),
+                              "--test_split_file", str(test_split_file),
+                              "--input_dir", params['input_dir'], 
+                              "--output_dir", str(ml_data_dir),
+                              "--y_col_name", str(params['y_col_name'])
+            ]
+
+        result = subprocess.run(preprocess_run,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.STDOUT,
+                                universal_newlines=True)
+
+        # Logger
+        print(f"returncode = {result.returncode}")
+        result_file_name_stdout = ml_data_dir / 'logs.txt'
+        with open(result_file_name_stdout, 'w') as file:
+            file.write(result.stdout)
+
+        # Timer
+        time_diff = time.time() - start
+        hours = int(time_diff // 3600)
+        minutes = int((time_diff % 3600) // 60)
+        seconds = time_diff % 60
+        time_diff_dict = {'hours': hours,
+                          'minutes': minutes,
+                          'seconds': seconds}
+        dir_to_save = ml_data_dir
+        filename = 'runtime.json'
+        with open(Path(dir_to_save) / filename, 'w') as json_file:
+            json.dump(time_diff_dict, json_file, indent=4)
+
+    return {'source_data_name': source_data_name, 'split': split}
+
+
+###############################
+####### CSA PARAMETERS ########
+###############################
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+fdir = Path(__file__).resolve().parent
+y_col_name = params['y_col_name']
+logger = logging.getLogger(f"{params['model_name']}")
+
+#Output directories for preprocess, train and infer
+params['ml_data_dir'] = Path(params['output_dir']) / 'ml_data' 
+
+#Model scripts
+params['preprocess_python_script'] = f"{params['model_name']}_preprocess_improve.py"
+
+##########################################################################
+##################### START PARSL PARALLEL EXECUTION #####################
+##########################################################################
+
+##Preprocess execution with Parsl
+preprocess_futures = []
+for source_data_name in params['source_datasets']:
+    for split in params['split']:
+        preprocess_futures.append(
+            preprocess(inputs=[params, source_data_name, split])
+        ) 
+
+for future_p in preprocess_futures:
+    print(future_p.result())
+
+parsl.dfk().cleanup()
\ No newline at end of file