diff --git a/hcpd_main.py b/hcpd_main.py new file mode 100644 index 0000000..a0b75c5 --- /dev/null +++ b/hcpd_main.py @@ -0,0 +1,361 @@ +import os +import glob +import shutil +import csv +import subprocess + +import json +import sys + +# Change the current working directory +""" +Rename the HCP-D data downloaded from NDA S3 + +No datalad python api is used in this script + +""" + + +def move_glob_files(glob_pattern,dest,cur_dir): + """Move files with a certain glob pattern to a new directory in bulk""" + glob_pattern = cur_dir + glob_pattern + dest = cur_dir + dest + for matches in glob.glob(glob_pattern): + try: + filename = os.path.basename(matches) + rename_dest = os.path.join(dest,filename) + os.rename(matches, rename_dest) +# dl.save(message=f'Move {filename} to {dest}') + # supress the File Exists errors, which cannot be avoided + except OSError as e: + # Errno 17 is "File exists" and is inevitable during the re-organization + if e.errno != 17: + print("Error:", e) + #dl.save(message=f'Moving files to {dest}') +def rename_list_of_files(pattern, replacement, cur_dir): + """Rename files with a certain glob pattern in bulk""" + file_list = os.listdir(cur_dir) + for ii in file_list: + newName = ii.replace(pattern,replacement) + if newName != ii: + source = os.path.join(cur_dir,ii) + dest = os.path.join(cur_dir,newName) + os.rename(os.path.join(cur_dir,ii), + os.path.join(cur_dir,newName)) + #dl.save(dest) + # dl.save(message=f'Renaming {ii} to {newName}') +def remove_glob_files(glob_pattern,cur_dir): + """Delete files with a glob pattern in bulk""" + glob_list = cur_dir + glob_pattern + fileList = glob.glob(cur_dir+glob_pattern) + for filePath in fileList: + try: + shutil.rmtree(os.path.join(cur_dir,filePath)) + except: + print("Error while deleting file : ", filePath) + # dl.save(message=f'Deleting files with pattern {glob_pattern}') +def main(): + """Entry point """ + # make inputs/data the current working directory + subid = sys.argv[1] + fileList = glob.glob(str(subid)) + + for ii in fileList: + newName = ii.replace('HCD','sub-').replace('_V1_MR','') + if newName != ii: + os.rename(ii,newName) + subjects = glob.glob("./sub*") + # list of new directories to be created + dir_names = ['ses-V1/anat','ses-V1/func','ses-V1/dwi','ses-V1/fmap','ses-V1/S1','ses-V1/S2','ses-V1/S3','ses-V1/S4','ses-V1/S5','ses-V1/S6','ses-V1/S7'] + for sub in subjects: + os.chdir(sub) + cur_dir = os.getcwd() + try: + os.rename('unprocessed','ses-V1') + except: + #print("Error while trying to rename directory unprocessed to ses-V1; unprocessed does not exist") + pass + + for folder in dir_names: + try: + if not os.path.isdir(os.path.join(cur_dir,folder)): + os.mkdir(os.path.join(cur_dir,folder)) + # dl.save(os.path.join(cur_dir,folder), message=f"creating new folder {folder}") + except: + print("Error while create directory: ", folder) + + move_glob_files('/ses-V1/Diffusion/*/*SpinEchoFieldMap*','/ses-V1/Diffusion/',cur_dir ) + move_glob_files('/ses-V1/T2w_SPC_vNav/*/*SpinEchoFieldMap*','/ses-V1/T2w_SPC_vNav/',cur_dir ) + move_glob_files('/ses-V1/T1w_MPR_vNav_4e_e1e2_mean/*/*SpinEchoFieldMap*','/ses-V1/T1w_MPR_vNav_4e_e1e2_mean/',cur_dir) + + os.chdir('ses-V1/') + + rest_files = glob.glob('*/*REST*') + + # bidify the resting state files + for rest_file in rest_files: + newName = (rest_file + .replace("_AP.nii.gz","_dir-AP_bold.nii.gz") + .replace("_AP.json","_dir-AP_bold.json") + .replace("_PA.nii.gz", "_dir-PA_bold.nii.gz") + .replace("_PA.json", "_dir-PA_bold.json") + .replace("AP_SBRef.nii.gz", "dir-AP_sbref.nii.gz") + .replace("AP_SBRef.json", "dir-AP_sbref.json") + .replace("PA_SBRef.nii.gz", "dir-PA_sbref.nii.gz") + .replace("PA_SBRef.json", "dir-PA_sbref.json") + .replace("_rfMRI_", "_task-rest_") + .replace("rest_REST", "rest_acq-REST")) + os.rename(rest_file,newName) + + # rename sbref nii.gz files by the number of run of the same task + counter_sbref_nii=1 + cur_dir = os.getcwd() + for sbref_file in glob.glob('*/*REST*_sbref.nii.gz'): + if 'run' in sbref_file: + continue + + newName = sbref_file.replace("_sbref",f"_run-0{counter_sbref_nii}_sbref") + if newName != sbref_file: + source = os.path.join(cur_dir,sbref_file) + dest = os.path.join(cur_dir,newName) + os.rename(source,dest) + + + counter_sbref_nii += 1 + + # rename sbref json files by the number of run of the same task + counter_sbref_json=1 + for sbref_json in glob.glob('*/*REST*_sbref.json' ): + if 'run' in sbref_json: + continue + newName = sbref_json.replace("_sbref",f"_run-0{counter_sbref_json}_sbref") + if newName != sbref_json: + source = os.path.join(cur_dir,sbref_json) + dest = os.path.join(cur_dir,newName) + os.rename(source,dest) + counter_sbref_json += 1 + + # rename bold nii.gz files by the number of run of the same task + counter_bold_nii=1 + for bold_file in glob.glob('*/*REST*_bold.nii.gz' ): + if 'run' in bold_file: + continue + newName = bold_file.replace("_bold",f"_run-0{counter_bold_nii}_bold") + if newName != bold_file: + source = os.path.join(cur_dir,bold_file) + dest = os.path.join(cur_dir,newName) + os.rename(source,dest) + counter_bold_nii += 1 + + # rename bold json files by the number of run of the same task + counter_bold_json=1 + for bold_json in glob.glob('*/*REST*_bold.json' ): + if 'run' in bold_json: + continue + newName = bold_json.replace("_bold",f"_run-0{counter_bold_json}_bold") + if newName != bold_json: + source = os.path.join(cur_dir,bold_json) + dest = os.path.join(cur_dir, newName) + os.rename(source,dest) + counter_bold_json += 1 + + directories_in_curdir = list(filter(os.path.isdir, os.listdir(os.curdir))) + # loop all subject folders + for sub_dir in directories_in_curdir: + os.chdir(sub_dir) #into each existing folder + + cur_dir = os.getcwd() + + source_list =["HCD", + "V1_MR", + "_SpinEchoFieldMap1_AP", + "_SpinEchoFieldMap1_PA", + "_SpinEchoFieldMap2_AP", + "_SpinEchoFieldMap2_PA", + "_SpinEchoFieldMap3_AP", + "_SpinEchoFieldMap3_PA", + "_SpinEchoFieldMap4_AP", + "_SpinEchoFieldMap4_PA", + "_SpinEchoFieldMap5_AP", + "_SpinEchoFieldMap5_PA", + "_SpinEchoFieldMap6_AP", + "_SpinEchoFieldMap6_PA", + "_SpinEchoFieldMap7_AP", + "_SpinEchoFieldMap7_PA", + "tfMRI_CARIT_AP_SBRef", + "tfMRI_CARIT_AP", + "tfMRI_CARIT_PA_SBRef", + "tfMRI_CARIT_PA", + "tfMRI_EMOTION_AP_SBRef", + "tfMRI_EMOTION_AP", + "tfMRI_EMOTION_PA_SBRef", + "tfMRI_EMOTION_PA", + "tfMRI_GUESSING_AP_SBRef", + "tfMRI_GUESSING_AP", + "tfMRI_GUESSING_PA_SBRef", + "tfMRI_GUESSING_PA", + "dMRI_dir98_AP_SBRef", + "dMRI_dir98_AP", + "dMRI_dir98_PA_SBRef", + "dMRI_dir98_PA", + "dMRI_dir99_AP_SBRef", + "dMRI_dir99_AP", + "dMRI_dir99_PA_SBRef", + "dMRI_dir99_PA", + "T1w_MPR_vNav_4e_e1e2_mean", + "T2w_SPC_vNav"] + dest_list = ["sub-", + "ses-V1", + "_dir-AP_run-01_epi", + "_dir-PA_run-01_epi", + "_dir-AP_run-02_epi", + "_dir-PA_run-02_epi", + "_dir-AP_run-03_epi", + "_dir-PA_run-03_epi", + "_dir-AP_run-04_epi", + "_dir-PA_run-04_epi", + "_dir-AP_run-05_epi", + "_dir-PA_run-05_epi", + "_dir-AP_run-06_epi", + "_dir-PA_run-06_epi", + "_dir-AP_run-07_epi", + "_dir-PA_run-07_epi", + "task-carit_dir-AP_run-01_sbref", + "task-carit_dir-AP_run-01_bold", + "task-carit_dir-PA_run-02_sbref", + "task-carit_dir-PA_run-02_bold", + "task-emotion_dir-AP_run-01_sbref", + "task-emotion_dir-AP_run-01_bold", + "task-emotion_dir-PA_run-02_sbref", + "task-emotion_dir-PA_run-02_bold", + "task-guessing_dir-AP_run-01_sbref", + "task-guessing_dir-AP_run-01_bold", + "task-guessing_dir-PA_run-02_sbref", + "task-guessing_dir-PA_run-02_bold", + "acq-dir98_dir-AP_run-01_sbref", + "acq-dir98_dir-AP_run-01_dwi", + "acq-dir98_dir-PA_run-02_sbref", + "acq-dir98_dir-PA_run-02_dwi", + "acq-dir99_dir-AP_run-03_sbref", + "acq-dir99_dir-AP_run-03_dwi", + "acq-dir99_dir-PA_run-04_sbref", + "acq-dir99_dir-PA_run-04_dwi", + "T1w", + "T2w"] + for index, item in enumerate(source_list): + rename_list_of_files(source_list[index], dest_list[index], cur_dir) + if glob.glob('*run-01_epi*'): + move_glob_files('/*','/../S1',cur_dir ) + if glob.glob('*run-02_epi*'): + move_glob_files('/*','/../S2',cur_dir) + if glob.glob('*run-03_epi*'): + move_glob_files('/*','/../S3',cur_dir) + if glob.glob('*run-04_epi*'): + move_glob_files('/*','/../S4',cur_dir) + if glob.glob('*run-05_epi*'): + move_glob_files('/*','/../S5',cur_dir) + if glob.glob('*run-06_epi*'): + move_glob_files('/*','/../S6',cur_dir) + if glob.glob('*run-07_epi*'): + move_glob_files('/*','/../S7',cur_dir) + os.chdir('..') #out of ses-V1 + + # add IntendedFor fields for EPI fieldmap jsons + fmap_poss = glob.glob('*/*epi.json') + folders = glob.glob('*/') + cur_dir = os.getcwd() + folders = [ os.path.join(cur_dir, ls) for ls in folders] # using list comprehension + fmap_poss = [ os.path.join(cur_dir, ls) for ls in fmap_poss] + for b in range (0, len(fmap_poss)): + intended_for = list() + for m in range (0, len(folders)): + folders[m] = os.path.join(cur_dir, folders[m]) + os.chdir(folders[m]) + folder_contents = glob.glob('*') + intended_subset = list(set(glob.glob('*.nii.gz')) - set(glob.glob('*epi.nii.gz'))) + basename = os.path.basename(fmap_poss[b]) + if basename in folder_contents: + intended_for.append(intended_subset) + else: + pass + os.chdir('..') + + + with open(fmap_poss[b]) as json_file: + data = json.load(json_file) + + [new_list] = intended_for + if intended_for == list(): + data['IntendedFor'] = str() + else: + data['IntendedFor'] = new_list + with open(fmap_poss[b], 'w') as json_file: + json.dump(data, json_file) + + os.chdir('..') + cur_dir = os.getcwd() + move_glob_files('/ses-V1/*/*epi*','/ses-V1/fmap',cur_dir ) + move_glob_files('/ses-V1/*/*task-*','/ses-V1/func',cur_dir) + move_glob_files('/ses-V1/*/*acq-dir*','/ses-V1/dwi',cur_dir) + move_glob_files('/ses-V1/*/*T1w*','/ses-V1/anat',cur_dir) + move_glob_files('/ses-V1/*/*T2w*','/ses-V1/anat',cur_dir) + #remove excess files / folders + remove_glob_files('/ses-V1/*fMRI*/',cur_dir) + remove_glob_files('/ses-V1/*vNav*/',cur_dir) + remove_glob_files('/ses-V1/*PCAS*/',cur_dir) + if os.path.isdir('./ses-V1/Diffusion'): + shutil.rmtree(os.path.join(cur_dir, 'ses-V1/Diffusion')) + if os.path.isdir('ses-V1/S1'): + shutil.rmtree(os.path.join(cur_dir,'ses-V1/S1')) + if os.path.isdir('ses-V1/S2'): + shutil.rmtree(os.path.join(cur_dir,'ses-V1/S2')) + if os.path.isdir('./ses-V1/S3'): + shutil.rmtree(os.path.join(cur_dir,'ses-V1/S3')) + if os.path.isdir('./ses-V1/S4'): + shutil.rmtree(os.path.join(cur_dir,'ses-V1/S4')) + if os.path.isdir('./ses-V1/S5'): + shutil.rmtree(os.path.join(cur_dir,'ses-V1/S5')) + if os.path.isdir('./ses-V1/S6'): + shutil.rmtree(os.path.join(cur_dir,'ses-V1/S6')) + if os.path.isdir('./ses-V1/S7'): + shutil.rmtree(os.path.join(cur_dir,'ses-V1/S7')) + os.chdir('..') + + + + # create participants.tsv file + subjects = glob.glob('sub-*') + + + # create problem_fmapjsons.txt + fmap_json = glob.glob('sub*/*/fmap/*epi.json') + t1w = '_T1w' + t2w = '_T2w' + dir99 = 'dir99' + dir98 = 'dir98' + + # add ses-V1/dwi or ses-V1/anat or ses-V1/func to the IntendedFor field + for i in range (0, len(fmap_json)): + with open(fmap_json[i]) as json_file: + data = json.load(json_file) + if 'IntendedFor' in data: + for j in range (0, len(data['IntendedFor'])): + if dir99 in data['IntendedFor'][j]: + data['IntendedFor'][j] = 'ses-V1/dwi/' + str(data['IntendedFor'][j]) + elif dir98 in data['IntendedFor'][j]: + data['IntendedFor'][j] = 'ses-V1/dwi/' + str(data['IntendedFor'][j]) + elif t1w in data['IntendedFor'][j]: + data['IntendedFor'][j] = 'ses-V1/anat/' + str(data['IntendedFor'][j]) + elif t2w in data['IntendedFor'][j]: + data['IntendedFor'][j] = 'ses-V1/anat/' + str(data['IntendedFor'][j]) + else: + data['IntendedFor'][j] = 'ses-V1/func/' + str(data['IntendedFor'][j]) + + + + with open(fmap_json[i], 'w') as json_file: + json.dump(data,json_file) + + +if __name__ == '__main__': + main() diff --git a/scripts/cubic/bootstrap-mriqc-unzip-full-outputs.sh b/scripts/cubic/bootstrap-mriqc-unzip-full-outputs.sh new file mode 100644 index 0000000..579a056 --- /dev/null +++ b/scripts/cubic/bootstrap-mriqc-unzip-full-outputs.sh @@ -0,0 +1,100 @@ +#!/bin/bash +PROJECTROOT=/cbica/projects/RBC/production/PNC/mriqc +cd ${PROJECTROOT} +RIA=${PROJECTROOT}/output_ria +datalad create -c yoda -D "extract mriqc results" unzipped-results-duplicate +cd unzipped-results-duplicate +datalad clone -d . --reckless ephemeral "ria+file://${RIA}#~data" inputs/data +datalad clone -d . ../pennlinc-containers + +## the actual compute job specification +cat > code/get_files.sh << "EOT" +#!/bin/bash +set -e -u -x + +ZIP_FILE=$1 + +# Create a mriqc/ directory +#unzip -j $ZIP_FILE "mriqc/sub-4238772277/ses-PNC1/anat/sub-4238772277_ses-PNC1_acq-refaced_T1w.json" -d "mriqc_results" + + +#subid=$(basename "${ZIP_FILE%.*}") +#subid=${subid%_*} +#echo $subid +# Create a mriqc/ directory +unzip -o $ZIP_FILE -x 'mriqc/*.html' + + +EOT + +cat > code/mriqc-group.sh << "EOT" +#!/bin/bash +set -e -u -x +datalad get ${PWD}/pennlinc-containers/.datalad/environments/mriqc-0-16-1/image +# create group reports for the anatomical T1w data +singularity exec --cleanenv -B ${PWD} \ + pennlinc-containers/.datalad/environments/mriqc-0-16-1/image \ + python code/group_results.py + +EOT + +cat > code/group_results.py << "EOT" + +from pathlib import Path +from mriqc.reports import group_html +from mriqc.utils.bids import DEFAULT_TYPES +from mriqc.utils.misc import generate_tsv + +if __name__ == '__main__': + output_dir = Path(".") / "mriqc" + # Generate reports + mod_group_reports = [] + for mod in DEFAULT_TYPES: + dataframe, out_tsv = generate_tsv(output_dir, mod) + # If there are no iqm.json files, nothing to do. + if dataframe is None: + continue + + print(f"Generated summary TSV table for the {mod} data ({out_tsv})") + + # out_pred = generate_pred(derivatives_dir, settings['output_dir'], mod) + # if out_pred is not None: + # log.info('Predicted QA CSV table for the %s data generated (%s)', + # mod, out_pred) + + out_html = output_dir / f"group_{mod}.html" + group_html( + out_tsv, + mod, + csv_failed=output_dir / f"group_variant-failed_{mod}.csv", + out_file=out_html, + ) + + print(f"Group-{mod} report generated ({out_html})") + mod_group_reports.append(mod) + + if not mod_group_reports: + raise Exception("No data found. No group level reports were generated.") + + print("Group level finished successfully.") +EOT + +datalad save -m "Add data extraction code" code + +zip_files=$(find inputs/data/ -name '*.zip') +for input_zip in ${zip_files} +do + subid=$(basename "${input_zip%.*}") + subid=${subid%_*} + outdir=. + + datalad run \ + -i pennlinc-containers/.datalad/environments/fmriprep-20-2-3/image \ + -i ${input_zip} \ + -o ${outdir}/${subid} \ + --explicit \ + "bash code/get_files.sh ${input_zip}" +done + +# CRITICAL: Don't uninstall the inputs - it will delete your data +rm -rf inputs diff --git a/scripts/cubic/bootstrap-mriqc.sh b/scripts/cubic/bootstrap-mriqc.sh new file mode 100644 index 0000000..ab450a9 --- /dev/null +++ b/scripts/cubic/bootstrap-mriqc.sh @@ -0,0 +1,282 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/mriqc +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + + +## Check the BIDS input +BIDSINPUT=$1 +if [[ -z ${BIDSINPUT} ]] +then + echo "Required argument is an identifier of the BIDS source" + # exit 1 +fi + +# Is it a directory on the filesystem? +BIDS_INPUT_METHOD=clone +if [[ -d "${BIDSINPUT}" ]] +then + # Check if it's datalad + BIDS_DATALAD_ID=$(datalad -f '{infos[dataset][id]}' wtf -S \ + dataset -d ${BIDSINPUT} 2> /dev/null || true) + [ "${BIDS_DATALAD_ID}" = 'N/A' ] && BIDS_INPUT_METHOD=copy +fi + + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset +if [[ "${BIDS_INPUT_METHOD}" == "clone" ]] +then + echo "Cloning input dataset into analysis dataset" + datalad clone -d . ${BIDSINPUT} inputs/data + # amend the previous commit with a nicer commit message + git commit --amend -m 'Register input data dataset as a subdataset' +else + echo "WARNING: copying input data into repository" + mkdir -p inputs/data + cp -r ${BIDSINPUT}/* inputs/data + datalad save -r -m "added input data" +fi + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 ) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +set +u +CONTAINERDS=$2 +set -u +#if [[ ! -z "${CONTAINERDS}" ]]; then +cd ${PROJECTROOT} +datalad clone ${CONTAINERDS} pennlinc-containers +## Add the containers as a subdataset +#datalad clone ria+ssh://sciget.pmacs.upenn.edu:/project/bbl_projects/containers#~pennlinc-containers pennlinc-containers +# download the image so we don't ddos pmacs +cd pennlinc-containers +datalad get -r . +# get rid of the references to pmacs +set +e +datalad siblings remove -s pmacs-ria-storage +datalad siblings remove -s origin +set -e + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=15G +#$ -l tmpfree=100G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup +datalad get -n "inputs/data/${subid}" +# Reomve all subjects we're not working on +(cd inputs/data && rm -rf `find . -type d -name 'sub*' | grep -v $subid`) +# ------------------------------------------------------------------------------ +# Do the run! +datalad run \ + -i code/mriqc_zip.sh \ + -i inputs/data/${subid} \ + -i inputs/data/*json \ + -i pennlinc-containers/.datalad/environments/mriqc-0-16-1/image \ + --explicit \ + -o ${subid}_mriqc-0.16.1.zip \ + -m "mriqc:0.16.1 ${subid}" \ + "bash ./code/mriqc_zip.sh ${subid}" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad uninstall --nocheck --if-dirty ignore -r inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +cat > code/mriqc_zip.sh << "EOT" +#!/bin/bash +set -e -u -x +subid="$1" +mkdir -p ${PWD}/.git/tmp/wkdir +singularity run --cleanenv -B ${PWD} \ + pennlinc-containers/.datalad/environments/mriqc-0-16-1/image \ + inputs/data \ + qc/mriqc \ + participant \ + -w ${PWD}/.git/tmp/wkdir \ + --n_cpus $NSLOTS \ + --ants-nthreads 2 \ + --float32 \ + -m T1w \ + --participant-label "$subid" \ + --verbose-reports --no-sub -v -v + +cd qc +7z a ../${subid}_mriqc-0.16.1.zip mriqc + +rm -rf qc .git/tmp/wkdir +EOT + +chmod +x code/mriqc_zip.sh +#cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N fp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +if [ "${BIDS_INPUT_METHOD}" = "clone" ] +then + datalad uninstall -r --nocheck inputs/data +fi + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap_hcpd_bids.sh b/scripts/cubic/bootstrap_hcpd_bids.sh new file mode 100644 index 0000000..1905e84 --- /dev/null +++ b/scripts/cubic/bootstrap_hcpd_bids.sh @@ -0,0 +1,262 @@ + + + + + + + + + + + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/bootstrap_hcpd_bids +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +SUBJECTIDCSV=$1 +HCPDCSV=$2 +if [[ -z ${SUBJECTIDCSV} ]] +then + echo "Required argument is an identifier of the HCPD csv source" + # exit 1 +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset + +SUBJECTS=$(cut -d, -f1 ${SUBJECTIDCSV}) + + + +git annex initremote datalad type=external externaltype=datalad encryption=none + + +cat > code/participant_csv.py << "EOT" + + +#!/usr/bin/env python +""" +USAGE: + +python participant_csv.py subid + +Run this inside of participant_job.sh + +Creates csv for one single participant + + +""" +import pandas as pd +import sys + +hcpdcsv = sys.argv[2] +df = pd.read_csv(hcpdcsv) + +# the HCD* +prefix = sys.argv[1] + +df2=df[df.filename.str.startswith(prefix)] + +df3 = df2.drop_duplicates(subset ="filename", keep = 'first', ignore_index=True) + +df3.to_csv(f"{prefix}.csv", index=False) + + +EOT + + + +chmod +x code/participant_csv.py + +datalad save -m "Participant csv implementation" + + +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=25G +#$ -l tmpfree=200G +#$ -R y +#$ -l h_rt=24:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +dssource="$1" +pushgitremote="$2" +subid="$3" +hcpdcsv="$4" + +rename_subid="sub-${subid:3:7}" + +cd ${CBICA_TMPDIR} +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +datalad clone "${dssource}" ds +cd ds + +python code/participant_csv.py ${subid} ${hcpdcsv} + + + +SUBJECTCSV="${subid}.csv" + + +git annex enableremote datalad type=external externaltype=datalad encryption=none +datalad addurls -d . ${SUBJECTCSV} '{associated_file}' '{filename}' +rm ${SUBJECTCSV} + +git remote add outputstore "$pushgitremote" + +git checkout -b "${BRANCH}" + + +# ------------------------------------------------------------------------------ +# Do the run! +# CREATE THE CSV FOR ONE SINGLE SUBJECT + +# CLONE +datalad run \ + -i ${subid}_V1_MR \ + --explicit \ + -o ${subid}_V1_MR \ + -o ${rename_subid} \ + -m "rename for ${subid}" \ + "python /cbica/projects/RBC/mengjia_space/hcpd_main.py ${subid}_V1_MR" + +datalad save -m "Records the deletion of raw non-BIDS directories" + + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage ${rename_subid} +# and the output branch +flock $DSLOCKFILE git push outputstore + +echo TMPDIR TO DELETE +echo ${BRANCH} + + +datalad drop -r . --nocheck +git annex dead here +cd ../.. + +chmod +w -R $BRANCH +rm -rf $BRANCH + +echo SUCCESS + +EOT + +chmod +x code/participant_job.sh + + + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +#dssource="~/mengjia_space/hcpd_single_subject.csv" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" + +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N ${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} ${HCPDCSV}" >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +#if [ "${BIDS_INPUT_METHOD}" = "clone" ] +#then + # datalad uninstall -r --nocheck inputs/data +#fi + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS + +