diff --git a/1_CRUK.sh b/1_CRUK.sh index f32b9e1..c8c4ef1 100644 --- a/1_CRUK.sh +++ b/1_CRUK.sh @@ -7,7 +7,8 @@ cd $PBS_O_WORKDIR #Description: BaseSpace app pipeline (Illumina paired-end). Not for use with other library preps/ experimental conditions. #Mode: BY_SAMPLE -version="1.1.1" + +version="1.1.3" # Directory structure required for pipeline # @@ -95,9 +96,7 @@ if [ $(find .. -maxdepth 1 -mindepth 1 -type d | wc -l | sed 's/^[[:space:]]*//g #soft link sample sheet ln -s /data/archive/fastq/"$seqId"/SampleSheet.csv .. #launch second pipeline script, move out one directory level, save working directory to a variable - cp 2_CRUK.sh .. && cp 3_CRUK.sh .. && cp /data/diagnostics/pipelines/"$pipelineName"/"$pipelineName"-"$pipelineVersion"/config.json .. \ - && cp /data/diagnostics/pipelines/"$pipelineName"/"$pipelineName"-"$pipelineVersion"/baseSpace.js .. - cd .. + cp 2_CRUK.sh .. && cp 3_CRUK.sh .. && cd .. wd=$PWD ssh transfer@cvx-gen01 "cd '$wd'; bash ./2_CRUK.sh './' '$negative' >./2_CRUK.out 2>./2_CRUK.err;" -fi \ No newline at end of file +fi diff --git a/2_CRUK.sh b/2_CRUK.sh index 1892f27..2f69b8c 100644 --- a/2_CRUK.sh +++ b/2_CRUK.sh @@ -1,238 +1,243 @@ #!/bin/bash set -euo pipefail -#Description: CRUK BaseSpace app pipeline -#Author: Sara Rey -#Status: RELEASE -Version="1.1.1" +# Description: CRUK BaseSpace app pipeline +# Author: Sara Rey and Chris Medway +# Status: RELEASE +Version="1.1.3" -# Aliases for local python VE -alias python='/home/transfer/basespace_vm/venv/bin/python' -PATH="$PATH":/home/transfer/basespace_vm/venv/bin/ +# location of basespace CLI v2 binary +BS=/home/transfer/bs # How to use # bash 2_CRUK.sh -# Variables +# variables CONFIG="pmg-euc1" -APPID="123123" -#load variables +# load variables . variables -# Usage checking +# usage checking if [ "$#" -lt 2 ] - then - echo "Commandline args incorrect. Usage: $0 ." - exit -1 + then + echo "Commandline args incorrect. Usage: $0 ." + exit -1 fi -# Variables dependent on command line arguments +# variables dependent on command line arguments INPUTFOLDER="$1" NEGATIVE="$2" NOTBASESPACE="$INPUTFOLDER""not_bs.txt" FASTQFOLDER="$INPUTFOLDER""/*/trimmed/" -#Check if the sample sheet indicates that a manual pairs file should be created +# check if the sample sheet indicates that a manual pairs file should be created if [ $pairs == 0 ] - then - SAMPLEPAIRS="$INPUTFOLDER""SamplePairs.txt" - makePairs=1 +then + SAMPLEPAIRS="$INPUTFOLDER""SamplePairs.txt" + makePairs=1 elif [ $pairs == 1 ] && [ "$#" -lt 3 ] - then - echo "SamplePairs file requires manual generation. Create in script directory and relaunch" \ - "2_CRUK.sh passing pairs file as the third command line argument." - exit 1 + then + echo "SamplePairs file requires manual generation. Create in script directory and relaunch" \ + "2_CRUK.sh passing pairs file as the third command line argument." + exit 1 elif [ $pairs == 1 ] && [ "$#" -eq 3 ] - then - SAMPLEPAIRS="$3" - # Skip generation of a SamplePairs.txt file - makePairs=-1 + then + SAMPLEPAIRS="$3" + # Skip generation of a SamplePairs.txt file + makePairs=-1 fi -# Check for the presence of the file with samples not to upload to BaseSpace in the same directory as the script +# check for the presence of the file with samples not to upload to BaseSpace in the same directory as the script if [[ -e $NOTBASESPACE ]] - then - samples_to_skip=1 - # Check that the provided file is not empty - if ! [[ -s $NOTBASESPACE ]] - then - echo "The file "$NOTBASESPACE" is empty. When this file exists, it must contain the names of samples that are in the SampleSheet.csv, but should not be uploaded to BaseSpace." - exit -1 - fi - else - samples_to_skip=-1 - # Notify the user that all samples in the sample sheet will be uploaded - echo "No "$NOTBASESPACE" file found in the same directory as the script. All samples on the SampleSheet.csv will be uploaded to BaseSpace." +then + samples_to_skip=1 + # check that the provided file is not empty + if ! [[ -s $NOTBASESPACE ]] + then + echo "The file "$NOTBASESPACE" is empty. When this file exists, it must contain the names of samples that are in the SampleSheet.csv, but should not be uploaded to BaseSpace." + exit -1 + fi +else + samples_to_skip=-1 + # notify the user that all samples in the sample sheet will be uploaded + echo "No "$NOTBASESPACE" file found in the same directory as the script. All samples on the SampleSheet.csv will be uploaded to BaseSpace." fi -# Declare an array to store the sample ids in order +# declare an array to store the sample ids in order declare -a samplesArr -# Initial entry created to avoid downstream error when appending to array +# initial entry created to avoid downstream error when appending to array samplesArr+=1 -# Parse SampleSheet +# parse SampleSheet function parseSampleSheet { - echo "Parsing sample sheet" + echo "Parsing sample sheet" - # Obtain project name from sample sheet - projectName=$(grep "Experiment Name" "$INPUTFOLDER""SampleSheet.csv" | cut -d, -f2 | tr -d " ") - - # Obtain list of samples from sample sheet - for line in $(sed "1,/Sample_ID/d" "$INPUTFOLDER""SampleSheet.csv" | tr -d " ") - do - # Obtain sample name and patient name - samplename=$(printf "$line" | cut -d, -f1 | sed 's/[^a-zA-Z0-9]+/-/g') - - # Skip any empty sample ids- both empty and whitespace characters (but not tabs at present) - if [[ "${#samplename}" = 0 ]] || [[ "$samplename" =~ [" "] ]] - then - continue - fi - - # Append information to list array- to retain order for sample pairing - samplesArr=("${samplesArr[@]}" "$samplename") - done + # obtain project name from sample sheet + projectName=$(grep "Experiment Name" "$INPUTFOLDER""SampleSheet.csv" | cut -d, -f2 | tr -d " ") + + echo $projectName + + # obtain list of samples from sample sheet + for line in $(sed "1,/Sample_ID/d" "$INPUTFOLDER""SampleSheet.csv" | tr -d " ") + do + # obtain sample name and patient name + samplename=$(printf "$line" | cut -d, -f1 | sed 's/[^a-zA-Z0-9]+/-/g') + + # skip any empty sample ids- both empty and whitespace characters (but not tabs at present) + if [[ "${#samplename}" = 0 ]] || [[ "$samplename" =~ [" "] ]] + then + continue + fi + + # append information to list array- to retain order for sample pairing + samplesArr=("${samplesArr[@]}" "$samplename") + done } function pairSamples { - echo "Pairing samples" + echo "Pairing samples" - # Create/clear file which holds the sample name and the patient identifiers - > "$SAMPLEPAIRS" + # create/clear file which holds the sample name and the patient identifiers + > "$SAMPLEPAIRS" - # Iterate through the samples and exclude any samples that are not for basespace - # Pair the samples assuming the order tumour then normal and create a file of these pairs - # Create array containing the samples that are not tumour-normal pairs - # Check if there are any samples on the run that are not for BaseSpace and so should not be paired - if [[ -e $NOTBASESPACE ]] - then - mapfile -t notPairs < $NOTBASESPACE - notPairs=("${notPairs[@]}" "$NEGATIVE") - else - notPairs+=("$NEGATIVE") - fi + # iterate through the samples and exclude any samples that are not for basespace + # pair the samples assuming the order tumour then normal and create a file of these pairs + # create array containing the samples that are not tumour-normal pairs + # check if there are any samples on the run that are not for BaseSpace and so should not be paired + if [[ -e $NOTBASESPACE ]] + then + mapfile -t notPairs < $NOTBASESPACE + notPairs=("${notPairs[@]}" "$NEGATIVE") + else + notPairs+=("$NEGATIVE") + fi - # Exclude non tumour-normal pairs from pair file creation - grep -f <(printf -- '%s\n' "${notPairs[@]}") -v <(printf '%s\n' "${samplesArr[@]:1}") | awk -F '\t' 'NR % 2 {printf "%s\t", $1;} !(NR % 2) {printf "%s\n", $1;}' >"$SAMPLEPAIRS" - + # exclude non tumour-normal pairs from pair file creation + grep -f <(printf -- '%s\n' "${notPairs[@]}") -v <(printf '%s\n' "${samplesArr[@]:1}") | awk -F '\t' 'NR % 2 {printf "%s\t", $1;} !(NR % 2) {printf "%s\n", $1;}' >"$SAMPLEPAIRS" } function locateFastqs { - echo "Uploading fastqs" + echo "Uploading fastqs" - if [[ "$samples_to_skip" == 1 ]] - then - fastqlist=$( printf -- '%s\n' "${samplesArr[@]:1}" | grep -f "$NOTBASESPACE" -v ) - else - fastqlist=$(printf -- '%s\n' "${samplesArr[@]:1}") - fi + if [[ "$samples_to_skip" == 1 ]] + then + fastqlist=$( printf -- '%s\n' "${samplesArr[@]:1}" | grep -f "$NOTBASESPACE" -v ) + else + fastqlist=$(printf -- '%s\n' "${samplesArr[@]:1}") + fi - for fastq in $(printf -- '%s\n' "$fastqlist") - do - f1=$FASTQFOLDER${fastq}*_R1_*.fastq.gz - f2=$FASTQFOLDER${fastq}*_R2_*.fastq.gz - - # Obtain basespace identifier for each sample - baseSpaceId=$(bs -c "$CONFIG" upload sample -p $projectName -i "$fastq" $f1 $f2 --terse) - done - + for fastq in $(printf -- '%s\n' "$fastqlist") + do + f1=$FASTQFOLDER${fastq}*_R1_*.fastq.gz + f2=$FASTQFOLDER${fastq}*_R2_*.fastq.gz + + # added in version 1.1.3. bscli v2 requires sample unicity + # therefore sample names are prefixed with project name + cp $f1 ./"$projectName"-`basename $f1` + cp $f2 ./"$projectName"-`basename $f2` + + f1=./"$projectName"-`basename $f1` + f2=./"$projectName"-`basename $f2` + + # upload fastq to biosample + $BS upload dataset --config "$CONFIG" --project $projectId $f1 $f2 + done } function launchApp { - # Launch app for each pair of samples in turn as tumour normal pairs then download analysis files + # launch app for each pair of samples in turn as tumour normal pairs then download analysis files - # Obtain basespace ID of negative control- this is not an optional input through the commandline app launch - negId=$(bs -c "$CONFIG" list samples --project "$projectName" --sample "$NEGATIVE" --terse) - - # Obtain the project identifier - projectId=$(bs -c "$CONFIG" list projects --project-name "$projectName" --terse) - - while read pair - do - # Stop iteration on first empty line of SamplePairs.txt file in case EOF marker is absent for any reason - if [[ -z $pair ]] - then - return 0 - fi - echo "Launching app for ""$pair" + # obtain basespace ID of negative control- this is not an optional input through the commandline app launch + negId=$($BS list biosample --config "$CONFIG" --filter-field BioSampleName --filter-term "$projectName"-"$NEGATIVE" --terse) + + while read pair + do + # stop iteration on first empty line of SamplePairs.txt file in case EOF marker is absent for any reason + if [[ -z $pair ]] + then + return 0 + fi + + echo "Launching app for ""$pair" - tum=$(printf "$pair" | cut -d$'\t' -f1) - nor=$(printf "$pair" | cut -d$'\t' -f2) - - # Obtain sample ids from basespace - tumId=$(bs -c "$CONFIG" list samples --project "$projectName" --sample "$tum" --terse) - norId=$(bs -c "$CONFIG" list samples --project "$projectName" --sample "$nor" --terse) - - # Launch app and store the appsession ID - appSessionId=$(bs -c "$CONFIG" launch app -i "$APPID" "$negId" "$norId" "$projectName" "$tumId" --terse) - done <"$SAMPLEPAIRS" - + tum=$(printf "$pair" | cut -d$'\t' -f1) + nor=$(printf "$pair" | cut -d$'\t' -f2) + + # obtain sample ids from basespace + tumId=$($BS list biosample --config "$CONFIG" --filter-field BioSampleName --filter-term "$projectName"-"$tum" --terse) + norId=$($BS list biosample --config "$CONFIG" --filter-field BioSampleName --filter-term "$projectName"-"$nor" --terse) + + # launch app and store the appsession ID + appSessionId=$($BS launch application \ + --config "$CONFIG" \ + --name "SMP2 v2" \ + --app-version "1.1.2" \ + --option tumour-sample-id:$tumId \ + --option normal-sample-id:$norId \ + --option negative-sample-id:$negId \ + --option project-id:$projectId \ + --option basespace-labs:1 \ + --terse ) + + # save file that will track the appsession IDs for each sample pair + echo -e $appSessionId $tum $nor $projectName >> ./appsessions.txt + + done <"$SAMPLEPAIRS" } - -# Call the functions - -# Check sample sheet exists at location provided +# call the functions +# check sample sheet exists at location provided if ! [[ -e "$INPUTFOLDER""SampleSheet.csv" ]] - then - echo "Sample Sheet not found at input folder location" - exit -1 +then + echo "Sample Sheet not found at input folder location" + exit -1 fi - -# Parse sample sheet to obtain required information +# parse sample sheet to obtain required information parseSampleSheet - -# Pair samples according to order in sample sheet if manually created pairs file has not been supplied +# pair samples according to order in sample sheet if manually created pairs file has not been supplied if [[ "$makePairs" == 1 ]] - then - pairSamples +then + pairSamples fi -# Count number of paired samples +# count number of paired samples numPairs=$(cat "$SAMPLEPAIRS" | cut -f2 | sed '/^\s*$/d' | wc -l) -# Read out the sample pairs in the order tumour blood with each pair on a new line +# read out the sample pairs in the order tumour blood with each pair on a new line echo "Displaying sample pairs:" cat "$SAMPLEPAIRS" printf $'\n' echo "Abort the script if the samples are paired incorrectly and create a file of the pairs (see README.MD for details about this file)." printf $'\n' - -# Create project in basespace +# create project in basespace echo "Creating project" -bs -c "$CONFIG" create project "$projectName" +$BS create project --name "$projectName" --config "$CONFIG" +# get project ID +projectId=$($BS get project --name $projectName --config $CONFIG --terse) # Get fastqs and upload to basespace locateFastqs - # Kick off the app for each pair in turn +if [ -e "appsessions.txt" ]; then rm appsessions.txt; fi launchApp -# Write config file for JavaScript script -printf '%s\n' "{" "\"projectID\": ""\"$projectId\"""," "\"projectName\": ""\"$projectName\"""," "\"numPairs\": ""\"$numPairs\"""," "\"negativeControl\": ""\"$NEGATIVE\"" "}" > runConfig.json - - -# Write name of sample pairs file name to file -printf "$SAMPLEPAIRS" >"pairFn.txt" - - -# Queue next script in the pipeline for half an hours time -at now +30 minutes -f ./3_CRUK.sh >3_CRUK.out 2>3_CRUK.err +# queue next script in the pipeline for half an hours time +at now +50 minutes -f ./3_CRUK.sh >3_CRUK.out 2>3_CRUK.err diff --git a/3_CRUK.sh b/3_CRUK.sh index 4342c7c..9b1703f 100644 --- a/3_CRUK.sh +++ b/3_CRUK.sh @@ -1,66 +1,44 @@ #!/bin/bash set -euo pipefail -#Description: CRUK BaseSpace app pipeline -#Author: Sara Rey -#Status: RELEASE -Version="1.1.1" -# Name of file containing samples to not be uploaded to BaseSpace -NOTBASESPACE=./"not_bs.txt" +# Description: downloads data generated by basespace SMPv2 app +# using basespace command line interface (version 2) +# Author: Christopher Medway -# Load pair file name -SAMPLEPAIRS=$(cat "pairFn.txt") +version="1.1.3" +if [ ! -e "./appsessions.txt" ]; then echo "cannot locate appsessions.txt"; exit -1; fi -# Remove pairs that are not for analysis in BaseSpace -if [[ -e "$NOTBASESPACE" ]] - then - grep -f "$NOTBASESPACE" -v "$SAMPLEPAIRS">"analysed_pairs.txt" - else - cat "$SAMPLEPAIRS">"analysed_pairs.txt" -fi +while read session; +do + sessionId=$(echo $session | cut -d" " -f1) + tumSample=$(echo $session | cut -d" " -f2) + norSample=$(echo $session | cut -d" " -f3) + projectName=$(echo $session | cut -d" " -f4) -# Load any variables file to obtain worksheet id -. $(ls -d */ | tail -n 1)/*.variables + echo "downloading files for $sessionId" + # check app has completed + bs list appsessions --config "pmg-euc1" --filter-field Id --filter-term �$sessionId -# Path to node bin directory (do not include trailing /) -NODE="/share/apps/node-distros/node-v6.11.3-linux-x64/bin" + # create directory for downloads + if [ ! -e $projectName ]; then mkdir $projectName; fi + # get dataset id from basespace using cli + datasetId=$(~/bs get properties appsession --terse --config pmg-euc1 --id $sessionId --property-name Output.Datasets) -# Path to location of node_modules -NODE_MOD=$(echo $NODE | awk -F '/' 'BEGIN {OFS = FS} NF{NF--; print $0}') + # download results + ~/bs download dataset --config pmg-euc1 --id $datasetId --output "$tumSample" + # move downloaded tar to project directory + if [ ! -e ./$projectName/$tumSample ]; then mkdir ./$projectName/$tumSample; fi + mv ./"$tumSample".tar.gz ./$projectName/$tumSample/ + tar -xvzf ./$projectName/$tumSample/$tumSample.tar.gz -C ./$projectName/$tumSample/ + +done < ./appsessions.txt -# Make directory for results -mkdir "$worklistId" +# clean data +rm $projectName*.fastq - -# Make directories with the tumour sample id name to put the results in -cut -f1 "analysed_pairs.txt" | xargs -L 1 -i mkdir "$worklistId""/"{} - - -# Launch node javascript file and pass node path to script -"$NODE"/node ./baseSpace.js "$NODE_MOD" >baseSpace.out 2>baseSpace.err - - -# Delete temporary file -if [[ -e "pairFn.txt" ]] - then - rm "pairFn.txt" -fi - - -# Move downloaded files into directory with tumour sample name -while read line - do - tum=$(printf "$line" | cut -d$'\t' -f1) - nor=$(printf "$line" | cut -d$'\t' -f2) - mv "$worklistId"/*"$tum"*.xlsx "$worklistId"/"$tum" - mv "$worklistId"/*"$tum"*.bam "$worklistId"/"$tum" - mv "$worklistId"/*"$tum"*.bai "$worklistId"/"$tum" - mv "$worklistId"/*"$nor"*.bam "$worklistId"/"$tum" - mv "$worklistId"/*"$nor"*.bai "$worklistId"/"$tum" -done < "analysed_pairs.txt" >3_CRUK_copy.out 2>3_CRUK_copy.err \ No newline at end of file diff --git a/SMP2v2/CRUK-1.1.3_SMP2v2.variables b/SMP2v2/CRUK-1.1.3_SMP2v2.variables new file mode 100755 index 0000000..179f70b --- /dev/null +++ b/SMP2v2/CRUK-1.1.3_SMP2v2.variables @@ -0,0 +1,2 @@ +read1Adapter=CTGTCTCTTATACACATCT #read 1 adapter to trim +read2Adapter=CTGTCTCTTATACACATCT #read 2 adapter to trim \ No newline at end of file diff --git a/baseSpace.js b/baseSpace.js index 98c7f82..4ad85b1 100644 --- a/baseSpace.js +++ b/baseSpace.js @@ -4,7 +4,7 @@ once the app has completed Description: CRUK BaseSpace app pipeline Author: Sara Rey Status: RELEASE -Version: "1.1.1" +Version: "1.1.2" */ "use strict"; @@ -38,7 +38,7 @@ var APPRES; // Variables which may need adjusting // Illumina named template for Excel results spreadsheet -var TEMPLATE = "SMP2_CRUK_V2_03.15.xlsx"; //Update manually if it changes +var TEMPLATE = "SMP2_CRUK_V2_03.17.xlsx"; //Update manually if it changes // Desired location of output files var OUTPATH = path.join(path.normalize('.'), PROJECTNAME); //Change if output location of files changes