Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 19 additions & 21 deletions dDocent
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ export LC_ALL=en_US.UTF-8
export SHELL=bash

##########dDocent##########
VERSION='2.9.5'; export VERSION
VERSION='2.9.6'; export VERSION
#This script serves as an interactive bash wrapper to QC, assemble, map, and call SNPs from double digest RAD (SE or PE), ezRAD (SE or PE) data, or SE RAD data.
#It requires that your raw data are split up by tagged individual and follow the naming convention of:

Expand Down Expand Up @@ -152,7 +152,7 @@ if [ "$TEST" -gt 0 ]; then
fi

#Count number of individuals in current directory
NumInd=$(ls *.F.fq.gz 2> /dev/null | wc -l)
NumInd=$(find . -maxdepth 1 -type f -name "*.F.fq.gz" | wc -l)
NumInd=$(($NumInd - 0))

#Test for file limits for current user and reset if necessary
Expand All @@ -168,7 +168,7 @@ if [ "$Flimit" != "unlimited" ]; then
fi

#Create list of sample names
ls *.F.fq.gz > namelist 2> /dev/null
find . -maxdepth 1 -type f -name "*.F.fq.gz" -printf "%f\n" > namelist
sed -i'' -e 's/.F.fq.gz//g' namelist
#Create an array of sample names
NUMNAMES=$(mawk '/_/' namelist | wc -l)
Expand Down Expand Up @@ -416,15 +416,15 @@ fi

if [ "$MAP" != "no" ]; then
echo -e "\nCreating alignment intervals"
ls *-RG.bam >bamlist.list
find . -maxdepth 1 -type f -name "*-RG.bam" -printf "%f\n" > bamlist.list
CreateIntervals
fi

##SNP Calling Section of code

if [ "$SNP" != "no" ]; then
#Create list of BAM files
ls *-RG.bam >bamlist.list
find . -maxdepth 1 -type f -name "*-RG.bam" -printf "%f\n" > bamlist.list
#If mapping is not being performed, but intervals do not exist they are created
if [[ "$MAP" == "no" && ! -f "cat-RRG.bam" ]]; then
CreateIntervals
Expand All @@ -435,12 +435,11 @@ if [ "$SNP" != "no" ]; then
fi
#Check to make sure interval files have been created
if [[ "$MAP" == "no" && ! -f "mapped.bed" ]]; then
#bedtools bamtobed -i cat-RRG.bam > cat-RRG.bed
#bedtools bamtobed -i cat-RRG.bam > cat-RRG.bed
#bedops --merge cat-RRG.bed > mapped.bed
cat namelist | parallel -j $FB2 "bedtools bamtobed -i {}-RG.bam | sort-bed - | bedops --merge - > {}-RG.bed"
bedops --merge *-RG.bed | bedtools sort -i - -faidx reference.fasta.fai > mapped.bed
rm *-RG.bed

find . -maxdepth 1 -type f -name "*-RG.bed" -delete
fi
#This code estimates the coverage of reference intervals and removes intervals in 0.01% of depth
#This allows genotyping to be more effecient and eliminates extreme copy number loci from the data
Expand Down Expand Up @@ -563,8 +562,9 @@ if [ "$SNP" != "no" ]; then
len=$3-$2;lc=len*$4;cov = cov + lc
if ( cov < cutoff) {x="mapped."i".bed";print $1"\t"$2"\t"$3 > x}
else {i=i+1; x="mapped."i".bed"; print $1"\t"$2"\t"$3 > x; cov=0}
}'
ls mapped.*.bed | sed -e 's/mapped.//g' | sed -e 's/.bed//g' | shuf | parallel --bar --halt now,fail=1 --env call_genos2 -j 4 --no-notice "call_genos2 {} "
}'

find . -maxdepth 1 -type f -name "mapped.*.bed" -printf "%f\n" | sed -e 's/mapped.//g' | sed -e 's/.bed//g' | shuf | parallel --bar --halt now,fail=1 --env call_genos2 -j 4 --no-notice "call_genos2 {} "

if [ -f "freebayes.error" ]; then
echo -e "\n\n\nFreeBayes has failed when trying to finish a previously failed instance. Memory and processor settings need to be drastically reconfigured"
Expand All @@ -588,8 +588,7 @@ if [ "$SNP" != "no" ]; then
export -f finish_genos

rm freebayes.error freebayes.log &> /dev/null

ls mapped.*.bed | sed -e 's/mapped.//g' | sed -e 's/.bed//g' | shuf | parallel --bar --halt now,fail=10 --env call_genos -j $NUMProc --no-notice --joblog ./par.log "call_genos {} "
find . -maxdepth 1 -type f -name "mapped.*.bed" -printf "%f\n" | sed -e 's/mapped.//g' | sed -e 's/.bed//g' | shuf | parallel --bar --halt now,fail=10 --env call_genos -j $NUMProc --no-notice --joblog ./par.log "call_genos {} "

if [ -f "freebayes.error" ]; then
FAILS=$( cat freebayes.error | wc -l )
Expand All @@ -601,7 +600,7 @@ if [ "$SNP" != "no" ]; then
rm freebayes.redos
else
echo -e "\nmultiple instances of freebayes failed. dDocent will now recalibrate run parameters to use less memory.\n"
rm mapped.*.bed
find . -maxdepth 1 -type f -name "mapped.*.bed" -delete
rm freebayes.error
LIM=$(( $SNPNUMProc * 4 ))
if head -1 reference.fasta | grep -e 'dDocent' 1>/dev/null; then
Expand All @@ -628,7 +627,7 @@ if [ "$SNP" != "no" ]; then
echo "Using FreeBayes to call SNPs again"
NumP=$(( $NUMProc / 4 ))
NumP=$(( $NumP * 3 ))
ls mapped.*.bed | sed -e 's/mapped.//g' | sed -e 's/.bed//g' | shuf | parallel --bar --halt now,fail=10 --env call_genos -j $NumP --no-notice --joblog ./par2.log "call_genos {} "
find . -maxdepth 1 -type f -name "mapped.*.bed" -printf "%f\n" | sed -e 's/mapped.//g' | sed -e 's/.bed//g' | shuf | parallel --bar --halt now,fail=10 --env call_genos -j $NumP --no-notice --joblog ./par2.log "call_genos {} "
fi
fi

Expand Down Expand Up @@ -666,7 +665,7 @@ if [ "$SNP" != "no" ]; then
NumP=$(( $NumP / 4 ))
NumP=$(( $NumP * 3 ))
echo "Using FreeBayes to call SNPs again"
ls mapped.*.bed | sed -e 's/mapped.//g' | sed -e 's/.bed//g' | shuf | parallel --bar --halt now,fail=10 --env call_genos -j $NumP --joblog ./par3.log --no-notice "call_genos {}"
find . -maxdepth 1 -type f -name "mapped.*.bed" -printf "%f\n" | sed -e 's/mapped.//g' | sed -e 's/.bed//g' | shuf | parallel --bar --halt now,fail=10 --env call_genos -j $NumP --joblog ./par3.log --no-notice "call_genos {}"
fi
fi

Expand All @@ -677,9 +676,8 @@ if [ "$SNP" != "no" ]; then
else
ERROR3=0
export ERROR3
fi

rm mapped.*.bed
fi
find . -maxdepth 1 -type f -name "mapped.*.bed" -delete

mv raw.1.vcf raw.01.vcf 2>/dev/null
mv raw.2.vcf raw.02.vcf 2>/dev/null
Expand Down Expand Up @@ -768,7 +766,7 @@ TrimReads () {
if [ $STACKS -gt 0 ]; then
echo "Removing the _1 character and replacing with /1 in the name of every sequence"
cat namelist | parallel -j $FB1 --no-notice "gunzip -c {}.F.fq.gz | sed -e 's:_1$:/1:g' > {}.F.fq"
rm -f *.F.fq.gz
find . -maxdepth 1 -type f -name "*.F.fq.gz" -delete
cat namelist | parallel -j $FB1 --no-notice "gzip {}.F.fq"
fi

Expand All @@ -778,7 +776,7 @@ TrimReads () {
if [ $STACKS -gt 0 ]; then
echo "Removing the _2 character and replacing with /2 in the name of every sequence"
cat namelist | parallel -j $FB1 --no-notice "gunzip -c {}.R.fq.gz | sed -e 's:_2$:/2:g' > {}.R.fq"
rm -f *.R.fq.gz
find . -maxdepth 1 -type f -name "*.R.fq.gz" -delete
cat namelist | parallel -j $FB1 --no-notice "gzip {}.R.fq"
fi
fi
Expand Down Expand Up @@ -1205,7 +1203,7 @@ wait
#bedops --merge cat-RRG.bed > mapped.bed
cat namelist | parallel -j $FB2 "bedtools bamtobed -i {}-RG.bam | sort-bed - | bedops --merge - > {}-RG.bed"
bedops --merge *-RG.bed | bedtools sort -i - -faidx reference.fasta.fai > mapped.bed
rm *-RG.bed
find . -maxdepth 1 -type f -name "*-RG.bed" -delete
}

#This checks that dDocent has detected the proper number of individuals and exits if incorrect
Expand Down