diff --git a/Changelog b/Changelog new file mode 100644 index 0000000..dadfb63 --- /dev/null +++ b/Changelog @@ -0,0 +1,3 @@ +[feature] Added support for cram files. Requires samtools 1.10 and newer +[speedup] decodes only parts of cram files used by RetroSeq Perl script +[fix] Samtools can not extract regions with a negative start position. Negative values are set to zero. This may occur in HG38 reference with alt contigs diff --git a/README.md b/README.md index 183c39b..d3ce5a9 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ RetroSeq ==================== What is RetroSeq? ------- -RetroSeq is a tool for discovery and genotyping of transposable element variants (TEVs) (also known as mobile element insertions) from next-gen sequencing reads aligned to a reference genome in BAM format. The goal is to call TEVs that are not present in the reference genome but present in the sample that has been sequenced. It should be noted that RetroSeq can be used to locate any class of viral insertion in any species where whole-genome sequencing data with a suitable reference genome is available. +RetroSeq is a tool for discovery and genotyping of transposable element variants (TEVs) (also known as mobile element insertions) from next-gen sequencing reads aligned to a reference genome in BAM or CRAM format. The goal is to call TEVs that are not present in the reference genome but present in the sample that has been sequenced. It should be noted that RetroSeq can be used to locate any class of viral insertion in any species where whole-genome sequencing data with a suitable reference genome is available. If you want to find if a transposable element that is present in the reference genome and not present in your sample (a deletion in structural variation terminology), then you should use a structural variation deletion caller such as [Breakdancer](http://gmt.genome.wustl.edu/breakdancer/current/). @@ -18,11 +18,11 @@ Using RetroSeq ================ 1 Discovery Phase ------------------ -The goal here is to pass through the BAM and identify discordant read pairs that might support a TE insertion. You can either supply a tab delimited file specifying a set of TE types (e.g. Alu, LINE etc.) and the corresponding BED file of locations where these are in the reference genome (-refTEs parameter). Alternatively, you can provide a tab delimited file specifying a set of viral/TE types and the corresponding fasta file with a set of consensus sequences for these (-eref parameter). +The goal here is to pass through the BAM (or CRAM) and identify discordant read pairs that might support a TE insertion. You can either supply a tab delimited file specifying a set of TE types (e.g. Alu, LINE etc.) and the corresponding BED file of locations where these are in the reference genome (-refTEs parameter). Alternatively, you can provide a tab delimited file specifying a set of viral/TE types and the corresponding fasta file with a set of consensus sequences for these (-eref parameter). Usage: retroseq.pl -discover -bam -eref -output [-srmode] [-q ] [-id ] [-len -noclean] - -bam BAM file of paired reads mapped to reference genome + -bam BAM or CRAM file of paired reads mapped to reference genome -output Output file to store candidate supporting reads (required for calling step) [-eref Tab file with list of transposon types and the corresponding fasta file of reference sequences (e.g. SINE /home/me/refs/SINE.fasta). Required when the -align option is used.] [-refTEs Tab file with TE type and BED file of reference elements. These will be used to quickly assign discordant reads the TE types and avoid alignment. Using this will speed up discovery dramatically.] @@ -42,7 +42,7 @@ The calling phase takes one or more outputs from the discovery phase, clusters t Usage: retroseq.pl -call -bam -input -ref -output [-srinput -filter -cleanup -reads -depth -hets] - -bam BAM file OR BAM fofn + -bam BAM file OR cram OR BAM fofn -input Either a single output file from the PE discover stage OR a prefix of a set of files from discovery to be combined for calling OR a fofn of discovery stage output files -ref Fasta of reference genome -output Output file name (VCF) diff --git a/RetroSeq/Utilities.pm b/RetroSeq/Utilities.pm index a0b1335..77488d4 100644 --- a/RetroSeq/Utilities.pm +++ b/RetroSeq/Utilities.pm @@ -164,8 +164,8 @@ sub testBreakPoint }else{die qq[Failed to extract region $chr:$refPos BAM];} } else - { - $cmdpre = qq[samtools view $bams[0] $chr:]; + { + $cmdpre = qq[samtools view --input-fmt-option required_fields=0x37F $bams[0] $chr:]; } #compute the distance from the breakpoint to the last fwd read and the first rev read (and then add this to the read window buffer) @@ -306,7 +306,9 @@ sub getCandidateBreakPointsDirVote my @bams = @{ $_[ 0 ] };shift; my $minQ = shift; my $soft = shift; #1 or 0. use soft clips or not - + #correct for negative values as start position (this happens in decoy chromosomes) + if( $start =~ /^-\d+$/ ){ $start="0"; } + if( $start !~ /^\d+$/ || $end !~ /^\d+$/ ){die qq[ERROR: Invalid parameters passed to getCandidateBreakPointsDir: $chr $start $end\n];} my %fwdCount; #pos->read count @@ -318,7 +320,7 @@ sub getCandidateBreakPointsDirVote { if( _mergeRegionBAMs( \@bams, $chr, $start, $end, qq[/tmp/$$.region.bam] ) ) { - $cmd = qq[samtools view /tmp/$$.region.bam |]; + $cmd = qq[samtools view --input-fmt-option required_fields=0x23B /tmp/$$.region.bam |]; }else{die qq[Failed to extract region $chr:$start-$end BAM];} } else diff --git a/bin/retroseq.pl b/bin/retroseq.pl index 1c29504..8d1246a 100755 --- a/bin/retroseq.pl +++ b/bin/retroseq.pl @@ -103,8 +103,8 @@ my $USAGE = < options - -discover Takes a BAM and a set of reference TE (fasta) and calls candidate supporting read pairs (BED output) - -call Takes multiple output of discovery stage and a BAM and outputs a VCF of TE calls + -discover Takes a BAM or CRAM and a set of reference TE (fasta) and calls candidate supporting read pairs (BED output) + -call Takes multiple output of discovery stage and a BAM or CRAM and outputs a VCF of TE calls NOTE: $0 requires samtools, bcftools, exonerate, unix sort, bedtools to be in the default path @@ -119,7 +119,7 @@ ( $bam && $output ) or die < -eref -output [-q ] [-id ] [-len -noclean] - -bam BAM file of paired reads mapped to reference genome + -bam BAM or CRAM file of paired reads mapped to reference genome -output Output file to store candidate supporting reads (required for calling step) [-refTEs Tab file with TE type and BED file of reference elements. These will be used to quickly assign discordant reads the TE types and avoid alignment. Using this will speed up discovery dramatically.] [-noclean Do not remove intermediate output files. Default is to cleanup.] @@ -178,8 +178,8 @@ print qq[\nMin anchor quality: $anchorQ\nMin percent identity: $id\nMin length for hit: $length\n\n]; - #test for samtools - RetroSeq::Utilities::checkBinary( q[samtools], qq[0.1.16], qq[0.1.19] ); + #test for samtools 1.10 at least,since required_fields is supported by 1.9 and newer (CheckBinary has problems with comparing 1.09 with 1.15) + RetroSeq::Utilities::checkBinary( q[samtools], qq[1.10] ); RetroSeq::Utilities::checkBinary( q[exonerate], qq[2.2.0] ) if( $doAlign ); RetroSeq::Utilities::checkBinary( q[bedtools] ); @@ -190,7 +190,7 @@ ( $bam && $input && $ref && $output ) or die < -input -ref -output [ -filter -cleanup -reads -depth ] - -bam BAM file OR BAM fofn + -bam BAM file OR CRAM file OR BAM fofn -input Either a single output file from the PE discover stage OR a prefix of a set of files from discovery to be combined for calling OR a fofn of discovery stage output files -ref Fasta of reference genome -output Output file name (VCF) @@ -205,7 +205,7 @@ USAGE - croak qq[Cant find BAM or BAM fofn: $bam] unless -f $bam || -l $bam; + croak qq[Cant find BAM or CRAM or BAM fofn: $bam] unless -f $bam || -l $bam; if( ! -f $input ) { @@ -233,8 +233,8 @@ } my $incsoft=defined($incsoftclips)?1:0; - #test for samtools - RetroSeq::Utilities::checkBinary( q[samtools], qq[0.1.16] ); + #test for samtools 1.10 at least,since required_fields is supported by 1.9 and newer (CheckBinary has problems with comparing 1.09 with 1.15) + RetroSeq::Utilities::checkBinary( q[samtools], qq[1.10] ); RetroSeq::Utilities::checkBinary( q[bcftools] ); RetroSeq::Utilities::checkBinary( q[bedtools] ); @@ -246,12 +246,12 @@ while(my $file = <$ifh> ) { chomp( $file ); - if( ( -l $file || -f $file ) && ( -l $file.qq[.bai] || -f $file.qq[.bai] ) ){push(@bams, $file);}else{die qq[Cant find BAM input file or BAM index file: $file\n];} + if( ( -l $file || -f $file ) && ( -l $file.qq[.bai] || -f $file.qq[.bai] || -f $file.qq[.crai] || -f $file.qq[cram.crai] || -l $file.qq[.crai] || -l $file.qq[cram.crai]) ){push(@bams, $file);}else{die qq[Cant find BAM input file or BAM index file: $file\n];} } print qq[Found ].scalar(@bams).qq[ BAM files\n\n]; close( $ifh ); } - else{if( ( -l $bam || -f $bam ) && ( -l $bam.qq[.bai] || -f $bam.qq[.bai] ) ){push( @bams, $bam );}else{die qq[Cant find BAM input file or BAM index file: $bam\n];}} + else{if( ( -l $bam || -f $bam ) && ( -l $bam.qq[.bai] || -f $bam.qq[.bai] || -f $bam.qq[.crai] || -f $bam.qq[cram.crai] || -l $bam.qq[.crai] || -l $bam.qq[cram.crai]) ){push( @bams, $bam );}else{die qq[Cant find BAM input file or BAM index file: $bam\n];}} my $sampleName = RetroSeq::Utilities::getBAMSampleName( \@bams ); print qq[Calling sample $sampleName\n]; @@ -307,7 +307,7 @@ sub _findCandidates #now go and get the reads from the bam (annoying have to traverse through the bam a second time - but required for reads where the mate is aligned distantly) #also dump out their mates as will need these later as anchors - open( my $bfh, qq[samtools view ].( defined( $readgroups ) ? qq[-R $$.readgroups ] : qq[ ] ).qq[$bam |] ) or die $!; + open( my $bfh, qq[samtools view --input-fmt-option required_fields=0x23F].( defined( $readgroups ) ? qq[-R $$.readgroups ] : qq[ ] ).qq[$bam |] ) or die $!; open( my $dfh, qq[>>$discordantMatesBed] ) or die $!; my $currentChr = ''; my $readsFound = 0; @@ -977,7 +977,7 @@ sub _getCandidateTEReadNames print qq[Opening BAM ($bam) and getting initial set of candidate mates....\n]; - open( my $bfh, qq[samtools view ].( defined( $readgroups ) ? qq[-R $$.readgroups ] : qq[ ] ).qq[$bam |] ) or die $!; + open( my $bfh, qq[samtools view --input-fmt-option required_fields=0x3FF].( defined( $readgroups ) ? qq[-R $$.readgroups ] : qq[ ] ).qq[$bam |] ) or die $!; my $currentChr = ''; while ( my $samLine = <$bfh> ) {