From 61ed187c50f9a9ca136f3fd3da7c7b339dfcbc7c Mon Sep 17 00:00:00 2001 From: kristinebilgrav <77359122+kristinebilgrav@users.noreply.github.com> Date: Thu, 28 Jan 2021 13:48:47 +0100 Subject: [PATCH 01/10] Update retroseq.pl --- bin/retroseq.pl | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/retroseq.pl b/bin/retroseq.pl index 1c29504..d3b6959 100755 --- a/bin/retroseq.pl +++ b/bin/retroseq.pl @@ -234,7 +234,6 @@ my $incsoft=defined($incsoftclips)?1:0; #test for samtools - RetroSeq::Utilities::checkBinary( q[samtools], qq[0.1.16] ); RetroSeq::Utilities::checkBinary( q[bcftools] ); RetroSeq::Utilities::checkBinary( q[bedtools] ); From c93693e9b592b9fd81decc5da707b390448abe4b Mon Sep 17 00:00:00 2001 From: kristinebilgrav <77359122+kristinebilgrav@users.noreply.github.com> Date: Thu, 28 Jan 2021 13:50:18 +0100 Subject: [PATCH 02/10] Update retroseq.pl --- bin/retroseq.pl | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/retroseq.pl b/bin/retroseq.pl index d3b6959..d055381 100755 --- a/bin/retroseq.pl +++ b/bin/retroseq.pl @@ -179,7 +179,6 @@ print qq[\nMin anchor quality: $anchorQ\nMin percent identity: $id\nMin length for hit: $length\n\n]; #test for samtools - RetroSeq::Utilities::checkBinary( q[samtools], qq[0.1.16], qq[0.1.19] ); RetroSeq::Utilities::checkBinary( q[exonerate], qq[2.2.0] ) if( $doAlign ); RetroSeq::Utilities::checkBinary( q[bedtools] ); From 2339ab74594e999218c1e97d9c51cb1463aa22d1 Mon Sep 17 00:00:00 2001 From: Maarten Kooyman Date: Wed, 20 Jul 2022 11:38:09 +0200 Subject: [PATCH 03/10] [feature] cram works as input: instead of looking only for the presents of bam indexes also crai indexes are checked for presents --- bin/retroseq.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/retroseq.pl b/bin/retroseq.pl index d055381..66bd7a1 100755 --- a/bin/retroseq.pl +++ b/bin/retroseq.pl @@ -244,12 +244,12 @@ while(my $file = <$ifh> ) { chomp( $file ); - if( ( -l $file || -f $file ) && ( -l $file.qq[.bai] || -f $file.qq[.bai] ) ){push(@bams, $file);}else{die qq[Cant find BAM input file or BAM index file: $file\n];} + if( ( -l $file || -f $file ) && ( -l $file.qq[.bai] || -f $file.qq[.bai] || -f $file.qq[.crai]) || -f $file.qq[cram.crai] || -l $file.qq[.crai] || -l $file.qq[cram.crai]) ){push(@bams, $file);}else{die qq[Cant find BAM input file or BAM index file: $file\n];} } print qq[Found ].scalar(@bams).qq[ BAM files\n\n]; close( $ifh ); } - else{if( ( -l $bam || -f $bam ) && ( -l $bam.qq[.bai] || -f $bam.qq[.bai] ) ){push( @bams, $bam );}else{die qq[Cant find BAM input file or BAM index file: $bam\n];}} + else{if( ( -l $bam || -f $bam ) && ( -l $bam.qq[.bai] || -f $bam.qq[.bai] || -f $bam.qq[.crai] || -f $file.qq[cram.crai] || -l $file.qq[.crai] || -l $file.qq[cram.crai]) ){push( @bams, $bam );}else{die qq[Cant find BAM input file or BAM index file: $bam\n];}} my $sampleName = RetroSeq::Utilities::getBAMSampleName( \@bams ); print qq[Calling sample $sampleName\n]; From 9101b44199b8b4c61ad0f7ea7938ba35a622cafc Mon Sep 17 00:00:00 2001 From: Maarten Kooyman Date: Wed, 20 Jul 2022 11:40:06 +0200 Subject: [PATCH 04/10] [fix] samtools tried to extract negative starting regions in small decoys contigs. Now the start is at least 0 --- RetroSeq/Utilities.pm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/RetroSeq/Utilities.pm b/RetroSeq/Utilities.pm index a0b1335..3f5c1dd 100644 --- a/RetroSeq/Utilities.pm +++ b/RetroSeq/Utilities.pm @@ -306,7 +306,9 @@ sub getCandidateBreakPointsDirVote my @bams = @{ $_[ 0 ] };shift; my $minQ = shift; my $soft = shift; #1 or 0. use soft clips or not - + #correct for negative values as start position (this happens in decoy chromosomes) + if( $start =~ /^-\d+$/ ){ $start="0"; } + if( $start !~ /^\d+$/ || $end !~ /^\d+$/ ){die qq[ERROR: Invalid parameters passed to getCandidateBreakPointsDir: $chr $start $end\n];} my %fwdCount; #pos->read count From 339a05c603974bc4e03963e103e162325e5a74da Mon Sep 17 00:00:00 2001 From: Maarten Kooyman Date: Wed, 20 Jul 2022 13:13:03 +0200 Subject: [PATCH 05/10] [fix] small typo and copy and paste error --- bin/retroseq.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/retroseq.pl b/bin/retroseq.pl index 66bd7a1..fbc82c0 100755 --- a/bin/retroseq.pl +++ b/bin/retroseq.pl @@ -244,12 +244,12 @@ while(my $file = <$ifh> ) { chomp( $file ); - if( ( -l $file || -f $file ) && ( -l $file.qq[.bai] || -f $file.qq[.bai] || -f $file.qq[.crai]) || -f $file.qq[cram.crai] || -l $file.qq[.crai] || -l $file.qq[cram.crai]) ){push(@bams, $file);}else{die qq[Cant find BAM input file or BAM index file: $file\n];} + if( ( -l $file || -f $file ) && ( -l $file.qq[.bai] || -f $file.qq[.bai] || -f $file.qq[.crai] || -f $file.qq[cram.crai] || -l $file.qq[.crai] || -l $file.qq[cram.crai]) ){push(@bams, $file);}else{die qq[Cant find BAM input file or BAM index file: $file\n];} } print qq[Found ].scalar(@bams).qq[ BAM files\n\n]; close( $ifh ); } - else{if( ( -l $bam || -f $bam ) && ( -l $bam.qq[.bai] || -f $bam.qq[.bai] || -f $bam.qq[.crai] || -f $file.qq[cram.crai] || -l $file.qq[.crai] || -l $file.qq[cram.crai]) ){push( @bams, $bam );}else{die qq[Cant find BAM input file or BAM index file: $bam\n];}} + else{if( ( -l $bam || -f $bam ) && ( -l $bam.qq[.bai] || -f $bam.qq[.bai] || -f $bam.qq[.crai] || -f $bam.qq[cram.crai] || -l $bam.qq[.crai] || -l $bam.qq[cram.crai]) ){push( @bams, $bam );}else{die qq[Cant find BAM input file or BAM index file: $bam\n];}} my $sampleName = RetroSeq::Utilities::getBAMSampleName( \@bams ); print qq[Calling sample $sampleName\n]; From def835ad7699c30ca84abec10776386d6caae120 Mon Sep 17 00:00:00 2001 From: Maarten Kooyman Date: Sun, 31 Jul 2022 22:34:03 +0200 Subject: [PATCH 06/10] When using cram files, only decode the field that is necessary. Benchmark with a small cram file a speedup of at least 2x in the discover phase. Bam files can still be used and do not benefit from this optimisation. --- RetroSeq/Utilities.pm | 6 +++--- bin/retroseq.pl | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/RetroSeq/Utilities.pm b/RetroSeq/Utilities.pm index 3f5c1dd..77488d4 100644 --- a/RetroSeq/Utilities.pm +++ b/RetroSeq/Utilities.pm @@ -164,8 +164,8 @@ sub testBreakPoint }else{die qq[Failed to extract region $chr:$refPos BAM];} } else - { - $cmdpre = qq[samtools view $bams[0] $chr:]; + { + $cmdpre = qq[samtools view --input-fmt-option required_fields=0x37F $bams[0] $chr:]; } #compute the distance from the breakpoint to the last fwd read and the first rev read (and then add this to the read window buffer) @@ -320,7 +320,7 @@ sub getCandidateBreakPointsDirVote { if( _mergeRegionBAMs( \@bams, $chr, $start, $end, qq[/tmp/$$.region.bam] ) ) { - $cmd = qq[samtools view /tmp/$$.region.bam |]; + $cmd = qq[samtools view --input-fmt-option required_fields=0x23B /tmp/$$.region.bam |]; }else{die qq[Failed to extract region $chr:$start-$end BAM];} } else diff --git a/bin/retroseq.pl b/bin/retroseq.pl index fbc82c0..2b474d3 100755 --- a/bin/retroseq.pl +++ b/bin/retroseq.pl @@ -305,7 +305,7 @@ sub _findCandidates #now go and get the reads from the bam (annoying have to traverse through the bam a second time - but required for reads where the mate is aligned distantly) #also dump out their mates as will need these later as anchors - open( my $bfh, qq[samtools view ].( defined( $readgroups ) ? qq[-R $$.readgroups ] : qq[ ] ).qq[$bam |] ) or die $!; + open( my $bfh, qq[samtools view --input-fmt-option required_fields=0x23F].( defined( $readgroups ) ? qq[-R $$.readgroups ] : qq[ ] ).qq[$bam |] ) or die $!; open( my $dfh, qq[>>$discordantMatesBed] ) or die $!; my $currentChr = ''; my $readsFound = 0; @@ -975,7 +975,7 @@ sub _getCandidateTEReadNames print qq[Opening BAM ($bam) and getting initial set of candidate mates....\n]; - open( my $bfh, qq[samtools view ].( defined( $readgroups ) ? qq[-R $$.readgroups ] : qq[ ] ).qq[$bam |] ) or die $!; + open( my $bfh, qq[samtools view --input-fmt-option required_fields=0x3FF].( defined( $readgroups ) ? qq[-R $$.readgroups ] : qq[ ] ).qq[$bam |] ) or die $!; my $currentChr = ''; while ( my $samLine = <$bfh> ) { From 313253f032ccf322322bbb0146e8f57e1680a2af Mon Sep 17 00:00:00 2001 From: Maarten Kooyman Date: Fri, 5 Aug 2022 21:11:03 +0200 Subject: [PATCH 07/10] Set a minimum version of samtools This is set to 1 version higher than actually needed since the CheckBinary function has issues to see that 1.15 Is higher than 1.9 (it does a string compare and not a numeric compare) --- bin/retroseq.pl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/retroseq.pl b/bin/retroseq.pl index 2b474d3..8d240be 100755 --- a/bin/retroseq.pl +++ b/bin/retroseq.pl @@ -178,7 +178,8 @@ print qq[\nMin anchor quality: $anchorQ\nMin percent identity: $id\nMin length for hit: $length\n\n]; - #test for samtools + #test for samtools 1.10 at least,since required_fields is supported by 1.9 and newer (CheckBinary has problems with comparing 1.09 with 1.15) + RetroSeq::Utilities::checkBinary( q[samtools], qq[1.10] ); RetroSeq::Utilities::checkBinary( q[exonerate], qq[2.2.0] ) if( $doAlign ); RetroSeq::Utilities::checkBinary( q[bedtools] ); @@ -232,7 +233,8 @@ } my $incsoft=defined($incsoftclips)?1:0; - #test for samtools + #test for samtools 1.10 at least,since required_fields is supported by 1.9 and newer (CheckBinary has problems with comparing 1.09 with 1.15) + RetroSeq::Utilities::checkBinary( q[samtools], qq[1.10] ); RetroSeq::Utilities::checkBinary( q[bcftools] ); RetroSeq::Utilities::checkBinary( q[bedtools] ); From 7d56659d775223c8b9f3a436c687e8afc13db2c2 Mon Sep 17 00:00:00 2001 From: Maarten Kooyman Date: Wed, 19 Oct 2022 11:30:44 +0200 Subject: [PATCH 08/10] Create Changelog --- Changelog | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Changelog diff --git a/Changelog b/Changelog new file mode 100644 index 0000000..dadfb63 --- /dev/null +++ b/Changelog @@ -0,0 +1,3 @@ +[feature] Added support for cram files. Requires samtools 1.10 and newer +[speedup] decodes only parts of cram files used by RetroSeq Perl script +[fix] Samtools can not extract regions with a negative start position. Negative values are set to zero. This may occur in HG38 reference with alt contigs From 2b70a7ce752e2a34ea24b1c22bbe61b62b78cfae Mon Sep 17 00:00:00 2001 From: Maarten Kooyman Date: Wed, 19 Oct 2022 11:33:08 +0200 Subject: [PATCH 09/10] Added cram files as input --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 183c39b..d3ce5a9 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ RetroSeq ==================== What is RetroSeq? ------- -RetroSeq is a tool for discovery and genotyping of transposable element variants (TEVs) (also known as mobile element insertions) from next-gen sequencing reads aligned to a reference genome in BAM format. The goal is to call TEVs that are not present in the reference genome but present in the sample that has been sequenced. It should be noted that RetroSeq can be used to locate any class of viral insertion in any species where whole-genome sequencing data with a suitable reference genome is available. +RetroSeq is a tool for discovery and genotyping of transposable element variants (TEVs) (also known as mobile element insertions) from next-gen sequencing reads aligned to a reference genome in BAM or CRAM format. The goal is to call TEVs that are not present in the reference genome but present in the sample that has been sequenced. It should be noted that RetroSeq can be used to locate any class of viral insertion in any species where whole-genome sequencing data with a suitable reference genome is available. If you want to find if a transposable element that is present in the reference genome and not present in your sample (a deletion in structural variation terminology), then you should use a structural variation deletion caller such as [Breakdancer](http://gmt.genome.wustl.edu/breakdancer/current/). @@ -18,11 +18,11 @@ Using RetroSeq ================ 1 Discovery Phase ------------------ -The goal here is to pass through the BAM and identify discordant read pairs that might support a TE insertion. You can either supply a tab delimited file specifying a set of TE types (e.g. Alu, LINE etc.) and the corresponding BED file of locations where these are in the reference genome (-refTEs parameter). Alternatively, you can provide a tab delimited file specifying a set of viral/TE types and the corresponding fasta file with a set of consensus sequences for these (-eref parameter). +The goal here is to pass through the BAM (or CRAM) and identify discordant read pairs that might support a TE insertion. You can either supply a tab delimited file specifying a set of TE types (e.g. Alu, LINE etc.) and the corresponding BED file of locations where these are in the reference genome (-refTEs parameter). Alternatively, you can provide a tab delimited file specifying a set of viral/TE types and the corresponding fasta file with a set of consensus sequences for these (-eref parameter). Usage: retroseq.pl -discover -bam -eref -output [-srmode] [-q ] [-id ] [-len -noclean] - -bam BAM file of paired reads mapped to reference genome + -bam BAM or CRAM file of paired reads mapped to reference genome -output Output file to store candidate supporting reads (required for calling step) [-eref Tab file with list of transposon types and the corresponding fasta file of reference sequences (e.g. SINE /home/me/refs/SINE.fasta). Required when the -align option is used.] [-refTEs Tab file with TE type and BED file of reference elements. These will be used to quickly assign discordant reads the TE types and avoid alignment. Using this will speed up discovery dramatically.] @@ -42,7 +42,7 @@ The calling phase takes one or more outputs from the discovery phase, clusters t Usage: retroseq.pl -call -bam -input -ref -output [-srinput -filter -cleanup -reads -depth -hets] - -bam BAM file OR BAM fofn + -bam BAM file OR cram OR BAM fofn -input Either a single output file from the PE discover stage OR a prefix of a set of files from discovery to be combined for calling OR a fofn of discovery stage output files -ref Fasta of reference genome -output Output file name (VCF) From 55254e7d31c2492de84e164d570b8086fd8d615e Mon Sep 17 00:00:00 2001 From: Maarten Kooyman Date: Wed, 19 Oct 2022 11:36:09 +0200 Subject: [PATCH 10/10] Added cram in messages next to bam --- bin/retroseq.pl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/retroseq.pl b/bin/retroseq.pl index 8d240be..8d1246a 100755 --- a/bin/retroseq.pl +++ b/bin/retroseq.pl @@ -103,8 +103,8 @@ my $USAGE = < options - -discover Takes a BAM and a set of reference TE (fasta) and calls candidate supporting read pairs (BED output) - -call Takes multiple output of discovery stage and a BAM and outputs a VCF of TE calls + -discover Takes a BAM or CRAM and a set of reference TE (fasta) and calls candidate supporting read pairs (BED output) + -call Takes multiple output of discovery stage and a BAM or CRAM and outputs a VCF of TE calls NOTE: $0 requires samtools, bcftools, exonerate, unix sort, bedtools to be in the default path @@ -119,7 +119,7 @@ ( $bam && $output ) or die < -eref -output [-q ] [-id ] [-len -noclean] - -bam BAM file of paired reads mapped to reference genome + -bam BAM or CRAM file of paired reads mapped to reference genome -output Output file to store candidate supporting reads (required for calling step) [-refTEs Tab file with TE type and BED file of reference elements. These will be used to quickly assign discordant reads the TE types and avoid alignment. Using this will speed up discovery dramatically.] [-noclean Do not remove intermediate output files. Default is to cleanup.] @@ -190,7 +190,7 @@ ( $bam && $input && $ref && $output ) or die < -input -ref -output [ -filter -cleanup -reads -depth ] - -bam BAM file OR BAM fofn + -bam BAM file OR CRAM file OR BAM fofn -input Either a single output file from the PE discover stage OR a prefix of a set of files from discovery to be combined for calling OR a fofn of discovery stage output files -ref Fasta of reference genome -output Output file name (VCF) @@ -205,7 +205,7 @@ USAGE - croak qq[Cant find BAM or BAM fofn: $bam] unless -f $bam || -l $bam; + croak qq[Cant find BAM or CRAM or BAM fofn: $bam] unless -f $bam || -l $bam; if( ! -f $input ) {