diff --git a/docs/source/databases.rst b/docs/source/databases.rst index e82d8041..d3edcfcd 100644 --- a/docs/source/databases.rst +++ b/docs/source/databases.rst @@ -50,7 +50,7 @@ Create databases directory structure .. code-block:: bash - mkdir -p ~/databases/{humandna,humanrna,ncbi} + mkdir -p ~/databases/{humandna,humanrna,ncbi,diamond} mkdir -p ~/databases/ncbi/blast/{nt,nr} mkdir -p ~/databases/ncbi/taxonomy @@ -91,15 +91,24 @@ Diamond Diamond corresponds to :doc:`stages/iterative_blast_phylo`'s ``blast_db_list`` -Download and index protein database for diamond blastx +Download and index protein database for diamond blastx. + +You should make sure to view the `makedb`_ command, specifically the part about +how the ``-b`` option controls how much memory diamond will take when it runs +using the database you build. In the example below we use ``-b 2`` which should +consume about 12GB of RAM so if you don't have that much RAM on your computer where +you will run diamond, or if you have way more RAM you may want to change that +number as the higher it is, the faster diamond will run but consume more memory. + +You can use the ``free -hg`` command which will show you how many GB you have(listed +under the Total column) .. code-block:: bash - mkdir -p ~/databases/diamond pushd ~/databases/diamond wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/nr.gz gunzip nr.gz - diamond makedb -p 12 -d diamondnr -v --log --in nr -b 0.5 + diamond makedb -d diamondnr -v --log --in nr -b 2 popd Alternatively you can generate the diamond database from an already downloaded @@ -107,10 +116,9 @@ blast nr database .. code-block:: bash - mkdir -p ~/databases/diamond pushd ~/databases/diamond blastdbcmd -db ~/databases/ncbi/blast/nr/nr -entry all > blastnr.fasta - diamond makedb -d diamondnr --log --in blastnr.fasta -b 0.5 + diamond makedb -d diamondnr --log --in blastnr.fasta -b 2 rm blastnr.fasta Host Genome Setup @@ -249,3 +257,5 @@ Note: This command is only available after you install. Unfortunately at this po You will probably want to ensure that the pipeline can find all of your databases. There is now a handy script that you can use to do this prior to installing. :doc:`scripts/verifydatabases` + +.. _makedb: https://github.com/bbuchfink/diamond#makedb-options diff --git a/pathdiscov/Local_Module/Verbose_Sys.pm b/pathdiscov/Local_Module/Verbose_Sys.pm index e5b06c9d..c3c17b72 100755 --- a/pathdiscov/Local_Module/Verbose_Sys.pm +++ b/pathdiscov/Local_Module/Verbose_Sys.pm @@ -1,4 +1,5 @@ use strict; +use File::Basename; # run a system commmand "verbosely", echo-ing the command as well as giving the time sub verbose_system @@ -7,12 +8,13 @@ sub verbose_system # @_ exists magically - anything passed in my $cmd = shift; + my @parts = split(' ', $cmd); + my $exe = basename($parts[0]); - print "[cmd] ",$cmd,"\n"; my $start_time = time(); - system($cmd); + print_system($cmd); my $end_time = time(); - print "[deltat] ",$end_time-$start_time,"\n"; + print "[$exe deltat] ",$end_time-$start_time,"\n"; } # run a system commmand but print the command first diff --git a/pathdiscov/download/diamond64/diamond b/pathdiscov/download/diamond64/diamond deleted file mode 100755 index b2eb082e..00000000 Binary files a/pathdiscov/download/diamond64/diamond and /dev/null differ diff --git a/pathdiscov/files/sample.param.base b/pathdiscov/files/sample.param.base index 9d23b027..0e68f726 100755 --- a/pathdiscov/files/sample.param.base +++ b/pathdiscov/files/sample.param.base @@ -45,7 +45,7 @@ command iterative_blast_phylo blast_db_list BLAST_NT,BLAST_NT #,DIAMOND_NR blast db prefix blast_task_list megablast,dc-megablast # options are: megablast dc-megablast blastn, diamond # blast_options_list -evalue 1e-4 -word_size 28,-evalue 1e-4 -word_size 12,-evalue 1e-4 -word_size 7 # blast options (except for: -task -query -db -out -outfmt -num_descriptions; these are -blast_options_list -evalue 1e-4 -word_size 28,-evalue 1e-4 -word_size 12 ##,--compress 0 -p 0 -v -k 10 -w 28 --id 0.7 -c 4 -t TEMPDIR blast options (except for: -task -query -db -out -outfmt -num_descriptions; these are hardwired) +blast_options_list -evalue 1e-4 -word_size 28,-evalue 1e-4 -word_size 12 ##,-v -k 10 -w 28 --id 0.7 -c 4 -t TEMPDIR blast options (except for: -task -query -db -out -outfmt -num_descriptions; these are hardwired) ninst_list NUMINST,NUMINST # the input file will be broken into chunks and blasted in parallel - this parameter is the number of instances of BLAST you want to run in parallel taxonomy_names TAX_NAMES # NCBI taxonomy names dump file taxonomy_nodes TAX_NODES # NCBI taxonomy nodes dump file diff --git a/pathdiscov/iterative_blast_phylo/blast_wrapper.pl b/pathdiscov/iterative_blast_phylo/blast_wrapper.pl index dbec7145..fbbf88b8 100755 --- a/pathdiscov/iterative_blast_phylo/blast_wrapper.pl +++ b/pathdiscov/iterative_blast_phylo/blast_wrapper.pl @@ -48,23 +48,25 @@ if ($type eq "diamond" || !(defined($task))) { $task_option="blastx"; - +} +elsif($type eq "blastx") +{ + $task_option = ""; } else { - $task_option="-task $task"; + $task_option="-task $task"; } -if ($type eq "blastn") +if ($type ne "diamond") { print "[start]\n"; my $cmd = "$type -query $query -db $db $task_option -out $out -outfmt $outfmt -max_target_seqs 10 $options"; verbose_system($cmd); print "[end]\n"; } - -if ($type eq "diamond") +else { print "[start]\n"; my $cmd = "$type $task_option $options -q $query -d $db -o $out"; diff --git a/pathdiscov/iterative_blast_phylo/iterative_blast_phylo.pl b/pathdiscov/iterative_blast_phylo/iterative_blast_phylo.pl index 3fc2bdf2..3eb75ac2 100755 --- a/pathdiscov/iterative_blast_phylo/iterative_blast_phylo.pl +++ b/pathdiscov/iterative_blast_phylo/iterative_blast_phylo.pl @@ -209,28 +209,50 @@ print_system($cmd); my $inputfasta = "$outputdir/$j.$mate.fasta"; - + my $blastexe = "blastn"; + my $blasttask = "--task $blast_task_list[$i]"; + my $outfmt="6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore"; + # Start building the --blast_options string + my $blastoptions = $blast_options_list[$i]; + # If blast type then we will force the format + if( $blast_task_list[$i] ne "diamond" ) { + # Must specify same format as diamond to keep things consistent + $blastoptions = $blastoptions . " -outfmt \\\"$outfmt\\\""; + } + # Ensure string is wrapped in quotes + $blastoptions = "$blastoptions"; + # Filter using get_orf if( $blast_task_list[$i] eq "diamond" || $blast_task_list[$i] eq "blastx" ) { + $blastexe = $blast_task_list[$i]; + $blasttask = ""; + if( $blast_task_list[$i] eq "diamond" ) { + $blasttask = "--task blastx"; + } print "[echo] orf filtering $mate prior to $blast_task_list[$i]\n"; my $odir = "tmp_${mate}_${j}/orf_filter"; my $logs = "$odir/logs"; + my $matename = $mate; print_system("mkdir -p $logs"); - my $cmd = "orf_filter.pl --outputdir $odir --logs $logs --paramfile $abs_pfile --R1 $outputdir/$j.$mate.fasta --sample $sample --timestamp $timestamp"; + if($contig) { + $matename = 'R1'; + } + my $cmd = "orf_filter.pl --outputdir $odir --logs $logs --paramfile $abs_pfile --$matename $outputdir/$j.$mate.fasta --sample $sample --timestamp $timestamp"; if($contig) { $cmd .= " --contig 1"; } verbose_system($cmd); # New input for diamond/blastx will be orf filtered fasta $inputfasta = "$outputdir/$odir/orf_filter.$mate"; - print "[debug] orf_filtered input $inputfasta\n"; + print "[debug] orf_filtered input for $blast_task_list[$i] will now be $inputfasta\n"; my $cmd = "linecount $inputfasta orf_filter $mate.count 2 1"; print_system($cmd); } my $blast_db_nr; - my $cmd = "$path_scripts/par_block_blast.pl --outputdir tmp_".$mate."_$j --inputfasta $inputfasta --db $blast_db_list[$i] --blast_type $blast_task_list[$i] --task $blast_task_list[$i] --ninst $ninst_list[$i] --outfile $outputdir/$j.$mate.blast --outheader $outputdir/blast.header --blast_options \"$blast_options_list[$i]\""; + my $cmd = "parallel_blast $inputfasta $outputdir/$j.$mate.blast --ninst $ninst_list[$i] --db $blast_db_list[$i] --blast_exe $blastexe $blasttask --blast_options \"$blastoptions\""; + verbose_system($cmd); print "[echo] get phylogeny counts\n"; @@ -287,7 +309,7 @@ # get reads that didnt blast # args: blast output, fasta input - my $cmd = "$path_scripts/get_unblast_reads.pl $j.$mate.blast $j.$mate.fasta > $j.$mate.noblast.fasta"; + my $cmd = "$path_scripts/get_unblast_reads.pl $j.$mate.blast $inputfasta > $j.$mate.noblast.fasta"; verbose_system($cmd); # args: input file, filtering_program_name, output file, 2->fasta, concat diff --git a/pathdiscov/orf_filter/orf_filter.pl b/pathdiscov/orf_filter/orf_filter.pl index 07e6f1b4..bfd8cce6 100755 --- a/pathdiscov/orf_filter/orf_filter.pl +++ b/pathdiscov/orf_filter/orf_filter.pl @@ -40,7 +40,8 @@ 'contig=i' => \$contig, # contig bool 'timestamp=s' => \$timestamp); # time stamp -die "[error] required input parameters not found" if (!( defined($outputdir) && defined($logs) && defined($pfile) && defined($r1) )); +die "[error] required input parameters not found" if (!( defined($outputdir) && defined($logs) && defined($pfile) )); +die "[error] required input parameters not found" if ( !defined($r1) && !defined($r2) ); @mates=("contig") if ($contig); # Reset mates to the name contig @@ -119,17 +120,18 @@ #my %h_fasta = map {/>(\w+)_(\w+)\s(.*)/; $1 => 1} split(/\n/, `cat $mate.orfout.fa`); # Should be >(anything)_\d\s(.* my %h_fasta = map {/>(.*?)_(\d+)\s(.*)/; $1 => 1} split(/\n/, `cat $mate.orfout.fa`); - #print Dumper \ %h_fasta; + # print Dumper \ %h_fasta; print "[echo] filter $hoh{$command}{$mate} by orfs into $command.$mate\n"; get_subset_by_fastaid($hoh{$command}{$mate}, "$command.$mate", \%h_fasta); verbose_system("linecount $command.$mate orf_filter $mate.count fastq 1"); } else { - print "Doing nothing because there are no getorf_options set\n"; + print "[info] Value of \$hoh{".$command."}{\"getorf_options\"}:'". $hoh{$command}{"getorf_options"} ."'\n"; + print "[error] there are no getorf_options set\n"; } } # defined - elsif ($mate eq "R1") + elsif (($mate eq "R1" || $mate eq "contig") && !defined($r2)) { if(! -s $hoh{$command}{$mate} ) { print("$hoh{$command}{$mate} is empty\n"); @@ -211,9 +213,9 @@ sub get_subset_by_fastaid $_ = <$fh>; if($_ =~ /^@/) { $format = "fastq"; - print("Detected $infile as fastq\n"); + print("[info] Detected $infile as fastq\n"); } else { - print("Detected $infile as fasta\n"); + print("[info] Detected $infile as fasta\n"); } close($fh); diff --git a/pavement.py b/pavement.py index 5446bd0b..ce49dffe 100644 --- a/pavement.py +++ b/pavement.py @@ -28,6 +28,10 @@ options(setup=setup_dict, + diamond=Bunch( + url='https://github.com/bbuchfink/diamond/releases/download/v0.7.9/diamond-linux64.tar.gz', + sdir=path('pathdiscov/download'), + ), bwa=Bunch( sdir=path('pathdiscov/download'), bindir=path('pathdiscov/bin') @@ -59,7 +63,10 @@ downloads=path('pathdiscov/download'), installdir=join(sys.prefix,'lib') ), - + parallel=Bunch( + downloads=path('pathdiscov/download'), + url='http://ftp.gnu.org/gnu/parallel/parallel-20150722.tar.bz2' + ), wkhtmltopdf=Bunch( sfile = path('pathdiscov/download/wkhtmltopdf'), olink =path('pathdiscov/bin') @@ -293,6 +300,19 @@ def download_install_fastqc(options): else: info("fastqc symlink already exists") +@task +def download_unpack_diamond(options): + diamondbin=join(sys.prefix,'bin','diamond') + if not exists(diamondbin): + info('Downloading diamond') + cmd = 'cd {0} && wget {1} -O- | tar xzvf -' + sh( + cmd.format( + options.diamond.sdir, + options.diamond.url + ) + ) + @task def download_compile_bwa(options): """installs the current package""" @@ -303,6 +323,19 @@ def download_compile_bwa(options): sdir = path(currwd) / options.bwa.sdir sh('(cd %s; wget https://github.com/lh3/bwa/archive/0.7.10.tar.gz -O- | tar xzf -; mv bwa-* bwa; cd bwa; make; cd %s)' % (sdir, sdir)) +@task +def download_compile_gnuparallel(options): + ''' Installs GNU Parallel ''' + prefix = sys.prefix + p_bin = join(sys.prefix,'bin','parallel') + options.url + if not exists(p_bin): + info("Installing GNU Parallel") + sh('cd {0}; wget {1} -O- | tar xjf -; cd parallel-*; ' \ + './configure --prefix={2}; make && make install;'.format( + options.downloads, options.url, prefix) + ) + @task def download_compile_samtools(options): """installs the current package""" @@ -421,7 +454,7 @@ def install_dependencies(): pass @task -@needs('download_compile_bwa','download_compile_samtools','refRay','getorf','download_install_fastqc','installSnap','perl_modules') +@needs('download_compile_bwa','download_compile_samtools','refRay','getorf','download_install_fastqc','installSnap','perl_modules','download_compile_gnuparallel','download_unpack_diamond') def install_other_dependencies(): pass diff --git a/setup.py b/setup.py index 2d3e0fb2..16cfda5d 100644 --- a/setup.py +++ b/setup.py @@ -282,6 +282,7 @@ def run_tests(self): 'sff2fastq = pathdiscov.sff2fastq:main', 'step1 = pathdiscov.stages.step1:main', 'get_blast_reads = pathdiscov.get_blast_reads:main', + 'parallel_blast = pathdiscov.parallel_blast:main', ], }, # These all get copied to our installation's bin folder for us @@ -294,7 +295,7 @@ def run_tests(self): 'pathdiscov/download/ray/Ray', 'pathdiscov/download/Ray2', 'pathdiscov/download/bowtie2/bowtie2', - 'pathdiscov/download/diamond64/diamond', + 'pathdiscov/download/diamond', 'pathdiscov/download/snap/snap', 'pathdiscov/quality_filter/quality_filter.pl', 'pathdiscov/host_map/host_map.pl', @@ -303,8 +304,8 @@ def run_tests(self): 'pathdiscov/ray2_assembly/joinlines.sh', 'pathdiscov/ray2_assembly/ray2_assembly.pl', 'pathdiscov/iterative_blast_phylo/annotate_blast.sh', - 'pathdiscov/iterative_blast_phylo/blast_wrapper.pl', - 'pathdiscov/iterative_blast_phylo/block_blast.sh', + #'pathdiscov/iterative_blast_phylo/blast_wrapper.pl', + #'pathdiscov/iterative_blast_phylo/block_blast.sh', 'pathdiscov/iterative_blast_phylo/fastq2fasta.awk', 'pathdiscov/iterative_blast_phylo/format_iterative_blast_phylo.pl', 'pathdiscov/iterative_blast_phylo/get_unblast_reads.pl', diff --git a/tests/common.py b/tests/common.py index 0169f7bb..799082df 100644 --- a/tests/common.py +++ b/tests/common.py @@ -6,6 +6,8 @@ import sys import glob +from nose.plugins.attrib import attr + import mock import unittest2 as unittest @@ -303,17 +305,27 @@ def _write_param(self, txt, parampath=None): fh.write(txt) return abspath(parampath) + def assertNotEmpty(self, path): + self.assertTrue(exists(path), '{0} does not exist'.format(path)) + self.assertNotEqual(0, os.stat(path).st_size, "{0} is empty".format(path)) + def _verify_countfile(self, expect, countpath): ''' expect is list((name,count),(name2,count)...) where each tuple should match a line in countpath file ''' fh = open(countpath) - for line, ex in zip(fh, expect): + lines = [line.strip() for line in fh] + print "Expected: {0}".format(expect) + print "Found: {0}".format(lines) + self.assertEqual(len(expect), len(lines), "More count entries than expected") + for line, ex in zip(lines, expect): name, count = line.strip().split() count = float(count) exc = float(ex[1]) self.assertEqual(ex[0], name) + print line + print ex self.assertAlmostEqual(exc, count, delta=2) def aexists(path, msg=None): diff --git a/tests/rikkcdna/test_built.py b/tests/rikkcdna/test_built.py new file mode 100644 index 00000000..c92917d3 --- /dev/null +++ b/tests/rikkcdna/test_built.py @@ -0,0 +1,4 @@ +from os.path import * + +def test_tst_databases_are_built(): + assert exists('rikkcdna.nin'), "You did not build databases. See rikkcdna/Readme.rst" diff --git a/tests/test_iterative_blast_phylo.py b/tests/test_iterative_blast_phylo.py index 4076438b..0696a294 100644 --- a/tests/test_iterative_blast_phylo.py +++ b/tests/test_iterative_blast_phylo.py @@ -10,7 +10,7 @@ from common import ( TESTDATA, SCRATCH, PATHDISCOV, RIKKDB, TESTDIR, - aexists + aexists, attr ) import common @@ -30,31 +30,39 @@ # Param paramtxt = ''' command iterative_blast_phylo -blast_db_list {0},{0} -blast_task_list megablast,dc-megablast -blast_options_list -evalue 1e-4 -word_size 28,-evalue 1e-4 -word_size 12 -ninst_list 1,1 -taxonomy_names {1} -taxonomy_nodes {2} -blast_pro_db {3} -'''.format(blastnt,taxnames,taxnodes,blastnr) +blast_db_list {0},{0},{1} +blast_task_list megablast,dc-megablast,blastx +blast_options_list -evalue 1e-4 -word_size 28,-evalue 1e-4 -word_size 12,-evalue 1e-4 -word_size 7 +ninst_list 1,1,1 +taxonomy_names {2} +taxonomy_nodes {3} +blast_pro_db {4} +command orf_filter +getorf_options -minsize 60 -find 0 +'''.format(blastnt,blastnr,taxnames,taxnodes,blastnr) expect_count_contig = [ ('input', '86'), ('megablast', '76'), - ('dc-megablast', '73') + ('dc-megablast', '73'), + ('orf_filter', '73'), + ('blastx', '72') ] expect_count_r1 = [ ('input', '250'), ('megablast', '216'), - ('dc-megablast', '215') + ('dc-megablast', '215'), + ('orf_filter','199'), + ('blastx', '199') ] expect_count_r2 = [ ('input', '250'), ('megablast', '222'), - ('dc-megablast', '218') + ('dc-megablast', '218'), + ('orf_filter', '201'), + ('blastx', '197') ] output_contig = 'iterative_blast_phylo_1.contig' @@ -93,10 +101,17 @@ def test_iterative_blast_phylo_r1r2(self): outdir = inspect.stack()[0][3] logs = join(outdir,'logs') os.makedirs(logs) + sh.step1( + sample='testsample', paramfile=param, + outputdir=outdir, logs=logs, timestamp='0', + R1=self.r1_fq, R2=self.r2_fq + ) + r1 = join(outdir,'R1.fastq') + r2 = join(outdir,'R2.fastq') sh.iterative_blast_phylo( sample='testsample', paramfile=param, outputdir=outdir, logs=logs, timestamp='0', - R1=self.r1_fq, R2=self.r2_fq, fastafile='no', contig='0' + R1=r1, R2=r2, fastafile='no', contig='0' ) aexists(join(outdir,output_r1)) aexists(join(outdir,output_r2)) @@ -107,12 +122,13 @@ def test_iterative_blast_phylo_r1r2(self): self._verify_countfile(expect_count_r1, join(outdir, 'R1.count')) self._verify_countfile(expect_count_r2, join(outdir, 'R2.count')) + @attr('slow') def test_iterative_blast_getorf_diamond(self): paramtxt = ''' command iterative_blast_phylo blast_db_list {0},{0},{1} blast_task_list megablast,dc-megablast,diamond - blast_options_list -evalue 1e-4 -word_size 28,-evalue 1e-4 -word_size 12,--compress 0 -p 1 -v -k 10 -w 28 --id 0.7 -c 4 + blast_options_list -evalue 1e-4 -word_size 28,-evalue 1e-4 -word_size 12,-v -k 10 -w 28 --id 0.7 -c 4 ninst_list 1,1,1 taxonomy_names {2} taxonomy_nodes {3} @@ -136,8 +152,8 @@ def test_iterative_blast_getorf_diamond(self): orf_filter_dir = join(outdir,'tmp_contig_3','orf_filter') aexists(join(orf_filter_dir, 'orf_filter.contig')) aexists(join(orf_filter_dir, 'contig.orfout.fa')) - # copy so we can modify - econtig = [x for x in expect_count_contig] - econtig.append(('orf_filter','73')) + # copy so we can modify and exclude orf_filter and blastx + # counts as we will have our own + econtig = expect_count_contig[:-1] econtig.append(('diamond','72')) self._verify_countfile(econtig, join(outdir, 'contig.count')) diff --git a/tests/test_orf_filter.py b/tests/test_orf_filter.py index cb9cd8af..b0a6b537 100644 --- a/tests/test_orf_filter.py +++ b/tests/test_orf_filter.py @@ -37,6 +37,20 @@ ] class TestOrfFilter(common.StageTestBase): + def test_can_supply_only_r2(self): + param = self._write_param(paramtxt) + outdir = inspect.stack()[0][3] + logs = join(outdir,'logs') + os.makedirs(logs) + sh.orf_filter( + sample='testsample', paramfile=param, + outputdir=outdir, logs=logs, timestamp='0', + R2=self.r2_fq + ) + self.assertTrue(exists(join(outdir,'orf_filter.R2'))) + self.assertTrue(exists(join(outdir,'R2.orfout.fa'))) + self._verify_countfile(expect_count_r2, join(outdir, 'R2.count')) + def test_orf_filter_fastq_r1r2(self): param = self._write_param(paramtxt) outdir = inspect.stack()[0][3] @@ -48,8 +62,8 @@ def test_orf_filter_fastq_r1r2(self): R1=self.r1_fq, R2=self.r2_fq ) - self.assertTrue(exists(join(outdir,'orf_filter.R1'))) - self.assertTrue(exists(join(outdir,'orf_filter.R2'))) + self.assertNotEmpty(join(outdir,'orf_filter.R1')) + self.assertNotEmpty(join(outdir,'orf_filter.R2')) self.assertTrue(exists(join(outdir,'R1.orfout.fa'))) self.assertTrue(exists(join(outdir,'R2.orfout.fa'))) self._verify_countfile(expect_count_r1, join(outdir, 'R1.count'))