diff --git a/pathdiscov/drop_mapped.py b/pathdiscov/drop_mapped.py new file mode 100644 index 00000000..9a03745f --- /dev/null +++ b/pathdiscov/drop_mapped.py @@ -0,0 +1,44 @@ + + +def _discard(_saved, _dropped): + with open(_saved, 'r'), open(_dropped, 'r') as saved, dropped: + try: + dropped_id = next(dropped) + except StopIteration: + warnings.warn("discard file %s was empty.") + return None + # just symlink + for read in SeqIO.parse(saved, 'fastq'): + if str(read.id) == dropped_id: + # read gets filtered because its ID was in discard so don't yield + dropped_id = next(dropped) + else: + yield read +def discard(saved, disc, outpath): + res = discard(saved, disc) + if res: + SeqIO.write(res, outpath, 'fastq') + else: + os.symlink(os.path.abspath(saved), os.path.abspath(outpath)) + +def main(): + parser = argparse.ArgumentParser( + description = 'Drop discarded reads from the other pair.' + ) + parser.add_argument( + '--saved', + help='The file with the unmapped reads.' + ) + parser.add_argument( + '--other-discard', + help='The discard file for the *opposite* member of the pair. + If --saved is R1, this should be R2.discard, etc.' + ) + parser.add_argument( + '--out', + help='The output path' + ) + args = parser.parse_args() + discard(args.saved, args.other_discard, args.out) + + diff --git a/pathdiscov/host_map/host_map.pl b/pathdiscov/host_map/host_map.pl index 0f0d62d7..1ecd3866 100755 --- a/pathdiscov/host_map/host_map.pl +++ b/pathdiscov/host_map/host_map.pl @@ -306,8 +306,7 @@ if ( defined($hoh{$command}{$mate}) && -s $hoh{$command}{$mate} ) { # update mate to new output - $hoh{$command}{$mate} = "$outputdir/map_$j/$mate.unmap.fastq"; - system("ln -sf map_$j/$mate.unmap.fastq $outputdir/host_map_$run_iteration.$mate"); + $hoh{$command}{$mate} = "$outputdir/map_$j/$mate.unmap.fastq.tmp"; my $cmd = "linecount $hoh{$command}{$mate} $mapper_name_list[$i] $mate.count 1 1"; print "[cmd] ",$cmd,"\n"; @@ -315,10 +314,23 @@ # (take this out of loop for efficiency - only needs to be done on the final iteration) # get discard IDs - i.e., the reads filtered in this step - my $cmd = "fastaq_tools_diff.exe --fastq $hoh{$command}{$mate.\"input\"} --fastq $hoh{$command}{$mate} > $mate.discard"; + my $cmd = "fastaq_tools_diff.exe --fastq $hoh{$command}{$mate.\"input\"} --fastq $hoh{$command}{$mate} | sort > $mate.discard"; verbose_system($cmd); } } + + my $cmd = "drop_mapped --saved $hoh{$command}{"R1"} --dropped "R2".discard --out $mapout" + + verbose_system($cmd); + + my $cmd = "drop_mapped --saved $hoh{$command}{"R2"} --dropped "R1".discard --out $mapout" + + verbose_system($cmd); + + foreach my $mate (@mates) + { + system("ln -sf map_$j/$mate.unmap.fastq $outputdir/host_map_$run_iteration.$mate"); + } } # -------------------------------------- diff --git a/setup.py b/setup.py index 2d3e0fb2..85cbafbb 100644 --- a/setup.py +++ b/setup.py @@ -282,6 +282,7 @@ def run_tests(self): 'sff2fastq = pathdiscov.sff2fastq:main', 'step1 = pathdiscov.stages.step1:main', 'get_blast_reads = pathdiscov.get_blast_reads:main', + 'drop_mapped = pathdiscov.drop_mapped:main' ], }, # These all get copied to our installation's bin folder for us