From a460d59b5d09680621e633d91a8a254d073340d5 Mon Sep 17 00:00:00 2001 From: alvarani Date: Fri, 28 Sep 2012 13:53:52 +0200 Subject: [PATCH 01/10] Removed the map folder --- map/run.degner.sh | 47 ---------------------- map/run.tophat.sh | 66 ------------------------------- map/tophat.gensbatch.pl | 65 ------------------------------ map/tophat.singleend.gensbatch.pl | 66 ------------------------------- 4 files changed, 244 deletions(-) delete mode 100644 map/run.degner.sh delete mode 100644 map/run.tophat.sh delete mode 100644 map/tophat.gensbatch.pl delete mode 100644 map/tophat.singleend.gensbatch.pl diff --git a/map/run.degner.sh b/map/run.degner.sh deleted file mode 100644 index da210e7..0000000 --- a/map/run.degner.sh +++ /dev/null @@ -1,47 +0,0 @@ -#Map with tophat -#DEP: tophat.gensbatch.pl - - -### -#Tophat -### -#Longest run: 5h38min, 6G file for each readpair: 8_LPS.readp_1.2.ill.fastq -email='alva.rani@scilifelab.se' -projid='b2012046' -execdir='/bubo/home/h24/alvaj/glob/code/ASE/map' -fastqdir='/proj/b2012046/edsgard/ase/sim/data/synt/fastqfilt' -fastqsuffix='.filter.fastq' -outdir='/proj/b2012046/rani/analysis/degner' -sbatchdir='/proj/b2012046/rani/scripts/degner' - -threads='8' -ref='/bubo/home/h24/alvaj/glob/code/ASE/degner/bowtie-0.12.8/indexes/ref/Ref.fa ' - -time='23:00:00' - -threads='8' -annotdir='/bubo/home/h26/edsgard/glob/annotation/human/' -gtf=${annotdir}'Homo_sapiens.GRCh37.59.gtf' -time='23:00:00' #Longest run: 7h10min: Input: ~7.3G per readpair fastq file -readlen='100' -isizefile='/proj/b2011075/data/rnaseq/bioanalyzer/isizes.only.tab' -isizedev='75' #very rough approx from the bioanalyzer plots. Also, we use trimming which further adds to the deviation. - -cd $fastqdir -## creating a sample -ls *.fastq | awk '{split($0,b,"."); print b[1]}' > ${sbatchdir}/samples.list -cd $sbatchdir -rm cmds.sh - -samples=(`cat samples.list`) -for sample in ${samples[@]} -do -find $fastqdir -maxdepth 1 -name ${sample}'*' | sed 's/[12].filter.fastq//' | grep -v '.S.filter.fastq' | sort -u >fastqfiles.prefix.list - cat fastqfiles.prefix.list | xargs -I% echo perl ${execdir}/'tophat.gensbatch.pl' $sample % $fastqsuffix $outdir $threads $gtf $ref $isizefile $sbatchdir $projid $email $time $readlen $isizedev >>cmds.sh -done -sh cmds.sh -find . -name '*.tophat.sbatch' | xargs -I% sbatch % - -## Submitted again - - diff --git a/map/run.tophat.sh b/map/run.tophat.sh deleted file mode 100644 index dcfda5e..0000000 --- a/map/run.tophat.sh +++ /dev/null @@ -1,66 +0,0 @@ -#! /bin/bash -l -#Map with tophat -#DEP: tophat.gensbatch.pl - - -### -#Tophat -### -#Longest run: 5h38min, 6G file for each readpair: 8_LPS.readp_1.2.ill.fastq -email='alva.rani@scilifelab.se' -projid='b2012046' -#execdir='/bubo/home/h24/alvaj/glob/code/ASE/map' -fastqdir='/proj/b2012046/edsgard/ase/sim/data/degner' -#fastqdir='/proj/b2012046/rani/data/fastq' -#fastqfile='testsingle.bqB.fastq' -outdir='/proj/b2012046/rani/analysis/monte' -sbatchdir='/proj/b2012046/rani/scripts/monte' - -#Vars. -#threads='8' -annotdir='/bubo/home/h26/edsgard/glob/annotation/human/' -gtf=${annotdir}/'Homo_sapiens.GRCh37.59.gtf' -ref='/bubo/home/h26/edsgard/glob/annotation/bowtie/concat.fa' -#time='01:00:00' - -#Longest run: 7h10min: Input: ~7.3G per readpair fastq file - -#readlen='100' -#isize='175' -#isizedev='75' - -#very rough approx from the bioanalyzer plots. Also, we use trimming which further adds to the deviation. - -cd $sbatchdir -#rm cmds.sh -#samples=(`cat samples.list`) - -#for sample in ${samples[@]} -#do -#find $fastqdir -maxdepth 1 -name ${samples}'*' | sort -u >fastqfiles.list - -(cat <${outdir}/.tophat.sbatch -EOF -) > sbatch.tophat - -cat sbatch.tophat >tophat.sbatch -sbatch tophat.sbatch - -#find . -name 'sbatch.tophat' | xargs -I% sbatch % - - - -#submitted diff --git a/map/tophat.gensbatch.pl b/map/tophat.gensbatch.pl deleted file mode 100644 index ba15215..0000000 --- a/map/tophat.gensbatch.pl +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/perl - -use strict; -use File::Basename; -use File::Spec::Functions; - - -my ($sample, $fastqprefix, $fastqsuffix, $outdir, $threads, $gtf, $ref, $isizefile, $ods, $projid, $email, $time, $readlen, $isizedev) = @ARGV; - -#set vars -my $fastqfile_1 = $fastqprefix . '1' . $fastqsuffix; -my $fastqfile_2 = $fastqprefix . '2' . $fastqsuffix; -my $fastqprefix_base = basename($fastqprefix); -my $outdir = catfile($outdir, $fastqprefix_base); -$outdir =~ s/\.$//; - -#set sbatch file -my $sbatch_file = catfile($ods, $fastqprefix_base . $fastqsuffix . '.tophat.sbatch'); -$sbatch_file =~ s/\.\./\./; - -#set insert size (NB: actually inner distance) -my $isize; -open(ISIZE, '<', $isizefile) or die("Couldnt open $isizefile, $!\n"); -while(defined(my $line = )){ - chomp($line); -# print 'DEBUG: ' . $line . "\n"; - my ($jsample, $isize_jsample) = split(/\t/, $line); -# print 'DEBUG: jsample: ' . $jsample . "\n"; -# print 'DEBUG: sample: ' . $sample . "\n"; -# my ($isize_jsample, $isize_jsample, $isizedev_jsample) = split(/ /, $line); - if($jsample =~ $sample){ - $isize = $isize_jsample; - $isize = $isize - 2 * $readlen; #Tophat uses the inner distance: isize - 2*readlen -# $isizedev = $isizedev_jsample; - } -} -close(ISIZE); - -#create outdirs -my $cmd = 'mkdir -p ' . catfile($outdir, 'log'); -system($cmd); -my $cmd = 'mkdir -p ' . $ods; -system($cmd); - -#print sbatch file -open(SBATCH, '>', $sbatch_file) or die("Couldn't open $sbatch_file"); -print SBATCH '#!/bin/bash -l' . "\n"; -print SBATCH '#SBATCH -A ' . $projid . "\n"; -print SBATCH '#SBATCH -t ' . $time . "\n"; -print SBATCH '#SBATCH -J tophat' . "\n"; -print SBATCH '#SBATCH -p node -n ' . $threads . "\n"; -print SBATCH '#SBATCH -e ' . $outdir . '/log/tophat.jid_%j.stderr' . "\n"; -print SBATCH '#SBATCH -o ' . $outdir . '/log/tophat.jid_%j.stdout' . "\n"; -print SBATCH '#SBATCH --mail-type=All' . "\n"; -print SBATCH '#SBATCH --mail-user=' . $email . "\n"; - -#set vars -print SBATCH 'module load bioinfo-tools' . "\n"; -print SBATCH 'module load samtools/0.1.18' . "\n"; -print SBATCH 'module load tophat/1.4.0' . "\n"; - -#print cmd -print SBATCH 'tophat --solexa1.3-quals -p ' . $threads . ' -o ' . $outdir . ' --GTF ' . $gtf . " -r '" . $isize . "' --mate-std-dev " . $isizedev . ' ' . $ref . ' ' . $fastqfile_1 . ' ' . $fastqfile_2 . "\n"; -close(SBATCH); - diff --git a/map/tophat.singleend.gensbatch.pl b/map/tophat.singleend.gensbatch.pl deleted file mode 100644 index ff4976c..0000000 --- a/map/tophat.singleend.gensbatch.pl +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/perl - -use strict; -use File::Basename; -use File::Spec::Functions; - - -my ($fastqprefix, $outdir, $threads, $gtf, $ref,$ods, $projid, $email, $time, $readlen) = @ARGV; - -#set vars -my $fastqfile_1 = $fastqprefix . '1'; -#my $fastqfile_2 = $fastqprefix . '2' . $fastqsuffix; -my $fastqprefix_base = basename($fastqprefix); -my $outdir = catfile($outdir, $fastqprefix_base); -$outdir =~ s/\.$//; - -#set sbatch file -my $sbatch_file = catfile($ods, $fastqprefix_base . '.tophat.sbatch'); -$sbatch_file =~ s/\.\./\./; - -#set insert size (NB: actually inner distance) -my $isize; -open(ISIZE, '<', $isizefile) or die("Couldnt open $isizefile, $!\n"); -#while(defined(my $line = )){ -# chomp($line); -# print 'DEBUG: ' . $line . "\n"; - # my ($jsample, $isize_jsample) = split(/\t/, $line); -# print 'DEBUG: jsample: ' . $jsample . "\n"; -# print 'DEBUG: sample: ' . $sample . "\n"; -# my ($isize_jsample, $isize_jsample, $isizedev_jsample) = split(/ /, $line); - #if($jsample =~ $sample){ - # $isize = $isize_jsample; - # $isize = $isize - 2 * $readlen; #Tophat uses the inner distance: isize - 2*readlen -# $isizedev = $isizedev_jsample; - #} -#} -#close(ISIZE); - -#create outdirs -my $cmd = 'mkdir -p ' . catfile($outdir, 'log'); -system($cmd); -my $cmd = 'mkdir -p ' . $ods; -system($cmd); - -#print sbatch file -open(SBATCH, '>', $sbatch_file) or die("Couldn't open $sbatch_file"); -print SBATCH '#!/bin/bash -l' . "\n"; -print SBATCH '#SBATCH -A ' . $projid . "\n"; -print SBATCH '#SBATCH -t ' . $time . "\n"; -print SBATCH '#SBATCH -J tophat' . "\n"; -print SBATCH '#SBATCH -p node -n ' . $threads . "\n"; -print SBATCH '#SBATCH -e ' . $outdir . '/log/tophat.jid_%j.stderr' . "\n"; -print SBATCH '#SBATCH -o ' . $outdir . '/log/tophat.jid_%j.stdout' . "\n"; -print SBATCH '#SBATCH --mail-type=All' . "\n"; -print SBATCH '#SBATCH --mail-user=' . $email . "\n"; - -#set vars -print SBATCH 'module load bioinfo-tools' . "\n"; -print SBATCH 'module load samtools/0.1.18' . "\n"; -print SBATCH 'module load tophat/1.4.0' . "\n"; - -#print cmd -print SBATCH 'tophat --solexa1.3-quals -p ' . $threads . ' -o ' . $outdir . ' --GTF ' . $gtf . " -r '" . $isize . "' --mate-std-dev " . $isizedev . ' ' . $ref . ' ' . $fastqfile_1 . ' ' . "\n"; -close(SBATCH); - - From eb535acd2f8bdad418b73e43b89a5300656d9277 Mon Sep 17 00:00:00 2001 From: alvarani Date: Fri, 28 Sep 2012 13:54:58 +0200 Subject: [PATCH 02/10] Removed degner from GSNAP --- degner/get.overlaps.v3.perl | 92 ------------------------------------- degner/run.degner.sh | 71 ---------------------------- 2 files changed, 163 deletions(-) delete mode 100644 degner/get.overlaps.v3.perl delete mode 100644 degner/run.degner.sh diff --git a/degner/get.overlaps.v3.perl b/degner/get.overlaps.v3.perl deleted file mode 100644 index adead9b..0000000 --- a/degner/get.overlaps.v3.perl +++ /dev/null @@ -1,92 +0,0 @@ -#!/bin/perl - -use strict; - -my $snpfile = shift @ARGV; -my $genomepath = shift @ARGV; -my $readlength = shift @ARGV; -my $readquality = shift @ARGV; -my $seqfile; -my @chroms = (1..22, 'X', 'Y'); -foreach my $chr (@chroms){ - read_files(($chr, $snpfile, $seqfile)); - print_overlaps($chr); -} - -sub read_files { - my $chr = shift @_; - my $snpfile = shift @_; - my $seqfile = shift @_; - open(SNP, '<', $snpfile) or die "cannot open snpfile"; - open(SEQUENCE, "cat $genomepath/chr.$chr.fa |") or die "cannot open seqfile"; -#DE: chr-formatted ref (UCSC): open(SEQUENCE, "zcat $genomepath/chr$chr.fa.gz |") or die "cannot open seqfile"; -} - -sub print_overlaps { - - my $chr = shift @_; - my @sequence_arr = ; - chomp(@sequence_arr); - shift(@sequence_arr); #remove fasta header - my $sequence = join("", @sequence_arr); -#DE: UCSC formatted: $sequence =~ s/>chr$chr//g; - close(SEQUENCE); - - #DE: removes header: ; - while (defined(my $line = )) { - chomp($line); - my ($snpid, $thischr, $pos, $strand, $ref, $alt) = split(/ /, $line); - - my $plusbase; -#DE: chr-formatted ref: if ($thischr eq "chr$chr") { - if ($thischr eq $chr) { - - ### - #Print short read identical to reference seq - ### - if ($strand eq "+") { - $plusbase = $ref - } elsif ($strand eq "-") { - $plusbase = $ref; - $plusbase =~ tr/ATGCYRKMBDHVatgcyrkmbdhv/TACGRYMKVHDBtacgrymkvhdb/; - } - - for (my $jbase=0; $jbase<$readlength; $jbase++) { - my $short = substr($sequence, ($pos-($readlength-1-$jbase)-1), $readlength-1-$jbase) . $plusbase . substr($sequence, ($pos), $jbase); - my $RC = $short; - $RC =~ tr/ATGCYRKMBDHVatgcyrkmbdhv/TACGRYMKVHDBtacgrymkvhdb/; - $RC = reverse($RC); - my $readname = join('_', ($snpid, $thischr, $pos, 'REF_+', $plusbase, $jbase)); - print('@', $readname, "\n", $short, "\n", '+', $readname, "\n", $readquality, "\n"); - $readname = join('_', ($snpid, $thischr, $pos, 'REF_-', $plusbase, $jbase)); - print('@', $readname, "\n", $RC, "\n", '+', $readname, "\n", $readquality, "\n"); -# print('@', "$snpid_$thischr_$pos_REF_+_$plusbase", "_$jbase", "\n$short\n", '+', "$snpid_$thischr_$pos_REF_+_$plusbase", "_$jbase", "\n$readquality\n"); -# print('@', "$snpid_$thischr_$pos_REF_-_$plusbase", "_$jbase", "\n$RC\n", '+', "$snpid_$thischr_$pos_REF_-_$plusbase", "_$jbase", "\n$readquality\n"); - } - - ### - #Print short read with alternate allele - ### - if ($strand eq "+") { - $plusbase = $alt - } elsif ($strand eq "-") { - $plusbase = $alt; - $plusbase =~ tr/ATGCYRKMBDHVatgcyrkmbdhv/TACGRYMKVHDBtacgrymkvhdb/; - } - - for (my $jbase=0; $jbase<$readlength; $jbase++) { - my $short = substr($sequence, ($pos-($readlength-1-$jbase)-1), $readlength-1-$jbase) . $plusbase . substr($sequence, ($pos), $jbase); - my $RC = $short; - $RC =~ tr/ATGCYRKMBDHVatgcyrkmbdhv/TACGRYMKVHDBtacgrymkvhdb/; - $RC = reverse($RC); - my $readname = join('_', ($snpid, $thischr, $pos, 'NONREF_+', $plusbase, $jbase)); - print('@', $readname, "\n", $short, "\n", '+', $readname, "\n", $readquality, "\n"); - $readname = join('_', ($snpid, $thischr, $pos, 'NONREF_-', $plusbase, $jbase)); - print('@', $readname, "\n", $RC, "\n", '+', $readname, "\n", $readquality, "\n"); -# print('@', "$snpid_$thischr_$pos_NONREF_+_$plusbase", "_$jbase", "\n$short\n", '+', "$snpid_$thischr_$pos_NONREF_+_$plusbase", "_$jbase", "\n$readquality\n"); -# print('@', "$snpid_$thischr_$pos_NONREF_-_$plusbase", "_$jbase", "\n$RC\n", '+', "$snpid_$thischr_$pos_NONREF_-_$plusbase", "_$jbase", "\n$readquality\n"); - } - } - } - close(SNP); -} diff --git a/degner/run.degner.sh b/degner/run.degner.sh deleted file mode 100644 index e42cb86..0000000 --- a/degner/run.degner.sh +++ /dev/null @@ -1,71 +0,0 @@ - -#Simulate data a la Degner -#DEP: get.overlaps.v3.perl (from Degner) - - -### -#Format input variants -### -#id chr pos strand ref alt -#EX: 10.100000020 10 100000020 + A G - -vcfdir='/proj/b2012046/edsgard/ase/sim/data/varcalls' -vars=${vcfdir}/vars2degner.dp10.txt -cat ${vcfdir}/*.hetvars.alleles | sort -u | awk -F'\t' '{print $1"."$2, $1, $2, "+", $3, $4;}' >$vars - -#Check: -wc -l vars2degner.dp10.txt #112901 - - -### -#Create simulated read set -### -#TBD: Add noise. -#TBD: NB: Disregards if >1 SNP covering a read! -#TBD: PE reads not simulated, only SEs! -degneroutdir='/proj/b2012046/edsgard/ase/sim/data/degner' -fastqoutfile=${degneroutdir}/mpileup.dp10.len100.bqB.fastq -vcfdir='/proj/b2012046/edsgard/ase/sim/data/varcalls' -vars=${vcfdir}/vars2degner.dp10.txt -projid='b2010035' -time='4:57:00' -logdir=${degneroutdir}/log -exedir='/bubo/home/h26/edsgard/glob/code/ase/degner' -refdir='/bubo/home/h26/edsgard/glob/annotation/bowtie' -readlen='100' -readqual='BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB' - -mkdir -p $logdir -srun -A $projid -p node -t $time -e ${logdir}/sim.err -o $fastqoutfile perl ${exedir}/get.overlaps.v3.perl $vars $refdir $readlen $readqual & -#Status: submitted - - -### -#Map with Tophat -### -#See map/run.tophat.sh -#You'll need to amend it such that it takes single end reads instead of paired end reads! - - -### -#rmDups -### -#Skip rmdups - - -### -#Merge and sort bam files -### -#TBD: Not sure that you need this. But in that case see bam/bamproc.sh - - -### -#MPILEUP (no varcall) -### -#See basecount/basecount.sh - - -### -#Mapping bias modification a la Montgomery -### -#I'll get back to you with code for that once you've reached this point:) From 2a409b24008bea70f5be601c38a930cc634974aa Mon Sep 17 00:00:00 2001 From: alvarani Date: Fri, 28 Sep 2012 13:58:49 +0200 Subject: [PATCH 03/10] Removed all files --- bam/bamproc.sh | 87 -------------- bam/bamstats.sh | 44 ------- bam/bedtoolshist2pdf.R | 158 ------------------------- bam/coverage.pl | 68 ----------- bam/mergebams.pl | 64 ---------- bam/rmdups.pl | 60 ---------- basecount/basecount.sh | 70 ----------- basecount/dphetfilter.R | 79 ------------- basecount/mpileup.allelecount.pl | 75 ------------ basecount/vcfI162basecount.sh | 3 - test/run.gunzip.sh | 29 ----- test/test.sh | 193 ------------------------------- test/testgsnap for smallseq.sh | 22 ---- varcall/mpileup_multisample.pl | 73 ------------ varcall/varcall.sh | 57 --------- 15 files changed, 1082 deletions(-) delete mode 100644 bam/bamproc.sh delete mode 100644 bam/bamstats.sh delete mode 100644 bam/bedtoolshist2pdf.R delete mode 100644 bam/coverage.pl delete mode 100644 bam/mergebams.pl delete mode 100644 bam/rmdups.pl delete mode 100644 basecount/basecount.sh delete mode 100644 basecount/dphetfilter.R delete mode 100644 basecount/mpileup.allelecount.pl delete mode 100644 basecount/vcfI162basecount.sh delete mode 100644 test/run.gunzip.sh delete mode 100644 test/test.sh delete mode 100644 test/testgsnap for smallseq.sh delete mode 100644 varcall/mpileup_multisample.pl delete mode 100644 varcall/varcall.sh diff --git a/bam/bamproc.sh b/bam/bamproc.sh deleted file mode 100644 index 17b97b5..0000000 --- a/bam/bamproc.sh +++ /dev/null @@ -1,87 +0,0 @@ -#Processing bam files -#DEP: rmdups.pl -#DEP: mergebams.pl - - -### -#rmDups -### -email='alva.rani@scilifelab.se' -projid='b2012046' -execdir='/bubo/home/h24/alvaj/glob/code/ASE/bam' -sbatchdir='/proj/b2012046/rani/scripts/gsnap/unmapped' -bamfile='/proj/b2012046/rani/analysis/gsnap/bamsort/' -cd $sbatchdir -find $bamfile -name '*.uniqmapped.bam' | awk -F'/' -v execdir=$execdir -v sbatchdir=$sbatchdir -v projid=$projid -v email=$email '{print "perl", execdir"/rmdups.pl", $0, $NF, sbatchdir, projid, email;}' >rmdups.sh -sh rmdups.sh -find . -name '*.sbatch' | xargs -I% sbatch % -#### - - -#Check: -find $bamfile -name '*.bam.sorted.nodup' | xargs -I% ls -lrth % -find $bamfile -name '*.rmdups.stderr' | xargs -I% grep -i 'err' -find $bamfile -name '*.rmdups.stderr' | xargs -I% grep -i 'warn' - -#rm unsorted bams -find $bamfile -name '*.bam' | xargs -I% rm % - -#rename bamfiles -find -name '*.bam.sorted' | xargs -I% echo rename '.bam.sorted' '.sorted.bam' % >cmds.sh -sh cmds.sh - - -#rename bamfiles -find $bamdir -name '*.bam.sorted.nodup' | xargs -I% echo rename '.bam.sorted.nodup' '.sorted.nodup.bam' % >cmds.sh -sh cmds.sh -#### - -### -#Merge and sort bam files -### -projid='b2012046' -email='alva.rani@scilifelab.se' -sbatchdir='/proj/b2012046/rani/scripts/gsnap/mergebams' -execdir='/bubo/home/h24/alvaj/glob/code/ASE/bam' -bamsort='/proj/b2012046/rani/analysis/gsnap/bamsort' -outdir='/proj/b2012046/rani/analysis/gsnap/mergebam' -time='11:00:00' -cd $sbatchdir -find $bamsort -name '*.sorted.nodup.bam' | sed 's/\..*//' | sort -u >samples.list -cat samples.list | xargs -I% basename % | xargs -I% echo perl ${execdir}'/mergebams.pl' % $bamsort $outdir $sbatchdir $projid $email $time >cmds.sh -sh cmds.sh -find $sbatchdir -name '*.mergesams.sbatch' | xargs -I% sbatch % - - -## Submitted -#Check: -find ${bamfile}/info/*.stderr | xargs -I% grep -i 'err' % -find ${bamfile}/info/*.mergesams.*.stderr | xargs -I% grep -i 'warn' % -#### - -### -#Create indexes -### -bamsort='/proj/b2012046/rani/analysis/gsnap/mergebam' -find $bamsort -maxdepth 1 -name '*.bam' | xargs -I% echo samtools index % > cmd.sh -#Manually add sbatch header to the cmds.sh script -(cat <bamindex.sbatchheader -cat bamindex.sbatchheader cmd.sh >bamindex.sbatch -sbatch bamindex.sbatch - -############################ -## Done all steps here without errors - diff --git a/bam/bamstats.sh b/bam/bamstats.sh deleted file mode 100644 index 871c599..0000000 --- a/bam/bamstats.sh +++ /dev/null @@ -1,44 +0,0 @@ -#Stats on bam files -#DEP: coverage.pl -#DEP: bedtoolshist2pdf.R - - -### -#Calculate number of mapped reads (More than the input number of reads, due to non-unique mappings!) -### -module load bioinfo-tools -module load samtools/0.1.16 -find '/proj/b2012046/rani/analysis/gsnap' -name 'accepted_hits.bam.sorted.nodup' -exec sh -c "samtools flagstat {} >{}.samtools.flagstat.out" \; & -find '/proj/b2012046/rani/analysis/gsnap' -name '*.samtools.flagstat.out' | xargs -I% awk 'NR == 1 {print FILENAME, $1;}' % >nmapped.reads - - -### -#Calculate coverage -### - -#BEDtools (see coverage.pl)--IMP -prg='/proj/b2012046/rani/scripts/gsnap/coverage.pl' -find '/proj/b2012046/rani/analysis/gsnap' -name 'merged.bam' | awk -F'/' -v ods="$ods" -v projid="$projid" -v em="$email" -v prg="$prg" '{print "perl", prg, $0, $8, ods, projid, em;}' >bedtoolscov.sh - - -### -#Generate pdf of the coverage--NOT needed now -### -find . -name 'ccds.bedtools.out2' -exec sh -c 'grep ^all {} >{}.all' \; -find . -name 'ensembl.bedtools.out' -exec sh -c 'grep ^all {} >{}.all' \; -find . -name 'genome.bedtools.out2' -exec sh -c 'grep ^genome {} >{}.all' \; -find . -name 'ccds.*2.all' | zip ccds.bedtools.all.zip -@ -find . -name 'genome.*2.all' | zip genome.bedtools.all.zip -@ -find . -name 'ccds*2.all' | awk -F'/' '{print "mv", $0, "coverage/"$2"."$4;}' >mv.sh -find . -name 'genome*2.all' | awk -F'/' '{print "mv", $0, "coverage/"$2"."$4;}' >mv.sh -find '/Users/danieledsgard/Dropbox/postdoc/projects/ase/res/hs_pipe/run1' -name '*ccds.*2.all' | sed 's/.*\///' | awk -F'.' '{print $1;}' >samples -find '/Users/danieledsgard/Dropbox/postdoc/projects/ase/res/hs_pipe/run1' -name '*ccds.*2.all' >files -paste samples files >ccds.histfiles -find '/Users/danieledsgard/Dropbox/postdoc/projects/ase/res/hs_pipe/run1' -name '*genome.*2.all' | sed 's/.*\///' | awk -F'.' '{print $1;}' >samples -find '/Users/danieledsgard/Dropbox/postdoc/projects/ase/res/hs_pipe/run1' -name '*genome.*2.all' >files -paste samples files >genome.histfiles - -bedtoolshist2pdf.R ccds.histfiles - -#NB: no pair after read-QC, rmdups, but not: unique, MQ, properly paired. What is M (mismatches)set to? - diff --git a/bam/bedtoolshist2pdf.R b/bam/bedtoolshist2pdf.R deleted file mode 100644 index 9ec9e85..0000000 --- a/bam/bedtoolshist2pdf.R +++ /dev/null @@ -1,158 +0,0 @@ - -#USAGE: Rscript bedtoolshist2pdf.R histfilelist.file - -#argv = commandArgs(trailingOnly=TRUE) -#histfilelist.file = argv - -annot = 'ccds' -histfilelist.file = '../res/hs_pipe/run1/coverage/ccds.histfiles' -resdir = dirname(histfilelist.file) -tab.outfile = file.path(resdir, 'ccds.cumsum.tab') -hist.pdf.file = file.path(resdir, 'ccds.hist.pdf') - -annot = 'genome' -histfilelist.file = '../res/hs_pipe/run1/coverage/genome.histfiles' -resdir = dirname(histfilelist.file) -tab.outfile = file.path(resdir, 'genome.cumsum.tab') -hist.pdf.file = file.path(resdir, 'genome.hist.pdf') - - -#read data -histfile2id = read.table(histfilelist.file, stringsAsFactors=FALSE, row.names=1) -colnames(histfile2id) = 'filename' -#hist.files = c('../res/hs_pipe/test/bedtools.genome.all.hist', '../res/hs_pipe/test/bedtools.ensembl.all.hist', '../res/hs_pipe/test/bedtools.ccds.all.hist') - -#set linetype (dashed for unstim) -samples = rownames(histfile2id) -n.samples = length(samples) -ltypes = rep(1, n.samples) -names(ltypes) = samples -ltypes[grep('unstim', names(ltypes))] = 2 - -#set color -library('RColorBrewer') -individual = gsub('^(\\d+)_.*', '\\1', samples, perl = TRUE) -ind2color = brewer.pal(8, 'Dark2') -names(ind2color) = unique(individual) -colors = ind2color[individual] - -histfile2id = cbind(histfile2id, ltypes, colors, stringsAsFactors=FALSE) - -main <- function(histfile2id, tab.outfile, hist.pdf.file){ - hist.tabs = read.files(histfile2id) - dp2frac = get.dp2frac(hist.tabs) - dp2bases = get.dp2bases(hist.tabs) - plot.dp2frac(dp2frac, ltypes = histfile2id[names(dp2frac), 'ltypes'], colors = histfile2id[names(dp2frac), 'colors']) - plot.dp2bases(dp2bases) - - - #Plot - if(annot == 'genome'){ - ylab = 'bases covered' - pdf(file = hist.pdf.file, width = 10, height = 7) - par(mfrow = c(1, 2)) - unstim.ind = grep('unstim', names(dp2bases)) - dp2bases.unstim = dp2bases[unstim.ind] - plot.dp2y(dp2bases.unstim, ltypes = histfile2id[names(dp2bases.unstim), 'ltypes'], colors = histfile2id[names(dp2bases.unstim), 'colors'], ylab = ylab, min.plot.dp=2) - - lps.ind = grep('LPS', names(dp2bases)) - dp2bases.lps = dp2bases[lps.ind] - plot.dp2y(dp2bases.lps, ltypes = histfile2id[names(dp2bases.lps), 'ltypes'], colors = histfile2id[names(dp2bases.lps), 'colors'], ylab = ylab, min.plot.dp = 2) - dev.off() - } - else{ - #Plot - pdf(file = hist.pdf.file, width = 10, height = 7) - par(mfrow = c(1, 2)) - unstim.ind = grep('unstim', names(dp2frac)) - dp2frac.unstim = dp2frac[unstim.ind] - plot.dp2frac(dp2frac.unstim, ltypes = histfile2id[names(dp2frac.unstim), 'ltypes'], colors = histfile2id[names(dp2frac.unstim), 'colors']) - - lps.ind = grep('LPS', names(dp2frac)) - dp2frac.lps = dp2frac[lps.ind] - plot.dp2frac(dp2frac.lps, ltypes = histfile2id[names(dp2frac.lps), 'ltypes'], colors = histfile2id[names(dp2frac.lps), 'colors']) - dev.off() - } - - #Print table - max.dp = 101 - dp2frac.mat = dp2frac[[1]][1:max.dp] - for(jfile in 2:length(dp2frac)){ - dp2frac.mat = cbind(dp2frac.mat, dp2frac[[jfile]][1:max.dp]) - } - colnames(dp2frac.mat) = names(dp2frac) - - rownames(dp2frac.mat) = 0:(max.dp - 1) - write.table(dp2frac.mat, sep = '\t', quote = FALSE, col.names = NA, file = tab.outfile) - - #avg cov - avg.cov = unlist(lapply(hist.tabs, function(hist.tab){sum(as.numeric(hist.tab[, 'dp'] * hist.tab[, 'bases.cov'])) / hist.tab[1, 'feat.len']})) - write.table(round(avg.cov), sep='\t', quote = F, col.names = F, file = file.path(resdir, 'ccds.avgcov.tab')) - - #y-axis: n.genes, x-axis: avg.cov -} - -read.files <- function(histfile2id){ - - samples = rownames(histfile2id) - n.hist = length(samples) - - hist.tabs = list() - length(hist.tabs) = n.hist - names(hist.tabs) = samples - for(jsample in samples){ - hist.tab = read.table(histfile2id[jsample, 'filename'], sep='\t', stringsAsFactors=FALSE) - colnames(hist.tab) = c('feat', 'dp', 'bases.cov', 'feat.len', 'frac.cov') - hist.tabs[[jsample]] = hist.tab - } - - return(hist.tabs) -} - -get.dp2frac <- function(hist.tabs){ -#fraction of bases covered against dp - - n.hist = length(hist.tabs) - dp2frac = list() - length(dp2frac) = n.hist - hist.files = names(hist.tabs) - names(dp2frac) = hist.files - for(hist.file in hist.files){ - hist.tab = hist.tabs[[hist.file]] - dp2frac[[hist.file]] = rev(cumsum(hist.tab[rev(1:nrow(hist.tab)), 'frac.cov'])) - } - - return(dp2frac) -} - -get.dp2bases <- function(hist.tabs){ -#number of bases covered against dp - n.hist = length(hist.tabs) - dp2bases = list() - length(dp2bases) = n.hist - hist.files = names(hist.tabs) - names(dp2bases) = hist.files - for(hist.file in hist.files){ - hist.tab = hist.tabs[[hist.file]] - dp2bases[[hist.file]] = rev(cumsum(hist.tab[rev(1:nrow(hist.tab)), 'bases.cov'])) - } - - return(dp2bases) -} - -plot.dp2y <- function(dp2frac, max.plot.dp = 100, ylab = 'fraction covered', ltypes, colors, min.plot.dp = 1){ - - n.hist = length(dp2frac) - - ylim = range(unlist(dp2frac)) - ylim = c(0, 5e8) - jhist = 1 - plot(dp2frac[[jhist]][min.plot.dp:max.plot.dp], xlab = 'depth', ylab = ylab, col=colors[jhist], type = 'l', ylim = ylim, lty = ltypes[jhist]) - for(jhist in 2:n.hist){ - lines(dp2frac[[jhist]][min.plot.dp:max.plot.dp], col=colors[jhist], lty = ltypes[jhist]) - } - legend('topright', legend = names(dp2frac), lty = ltypes, col = colors) -} - -#execute -#main(histfile2id) diff --git a/bam/coverage.pl b/bam/coverage.pl deleted file mode 100644 index 4484e7e..0000000 --- a/bam/coverage.pl +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/perl - -#mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e "select chrom, size from hg19.chromInfo" | sed 's/^chr//' | sed 's/^M/MT/' > $chrsizes -#prg='/bubo/home/h26/edsgard/glob/code/ngs_pipes/hs_pipe/coverage.pl' -#find '/proj/b2011075/analysis/hs_pipe/outdata/run1' -name 'merged.bam' | awk -F'/' -v ods="$ods" -v projid="$projid" -v em="$em" -v prg="$prg" '{print "perl", prg, $0, $8, ods, projid, em;}' >bedtoolscov.sh - -use strict; - -#my ($bamfile, $sampleid, $ods, $aid, $em) = @ARGV; - -my $bed='/bubo/home/h26/edsgard/glob/annotation/human/CCDS-hg19.chrstripped.bed'; -my $gtf='/bubo/home/h26/edsgard/glob/annotation/human/Homo_sapiens.GRCh37.59.gtf'; -my $genome='/bubo/home/h26/edsgard/glob/annotation/human/hg19.chrsizes.chrstripped'; - -coverage((@ARGV, $bed, $gtf, $genome)); - -sub coverage { - - use File::Spec::Functions; - use File::Basename; - - my $bamfile = shift @_; - my $sampleid = shift @_; - my $ods = shift @_; - my $aid = shift @_; - my $em = shift @_; - my $bed = shift @_; - my $gtf = shift @_; - my $genome = shift @_; - my $time = '3:00:00'; - - my $sbatch_dir = catfile($ods, $sampleid, 'coverage'); - my $sbatch_file = catfile($sbatch_dir, "$sampleid.coverage.sbatch2"); - my $outdata_dir = catfile(dirname(dirname($bamfile)), 'coverage'); - my $outdata_infodir = catfile($outdata_dir, 'info'); - - #make dirs - `mkdir -p $outdata_infodir;`; #Creates the data and info dir - `mkdir -p $sbatch_dir;`; - - open (SBATCH, '>', $sbatch_file) or die("Can't write to $sbatch_file: $!\n"); - - #print sbatch vars - print SBATCH "#!/bin/bash -l", "\n"; - print SBATCH "#SBATCH -A ", $ods, "\n"; - print SBATCH "#SBATCH -t ", $time, "\n"; - print SBATCH "#SBATCH -J Cov_", $sampleid, "\n"; - print SBATCH "#SBATCH -p node -n 1", "\n"; - print SBATCH "#SBATCH -e $outdata_infodir/$sampleid.bedtools.cov.stderr2", "\n"; - print SBATCH "#SBATCH -o $outdata_infodir/$sampleid.bedtools.cov.stdout2", "\n"; - - unless ($em eq 0) { - print SBATCH "#SBATCH --mail-type=All", "\n"; - print SBATCH "#SBATCH --mail-user=$em", "\n\n"; - } - - #print vars - print SBATCH 'module load bioinfo-tools' . "\n"; - print SBATCH 'module load BEDTools/2.11.2' . "\n"; - print SBATCH 'echo "Running on: $(hostname)"',"\n\n"; - - #print cmd - print SBATCH 'coverageBed -abam ' . $bamfile . ' -b ' . $bed . ' -hist -split >' . catfile($outdata_dir, 'ccds.bedtools.out2') . "\n"; - print SBATCH 'coverageBed -abam ' . $bamfile . ' -b ' . $gtf . ' -hist -split >' . catfile($outdata_dir, 'ensembl.bedtools.out2') . "\n"; - print SBATCH 'genomeCoverageBed -ibam ' . $bamfile . ' -g ' . $genome . ' -split -max 1000 >' . catfile($outdata_dir, 'genome.bedtools.out2') . "\n"; - - close(SBATCH); -} diff --git a/bam/mergebams.pl b/bam/mergebams.pl deleted file mode 100644 index 175ed9a..0000000 --- a/bam/mergebams.pl +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/perl - -#find '/proj/b2011075/analysis/hs_pipe/outdata/run1' -name 'accepted_hits.bam' | awk -F'/' '{print "perl /bubo/home/h26/edsgard/glob/code/ngs_pipes/hs_pipe/rmdups.pl", $0, $10, "/proj/b2011075/analysis/hs_pipe/outscripts/run1 b2010035 daniel.edsgard@scilifelab.se";}' >rmdups.sh - -use strict; - -#my ($sampleid, $bamdir, $outdir, $sbatch_dir, $aid, $em) = @ARGV; - -mergesams(@ARGV); - -sub mergesams { - - use File::Spec::Functions; - use File::Basename; - - my $sampleid = shift @_; - my $bamdir = shift @_; - my $outdata_dir = shift @_; - my $sbatch_dir = shift @_; - my $aid = shift @_; - my $em = shift @_; - my $time = shift @_; - - if(!defined($time)){ - $time = '23:59:00'; - } - - my $sbatch_file = catfile($sbatch_dir, "$sampleid.mergesams.sbatch"); - my $outdata_infodir = catfile($outdata_dir, 'info'); - - #get bamfiles to be merged - my @bamfiles = `find $bamdir -name '*.bam' | grep $sampleid`; - chomp(@bamfiles); - - #set outfile - my $bamfile_merged = catfile($outdata_dir, $sampleid . '.bam'); - - #make dirs - `mkdir -p $outdata_infodir;`; #Creates the data and info dir - `mkdir -p $sbatch_dir;`; #Creates the sbatch-script dir - - open (SBATCH, '>', $sbatch_file) or die("Can't write to $sbatch_file: $!\n"); - - #print sbatch vars - print SBATCH "#!/bin/bash -l", "\n"; - print SBATCH "#SBATCH -A ", $aid, "\n"; - print SBATCH "#SBATCH -t ", $time, "\n"; - print SBATCH "#SBATCH -J mergesams_", $sampleid, "\n"; - print SBATCH "#SBATCH -p core -n 1", "\n"; - print SBATCH "#SBATCH -e $outdata_infodir/$sampleid" . '.mergesams.jid_%.stderr' . "\n"; - print SBATCH "#SBATCH -o $outdata_infodir/$sampleid" . '.mergesams.jid_%.stdout' . "\n"; - unless ($em eq 0) { - print SBATCH "#SBATCH --mail-type=All", "\n"; - print SBATCH "#SBATCH --mail-user=$em", "\n\n"; - } - - #print vars - print SBATCH 'module load bioinfo-tools' . "\n"; - print SBATCH 'module load picard/1.41' . "\n"; - - #print cmd - print SBATCH 'java -Xmx2g -jar /bubo/sw/apps/bioinfo/picard/1.41/MergeSamFiles.jar INPUT=' . join(' INPUT=', @bamfiles) . ' OUTPUT=' . $bamfile_merged . ' USE_THREADING=true SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT' . "\n"; - close(SBATCH); -} diff --git a/bam/rmdups.pl b/bam/rmdups.pl deleted file mode 100644 index d3b90a5..0000000 --- a/bam/rmdups.pl +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/perl - -#perl rmdups.pl /proj/b2011075/analysis/hs_pipe/outdata/run1/1_unstim_12h/tophat/1_unstim_12h.lane_1.index_8.readp_1.fastq.1.filter.fastq/accepted_hits.bam 1_unstim_12h.lane_1.index_8 '/proj/b2011075/analysis/hs_pipe/outscripts/run1' 'b2010035' 'daniel.edsgard@scilifelab.se' -#find '/proj/b2011075/analysis/hs_pipe/outdata/run1' -name 'accepted_hits.bam' | awk -F'/' '{print "perl /bubo/home/h26/edsgard/glob/code/ngs_pipes/hs_pipe/rmdups.pl", $0, $10, "/proj/b2011075/analysis/hs_pipe/outscripts/run1 b2010035 daniel.edsgard@scilifelab.se";}' >rmdups.sh - -use strict; - -#my ($bamfile, $sampleid, $ods, $aid, $em) = @ARGV; - -rmDups(@ARGV); - -sub rmDups { - - use File::Spec::Functions; - use File::Basename; - - my $bamfile = shift @_; - my $sampleid = shift @_; - my $ods = shift @_; - my $aid = shift @_; - my $em = shift @_; - my $time = '9:00:00'; - - my $sbatch_dir = catfile($ods, $sampleid); - my $sbatch_file = catfile($sbatch_dir, "$sampleid.rmdups.sbatch"); - my $outdata_dir = dirname($bamfile); - my $outdata_infodir = catfile($outdata_dir, 'log'); - - #make dirs - `mkdir -p $outdata_infodir;`; #Creates the data and info dir - `mkdir -p $sbatch_dir;`; #Creates the sbatch-script dir - - open (SBATCH, '>', $sbatch_file) or die("Can't write to $sbatch_file: $!\n"); - - #print sbatch vars - print SBATCH "#!/bin/bash -l", "\n"; - print SBATCH "#SBATCH -A ", $aid, "\n"; - print SBATCH "#SBATCH -t ", $time, "\n"; - print SBATCH "#SBATCH -J rmDups_", $sampleid, "\n"; - print SBATCH "#SBATCH -p core -n 1", "\n"; - print SBATCH "#SBATCH -e $outdata_infodir/$sampleid.rmdups.stderr", "\n"; - print SBATCH "#SBATCH -o $outdata_infodir/$sampleid.rmdups.stdout", "\n"; - unless ($em eq 0) { - print SBATCH "#SBATCH --mail-type=All", "\n"; - print SBATCH "#SBATCH --mail-user=$em", "\n\n"; - } - - #print vars - print SBATCH 'module load bioinfo-tools' . "\n"; - print SBATCH 'module load picard/1.41' . "\n"; - print SBATCH 'echo "Running on: $(hostname)"',"\n\n"; - - #print cmd - my $bamfile_sorted = $bamfile . '.sorted'; - my $bamfile_nodup = $bamfile_sorted . '.nodup'; - print SBATCH 'java -Xmx2g -jar /bubo/sw/apps/bioinfo/picard/1.41/SortSam.jar INPUT=' . $bamfile . ' OUTPUT=' . $bamfile_sorted . ' SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT' . "\n"; - print SBATCH 'java -Xmx2g -jar /bubo/sw/apps/bioinfo/picard/1.41/MarkDuplicates.jar INPUT=' . $bamfile_sorted . ' OUTPUT=' . $bamfile_nodup . ' ASSUME_SORTED=true REMOVE_DUPLICATES=true METRICS_FILE=' . $sampleid . '_picardDup_metrics VALIDATION_STRINGENCY=LENIENT' . "\n"; - - close(SBATCH); -} diff --git a/basecount/basecount.sh b/basecount/basecount.sh deleted file mode 100644 index e9ed246..0000000 --- a/basecount/basecount.sh +++ /dev/null @@ -1,70 +0,0 @@ -#Basecount (allelic count of each allele) -#DEP: dphetfilter.R -#DEP: mpileup.allelecount.pl -#DEP: vcfI162basecount.sh - - -######## -#DP and HET filter -######## - -vcfdir='/proj/b2012046/edsgard/ase/sim/data/varcalls' - -## -#VCF to tab format -## -PATH=${PATH}:/bubo/home/h26/edsgard/opt/vcftools_0.1.7/bin; export PATH -PERL5LIB=/bubo/home/h26/edsgard/opt/vcftools_0.1.7/lib; export PERL5LIB -vcftools --vcf allbams.vcf --extract-FORMAT-info DP --out genome -vcftools --vcf allbams.vcf --extract-FORMAT-info GT --out genome - -## -#Filter -## -#OUT: *.het.pos -mindp=10 -execdir='/bubo/home/h26/edsgard/glob/code/ase/sim' -Rscript ${execdir}/dphetfilter.R genome.DP.FORMAT genome.GT.FORMAT $mindp & -ls -1 *.het.pos | xargs -I% echo cat % '|' sed "'s/\./ /' >"%.hetvars >cmds.sh -rename genome.DP.FORMAT.het.pos. . *.hetvars - -#Dump "chr,pos,ref,alt" for these vars. (Used as input in snv.ase.R since the alt allele is seldom biallelic when running mpileup.allelecount.pl below.) -find $vcdir -maxdepth 1 -name '*.vcf' | grep -v 'allbams' | xargs -I% echo "awk -F'\t' '{print \$1\".\"\$2, \$1, \$2, \$4, \$5;}' % | grep -v '^#' | sort -k 1,1 >"%.pos >cmds.sh -sh cmds.sh -ls -1 *.het.pos | xargs -I% echo sort % '>'%.sorted >cmds.sh -sh cmds.sh -ls -1 *.het.pos.sorted | sed 's/.genome.DP.FORMAT.het.pos.sorted//' >samples.list -cat samples.list | xargs -I% echo join -j1 %.genome.DP.FORMAT.het.pos.sorted %.vcf.pos '>' %.hetvars.alleles >cmds.sh -srun -p devel -t 1:00:00 -A b2010035 sh cmds.sh & -ls -1 *.hetvars.alleles | xargs -I% echo cut -d"' '" -f2-5 % "| tr ' ' '\t' >"%.tmp >cmds.sh -rename .hetvars.alleles.tmp .hetvars.alleles *.hetvars.alleles.tmp - - -################################################# -#BASECOUNT by MPILEUP (no varcall). -#First four of I16 == DP4 -################################################# -bamdir='/proj/b2012046/rani/analysis/gsnap' -vcfdir='/proj/b2012046/edsgard/ase/sim/data/varcalls' -bcdir='/proj/b2012046/rani/data/basecount' -execdir='/bubo/home/h24/alvaj/glob/code/ASE/basecount' -scriptdir='/proj/b2012046/rani/scripts/basecount' -projid='b2012046' -email='alva.rani@scilifelab.se' -time='5:00:00' -ref='/bubo/home/h26/edsgard/glob/annotation/human/Homo_sapiens.GRCh37.57.dna.concat.fa' - -cd $scriptdir -find $bamdir -maxdepth 1 -name '*.bam' | xargs -I% basename % | sed 's/\.bam//' >samples.list -cat samples.list | xargs -I% echo 'perl' ${execdir}'/mpileup.allelecount.pl' ${bamdir}/%.bam12 ${vcfdir}/%.hetvars $ref $scriptdir $projid $email $time $bcdir >cmds.sh -sh cmds.sh -find $scriptdir -name '*mpileup.basecount*' | xargs -I% sbatch % - -#Extract basecounts from vcf files (I16 field) -bcdir='/proj/b2012046/rani/data/basecount' -execdir='/bubo/home/h24/alvaj/glob/code/ASE/basecount' -find $bcdir -name '*nocall.vcf' >nocall.vcf.list -cat nocall.vcf.list | xargs -I% echo 'sh' ${execdir}/vcfI162basecount.sh % '>' %.basecount >cmds.sh -srun -p devel -t 1:00:00 -A b2010035 sh cmds.sh & - - diff --git a/basecount/dphetfilter.R b/basecount/dphetfilter.R deleted file mode 100644 index a782a5f..0000000 --- a/basecount/dphetfilter.R +++ /dev/null @@ -1,79 +0,0 @@ - -#USAGE: Rscript dp.file gt.file 10 - -#get args -argv = commandArgs(trailingOnly=TRUE) -dp.file = argv[1] -gt.file = argv[2] -dp.min = as.integer(argv[3]) - -#Test-suite -if(0){ - datadir = '~/Dropbox/postdoc/projects/ase/data/codetest' - dp.file = file.path(datadir, 'head.dp') - gt.file = file.path(datadir, 'head.gt') - dp.min = 10 -} - -main <- function(dp.file, gt.file, dp.min){ - - #read - dp.mat = read.dp(dp.file) - gt.mat = read.gt(gt.file) - - #sort - dp.mat = dp.mat[rownames(gt.mat), ] - - #filter - rseq.pos.pass = dp.het.filter(dp.mat, gt.mat, dp.min) - - #dump - samples = names(rseq.pos.pass) - for(jsample in samples){ - hetvars.file = paste(jsample, dp.file, 'het', 'pos', sep='.') - write.table(rseq.pos.pass[[jsample]], quote = F, sep = '\t', col.names=F, row.names = F, file = hetvars.file) - } -} - -dp.het.filter <- function(dp.mat, gt.mat, dp.min){ - - dp.pass = lapply(dp.mat, function(ind.vals, minval){which(ind.vals >= minval)}, minval = dp.min) - het.pass = lapply(gt.mat, function(ind.vals, filt.val){which(ind.vals == filt.val)}, filt.val = 1) - samples = names(dp.pass) - sample2pass = lapply(samples, function(sample, dp.pass, het.pass){intersect(dp.pass[[sample]], het.pass[[sample]]);}, dp.pass = dp.pass, het.pass = het.pass) - rseq.pos.pass = lapply(sample2pass, function(ind.pass, pos){pos[ind.pass]}, pos = rownames(dp.mat)) - names(rseq.pos.pass) = samples - - return(rseq.pos.pass) -} - -read.gt <- function(gt.file){ - - gt.mat = read.table(gt.file, sep='\t', stringsAsFactors=F, header=T) - colnames(gt.mat) = gsub('.*\\.run1\\.(.*)_12h.*', '\\1', colnames(gt.mat)) - id.num = paste(gt.mat[, 'CHROM'], gt.mat[, 'POS'], sep='.') - rownames(gt.mat) = id.num - gt.mat = gt.mat[, setdiff(colnames(gt.mat), c('CHROM', 'POS'))] - - #sub 00, 01, 11 -> 0,1,2 - gt.num.mat = as.data.frame(lapply(gt.mat, function(ind.gt){ind.gt = gsub('0/0', '0', ind.gt); ind.gt = gsub('0/1', '1', ind.gt); ind.gt = gsub('1/1', '2', ind.gt); ind.gt = as.integer(ind.gt); return(ind.gt);})) - rownames(gt.num.mat) = rownames(gt.mat) - colnames(gt.num.mat) = gsub('^X', '', colnames(gt.num.mat)) - gt.mat = gt.num.mat - - return(gt.mat) -} - -read.dp <- function(dp.file){ - dp.mat = read.table(dp.file, sep='\t', stringsAsFactors=F, header=T) - colnames(dp.mat) = gsub('.*\\.run1\\.(.*)_12h.*', '\\1', colnames(dp.mat)) - colnames(dp.mat) = gsub('^X', '', colnames(dp.mat)) - id.num = paste(dp.mat[, 'CHROM'], dp.mat[, 'POS'], sep='.') - rownames(dp.mat) = id.num - dp.mat = dp.mat[, setdiff(colnames(dp.mat), c('CHROM', 'POS'))] - - return(dp.mat) -} - -#Execute -main(dp.file, gt.file, dp.min) diff --git a/basecount/mpileup.allelecount.pl b/basecount/mpileup.allelecount.pl deleted file mode 100644 index 7ddcd31..0000000 --- a/basecount/mpileup.allelecount.pl +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/perl - - -#-q1 mapQ filter to get "unique" alignments. - -use strict; -use File::Basename; -use File::Spec; - -my ($bamfile, $varsfile, $ref, $ods, $aid, $em, $time, $outdir) = @ARGV; - -if(!(defined($time))){ - $time = '2:00:00'; -} -if(!(defined($outdir))){ - $outdir = dirname($bamfile); -} - - -mpileup(($bamfile, $varsfile, $ref, $ods, $aid, $em, $time, $outdir)); - -sub mpileup { - - use File::Spec::Functions; - use File::Basename; - - my $bamfile = shift @_; - my $varsfile = shift @_; - my $ref = shift @_; - my $ods = shift @_; - my $aid = shift @_; - my $em = shift @_; - my $time = shift @_; - my $outdir = shift @_; - - my $bamfile_base = basename($bamfile); - my $sbatch_base = $bamfile; - $sbatch_base =~ s/\///; - $sbatch_base =~ s/\//\./g; - my $sbatch_dir = $ods; - my $sbatch_file = catfile($sbatch_dir, $sbatch_base . '.mpileup.basecount.sbatch'); - my $outdata_infodir = catfile($outdir, 'info'); - -# print $sbatch_file . "\n"; - - #make dirs - `mkdir -p $outdata_infodir;`; #Creates the data and info dir - `mkdir -p $sbatch_dir;`; #Creates the sbatch-script dir - - open (SBATCH, '>', $sbatch_file) or die("Can't write to $sbatch_file: $!\n"); - - #print sbatch varsfile - print SBATCH "#!/bin/bash -l", "\n"; - print SBATCH "#SBATCH -A ", $aid, "\n"; - print SBATCH "#SBATCH -t ", $time, "\n"; - print SBATCH "#SBATCH -J multimpileup", "\n"; - print SBATCH "#SBATCH -p core -n 1", "\n"; - print SBATCH "#SBATCH -e $outdata_infodir/$sbatch_base.mpileup.basecount" . '.jid_%j.stderr', "\n"; - print SBATCH "#SBATCH -o $outdata_infodir/$sbatch_base.mpileup.basecount" . '.jid_%j.stdout', "\n"; - unless ($em eq 0) { - print SBATCH "#SBATCH --mail-type=All", "\n"; - print SBATCH "#SBATCH --mail-user=$em", "\n\n"; - } - - #print varsfile - print SBATCH 'module load bioinfo-tools' . "\n"; - print SBATCH 'module load samtools/0.1.18' . "\n"; - - #print cmd - my $basecountfile = catfile($outdir, $bamfile_base . '.mpileup.nocall.vcf'); - - print SBATCH 'samtools mpileup -q1 -d10000 -L10000 -DSugf ' . $ref . ' -l ' . $varsfile . ' ' . $bamfile . q( | bcftools view - >) . $basecountfile . "\n"; - - close(SBATCH); -} diff --git a/basecount/vcfI162basecount.sh b/basecount/vcfI162basecount.sh deleted file mode 100644 index 0d6076b..0000000 --- a/basecount/vcfI162basecount.sh +++ /dev/null @@ -1,3 +0,0 @@ - - -cat $1 | awk '{print $1, $2, $4, $5, $8;}' | perl -e 'while(){chomp; my ($chr, $pos, $ref, $alt, $info) = split; if($info =~ m/I16=(\d+,\d+,\d+,\d+)/){ $dp4=$1; @dp4arr=split(",", $dp4); $nref = $dp4arr[0] + $dp4arr[1]; $nalt = $dp4arr[2] + $dp4arr[3]; $tot = $nref + $nalt; if($nalt != 0){$frac = $nalt / $tot}else{$frac = 0;}; print join("\t", ($chr, $pos, $ref, $alt, $tot, $nref, $nalt, $frac)) . "\n";}}' diff --git a/test/run.gunzip.sh b/test/run.gunzip.sh deleted file mode 100644 index b7897f1..0000000 --- a/test/run.gunzip.sh +++ /dev/null @@ -1,29 +0,0 @@ -#GSNAP -# I have downloaded the gmap from the homepage and then unziped and extracted -wget http://research-pub.gene.com/gmap/src/gmap-gsnap-2012-05-15.tar.gz -gunzip gmap-gsnap-2012-05-15.tar.gz -tar xvf./* gmap-gsnap-2012-05-15.tar -#Made some changes in the config file -#Specified the paths -./configure -make -make install -# I have downloaded the database and extracted it -wget http://research-pub.gene.com/gmap/genomes/hg19.tar -tar xvf hg19.tar -##optional if need to run with older versions - ln -s hg19.ref153positions hg19.ref12153.positions -#the fasta file used here -wget 'http://bgx.org.uk/software/Homo_sapiens.GRCh37.64.dna.toplevel.ref.fa.gz' -gunzip Homo_sapiens.GRCh37.64.dna.toplevel.ref.fa.gz - -#How to process a genome for GMAP...Started with an example file 'g' -# Ran an example ,downloaded a fasta file and saved as g in utils -#Ran coords.txt -./fa_coords g -#then Ran gmap_setup -./gmap_setup -d hg19 g -# tried the file from gmapdb -./md_coords /bubo/home/h24/alvaj/gmap-2012-05-15/gmapdb/hg19/hg19.contig - - diff --git a/test/test.sh b/test/test.sh deleted file mode 100644 index f3107c3..0000000 --- a/test/test.sh +++ /dev/null @@ -1,193 +0,0 @@ -#GSNAP -# I have downloaded the gmap from the homepage and then unziped and extracted -wget 'http://research-pub.gene.com/gmap/src/gmap-gsnap-2012-05-15.tar.gz' -gunzip gmap-gsnap-2012-06-12.tar.gz -tar xvf./* gmap-gsnap-2012-06-12.tar -tar -xvf gmap-gsnap-2012-06-12.tar -#Made some changes in the config file -#Specified the paths -./configure -make -make installg -# I have downloaded the database and extracted it -wget 'http://research-pub.gene.com/gmap/genomes/hg19.tar' -tar xvf hg19.tar -##optional if need to run with older versions - ln -s hg19.ref153positions hg19.ref12153.positions -#the fasta file used here -wget 'http://bgx.org.uk/software/Homo_sapiens.GRCh37.64.dna.toplevel.ref.fa.gz' -gunzip Homo_sapiens.GRCh37.64.dna.toplevel.ref.fa.gz - -wget 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp135.txt.gz' - -#How to process a genome for GMAP...Started with an example file 'g' -# Ran an example ,downloaded a fasta file and saved as g in utils -#Ran coords.txt -./fa_coords g -#then Ran gmap_setup -./gmap_setup -d hg19 g -# tried the file from gmapdb -./md_coords /bubo/home/h24/alvaj/gmap-2012-05-15/gmapdb/hg19/hg19.contig - - -#Building map files -# here I used an example file 'Ailuropoda_melanoleuca.ailMel1.67.gtf',first I concatinated this file with a utility program gtf_splicesites by his command and stored in a folder at.splicesites -cat Ailuropoda_melanoleuca.ailMel1.67.gtf | /bubo/home/h24/alvaj/gsnap-2012-06-12/util/gtf_splicesites > at.splicesites -## then rename at.splicesites to at1, the folowwing comand -cat at.splicesites |./iit_store -o at1 - - -## Runing GSANP, the following command helps to run the fastaq file towards the genome hg19 -./gsnap -d hg19 x - -# GSNAP gives the difference between site level and intron level splicing based on the format of input file, to perform site level splicing the file should be in a special format which can be created by the folowwing commnad - - - cat Ailuropoda_melanoleuca.ailMel1.67.gtf |/bubo/home/h24/alvaj/gmap-2012-05-15/util/gtf_splicesites >foo.splicesites -# for intron level the format can be created by, -cat Ailuropoda_melanoleuca.ailMel1.67.gtf |/bubo/home/h24/alvaj/gmap-2012-05-15/util/gtf_introns >foo.introns -## once it is created you can process for map files by the following command, -cat foo.splicesites | ./iit_store -o splicesitefile - - -## Example for SNP tolerant alignment in GSNAP -# wget ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz -# unzipped by, -gunzip -c snp130.txt.gz | ./dbsnp_iit -w 3 -e snp130Exceptions.txt.gz > snpfile.txt -## changing the snpfile to .iit format file by, -cat snpfile.txt | /bubo/home/h24/alvaj/gmap-2012-05-15/src/iit_store -o snpfile -## after cpying the itt file into hg19.maps folder , and the programm to find the indictaed file from the .maps folder by the folowwing command(used at1.iit file) -/bubo/home/h24/alvaj/gmap-2012-05-15/src/snpindex -d hg19 -v at1 - -# for intron sites -cat Homo_sapiens.GRCh37.59.gtf |/bubo/home/h24/alvaj/glob/annotation/gsnap/gmap-2012-05-15/util/gtf_introns > snp.introns - -(cat < sample.fa -EOF -) >sbatch.template -cat sample.list | xargs -I% echo cat sbatch.template "| sed 's/sample/"%"/g' >" %.haploref.sbatch >cmds.sh -sh cmds.sh -find . -name '*.haploref.sbatch' | xargs -I% sbatch % - -##****Kalkyl**** -# assigning the job for some other interactive level -interactive -p devel -t 1:00:00 -A b2012046 -# to see om whic node I am runing -srun hostname -s |sort -u -# to create same directory for all the nodes which I am working with - srun -N 4 -n 4 mkdir /scratch/$SLURM_JOB_ID/indata -# Copy indata for my_program to the local directories - srun -N 4 -n 4 cp -r ~/glob/indata/* /scratch/$SLURM_JOB_ID/indata -# to see the queue line of the job -squeue -u alvaj - - -# to rename the file -cat snp.splicesite | awk '{print $1, "chr"$2, $3, $4;}' >snp.splicesite.uscschr -# tried for a short seqeunce from the fastq -/bubo/home/h24/alvaj/gmap-2012-05-15/src/gsnap -d hg19 ./fastq >shortfastaseq -# to copy a .sh file from the local computer to the cloud... -scp g.sh alvaj@kalkyl.uppmax.uu.se:/bubo/home/h24/alvaj/glob/annotation/gsnap/g.sh - -/bubo/home/h24/alvaj/gmap-2012-05-15/src/gsnap -d hg19 -s./splicesitesfile /proj/b2012046/edsgard/ase/sim/data/synt/fastqfilt/1_unstim.readp_1.2.ill.fastq.1.filter.fastq >output1 - -# for paired end rna seq reads -/bubo/home/h24/alvaj/gmap-2012-05-15/src/gsnap -d hg19 -s./splicesitesfile /proj/b2012046/edsgard/ase/sim/data/synt/fastqfilt/1_unstim.readp_1.2.ill.fastq.1.filter.fastq /proj/b2012046/edsgard/ase/sim/data/synt/fastqfilt/ 9_unstim.readp_1.2.ill.fastq.1.filter.fastq >output2 - - -(nohup prg args >my.out) >& my.err & -sbatch .script -srun -A b21010 -t 1:00:00 -p devel prg args >my.out - -## -#RAN for small seq and is working fine!!! -annotdir='/bubo/home/h24/alvaj/glob/annotation/gsnap' -outdir='/proj/b2012046/rani/analysis/gsnap' -datadir='/proj/b2012046/rani/data/fastq' -splicefile=${annotdir}/'splicesiteschro' -fastqfile1=${datadir}/test.n100.rp1.fastq -fastqfile2=${datadir}/test.n100.rp2.fastq -refdir=${annotdir}/gmapdb -ref=hg19 -snpfile=dbsnp135 -snpdir=${annotdir}/dbsnp -samdir='/bubo/home/h24/alvaj/glob/annotation/gsnap/samtools/samtools-0.1.6' - -cd $outdir -export PATH=$PATH:/bubo/home/h24/alvaj/opt0/gmap-2012-06-12/bin - -(gsnap -D $refdir -d $ref -A sam -s $splicefile -V $snpdir -v $snpfile --quality-protocol=illumina $fastqfile1 $fastqfile2 | ${samdir}/samtools view -Sb - > ${outdir}/test.n100.bam12) >& test.err12 & - -| ${samdir}/samtools view -Sb test.sam > test.sam -| ${samdir}/samtools view -S sam -b bam -samtools view -bT hg.fa -o xxx.bam xxx.sam -samdir='/bubo/home/h24/alvaj/glob/annotation/gsnap/samtools/samtools-0.1.6' - -## github commit// to push the changes from my computer to github -#when a fatal error occurs such as index lock already running - rm -f ./.git/index.lock - -# github to add file from computer towards repo -git add run.tophat.sh - git commit -m 'The edited tophat.sh file' run.tophat.sh -git remote add origin https:https://github.com/alvarani/ASE.git -git push origin master - -#kill all slurm jobs--if you want to specify the number of jobs eg: 'NR >32 {print $1;}' -squeue -u alvaj | grep -v JOB | awk '{print $1;}' | xargs -I% scancel % - - -# stream editor SED an example:to print the lines containg s/1.filter.fastq// and 2.filter.fastq'.... -cat fastq.files | sed 's/1.filter.fastq//' | grep -v '2.filter.fastq' >fastqfiles.prefix - - -# to delete a file with only a particular format within a directory,here to delete the files with .sam format -find . -type f -name "*.sam" -exec rm -f {} \; - - - -# it helps to change the suffix from fastqfiles10.prefix to sample1 and sample2 -cat fastqfiles10.prefix | xargs -I% basename % | xargs -I% echo cat sbatch10.template "| sed 's/sample/"%"/g' >" %gsnap.sbatch >cmds10.sh - - - -# to count the number of files in a directory, where targetdir is the name of directory -ls -1 targetdir | wc -l -# which is more advanced -find Downloads -maxdepth 1 -type f | wc -l - - - -# listing the jobs in human readable form -ls -ltrh -#to convet from sam to bam -samdir= '/bubo/home/h24/alvaj/glob/annotation/gsnap/samtools/samtools-0.1.6/samtools' -outdir='/proj/b2012046/rani/analysis/gsnap' -find . -type f -name "*.sam" | ${samdir}samtools view -bS > ${outdir}/samplebam - - -#example sed -sed 's/day/night/' new - -s Substitute command -/../../ Delimiter -day Regular Expression Pattern Search Pattern -night Replacement string , ie replace day with night -# anothe rexample with echno -echo Sunday | sed 's/day/night/' - -# the codes for bam file is in this one ASE -/bubo/home/h24/alvaj/glob/code/ASE - diff --git a/test/testgsnap for smallseq.sh b/test/testgsnap for smallseq.sh deleted file mode 100644 index 6316b04..0000000 --- a/test/testgsnap for smallseq.sh +++ /dev/null @@ -1,22 +0,0 @@ - - - -#RUN!for small test seq - -annotdir='/bubo/home/h24/alvaj/glob/annotation/gsnap' -splicefile=${annotdir}/'splicesiteschro' -snpfile=dbsnp135 -snpdir=${annotdir}/dbsnp -outdir='/proj/b2012046/rani/analysis/gsnap' -datadir='/proj/b2012046/rani/data/fastq' -fastqfile1=${datadir}/test.n100.rp1.fastq -fastqfile2=${datadir}/test.n100.rp2.fastq -refdir=${annotdir}/gmapdb -ref=hg19 - -cd $outdir -export PATH=$PATH:/bubo/home/h24/alvaj/opt/gmap-2012-06-02/bin -(gsnap -D $refdir -d $ref -A sam -s $splicefile -V $snpdir -v $snpfile --quality-protocol=illumina $fastqfile1 $fastqfile2 >${outdir}/test.n100.sam) >& test.err & - -### worked and given output - diff --git a/varcall/mpileup_multisample.pl b/varcall/mpileup_multisample.pl deleted file mode 100644 index 821e8d0..0000000 --- a/varcall/mpileup_multisample.pl +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/perl - -#find '/proj/b2011075/analysis/hs_pipe/outdata/run1/varcalls' -name 'allbams.chr_*' | xargs -I% echo 'perl /bubo/home/h26/edsgard/glob/code/ngs_pipes/hs_pipe/mpileup_multisample.pl' % '/proj/b2011075/analysis/hs_pipe/outscripts/run1/varcalls b2010035 daniel.edsgard@scilifelab.se' >cmds.sh - -#-q1 mapQ filter to get "unique" alignments. -#No max dp varFilter since RNA-seq data. - -use strict; - -my ($bamfilelist, $region, $ods, $aid, $em, $ref, $time, $partition) = @ARGV; - -if(!(defined($ref))){ - $ref = '/bubo/home/h26/edsgard/glob/annotation/human/Homo_sapiens.GRCh37.57.dna.concat.fa'; -} -if(!(defined($time))){ - $time = '23:00:00'; -} -if(!(defined($partition))){ - $partition = 'node'; -} - -mpileup(($bamfilelist, $region, $ods, $aid, $em, $ref, $time, $partition)); - -sub mpileup { - - use File::Spec::Functions; - use File::Basename; - - my $bamfilelist = shift @_; - my $region = shift @_; - my $ods = shift @_; - my $aid = shift @_; - my $em = shift @_; - my $ref = shift @_; - my $time = shift @_; - my $partition = shift @_; - - my $bamfiles_base = basename($bamfilelist); - my $sbatch_dir = $ods; - my $sbatch_file = catfile($sbatch_dir, $bamfiles_base . '.' . $region . '.mpileup.sbatch'); - my $outdata_dir = dirname($bamfilelist); - my $outdata_infodir = catfile($outdata_dir, 'info'); - - #make dirs - `mkdir -p $outdata_infodir;`; #Creates the data and info dir - `mkdir -p $sbatch_dir;`; #Creates the sbatch-script dir - - open (SBATCH, '>', $sbatch_file) or die("Can't write to $sbatch_file: $!\n"); - - #print sbatch vars - print SBATCH "#!/bin/bash -l", "\n"; - print SBATCH "#SBATCH -A ", $aid, "\n"; - print SBATCH "#SBATCH -t ", $time, "\n"; - print SBATCH "#SBATCH -J multimpileup", "\n"; - print SBATCH '#SBATCH -p ', $partition, ' -n 1', "\n"; - print SBATCH "#SBATCH -e $outdata_infodir/$bamfiles_base.$region.multimpileup" . '.jid_%j.stderr', "\n"; - print SBATCH "#SBATCH -o $outdata_infodir/$bamfiles_base.$region.multimpileup" . '.jid_%j.stdout', "\n"; - unless ($em eq 0) { - print SBATCH "#SBATCH --mail-type=All", "\n"; - print SBATCH "#SBATCH --mail-user=$em", "\n\n"; - } - - #print vars - print SBATCH 'module load bioinfo-tools' . "\n"; - print SBATCH 'module load samtools/0.1.18' . "\n"; - print SBATCH 'echo "Running on: $(hostname)"',"\n\n"; - - #print cmd - my $vcffile = $bamfilelist . '.' . $region . '.vcf'; - print SBATCH 'samtools mpileup -q1 -d10000 -L10000 -DSugf ' . $ref . ' -r ' . $region . ' -b ' . $bamfilelist . ' | bcftools view -vcg - | vcfutils.pl varFilter >' . $vcffile . "\n"; - - close(SBATCH); -} diff --git a/varcall/varcall.sh b/varcall/varcall.sh deleted file mode 100644 index 750a34b..0000000 --- a/varcall/varcall.sh +++ /dev/null @@ -1,57 +0,0 @@ -#Varcalling with SAMtools -#DEP: mpileup_multisample.pl - - -### -#VarCall (SAMtools) -### - -projid='b2012046' -email='alva.rani@scilifelab.se' -sbatchdir='/proj/b2012046/rani/scripts/gsnap/varscripts' -bamfile='/proj/b2012046/rani/analysis/gsnap/mergebam' -vcfdir='/proj/b2012046/rani/data/varcalls' -ref='/bubo/home/h24/alvaj/glob/annotation/gsnap/reference/reference.genome' -execdir='/bubo/home/h24/alvaj/glob/code/ASE/varcall' - -cd $sbatchdir -# need to redo it again with merged bam files -#create file listing the bam files -allbamsfile=${vcfdir}/allbams.list -find $bamfile -maxdepth 1 -name '*.bam' >$allbamsfile - -#create region files -module load bioinfo-tools -module load samtools/0.1.18 -srun -p devel -A b2012046 -t 1:00:00 samtools faidx $ref & -vcfutils.pl splitchr -l 6600000 ${ref}.fai >${vcfdir}/genome.480.regs - - -#Run mpileup -cat ${vcfdir}/genome.480.regs | xargs -I% echo 'perl' ${execdir}/mpileup_multisample.pl $allbamsfile % $sbatchdir $projid $email $ref >cmds.sh -sh cmds.sh -find $sbatchdir -name '*.mpileup.sbatch' | xargs -I% sbatch % - - -#check status: -cd $vcfdir -lt *.stderr >allerrfiles.tmp -squeue -u alvaj | grep -v JOBID | awk '{print $1;}' | xargs -I% grep % ${vcfdir}/info/allerrfiles.tmp - -#Catenate all region-specific vcfs -find $vcfdir -name 'allbams.*.vcf' | xargs cat | grep -v '^#' >${vcfdir}/allbams.vcf.tmp -cat allbams.list.MT:1-16569.vcf | grep '^#' >header.tmp -cat header.tmp allbams.vcf.tmp >allbams.vcf -rm header.tmp allbams.vcf.tmp -wc -l allbams.vcf -#26 - -#Rm slashes ilon vcf sample header -vcfdir='/proj/b2012046/rani/data/varcalls' -cd ${vcfdir} -cat allbams.vcf | grep '^#' >header.tmp -cat header.tmp | sed 's/\/proj\/b2012046\/rani\/analysis\/gsnap\///g' | sed 's/\.bam//g' >file.tmp -mv file.tmp header.tmp -grep -v '^#' allbams.vcf >allbams.vcf.noheader -cat header.tmp allbams.vcf.noheader >allbams.vcf -rm allbams.vcf.noheader header.tmp From 11b5e075c1c6fa6bc8d28256bd54bc55b8f75e94 Mon Sep 17 00:00:00 2001 From: alvarani Date: Fri, 28 Sep 2012 14:00:16 +0200 Subject: [PATCH 04/10] Added new GSNAP --- GSNAP/bam/bamerror.run.sh | 31 ++++ GSNAP/bam/bamproc.sh | 87 ++++++++++ GSNAP/bam/bamstats.sh | 44 ++++++ GSNAP/bam/bedtoolshist2pdf.R | 158 +++++++++++++++++++ GSNAP/bam/coverage.pl | 68 ++++++++ GSNAP/bam/mergebams.pl | 64 ++++++++ GSNAP/bam/rmdups.pl | 60 +++++++ GSNAP/basecount/basecount.sh | 145 +++++++++++++++++ GSNAP/basecount/dphetfilter.R | 79 ++++++++++ GSNAP/basecount/mpileup.allelecount.pl | 75 +++++++++ GSNAP/basecount/vcfI162basecount.sh | 3 + GSNAP/map/run.gsnap.sh | 104 ++++++++++++ GSNAP/varcall/varcall/mpileup_multisample.pl | 73 +++++++++ GSNAP/varcall/varcall/varcall.sh | 57 +++++++ 14 files changed, 1048 insertions(+) create mode 100644 GSNAP/bam/bamerror.run.sh create mode 100644 GSNAP/bam/bamproc.sh create mode 100644 GSNAP/bam/bamstats.sh create mode 100644 GSNAP/bam/bedtoolshist2pdf.R create mode 100644 GSNAP/bam/coverage.pl create mode 100644 GSNAP/bam/mergebams.pl create mode 100644 GSNAP/bam/rmdups.pl create mode 100644 GSNAP/basecount/basecount.sh create mode 100644 GSNAP/basecount/dphetfilter.R create mode 100644 GSNAP/basecount/mpileup.allelecount.pl create mode 100644 GSNAP/basecount/vcfI162basecount.sh create mode 100644 GSNAP/map/run.gsnap.sh create mode 100644 GSNAP/varcall/varcall/mpileup_multisample.pl create mode 100644 GSNAP/varcall/varcall/varcall.sh diff --git a/GSNAP/bam/bamerror.run.sh b/GSNAP/bam/bamerror.run.sh new file mode 100644 index 0000000..ef60e50 --- /dev/null +++ b/GSNAP/bam/bamerror.run.sh @@ -0,0 +1,31 @@ + +###Test for 19 sam file +#created a errorlist of 19 files +# taking the the reads appreaed more than twice and taking it in a order +cat errorfiles.list | xargs -I% echo "samtools view "%" | awk '{print \$1;}' | sort | uniq -c >"%.reads2nmapped >cmds.sh + + + +errordir='/proj/b2012046/rani/analysis/gsnap/bamsort/errortest' + +ls -1 ${errordir}/*.reads2nmapped >reads2nmapped.files + +cat reads2nmapped.files | xargs -I% basename % | xargs -I% echo "awk '\$1 == 2 {print \$0;}'" ${errordir}/% " | wc -l >" %.n.uniq >cmds.sh +cat reads2nmapped.files | xargs -I% basename % | xargs -I% echo "awk '\$1 != 2 {print \$0;}'" ${errordir}/% " | wc -l >" %.n.nonuniq >cmds2.sh + + +bamfiles='/proj/b2012046/rani/analysis/gsnap/bamsort/errortest/errorbamfiles.list' +errordir='/proj/b2012046/rani/analysis/gsnap/bamsort/errortest' + +#Extract multimapped reads +ls -1 ${errordir}/*.reads2nmapped >reads2nmapped.files +cat reads2nmapped.files | xargs -I% basename % | xargs -I% echo "awk '\$1 != 2 {print \$0;}'" ${errordir}/% "| awk '{print \$2;}'>" %.nonuniq >cmds.sh +sh cmds.sh & + +#TBD: Convert all bams to sams +bamdir='/proj/b2012046/rani/analysis/gsnap/bamindex' +sam=${bamdir}/2_LPS.part_2ab.sam + +#Rm multimapped reads +ls -1 *d.nonuniq | awk -F'.' -v OFS='.' '{print $1, $2;}' >errorfiles.prefix.list +cat errorfiles.prefix.list | xargs -I% echo "perl rm.multimappedreads.pl " %.sam %.fastq.bam.reads2nmapped.nonuniq ">"%.uniqmapped.sam >cmds.sh diff --git a/GSNAP/bam/bamproc.sh b/GSNAP/bam/bamproc.sh new file mode 100644 index 0000000..17b97b5 --- /dev/null +++ b/GSNAP/bam/bamproc.sh @@ -0,0 +1,87 @@ +#Processing bam files +#DEP: rmdups.pl +#DEP: mergebams.pl + + +### +#rmDups +### +email='alva.rani@scilifelab.se' +projid='b2012046' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/bam' +sbatchdir='/proj/b2012046/rani/scripts/gsnap/unmapped' +bamfile='/proj/b2012046/rani/analysis/gsnap/bamsort/' +cd $sbatchdir +find $bamfile -name '*.uniqmapped.bam' | awk -F'/' -v execdir=$execdir -v sbatchdir=$sbatchdir -v projid=$projid -v email=$email '{print "perl", execdir"/rmdups.pl", $0, $NF, sbatchdir, projid, email;}' >rmdups.sh +sh rmdups.sh +find . -name '*.sbatch' | xargs -I% sbatch % +#### + + +#Check: +find $bamfile -name '*.bam.sorted.nodup' | xargs -I% ls -lrth % +find $bamfile -name '*.rmdups.stderr' | xargs -I% grep -i 'err' +find $bamfile -name '*.rmdups.stderr' | xargs -I% grep -i 'warn' + +#rm unsorted bams +find $bamfile -name '*.bam' | xargs -I% rm % + +#rename bamfiles +find -name '*.bam.sorted' | xargs -I% echo rename '.bam.sorted' '.sorted.bam' % >cmds.sh +sh cmds.sh + + +#rename bamfiles +find $bamdir -name '*.bam.sorted.nodup' | xargs -I% echo rename '.bam.sorted.nodup' '.sorted.nodup.bam' % >cmds.sh +sh cmds.sh +#### + +### +#Merge and sort bam files +### +projid='b2012046' +email='alva.rani@scilifelab.se' +sbatchdir='/proj/b2012046/rani/scripts/gsnap/mergebams' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/bam' +bamsort='/proj/b2012046/rani/analysis/gsnap/bamsort' +outdir='/proj/b2012046/rani/analysis/gsnap/mergebam' +time='11:00:00' +cd $sbatchdir +find $bamsort -name '*.sorted.nodup.bam' | sed 's/\..*//' | sort -u >samples.list +cat samples.list | xargs -I% basename % | xargs -I% echo perl ${execdir}'/mergebams.pl' % $bamsort $outdir $sbatchdir $projid $email $time >cmds.sh +sh cmds.sh +find $sbatchdir -name '*.mergesams.sbatch' | xargs -I% sbatch % + + +## Submitted +#Check: +find ${bamfile}/info/*.stderr | xargs -I% grep -i 'err' % +find ${bamfile}/info/*.mergesams.*.stderr | xargs -I% grep -i 'warn' % +#### + +### +#Create indexes +### +bamsort='/proj/b2012046/rani/analysis/gsnap/mergebam' +find $bamsort -maxdepth 1 -name '*.bam' | xargs -I% echo samtools index % > cmd.sh +#Manually add sbatch header to the cmds.sh script +(cat <bamindex.sbatchheader +cat bamindex.sbatchheader cmd.sh >bamindex.sbatch +sbatch bamindex.sbatch + +############################ +## Done all steps here without errors + diff --git a/GSNAP/bam/bamstats.sh b/GSNAP/bam/bamstats.sh new file mode 100644 index 0000000..871c599 --- /dev/null +++ b/GSNAP/bam/bamstats.sh @@ -0,0 +1,44 @@ +#Stats on bam files +#DEP: coverage.pl +#DEP: bedtoolshist2pdf.R + + +### +#Calculate number of mapped reads (More than the input number of reads, due to non-unique mappings!) +### +module load bioinfo-tools +module load samtools/0.1.16 +find '/proj/b2012046/rani/analysis/gsnap' -name 'accepted_hits.bam.sorted.nodup' -exec sh -c "samtools flagstat {} >{}.samtools.flagstat.out" \; & +find '/proj/b2012046/rani/analysis/gsnap' -name '*.samtools.flagstat.out' | xargs -I% awk 'NR == 1 {print FILENAME, $1;}' % >nmapped.reads + + +### +#Calculate coverage +### + +#BEDtools (see coverage.pl)--IMP +prg='/proj/b2012046/rani/scripts/gsnap/coverage.pl' +find '/proj/b2012046/rani/analysis/gsnap' -name 'merged.bam' | awk -F'/' -v ods="$ods" -v projid="$projid" -v em="$email" -v prg="$prg" '{print "perl", prg, $0, $8, ods, projid, em;}' >bedtoolscov.sh + + +### +#Generate pdf of the coverage--NOT needed now +### +find . -name 'ccds.bedtools.out2' -exec sh -c 'grep ^all {} >{}.all' \; +find . -name 'ensembl.bedtools.out' -exec sh -c 'grep ^all {} >{}.all' \; +find . -name 'genome.bedtools.out2' -exec sh -c 'grep ^genome {} >{}.all' \; +find . -name 'ccds.*2.all' | zip ccds.bedtools.all.zip -@ +find . -name 'genome.*2.all' | zip genome.bedtools.all.zip -@ +find . -name 'ccds*2.all' | awk -F'/' '{print "mv", $0, "coverage/"$2"."$4;}' >mv.sh +find . -name 'genome*2.all' | awk -F'/' '{print "mv", $0, "coverage/"$2"."$4;}' >mv.sh +find '/Users/danieledsgard/Dropbox/postdoc/projects/ase/res/hs_pipe/run1' -name '*ccds.*2.all' | sed 's/.*\///' | awk -F'.' '{print $1;}' >samples +find '/Users/danieledsgard/Dropbox/postdoc/projects/ase/res/hs_pipe/run1' -name '*ccds.*2.all' >files +paste samples files >ccds.histfiles +find '/Users/danieledsgard/Dropbox/postdoc/projects/ase/res/hs_pipe/run1' -name '*genome.*2.all' | sed 's/.*\///' | awk -F'.' '{print $1;}' >samples +find '/Users/danieledsgard/Dropbox/postdoc/projects/ase/res/hs_pipe/run1' -name '*genome.*2.all' >files +paste samples files >genome.histfiles + +bedtoolshist2pdf.R ccds.histfiles + +#NB: no pair after read-QC, rmdups, but not: unique, MQ, properly paired. What is M (mismatches)set to? + diff --git a/GSNAP/bam/bedtoolshist2pdf.R b/GSNAP/bam/bedtoolshist2pdf.R new file mode 100644 index 0000000..9ec9e85 --- /dev/null +++ b/GSNAP/bam/bedtoolshist2pdf.R @@ -0,0 +1,158 @@ + +#USAGE: Rscript bedtoolshist2pdf.R histfilelist.file + +#argv = commandArgs(trailingOnly=TRUE) +#histfilelist.file = argv + +annot = 'ccds' +histfilelist.file = '../res/hs_pipe/run1/coverage/ccds.histfiles' +resdir = dirname(histfilelist.file) +tab.outfile = file.path(resdir, 'ccds.cumsum.tab') +hist.pdf.file = file.path(resdir, 'ccds.hist.pdf') + +annot = 'genome' +histfilelist.file = '../res/hs_pipe/run1/coverage/genome.histfiles' +resdir = dirname(histfilelist.file) +tab.outfile = file.path(resdir, 'genome.cumsum.tab') +hist.pdf.file = file.path(resdir, 'genome.hist.pdf') + + +#read data +histfile2id = read.table(histfilelist.file, stringsAsFactors=FALSE, row.names=1) +colnames(histfile2id) = 'filename' +#hist.files = c('../res/hs_pipe/test/bedtools.genome.all.hist', '../res/hs_pipe/test/bedtools.ensembl.all.hist', '../res/hs_pipe/test/bedtools.ccds.all.hist') + +#set linetype (dashed for unstim) +samples = rownames(histfile2id) +n.samples = length(samples) +ltypes = rep(1, n.samples) +names(ltypes) = samples +ltypes[grep('unstim', names(ltypes))] = 2 + +#set color +library('RColorBrewer') +individual = gsub('^(\\d+)_.*', '\\1', samples, perl = TRUE) +ind2color = brewer.pal(8, 'Dark2') +names(ind2color) = unique(individual) +colors = ind2color[individual] + +histfile2id = cbind(histfile2id, ltypes, colors, stringsAsFactors=FALSE) + +main <- function(histfile2id, tab.outfile, hist.pdf.file){ + hist.tabs = read.files(histfile2id) + dp2frac = get.dp2frac(hist.tabs) + dp2bases = get.dp2bases(hist.tabs) + plot.dp2frac(dp2frac, ltypes = histfile2id[names(dp2frac), 'ltypes'], colors = histfile2id[names(dp2frac), 'colors']) + plot.dp2bases(dp2bases) + + + #Plot + if(annot == 'genome'){ + ylab = 'bases covered' + pdf(file = hist.pdf.file, width = 10, height = 7) + par(mfrow = c(1, 2)) + unstim.ind = grep('unstim', names(dp2bases)) + dp2bases.unstim = dp2bases[unstim.ind] + plot.dp2y(dp2bases.unstim, ltypes = histfile2id[names(dp2bases.unstim), 'ltypes'], colors = histfile2id[names(dp2bases.unstim), 'colors'], ylab = ylab, min.plot.dp=2) + + lps.ind = grep('LPS', names(dp2bases)) + dp2bases.lps = dp2bases[lps.ind] + plot.dp2y(dp2bases.lps, ltypes = histfile2id[names(dp2bases.lps), 'ltypes'], colors = histfile2id[names(dp2bases.lps), 'colors'], ylab = ylab, min.plot.dp = 2) + dev.off() + } + else{ + #Plot + pdf(file = hist.pdf.file, width = 10, height = 7) + par(mfrow = c(1, 2)) + unstim.ind = grep('unstim', names(dp2frac)) + dp2frac.unstim = dp2frac[unstim.ind] + plot.dp2frac(dp2frac.unstim, ltypes = histfile2id[names(dp2frac.unstim), 'ltypes'], colors = histfile2id[names(dp2frac.unstim), 'colors']) + + lps.ind = grep('LPS', names(dp2frac)) + dp2frac.lps = dp2frac[lps.ind] + plot.dp2frac(dp2frac.lps, ltypes = histfile2id[names(dp2frac.lps), 'ltypes'], colors = histfile2id[names(dp2frac.lps), 'colors']) + dev.off() + } + + #Print table + max.dp = 101 + dp2frac.mat = dp2frac[[1]][1:max.dp] + for(jfile in 2:length(dp2frac)){ + dp2frac.mat = cbind(dp2frac.mat, dp2frac[[jfile]][1:max.dp]) + } + colnames(dp2frac.mat) = names(dp2frac) + + rownames(dp2frac.mat) = 0:(max.dp - 1) + write.table(dp2frac.mat, sep = '\t', quote = FALSE, col.names = NA, file = tab.outfile) + + #avg cov + avg.cov = unlist(lapply(hist.tabs, function(hist.tab){sum(as.numeric(hist.tab[, 'dp'] * hist.tab[, 'bases.cov'])) / hist.tab[1, 'feat.len']})) + write.table(round(avg.cov), sep='\t', quote = F, col.names = F, file = file.path(resdir, 'ccds.avgcov.tab')) + + #y-axis: n.genes, x-axis: avg.cov +} + +read.files <- function(histfile2id){ + + samples = rownames(histfile2id) + n.hist = length(samples) + + hist.tabs = list() + length(hist.tabs) = n.hist + names(hist.tabs) = samples + for(jsample in samples){ + hist.tab = read.table(histfile2id[jsample, 'filename'], sep='\t', stringsAsFactors=FALSE) + colnames(hist.tab) = c('feat', 'dp', 'bases.cov', 'feat.len', 'frac.cov') + hist.tabs[[jsample]] = hist.tab + } + + return(hist.tabs) +} + +get.dp2frac <- function(hist.tabs){ +#fraction of bases covered against dp + + n.hist = length(hist.tabs) + dp2frac = list() + length(dp2frac) = n.hist + hist.files = names(hist.tabs) + names(dp2frac) = hist.files + for(hist.file in hist.files){ + hist.tab = hist.tabs[[hist.file]] + dp2frac[[hist.file]] = rev(cumsum(hist.tab[rev(1:nrow(hist.tab)), 'frac.cov'])) + } + + return(dp2frac) +} + +get.dp2bases <- function(hist.tabs){ +#number of bases covered against dp + n.hist = length(hist.tabs) + dp2bases = list() + length(dp2bases) = n.hist + hist.files = names(hist.tabs) + names(dp2bases) = hist.files + for(hist.file in hist.files){ + hist.tab = hist.tabs[[hist.file]] + dp2bases[[hist.file]] = rev(cumsum(hist.tab[rev(1:nrow(hist.tab)), 'bases.cov'])) + } + + return(dp2bases) +} + +plot.dp2y <- function(dp2frac, max.plot.dp = 100, ylab = 'fraction covered', ltypes, colors, min.plot.dp = 1){ + + n.hist = length(dp2frac) + + ylim = range(unlist(dp2frac)) + ylim = c(0, 5e8) + jhist = 1 + plot(dp2frac[[jhist]][min.plot.dp:max.plot.dp], xlab = 'depth', ylab = ylab, col=colors[jhist], type = 'l', ylim = ylim, lty = ltypes[jhist]) + for(jhist in 2:n.hist){ + lines(dp2frac[[jhist]][min.plot.dp:max.plot.dp], col=colors[jhist], lty = ltypes[jhist]) + } + legend('topright', legend = names(dp2frac), lty = ltypes, col = colors) +} + +#execute +#main(histfile2id) diff --git a/GSNAP/bam/coverage.pl b/GSNAP/bam/coverage.pl new file mode 100644 index 0000000..4484e7e --- /dev/null +++ b/GSNAP/bam/coverage.pl @@ -0,0 +1,68 @@ +#!/usr/bin/perl + +#mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e "select chrom, size from hg19.chromInfo" | sed 's/^chr//' | sed 's/^M/MT/' > $chrsizes +#prg='/bubo/home/h26/edsgard/glob/code/ngs_pipes/hs_pipe/coverage.pl' +#find '/proj/b2011075/analysis/hs_pipe/outdata/run1' -name 'merged.bam' | awk -F'/' -v ods="$ods" -v projid="$projid" -v em="$em" -v prg="$prg" '{print "perl", prg, $0, $8, ods, projid, em;}' >bedtoolscov.sh + +use strict; + +#my ($bamfile, $sampleid, $ods, $aid, $em) = @ARGV; + +my $bed='/bubo/home/h26/edsgard/glob/annotation/human/CCDS-hg19.chrstripped.bed'; +my $gtf='/bubo/home/h26/edsgard/glob/annotation/human/Homo_sapiens.GRCh37.59.gtf'; +my $genome='/bubo/home/h26/edsgard/glob/annotation/human/hg19.chrsizes.chrstripped'; + +coverage((@ARGV, $bed, $gtf, $genome)); + +sub coverage { + + use File::Spec::Functions; + use File::Basename; + + my $bamfile = shift @_; + my $sampleid = shift @_; + my $ods = shift @_; + my $aid = shift @_; + my $em = shift @_; + my $bed = shift @_; + my $gtf = shift @_; + my $genome = shift @_; + my $time = '3:00:00'; + + my $sbatch_dir = catfile($ods, $sampleid, 'coverage'); + my $sbatch_file = catfile($sbatch_dir, "$sampleid.coverage.sbatch2"); + my $outdata_dir = catfile(dirname(dirname($bamfile)), 'coverage'); + my $outdata_infodir = catfile($outdata_dir, 'info'); + + #make dirs + `mkdir -p $outdata_infodir;`; #Creates the data and info dir + `mkdir -p $sbatch_dir;`; + + open (SBATCH, '>', $sbatch_file) or die("Can't write to $sbatch_file: $!\n"); + + #print sbatch vars + print SBATCH "#!/bin/bash -l", "\n"; + print SBATCH "#SBATCH -A ", $ods, "\n"; + print SBATCH "#SBATCH -t ", $time, "\n"; + print SBATCH "#SBATCH -J Cov_", $sampleid, "\n"; + print SBATCH "#SBATCH -p node -n 1", "\n"; + print SBATCH "#SBATCH -e $outdata_infodir/$sampleid.bedtools.cov.stderr2", "\n"; + print SBATCH "#SBATCH -o $outdata_infodir/$sampleid.bedtools.cov.stdout2", "\n"; + + unless ($em eq 0) { + print SBATCH "#SBATCH --mail-type=All", "\n"; + print SBATCH "#SBATCH --mail-user=$em", "\n\n"; + } + + #print vars + print SBATCH 'module load bioinfo-tools' . "\n"; + print SBATCH 'module load BEDTools/2.11.2' . "\n"; + print SBATCH 'echo "Running on: $(hostname)"',"\n\n"; + + #print cmd + print SBATCH 'coverageBed -abam ' . $bamfile . ' -b ' . $bed . ' -hist -split >' . catfile($outdata_dir, 'ccds.bedtools.out2') . "\n"; + print SBATCH 'coverageBed -abam ' . $bamfile . ' -b ' . $gtf . ' -hist -split >' . catfile($outdata_dir, 'ensembl.bedtools.out2') . "\n"; + print SBATCH 'genomeCoverageBed -ibam ' . $bamfile . ' -g ' . $genome . ' -split -max 1000 >' . catfile($outdata_dir, 'genome.bedtools.out2') . "\n"; + + close(SBATCH); +} diff --git a/GSNAP/bam/mergebams.pl b/GSNAP/bam/mergebams.pl new file mode 100644 index 0000000..175ed9a --- /dev/null +++ b/GSNAP/bam/mergebams.pl @@ -0,0 +1,64 @@ +#!/usr/bin/perl + +#find '/proj/b2011075/analysis/hs_pipe/outdata/run1' -name 'accepted_hits.bam' | awk -F'/' '{print "perl /bubo/home/h26/edsgard/glob/code/ngs_pipes/hs_pipe/rmdups.pl", $0, $10, "/proj/b2011075/analysis/hs_pipe/outscripts/run1 b2010035 daniel.edsgard@scilifelab.se";}' >rmdups.sh + +use strict; + +#my ($sampleid, $bamdir, $outdir, $sbatch_dir, $aid, $em) = @ARGV; + +mergesams(@ARGV); + +sub mergesams { + + use File::Spec::Functions; + use File::Basename; + + my $sampleid = shift @_; + my $bamdir = shift @_; + my $outdata_dir = shift @_; + my $sbatch_dir = shift @_; + my $aid = shift @_; + my $em = shift @_; + my $time = shift @_; + + if(!defined($time)){ + $time = '23:59:00'; + } + + my $sbatch_file = catfile($sbatch_dir, "$sampleid.mergesams.sbatch"); + my $outdata_infodir = catfile($outdata_dir, 'info'); + + #get bamfiles to be merged + my @bamfiles = `find $bamdir -name '*.bam' | grep $sampleid`; + chomp(@bamfiles); + + #set outfile + my $bamfile_merged = catfile($outdata_dir, $sampleid . '.bam'); + + #make dirs + `mkdir -p $outdata_infodir;`; #Creates the data and info dir + `mkdir -p $sbatch_dir;`; #Creates the sbatch-script dir + + open (SBATCH, '>', $sbatch_file) or die("Can't write to $sbatch_file: $!\n"); + + #print sbatch vars + print SBATCH "#!/bin/bash -l", "\n"; + print SBATCH "#SBATCH -A ", $aid, "\n"; + print SBATCH "#SBATCH -t ", $time, "\n"; + print SBATCH "#SBATCH -J mergesams_", $sampleid, "\n"; + print SBATCH "#SBATCH -p core -n 1", "\n"; + print SBATCH "#SBATCH -e $outdata_infodir/$sampleid" . '.mergesams.jid_%.stderr' . "\n"; + print SBATCH "#SBATCH -o $outdata_infodir/$sampleid" . '.mergesams.jid_%.stdout' . "\n"; + unless ($em eq 0) { + print SBATCH "#SBATCH --mail-type=All", "\n"; + print SBATCH "#SBATCH --mail-user=$em", "\n\n"; + } + + #print vars + print SBATCH 'module load bioinfo-tools' . "\n"; + print SBATCH 'module load picard/1.41' . "\n"; + + #print cmd + print SBATCH 'java -Xmx2g -jar /bubo/sw/apps/bioinfo/picard/1.41/MergeSamFiles.jar INPUT=' . join(' INPUT=', @bamfiles) . ' OUTPUT=' . $bamfile_merged . ' USE_THREADING=true SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT' . "\n"; + close(SBATCH); +} diff --git a/GSNAP/bam/rmdups.pl b/GSNAP/bam/rmdups.pl new file mode 100644 index 0000000..d3b90a5 --- /dev/null +++ b/GSNAP/bam/rmdups.pl @@ -0,0 +1,60 @@ +#!/usr/bin/perl + +#perl rmdups.pl /proj/b2011075/analysis/hs_pipe/outdata/run1/1_unstim_12h/tophat/1_unstim_12h.lane_1.index_8.readp_1.fastq.1.filter.fastq/accepted_hits.bam 1_unstim_12h.lane_1.index_8 '/proj/b2011075/analysis/hs_pipe/outscripts/run1' 'b2010035' 'daniel.edsgard@scilifelab.se' +#find '/proj/b2011075/analysis/hs_pipe/outdata/run1' -name 'accepted_hits.bam' | awk -F'/' '{print "perl /bubo/home/h26/edsgard/glob/code/ngs_pipes/hs_pipe/rmdups.pl", $0, $10, "/proj/b2011075/analysis/hs_pipe/outscripts/run1 b2010035 daniel.edsgard@scilifelab.se";}' >rmdups.sh + +use strict; + +#my ($bamfile, $sampleid, $ods, $aid, $em) = @ARGV; + +rmDups(@ARGV); + +sub rmDups { + + use File::Spec::Functions; + use File::Basename; + + my $bamfile = shift @_; + my $sampleid = shift @_; + my $ods = shift @_; + my $aid = shift @_; + my $em = shift @_; + my $time = '9:00:00'; + + my $sbatch_dir = catfile($ods, $sampleid); + my $sbatch_file = catfile($sbatch_dir, "$sampleid.rmdups.sbatch"); + my $outdata_dir = dirname($bamfile); + my $outdata_infodir = catfile($outdata_dir, 'log'); + + #make dirs + `mkdir -p $outdata_infodir;`; #Creates the data and info dir + `mkdir -p $sbatch_dir;`; #Creates the sbatch-script dir + + open (SBATCH, '>', $sbatch_file) or die("Can't write to $sbatch_file: $!\n"); + + #print sbatch vars + print SBATCH "#!/bin/bash -l", "\n"; + print SBATCH "#SBATCH -A ", $aid, "\n"; + print SBATCH "#SBATCH -t ", $time, "\n"; + print SBATCH "#SBATCH -J rmDups_", $sampleid, "\n"; + print SBATCH "#SBATCH -p core -n 1", "\n"; + print SBATCH "#SBATCH -e $outdata_infodir/$sampleid.rmdups.stderr", "\n"; + print SBATCH "#SBATCH -o $outdata_infodir/$sampleid.rmdups.stdout", "\n"; + unless ($em eq 0) { + print SBATCH "#SBATCH --mail-type=All", "\n"; + print SBATCH "#SBATCH --mail-user=$em", "\n\n"; + } + + #print vars + print SBATCH 'module load bioinfo-tools' . "\n"; + print SBATCH 'module load picard/1.41' . "\n"; + print SBATCH 'echo "Running on: $(hostname)"',"\n\n"; + + #print cmd + my $bamfile_sorted = $bamfile . '.sorted'; + my $bamfile_nodup = $bamfile_sorted . '.nodup'; + print SBATCH 'java -Xmx2g -jar /bubo/sw/apps/bioinfo/picard/1.41/SortSam.jar INPUT=' . $bamfile . ' OUTPUT=' . $bamfile_sorted . ' SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT' . "\n"; + print SBATCH 'java -Xmx2g -jar /bubo/sw/apps/bioinfo/picard/1.41/MarkDuplicates.jar INPUT=' . $bamfile_sorted . ' OUTPUT=' . $bamfile_nodup . ' ASSUME_SORTED=true REMOVE_DUPLICATES=true METRICS_FILE=' . $sampleid . '_picardDup_metrics VALIDATION_STRINGENCY=LENIENT' . "\n"; + + close(SBATCH); +} diff --git a/GSNAP/basecount/basecount.sh b/GSNAP/basecount/basecount.sh new file mode 100644 index 0000000..2b162e4 --- /dev/null +++ b/GSNAP/basecount/basecount.sh @@ -0,0 +1,145 @@ +#Basecount (allelic count of each allele) +#DEP: dphetfilter.R +#DEP: mpileup.allelecount.pl +#DEP: vcfI162basecount.sh + + +######## +#DP and HET filter +######## + +vcfdir='/proj/b2012046/rani/data/varcalls' + +## +#VCF to tab format +## +PATH=${PATH}:/bubo/home/h26/edsgard/opt/vcftools_0.1.7/bin; export PATH +PERL5LIB=/bubo/home/h26/edsgard/opt/vcftools_0.1.7/lib; export PERL5LIB +vcftools --vcf allbams.vcf --extract-FORMAT-info DP --out genome +vcftools --vcf allbams.vcf --extract-FORMAT-info GT --out genome + +## +#Filter +## +#OUT: *.het.pos +mindp=10 +execdir='/bubo/home/h26/edsgard/glob/code/ase/sim' +Rscript ${execdir}/dphetfilter.R genome.DP.FORMAT genome.GT.FORMAT $mindp & +ls -1 *.het.pos | xargs -I% echo cat % '|' sed "'s/\./ /' >"%.hetvars >cmds.sh +rename genome.DP.FORMAT.het.pos. . *.hetvars + +#Dump "chr,pos,ref,alt" for these vars. (Used as input in snv.ase.R since the alt allele is seldom biallelic when running mpileup.allelecount.pl below.) +find $vcdir -maxdepth 1 -name '*.vcf' | grep -v 'allbams' | xargs -I% echo "awk -F'\t' '{print \$1\".\"\$2, \$1, \$2, \$4, \$5;}' % | grep -v '^#' | sort -k 1,1 >"%.pos >cmds.sh +sh cmds.sh +ls -1 *.het.pos | xargs -I% echo sort % '>'%.sorted >cmds.sh +sh cmds.sh +ls -1 *.het.pos.sorted | sed 's/.genome.DP.FORMAT.het.pos.sorted//' >samples.list +cat samples.list | xargs -I% echo join -j1 %.genome.DP.FORMAT.het.pos.sorted %.genome.DP.FORMAT.het.pos '>' %.hetvars.alleles >cmds.sh +srun -p devel -t 1:00:00 -A b2012046 sh cmds.sh & +ls -1 *.hetvars.alleles | xargs -I% echo cut -d"' '" -f2-5 % "| tr ' ' '\t' >"%.tmp >cmds.sh +rename .hetvars.alleles.tmp .hetvars.alleles *.hetvars.alleles.tmp + + +################################################# +#BASECOUNT by MPILEUP (no varcall). +#First four of I16 == DP4 +################################################# +bamdir='/proj/b2012046/rani/analysis/gsnap/mergebam' +vcfdir='/proj/b2012046/rani/data/varcalls' +bcdir='/proj/b2012046/rani/data/gsnapbasecount' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/basecount' +scriptdir='/proj/b2012046/rani/scripts/basecount' +projid='b2012046' +email='alva.rani@scilifelab.se' +time='4:00:00' +ref='/bubo/home/h24/alvaj/glob/annotation/gsnap/reference/reference.genome' + +cd $scriptdir +find $bamdir -maxdepth 1 -name '*.bam' | xargs -I% basename % | sed 's/\.bam//' >samples.list + +cd $scriptdir +find $bamdir -maxdepth 1 -name '*.bam' | xargs -I% basename % | sed 's/\.bam//' >samples.list +cat samples.list | xargs -I% echo 'perl' ${execdir}'/mpileup.allelecount.pl' ${bamdir}/%.bam ${vcfdir}/%.hetvars $ref $scriptdir $projid $email $time $bcdir >cmds.sh +sh cmds.sh +find $scriptdir -name '*mpileup.basecount*' | xargs -I% sbatch % + +bcdir='/proj/b2012046/rani/data/basecount' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/basecount' +find $bcdir -name '*nocall.vcf' >nocall.vcf.list +cat nocall.vcf.list | xargs -I% echo 'sh' ${execdir}/vcfI162basecount.sh % '>' %.basecount >cmds.sh +srun -p devel -t 1:00:00 -A b2012046 sh cmds.sh & + + +[alvaj@kalkyl4 basecount]$ nano basecount.sh +[alvaj@kalkyl4 basecount]$ cat basecount.sh +#Basecount (allelic count of each allele) +#DEP: dphetfilter.R +#DEP: mpileup.allelecount.pl +#DEP: vcfI162basecount.sh + + +######## +#DP and HET filter +######## + +vcfdir='/proj/b2012046/rani/data/varcalls' + +## +#VCF to tab format +## +PATH=${PATH}:/bubo/home/h26/edsgard/opt/vcftools_0.1.7/bin; export PATH +PERL5LIB=/bubo/home/h26/edsgard/opt/vcftools_0.1.7/lib; export PERL5LIB +vcftools --vcf allbams.vcf --extract-FORMAT-info DP --out genome +vcftools --vcf allbams.vcf --extract-FORMAT-info GT --out genome + +## +#Filter +## +#OUT: *.het.pos +mindp=10 +execdir='/bubo/home/h26/edsgard/glob/code/ase/sim' +Rscript ${execdir}/dphetfilter.R genome.DP.FORMAT genome.GT.FORMAT $mindp & +ls -1 *.het.pos | xargs -I% echo cat % '|' sed "'s/\./ /' >"%.hetvars >cmds.sh +rename genome.DP.FORMAT.het.pos. . *.hetvars + +#Dump "chr,pos,ref,alt" for these vars. (Used as input in snv.ase.R since the alt allele is seldom biallelic when running mpileup.allelecount.pl below.) +find $vcdir -maxdepth 1 -name '*.vcf' | grep -v 'allbams' | xargs -I% echo "awk -F'\t' '{print \$1\".\"\$2, \$1, \$2, \$4, \$5;}' % | grep -v '^#' | sort -k 1,1 >"%.pos >cmds.sh +sh cmds.sh +ls -1 *.het.pos | xargs -I% echo sort % '>'%.sorted >cmds.sh +sh cmds.sh +ls -1 *.het.pos.sorted | sed 's/.genome.DP.FORMAT.het.pos.sorted//' >samples.list +cat samples.list | xargs -I% echo join -j1 %.genome.DP.FORMAT.het.pos.sorted %.genome.DP.FORMAT.het.pos '>' %.hetvars.alleles >cmds.sh +srun -p devel -t 1:00:00 -A b2012046 sh cmds.sh & +ls -1 *.hetvars.alleles | xargs -I% echo cut -d"' '" -f2-5 % "| tr ' ' '\t' >"%.tmp >cmds.sh +rename .hetvars.alleles.tmp .hetvars.alleles *.hetvars.alleles.tmp + + +################################################# +#BASECOUNT by MPILEUP (no varcall). +#First four of I16 == DP4 +################################################# +bamdir='/proj/b2012046/rani/analysis/gsnap/mergebam' +vcfdir='/proj/b2012046/rani/data/varcalls' +bcdir='/proj/b2012046/rani/data/gsnapbasecount' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/basecount' +scriptdir='/proj/b2012046/rani/scripts/basecount' +projid='b2012046' +email='alva.rani@scilifelab.se' +time='10:00:00' +ref='/bubo/home/h24/alvaj/glob/annotation/gsnap/reference/reference.genome' + +cd $scriptdir +find $bamdir -maxdepth 1 -name '*.bam' | xargs -I% basename % | sed 's/\.bam//' >samples.list + +cd $scriptdir +find $bamdir -maxdepth 1 -name '*.bam' | xargs -I% basename % | sed 's/\.bam//' >samples.list +cat samples.list | xargs -I% echo 'perl' ${execdir}'/mpileup.allelecount.pl' ${bamdir}/%.bam ${vcfdir}/%.hetvars $ref $scriptdir $projid $email $time $bcdir >cmds.sh +sh cmds.sh +find $scriptdir -name '*mpileup.basecount*' | xargs -I% sbatch % + +bcdir='/proj/b2012046/rani/data/basecount' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/basecount' +find $bcdir -name '*nocall.vcf' >nocall.vcf.list +cat nocall.vcf.list | xargs -I% echo 'sh' ${execdir}/vcfI162basecount.sh % '>' %.basecount >cmds.sh +srun -p devel -t 1:00:00 -A b2012046 sh cmds.sh & + diff --git a/GSNAP/basecount/dphetfilter.R b/GSNAP/basecount/dphetfilter.R new file mode 100644 index 0000000..a782a5f --- /dev/null +++ b/GSNAP/basecount/dphetfilter.R @@ -0,0 +1,79 @@ + +#USAGE: Rscript dp.file gt.file 10 + +#get args +argv = commandArgs(trailingOnly=TRUE) +dp.file = argv[1] +gt.file = argv[2] +dp.min = as.integer(argv[3]) + +#Test-suite +if(0){ + datadir = '~/Dropbox/postdoc/projects/ase/data/codetest' + dp.file = file.path(datadir, 'head.dp') + gt.file = file.path(datadir, 'head.gt') + dp.min = 10 +} + +main <- function(dp.file, gt.file, dp.min){ + + #read + dp.mat = read.dp(dp.file) + gt.mat = read.gt(gt.file) + + #sort + dp.mat = dp.mat[rownames(gt.mat), ] + + #filter + rseq.pos.pass = dp.het.filter(dp.mat, gt.mat, dp.min) + + #dump + samples = names(rseq.pos.pass) + for(jsample in samples){ + hetvars.file = paste(jsample, dp.file, 'het', 'pos', sep='.') + write.table(rseq.pos.pass[[jsample]], quote = F, sep = '\t', col.names=F, row.names = F, file = hetvars.file) + } +} + +dp.het.filter <- function(dp.mat, gt.mat, dp.min){ + + dp.pass = lapply(dp.mat, function(ind.vals, minval){which(ind.vals >= minval)}, minval = dp.min) + het.pass = lapply(gt.mat, function(ind.vals, filt.val){which(ind.vals == filt.val)}, filt.val = 1) + samples = names(dp.pass) + sample2pass = lapply(samples, function(sample, dp.pass, het.pass){intersect(dp.pass[[sample]], het.pass[[sample]]);}, dp.pass = dp.pass, het.pass = het.pass) + rseq.pos.pass = lapply(sample2pass, function(ind.pass, pos){pos[ind.pass]}, pos = rownames(dp.mat)) + names(rseq.pos.pass) = samples + + return(rseq.pos.pass) +} + +read.gt <- function(gt.file){ + + gt.mat = read.table(gt.file, sep='\t', stringsAsFactors=F, header=T) + colnames(gt.mat) = gsub('.*\\.run1\\.(.*)_12h.*', '\\1', colnames(gt.mat)) + id.num = paste(gt.mat[, 'CHROM'], gt.mat[, 'POS'], sep='.') + rownames(gt.mat) = id.num + gt.mat = gt.mat[, setdiff(colnames(gt.mat), c('CHROM', 'POS'))] + + #sub 00, 01, 11 -> 0,1,2 + gt.num.mat = as.data.frame(lapply(gt.mat, function(ind.gt){ind.gt = gsub('0/0', '0', ind.gt); ind.gt = gsub('0/1', '1', ind.gt); ind.gt = gsub('1/1', '2', ind.gt); ind.gt = as.integer(ind.gt); return(ind.gt);})) + rownames(gt.num.mat) = rownames(gt.mat) + colnames(gt.num.mat) = gsub('^X', '', colnames(gt.num.mat)) + gt.mat = gt.num.mat + + return(gt.mat) +} + +read.dp <- function(dp.file){ + dp.mat = read.table(dp.file, sep='\t', stringsAsFactors=F, header=T) + colnames(dp.mat) = gsub('.*\\.run1\\.(.*)_12h.*', '\\1', colnames(dp.mat)) + colnames(dp.mat) = gsub('^X', '', colnames(dp.mat)) + id.num = paste(dp.mat[, 'CHROM'], dp.mat[, 'POS'], sep='.') + rownames(dp.mat) = id.num + dp.mat = dp.mat[, setdiff(colnames(dp.mat), c('CHROM', 'POS'))] + + return(dp.mat) +} + +#Execute +main(dp.file, gt.file, dp.min) diff --git a/GSNAP/basecount/mpileup.allelecount.pl b/GSNAP/basecount/mpileup.allelecount.pl new file mode 100644 index 0000000..7ddcd31 --- /dev/null +++ b/GSNAP/basecount/mpileup.allelecount.pl @@ -0,0 +1,75 @@ +#!/usr/bin/perl + + +#-q1 mapQ filter to get "unique" alignments. + +use strict; +use File::Basename; +use File::Spec; + +my ($bamfile, $varsfile, $ref, $ods, $aid, $em, $time, $outdir) = @ARGV; + +if(!(defined($time))){ + $time = '2:00:00'; +} +if(!(defined($outdir))){ + $outdir = dirname($bamfile); +} + + +mpileup(($bamfile, $varsfile, $ref, $ods, $aid, $em, $time, $outdir)); + +sub mpileup { + + use File::Spec::Functions; + use File::Basename; + + my $bamfile = shift @_; + my $varsfile = shift @_; + my $ref = shift @_; + my $ods = shift @_; + my $aid = shift @_; + my $em = shift @_; + my $time = shift @_; + my $outdir = shift @_; + + my $bamfile_base = basename($bamfile); + my $sbatch_base = $bamfile; + $sbatch_base =~ s/\///; + $sbatch_base =~ s/\//\./g; + my $sbatch_dir = $ods; + my $sbatch_file = catfile($sbatch_dir, $sbatch_base . '.mpileup.basecount.sbatch'); + my $outdata_infodir = catfile($outdir, 'info'); + +# print $sbatch_file . "\n"; + + #make dirs + `mkdir -p $outdata_infodir;`; #Creates the data and info dir + `mkdir -p $sbatch_dir;`; #Creates the sbatch-script dir + + open (SBATCH, '>', $sbatch_file) or die("Can't write to $sbatch_file: $!\n"); + + #print sbatch varsfile + print SBATCH "#!/bin/bash -l", "\n"; + print SBATCH "#SBATCH -A ", $aid, "\n"; + print SBATCH "#SBATCH -t ", $time, "\n"; + print SBATCH "#SBATCH -J multimpileup", "\n"; + print SBATCH "#SBATCH -p core -n 1", "\n"; + print SBATCH "#SBATCH -e $outdata_infodir/$sbatch_base.mpileup.basecount" . '.jid_%j.stderr', "\n"; + print SBATCH "#SBATCH -o $outdata_infodir/$sbatch_base.mpileup.basecount" . '.jid_%j.stdout', "\n"; + unless ($em eq 0) { + print SBATCH "#SBATCH --mail-type=All", "\n"; + print SBATCH "#SBATCH --mail-user=$em", "\n\n"; + } + + #print varsfile + print SBATCH 'module load bioinfo-tools' . "\n"; + print SBATCH 'module load samtools/0.1.18' . "\n"; + + #print cmd + my $basecountfile = catfile($outdir, $bamfile_base . '.mpileup.nocall.vcf'); + + print SBATCH 'samtools mpileup -q1 -d10000 -L10000 -DSugf ' . $ref . ' -l ' . $varsfile . ' ' . $bamfile . q( | bcftools view - >) . $basecountfile . "\n"; + + close(SBATCH); +} diff --git a/GSNAP/basecount/vcfI162basecount.sh b/GSNAP/basecount/vcfI162basecount.sh new file mode 100644 index 0000000..0d6076b --- /dev/null +++ b/GSNAP/basecount/vcfI162basecount.sh @@ -0,0 +1,3 @@ + + +cat $1 | awk '{print $1, $2, $4, $5, $8;}' | perl -e 'while(){chomp; my ($chr, $pos, $ref, $alt, $info) = split; if($info =~ m/I16=(\d+,\d+,\d+,\d+)/){ $dp4=$1; @dp4arr=split(",", $dp4); $nref = $dp4arr[0] + $dp4arr[1]; $nalt = $dp4arr[2] + $dp4arr[3]; $tot = $nref + $nalt; if($nalt != 0){$frac = $nalt / $tot}else{$frac = 0;}; print join("\t", ($chr, $pos, $ref, $alt, $tot, $nref, $nalt, $frac)) . "\n";}}' diff --git a/GSNAP/map/run.gsnap.sh b/GSNAP/map/run.gsnap.sh new file mode 100644 index 0000000..7a61d44 --- /dev/null +++ b/GSNAP/map/run.gsnap.sh @@ -0,0 +1,104 @@ +#! /bin/bash -l +# download the software and DB +wget 'http://research-pub.gene.com/gmap/src/gmap-gsnap-2012-06-06.tar.gz' +wget 'http://research-pub.gene.com/gmap/genomes/hg19.tar' + +gsnapexecdir='/bubo/home/h24/alvaj/opt0/gmap-2012-06-06' + +#SNPs +### +# downloaded the needed file snp134,snp135common, +cd $snpdir +wget 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp135Common.txt.gz' +## done again saved in annotation/gsnap for latest version of gsanp +wget 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp135.txt.gz' + +#reformat---from here I have to run for the new version +gunzip -c snp135Common.txt.gz | ${gsnapexecdir}/dbsnp_iit -w 3 > ${snpfile}.txt & +gunzip -c snp135Common.txt.gz | ${gsnapexecdir}/dbsnp_iit -w 3 > dbsnp135.txt & +## done and save in annotation/gsnap |I used it here for new version) + +(cat ${snpfile}.txt |${gsnapexecdir}/iit_store -o $snpfile >${snpfile}.iitstore.out) >& ${snpfile}.iitstore.err & +##done for new version + +(snpindex -D $refdir -d $ref -V $snpdir -v $snpfile ${snpfile}.iit >snpindex.out) >& snpindex.err & +/bubo/home/h24/alvaj/opt0/gmap-2012-06-06/src/snpindex + +### +# Splice sites , to generate a splice site index---***splicesite*** +### +cat Homo_sapiens.GRCh37.59.gtf |${gsnapexecdir}/gtf_splicesites > snp.splicesiteschr +#(---done) + +# renamed file is splicesitechr +cat snp.splicesiteschr | awk '{print $1, "chr"$2, $3, $4;}' >splicesiteschr + + +# renamed file is splicesitechr + +## processing it to map file (changing to .iit file)---done for snp.splicesitechr +cat splicesiteschr | ${gsnapexecdir}/iit_store -o splicesiteschro +## done + +### +## Run gsnap +### + +#set path for for each of these variables +gsnapexecdir='/bubo/home/h24/alvaj/opt0/gmap-2012-06-12' +annotdir='/bubo/home/h24/alvaj/glob/annotation/gsnap' +scriptdir='/proj/b2012046/rani/scripts/gsnap/test' +fastqdir='/proj/b2012046/rani/data/synt/fastqfilt/oneGchunks' +outdir='/proj/b2012046/rani/analysis/gsnap' +splicefile=${annotdir}/'splicesiteschro' +refdir=${annotdir}/gmapdb +ref=hg19 +snpdir=${annotdir}/dbsnp +snpfile=dbsnp135 +projid='b2012046' +email='alva.rani@scilifelab.se' + +#Rename fastq files +cd $fastqdir +ln -s /proj/b2012046/edsgard/ase/sim/data/synt/fastqfilt/oneGchunks/*.fastq . +#append readp_1 as suffix +ls -1 *.readp_1.*.fastq | xargs -I% mv % %.readp_1 +#rm first occurence of readp_1 +rename .readp_1.part .part *.readp_1 +#append readp_2 as suffix +ls -1 *.readp_2.*.fastq | xargs -I% mv % %.readp_2 +#rm first occurence of readp_2 +rename .readp_2.part .part *.readp_2 + +cd $scriptdir +find $fastqdir -maxdepth 1 -name '*fastq.readp_1' | sed 's/.readp_1//' >fastq.files.prefix + +samples=(`cat fastq.files.prefix`) + +for fastqfile in ${samples[@]} +do +sbatchfile=`basename $fastqfile` +echo $sbatchfile +(cat <${outdir}/${sbatchfile}.bam +EOF +)>${sbatchfile}.sbatch +done +find . -name '*.fastq.sbatch' | xargs -I% sbatch % +#status: submitted + +############ + diff --git a/GSNAP/varcall/varcall/mpileup_multisample.pl b/GSNAP/varcall/varcall/mpileup_multisample.pl new file mode 100644 index 0000000..821e8d0 --- /dev/null +++ b/GSNAP/varcall/varcall/mpileup_multisample.pl @@ -0,0 +1,73 @@ +#!/usr/bin/perl + +#find '/proj/b2011075/analysis/hs_pipe/outdata/run1/varcalls' -name 'allbams.chr_*' | xargs -I% echo 'perl /bubo/home/h26/edsgard/glob/code/ngs_pipes/hs_pipe/mpileup_multisample.pl' % '/proj/b2011075/analysis/hs_pipe/outscripts/run1/varcalls b2010035 daniel.edsgard@scilifelab.se' >cmds.sh + +#-q1 mapQ filter to get "unique" alignments. +#No max dp varFilter since RNA-seq data. + +use strict; + +my ($bamfilelist, $region, $ods, $aid, $em, $ref, $time, $partition) = @ARGV; + +if(!(defined($ref))){ + $ref = '/bubo/home/h26/edsgard/glob/annotation/human/Homo_sapiens.GRCh37.57.dna.concat.fa'; +} +if(!(defined($time))){ + $time = '23:00:00'; +} +if(!(defined($partition))){ + $partition = 'node'; +} + +mpileup(($bamfilelist, $region, $ods, $aid, $em, $ref, $time, $partition)); + +sub mpileup { + + use File::Spec::Functions; + use File::Basename; + + my $bamfilelist = shift @_; + my $region = shift @_; + my $ods = shift @_; + my $aid = shift @_; + my $em = shift @_; + my $ref = shift @_; + my $time = shift @_; + my $partition = shift @_; + + my $bamfiles_base = basename($bamfilelist); + my $sbatch_dir = $ods; + my $sbatch_file = catfile($sbatch_dir, $bamfiles_base . '.' . $region . '.mpileup.sbatch'); + my $outdata_dir = dirname($bamfilelist); + my $outdata_infodir = catfile($outdata_dir, 'info'); + + #make dirs + `mkdir -p $outdata_infodir;`; #Creates the data and info dir + `mkdir -p $sbatch_dir;`; #Creates the sbatch-script dir + + open (SBATCH, '>', $sbatch_file) or die("Can't write to $sbatch_file: $!\n"); + + #print sbatch vars + print SBATCH "#!/bin/bash -l", "\n"; + print SBATCH "#SBATCH -A ", $aid, "\n"; + print SBATCH "#SBATCH -t ", $time, "\n"; + print SBATCH "#SBATCH -J multimpileup", "\n"; + print SBATCH '#SBATCH -p ', $partition, ' -n 1', "\n"; + print SBATCH "#SBATCH -e $outdata_infodir/$bamfiles_base.$region.multimpileup" . '.jid_%j.stderr', "\n"; + print SBATCH "#SBATCH -o $outdata_infodir/$bamfiles_base.$region.multimpileup" . '.jid_%j.stdout', "\n"; + unless ($em eq 0) { + print SBATCH "#SBATCH --mail-type=All", "\n"; + print SBATCH "#SBATCH --mail-user=$em", "\n\n"; + } + + #print vars + print SBATCH 'module load bioinfo-tools' . "\n"; + print SBATCH 'module load samtools/0.1.18' . "\n"; + print SBATCH 'echo "Running on: $(hostname)"',"\n\n"; + + #print cmd + my $vcffile = $bamfilelist . '.' . $region . '.vcf'; + print SBATCH 'samtools mpileup -q1 -d10000 -L10000 -DSugf ' . $ref . ' -r ' . $region . ' -b ' . $bamfilelist . ' | bcftools view -vcg - | vcfutils.pl varFilter >' . $vcffile . "\n"; + + close(SBATCH); +} diff --git a/GSNAP/varcall/varcall/varcall.sh b/GSNAP/varcall/varcall/varcall.sh new file mode 100644 index 0000000..3ea6f2d --- /dev/null +++ b/GSNAP/varcall/varcall/varcall.sh @@ -0,0 +1,57 @@ +#Varcalling with SAMtools +#DEP: mpileup_multisample.pl + + +### +#VarCall (SAMtools) +### + +projid='b2012046' +email='alva.rani@scilifelab.se' +sbatchdir='/proj/b2012046/rani/scripts/gsnap/varscripts' +bamfile='/proj/b2012046/rani/analysis/gsnap/mergebam' +vcfdir='/proj/b2012046/rani/data/varcalls' +ref='/bubo/home/h24/alvaj/glob/annotation/gsnap/reference/reference.genome' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/varcall' + +cd $sbatchdir +# need to redo it again with merged bam files +#create file listing the bam files +allbamsfile=${vcfdir}/allbams.list +find $bamfile -maxdepth 1 -name '*.bam' >$allbamsfile + +#create region files +module load bioinfo-tools +module load samtools/0.1.18 +srun -p devel -A b2012046 -t 1:00:00 samtools faidx $ref & +vcfutils.pl splitchr -l 6600000 ${ref}.fai >${vcfdir}/genome.480.regs + + +#Run mpileup +cat ${vcfdir}/genome.480.regs | xargs -I% echo 'perl' ${execdir}/mpileup_multisample.pl $allbamsfile % $sbatchdir $projid $email $ref >cmds.sh +sh cmds.sh +find $sbatchdir -name '*.mpileup.sbatch' | xargs -I% sbatch % + + +#check status: +cd $vcfdir +lt *.stderr >allerrfiles.tmp +squeue -u alvaj | grep -v JOBID | awk '{print $1;}' | xargs -I% grep % ${vcfdir}/info/allerrfiles.tmp + +#Catenate all region-specific vcfs +find $vcfdir -name 'allbams.*.vcf' | xargs cat | grep -v '^#' >${vcfdir}/allbams.vcf.tmp +cat allbams.list.chr15:1-6600000.vcf | grep '^#' >header.tmp +cat header.tmp allbams.vcf.tmp >allbams.vcf +rm header.tmp allbams.vcf.tmp +wc -l allbams.vcf +#26 + +#Rm slashes ilon vcf sample header +vcfdir='/proj/b2012046/rani/data/varcalls' +cd ${vcfdir} +cat allbams.vcf | grep '^#' >header.tmp +cat header.tmp | sed 's/\/proj\/b2012046\/rani\/analysis\/gsnap\///g' | sed 's/\.bam//g' >file.tmp +mv file.tmp header.tmp +grep -v '^#' allbams.vcf >allbams.vcf.noheader +cat header.tmp allbams.vcf.noheader >allbams.vcf +rm allbams.vcf.noheader header.tmp From fac6ed8cf596366cd4bf33511563ff5ecdabc6e8 Mon Sep 17 00:00:00 2001 From: alvarani Date: Fri, 28 Sep 2012 14:05:54 +0200 Subject: [PATCH 05/10] Adding new Degner --- Degner/Bam/bamproc.degner.sh | 90 ++++++++++++++ Degner/Bam/mergebams.pl | 64 ++++++++++ Degner/Basecount/basecount.degner.sh | 145 +++++++++++++++++++++++ Degner/VArcall/degner.varcall.sh | 57 +++++++++ Degner/VArcall/mpileup_multisample.pl | 73 ++++++++++++ Degner/map/run.degner.sh | 47 ++++++++ Degner/map/tophat.gensbatch.pl | 65 ++++++++++ Degner/map/tophat.singleend.gensbatch.pl | 66 +++++++++++ 8 files changed, 607 insertions(+) create mode 100644 Degner/Bam/bamproc.degner.sh create mode 100644 Degner/Bam/mergebams.pl create mode 100644 Degner/Basecount/basecount.degner.sh create mode 100644 Degner/VArcall/degner.varcall.sh create mode 100644 Degner/VArcall/mpileup_multisample.pl create mode 100644 Degner/map/run.degner.sh create mode 100644 Degner/map/tophat.gensbatch.pl create mode 100644 Degner/map/tophat.singleend.gensbatch.pl diff --git a/Degner/Bam/bamproc.degner.sh b/Degner/Bam/bamproc.degner.sh new file mode 100644 index 0000000..5974dfe --- /dev/null +++ b/Degner/Bam/bamproc.degner.sh @@ -0,0 +1,90 @@ +#Processing bam files +#DEP: rmdups.pl +#DEP: mergebams.pl + + +### +#rmDups +### +email='alva.rani@scilifelab.se' +projid='b2012046' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/bam' +sbatchdir='/proj/b2012046/rani/scripts/degner/bamscripts' +bamfile='/proj/b2012046/rani/analysis/degner' +cd $sbatchdir +find $bamfile -name 'accepted_hits.bam' | awk -F'/' -v execdir=$execdir -v sbatchdir=$sbatchdir -v projid=$projid -v email=$email '{print "perl", execdir"/rmdups.pl", $0, $7, sbatchdir, projid, email;}' >rmdups.sh +sh rmdups.sh +find . -name '*.sbatch' | xargs -I% sbatch % +#### +find . -name '*.sorted' | wc -l + +#Check: +find $bamfile -name '*.bam.sorted.nodup' | xargs -I% ls -lrth % +find $bamfile -name '*.rmdups.stderr' | xargs -I% grep -i 'err' +find $bamfile -name '*.rmdups.stderr' | xargs -I% grep -i 'warn' + +#rm unsorted bams +find $bamfile -name '*.bam' | xargs -I% rm % + +#rename bamfiles +find -name '*.bam.sorted' | xargs -I% echo rename '.bam.sorted' '.sorted.bam' % >cmds.sh +sh cmds.sh + +#rm unsorted bams +find $bamdir -name 'accepted_hits.bam' | grep -v 'mmseq' | xargs -I% rm % + +#rename bamfiles +find $bamdir -name '*.bam.sorted.nodup' | xargs -I% echo rename '.bam.sorted.nodup' '.sorted.nodup.bam' % >cmds.sh +sh cmds.sh +#### + +### +#Merge and sort bam files +### +projid='b2012046' +email='alva.rani@scilifelab.se' +sbatchdir='/proj/b2012046/rani/scripts/degner/mergebamScripts' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/bam' +bamsort='/proj/b2012046/rani/analysis/degner/' +outdir='/proj/b2012046/rani/analysis/degner/mergebam' +time='11:00:00' +cd $sbatchdir +find $bamsort -name '*.sorted.nodup.bam' | sed 's/\..*//' | sort -u >samples.list +cat samples.list | xargs -I% basename % | xargs -I% echo perl ${execdir}'/mergebams.pl' % $bamsort $outdir $sbatchdir $projid $email $time >cmds.sh +sh cmds.sh +find $sbatchdir -name '*.mergesams.sbatch' | xargs -I% sbatch % + + + +## Submitted +#Check: +find ${bamfile}/info/*.stderr | xargs -I% grep -i 'err' % +find ${bamfile}/info/*.mergesams.*.stderr | xargs -I% grep -i 'warn' % +#### + +### +#Create indexes +### +bamsort='/proj/b2012046/rani/analysis/degner/mergebam' +find $bamsort -maxdepth 1 -name '*.bam' | xargs -I% echo samtools index % > cmd.sh +#Manually add sbatch header to the cmds.sh script +(cat <bamindex.sbatchheader +cat bamindex.sbatchheader cmd.sh >bamindex.sbatch +sbatch bamindex.sbatch + +############################ +## Done all steps here without errors + diff --git a/Degner/Bam/mergebams.pl b/Degner/Bam/mergebams.pl new file mode 100644 index 0000000..d42669f --- /dev/null +++ b/Degner/Bam/mergebams.pl @@ -0,0 +1,64 @@ +#!/usr/bin/perl + +#find '/proj/b2011075/analysis/hs_pipe/outdata/run1' -name 'accepted_hits.bam' | awk -F'/' '{print "perl /bubo/home/h26/edsgard/glob/code/ngs_pipes/hs_pipe/rmdups.pl", $0, $10, "/proj/b2011075/analysis/hs_pipe/outscripts/run1 b2010035 daniel.edsgard@scilifelab.se";}' >rmdups.sh + +use strict; + +#my ($sampleid, $bamdir, $outdir, $sbatch_dir, $aid, $em) = @ARGV; + +mergesams(@ARGV); + +sub mergesams { + + use File::Spec::Functions; + use File::Basename; + + my $sampleid = shift @_; + my $bamdir = shift @_; + my $outdata_dir = shift @_; + my $sbatch_dir = shift @_; + my $aid = shift @_; + my $em = shift @_; + my $time = shift @_; + + if(!defined($time)){ + $time = '23:59:00'; + } + + my $sbatch_file = catfile($sbatch_dir, "$sampleid.mergesams.sbatch"); + my $outdata_infodir = catfile($outdata_dir, 'info'); + + #get bamfiles to be merged + my @bamfiles = `find $bamdir -name '*.bam' | grep $sampleid`; + chomp(@bamfiles); + + #set outfile + my $bamfile_merged = catfile($outdata_dir, $sampleid . '.merged.bam'); + + #make dirs + `mkdir -p $outdata_infodir;`; #Creates the data and info dir + `mkdir -p $sbatch_dir;`; #Creates the sbatch-script dir + + open (SBATCH, '>', $sbatch_file) or die("Can't write to $sbatch_file: $!\n"); + + #print sbatch vars + print SBATCH "#!/bin/bash -l", "\n"; + print SBATCH "#SBATCH -A ", $aid, "\n"; + print SBATCH "#SBATCH -t ", $time, "\n"; + print SBATCH "#SBATCH -J mergesams_", $sampleid, "\n"; + print SBATCH "#SBATCH -p core -n 1", "\n"; + print SBATCH "#SBATCH -e $outdata_infodir/$sampleid" . '.mergesams.jid_%.stderr' . "\n"; + print SBATCH "#SBATCH -o $outdata_infodir/$sampleid" . '.mergesams.jid_%.stdout' . "\n"; + unless ($em eq 0) { + print SBATCH "#SBATCH --mail-type=All", "\n"; + print SBATCH "#SBATCH --mail-user=$em", "\n\n"; + } + + #print vars + print SBATCH 'module load bioinfo-tools' . "\n"; + print SBATCH 'module load samtools/0.1.18' . "\n"; + + #print cmd + print SBATCH 'samtools merge'. ' ' OUTPUT=' . $bamfile_merged . ' INPUT=' . join(' INPUT=', @bamfiles) ' . "\n"; + close(SBATCH); +} diff --git a/Degner/Basecount/basecount.degner.sh b/Degner/Basecount/basecount.degner.sh new file mode 100644 index 0000000..4163659 --- /dev/null +++ b/Degner/Basecount/basecount.degner.sh @@ -0,0 +1,145 @@ +#Basecount (allelic count of each allele) +#DEP: dphetfilter.R +#DEP: mpileup.allelecount.pl +#DEP: vcfI162basecount.sh + + +######## +#DP and HET filter +######## + +vcfdir='/proj/b2012046/rani/data/degvarcall' + +## +#VCF to tab format +## +PATH=${PATH}:/bubo/home/h26/edsgard/opt/vcftools_0.1.7/bin; export PATH +PERL5LIB=/bubo/home/h26/edsgard/opt/vcftools_0.1.7/lib; export PERL5LIB +vcftools --vcf allbams.vcf --extract-FORMAT-info DP --out genome +vcftools --vcf allbams.vcf --extract-FORMAT-info GT --out genome + +## +#Filter +## +#OUT: *.het.pos +mindp=10 +execdir='/bubo/home/h26/edsgard/glob/code/ase/sim' +Rscript ${execdir}/dphetfilter.R genome.DP.FORMAT genome.GT.FORMAT $mindp & +ls -1 *.het.pos | xargs -I% echo cat % '|' sed "'s/\./ /' >"%.hetvars >cmds.sh +rename genome.DP.FORMAT.het.pos. . *.hetvars + +#Dump "chr,pos,ref,alt" for these vars. (Used as input in snv.ase.R since the alt allele is seldom biallelic when running mpileup.allelecount.pl below.) +find $vcdir -maxdepth 1 -name '*.vcf' | grep -v 'allbams' | xargs -I% echo "awk -F'\t' '{print \$1\".\"\$2, \$1, \$2, \$4, \$5;}' % | grep -v '^#' | sort -k 1,1 >"%.pos >cmds.sh +sh cmds.sh +ls -1 *.het.pos | xargs -I% echo sort % '>'%.sorted >cmds.sh +sh cmds.sh +ls -1 *.het.pos.sorted | sed 's/.genome.DP.FORMAT.het.pos.sorted//' >samples.list +cat samples.list | xargs -I% echo join -j1 %.genome.DP.FORMAT.het.pos.sorted %.genome.DP.FORMAT.het.pos '>' %.hetvars.alleles >cmds.sh +srun -p devel -t 1:00:00 -A b2012046 sh cmds.sh & +ls -1 *.hetvars.alleles | xargs -I% echo cut -d"' '" -f2-5 % "| tr ' ' '\t' >"%.tmp >cmds.sh +rename .hetvars.alleles.tmp .hetvars.alleles *.hetvars.alleles.tmp + + +################################################# +#BASECOUNT by MPILEUP (no varcall). +#First four of I16 == DP4 +################################################# +bamdir='/proj/b2012046/rani/analysis/degner/mergebam' +vcfdir='/proj/b2012046/rani/data/degvarcall' +bcdir='/proj/b2012046/rani/data/degnerbasecount' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/basecount' +scriptdir='/proj/b2012046/rani/scripts/degner/basecount' +projid='b2012046' +email='alva.rani@scilifelab.se' +time='10:00:00' +ref='/bubo/home/h24/alvaj/glob/annotation/degner/reference/Homo_sapiens.maskedRefe.GRCh37.57.fa' + +cd $scriptdir +find $bamdir -maxdepth 1 -name '*.bam' | xargs -I% basename % | sed 's/\.bam//' >samples.list + +cd $scriptdir +find $bamdir -maxdepth 1 -name '*.bam' | xargs -I% basename % | sed 's/\.bam//' >samples.list +cat samples.list | xargs -I% echo 'perl' ${execdir}'/mpileup.allelecount.pl' ${bamdir}/%.bam ${vcfdir}/%.hetvars $ref $scriptdir $projid $email $time $bcdir >cmds.sh +sh cmds.sh +find $scriptdir -name '*mpileup.basecount*' | xargs -I% sbatch % + +bcdir='/proj/b2012046/rani/data/basecount' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/basecount' +find $bcdir -name '*nocall.vcf' >nocall.vcf.list +cat nocall.vcf.list | xargs -I% echo 'sh' ${execdir}/vcfI162basecount.sh % '>' %.basecount >cmds.sh +srun -p devel -t 1:00:00 -A b2012046 sh cmds.sh & + + +[alvaj@kalkyl4 basecount]$ nano basecount.sh +[alvaj@kalkyl4 basecount]$ cat basecount.sh +#Basecount (allelic count of each allele) +#DEP: dphetfilter.R +#DEP: mpileup.allelecount.pl +#DEP: vcfI162basecount.sh + + +######## +#DP and HET filter +######## + +vcfdir='/proj/b2012046/rani/data/varcalls' + +## +#VCF to tab format +## +PATH=${PATH}:/bubo/home/h26/edsgard/opt/vcftools_0.1.7/bin; export PATH +PERL5LIB=/bubo/home/h26/edsgard/opt/vcftools_0.1.7/lib; export PERL5LIB +vcftools --vcf allbams.vcf --extract-FORMAT-info DP --out genome +vcftools --vcf allbams.vcf --extract-FORMAT-info GT --out genome + +## +#Filter +## +#OUT: *.het.pos +mindp=10 +execdir='/bubo/home/h26/edsgard/glob/code/ase/sim' +Rscript ${execdir}/dphetfilter.R genome.DP.FORMAT genome.GT.FORMAT $mindp & +ls -1 *.het.pos | xargs -I% echo cat % '|' sed "'s/\./ /' >"%.hetvars >cmds.sh +rename genome.DP.FORMAT.het.pos. . *.hetvars + +#Dump "chr,pos,ref,alt" for these vars. (Used as input in snv.ase.R since the alt allele is seldom biallelic when running mpileup.allelecount.pl below.) +find $vcdir -maxdepth 1 -name '*.vcf' | grep -v 'allbams' | xargs -I% echo "awk -F'\t' '{print \$1\".\"\$2, \$1, \$2, \$4, \$5;}' % | grep -v '^#' | sort -k 1,1 >"%.pos >cmds.sh +sh cmds.sh +ls -1 *.het.pos | xargs -I% echo sort % '>'%.sorted >cmds.sh +sh cmds.sh +ls -1 *.het.pos.sorted | sed 's/.genome.DP.FORMAT.het.pos.sorted//' >samples.list +cat samples.list | xargs -I% echo join -j1 %.genome.DP.FORMAT.het.pos.sorted %.genome.DP.FORMAT.het.pos '>' %.hetvars.alleles >cmds.sh +srun -p devel -t 1:00:00 -A b2012046 sh cmds.sh & +ls -1 *.hetvars.alleles | xargs -I% echo cut -d"' '" -f2-5 % "| tr ' ' '\t' >"%.tmp >cmds.sh +rename .hetvars.alleles.tmp .hetvars.alleles *.hetvars.alleles.tmp + + +################################################# +#BASECOUNT by MPILEUP (no varcall). +#First four of I16 == DP4 +################################################# +bamdir='/proj/b2012046/rani/analysis/gsnap/mergebam' +vcfdir='/proj/b2012046/rani/data/varcalls' +bcdir='/proj/b2012046/rani/data/gsnapbasecount' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/basecount' +scriptdir='/proj/b2012046/rani/scripts/basecount' +projid='b2012046' +email='alva.rani@scilifelab.se' +time='10:00:00' +ref='/bubo/home/h24/alvaj/glob/annotation/gsnap/reference/reference.genome' + +cd $scriptdir +find $bamdir -maxdepth 1 -name '*.bam' | xargs -I% basename % | sed 's/\.bam//' >samples.list + +cd $scriptdir +find $bamdir -maxdepth 1 -name '*.bam' | xargs -I% basename % | sed 's/\.bam//' >samples.list +cat samples.list | xargs -I% echo 'perl' ${execdir}'/mpileup.allelecount.pl' ${bamdir}/%.bam ${vcfdir}/%.hetvars $ref $scriptdir $projid $email $time $bcdir >cmds.sh +sh cmds.sh +find $scriptdir -name '*mpileup.basecount*' | xargs -I% sbatch % + +bcdir='/proj/b2012046/rani/data/basecount' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/basecount' +find $bcdir -name '*nocall.vcf' >nocall.vcf.list +cat nocall.vcf.list | xargs -I% echo 'sh' ${execdir}/vcfI162basecount.sh % '>' %.basecount >cmds.sh +srun -p devel -t 1:00:00 -A b2012046 sh cmds.sh & + diff --git a/Degner/VArcall/degner.varcall.sh b/Degner/VArcall/degner.varcall.sh new file mode 100644 index 0000000..e57d11f --- /dev/null +++ b/Degner/VArcall/degner.varcall.sh @@ -0,0 +1,57 @@ +#Varcalling with SAMtools +#DEP: mpileup_multisample.pl + + +### +#VarCall (SAMtools) +### + +projid='b2012046' +email='alva.rani@scilifelab.se' +sbatchdir='/proj/b2012046/rani/scripts/degner/varscripts' +bamfile='/proj/b2012046/rani/analysis/degner/mergebam' +vcfdir='/proj/b2012046/rani/data/degvarcall' +ref='/bubo/home/h24/alvaj/glob/annotation/degner/reference/Homo_sapiens.maskedRefe.GRCh37.57.fa' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/varcall' + +cd $sbatchdir +# need to redo it again with merged bam files +#create file listing the bam files +allbamsfile=${vcfdir}/allbams.list +find $bamfile -maxdepth 1 -name '*.bam' >$allbamsfile + +#create region files +module load bioinfo-tools +module load samtools/0.1.18 +srun -p devel -A b2012046 -t 1:00:00 samtools faidx $ref & +vcfutils.pl splitchr -l 6600000 ${ref}.fai >${vcfdir}/genome.480.regs + + +#Run mpileup +cat ${vcfdir}/genome.480.regs | xargs -I% echo 'perl' ${execdir}/mpileup_multisample.pl $allbamsfile % $sbatchdir $projid $email $ref >cmds.sh +sh cmds.sh +find $sbatchdir -name '*.mpileup.sbatch' | xargs -I% sbatch % + + +#check status: +cd $vcfdir +lt *.stderr >allerrfiles.tmp +squeue -u alvaj | grep -v JOBID | awk '{print $1;}' | xargs -I% grep % ${vcfdir}/info/allerrfiles.tmp + +#Catenate all region-specific vcfs +find $vcfdir -name 'allbams.*.vcf' | xargs cat | grep -v '^#' >${vcfdir}/allbams.vcf.tmp +cat allbams.list.MT:1-16569.vcf | grep '^#' >header.tmp +cat header.tmp allbams.vcf.tmp >allbams.vcf +rm header.tmp allbams.vcf.tmp +wc -l allbams.vcf +#26 + +#Rm slashes ilon vcf sample header +vcfdir='/proj/b2012046/rani/data/varcalls' +cd ${vcfdir} +cat allbams.vcf | grep '^#' >header.tmp +cat header.tmp | sed 's/\/proj\/b2012046\/rani\/analysis\/degner\/mergebam\///g' | sed 's/\.bam//g' >file.tmp +mv file.tmp header.tmp +grep -v '^#' allbams.*.vcf >allbams.vcf.noheader +cat header.tmp allbams.vcf.noheader >allbams.vcf +rm allbams.vcf.noheader header.tmp diff --git a/Degner/VArcall/mpileup_multisample.pl b/Degner/VArcall/mpileup_multisample.pl new file mode 100644 index 0000000..4f24dd6 --- /dev/null +++ b/Degner/VArcall/mpileup_multisample.pl @@ -0,0 +1,73 @@ +#!/usr/bin/perl + +#find '/proj/b2011075/analysis/hs_pipe/outdata/run1/varcalls' -name 'allbams.chr_*' | xargs -I% echo 'perl /bubo/home/h26/edsgard/glob/code/ngs_pipes/hs_pipe/mpileup_multisample.pl' % '/proj/b2011075/analysis/hs_pipe/outscripts/run1/varcalls b2010035 daniel.edsgard@scilifelab.se' >cmds.sh + +#-q1 mapQ filter to get "unique" alignments. +#No max dp varFilter since RNA-seq data. + +use strict; + +my ($bamfilelist, $region, $ods, $aid, $em, $ref, $time, $partition) = @ARGV; + +if(!(defined($ref))){ + $ref = '/bubo/home/h26/edsgard/glob/annotation/human/Homo_sapiens.GRCh37.57.dna.concat.fa'; +} +if(!(defined($time))){ + $time = '23:00:00'; +} +if(!(defined($partition))){ + $partition = 'node'; +} + +mpileup(($bamfilelist, $region, $ods, $aid, $em, $ref, $time, $partition)); + +sub mpileup { + + use File::Spec::Functions; + use File::Basename; + + my $bamfilelist = shift @_; + my $region = shift @_; + my $ods = shift @_; + my $aid = shift @_; + my $em = shift @_; + my $ref = shift @_; + my $time = shift @_; + my $partition = shift @_; + + my $bamfiles_base = basename($bamfilelist); + my $sbatch_dir = $ods; + my $sbatch_file = catfile($sbatch_dir, $bamfiles_base . '.' . $region . '.mpileup.sbatch'); + my $outdata_dir = dirname($bamfilelist); + my $outdata_infodir = catfile($outdata_dir, 'info'); + + #make dirs + `mkdir -p $outdata_infodir;`; #Creates the data and info dir + `mkdir -p $sbatch_dir;`; #Creates the sbatch-script dir + + open (SBATCH, '>', $sbatch_file) or die("Can't write to $sbatch_file: $!\n"); + + #print sbatch vars + print SBATCH "#!/bin/bash -l", "\n"; + print SBATCH "#SBATCH -A ", $aid, "\n"; + print SBATCH "#SBATCH -t ", $time, "\n"; + print SBATCH "#SBATCH -J multimpileup", "\n"; + print SBATCH '#SBATCH -p ', $partition, ' -n 1', "\n"; + print SBATCH "#SBATCH -e $outdata_infodir/$bamfiles_base.$region.multimpileup" . '.jid_%j.stderr', "\n"; + print SBATCH "#SBATCH -o $outdata_infodir/$bamfiles_base.$region.multimpileup" . '.jid_%j.stdout', "\n"; + unless ($em eq 0) { + print SBATCH "#SBATCH --mail-type=All", "\n"; + print SBATCH "#SBATCH --mail-user=$em", "\n\n"; + } + + #print vars + print SBATCH 'module load bioinfo-tools' . "\n"; + print SBATCH 'module load samtools/0.1.18' . "\n"; + print SBATCH 'echo "Running on: $(hostname)"',"\n\n"; + + #print cmd + my $vcffile = $bamfilelist . '.' . $region . '.vcf'; + print SBATCH 'samtools mpileup -q1 -d10000 -L10000 -DSugf ' . $ref . ' -r ' . $region . ' -b ' . $bamfilelist. ' | bcftools view -vcg - | vcfutils.pl varFilter >' . $vcffile . "\n"; + + close(SBATCH); +} diff --git a/Degner/map/run.degner.sh b/Degner/map/run.degner.sh new file mode 100644 index 0000000..da210e7 --- /dev/null +++ b/Degner/map/run.degner.sh @@ -0,0 +1,47 @@ +#Map with tophat +#DEP: tophat.gensbatch.pl + + +### +#Tophat +### +#Longest run: 5h38min, 6G file for each readpair: 8_LPS.readp_1.2.ill.fastq +email='alva.rani@scilifelab.se' +projid='b2012046' +execdir='/bubo/home/h24/alvaj/glob/code/ASE/map' +fastqdir='/proj/b2012046/edsgard/ase/sim/data/synt/fastqfilt' +fastqsuffix='.filter.fastq' +outdir='/proj/b2012046/rani/analysis/degner' +sbatchdir='/proj/b2012046/rani/scripts/degner' + +threads='8' +ref='/bubo/home/h24/alvaj/glob/code/ASE/degner/bowtie-0.12.8/indexes/ref/Ref.fa ' + +time='23:00:00' + +threads='8' +annotdir='/bubo/home/h26/edsgard/glob/annotation/human/' +gtf=${annotdir}'Homo_sapiens.GRCh37.59.gtf' +time='23:00:00' #Longest run: 7h10min: Input: ~7.3G per readpair fastq file +readlen='100' +isizefile='/proj/b2011075/data/rnaseq/bioanalyzer/isizes.only.tab' +isizedev='75' #very rough approx from the bioanalyzer plots. Also, we use trimming which further adds to the deviation. + +cd $fastqdir +## creating a sample +ls *.fastq | awk '{split($0,b,"."); print b[1]}' > ${sbatchdir}/samples.list +cd $sbatchdir +rm cmds.sh + +samples=(`cat samples.list`) +for sample in ${samples[@]} +do +find $fastqdir -maxdepth 1 -name ${sample}'*' | sed 's/[12].filter.fastq//' | grep -v '.S.filter.fastq' | sort -u >fastqfiles.prefix.list + cat fastqfiles.prefix.list | xargs -I% echo perl ${execdir}/'tophat.gensbatch.pl' $sample % $fastqsuffix $outdir $threads $gtf $ref $isizefile $sbatchdir $projid $email $time $readlen $isizedev >>cmds.sh +done +sh cmds.sh +find . -name '*.tophat.sbatch' | xargs -I% sbatch % + +## Submitted again + + diff --git a/Degner/map/tophat.gensbatch.pl b/Degner/map/tophat.gensbatch.pl new file mode 100644 index 0000000..ba15215 --- /dev/null +++ b/Degner/map/tophat.gensbatch.pl @@ -0,0 +1,65 @@ +#!/usr/bin/perl + +use strict; +use File::Basename; +use File::Spec::Functions; + + +my ($sample, $fastqprefix, $fastqsuffix, $outdir, $threads, $gtf, $ref, $isizefile, $ods, $projid, $email, $time, $readlen, $isizedev) = @ARGV; + +#set vars +my $fastqfile_1 = $fastqprefix . '1' . $fastqsuffix; +my $fastqfile_2 = $fastqprefix . '2' . $fastqsuffix; +my $fastqprefix_base = basename($fastqprefix); +my $outdir = catfile($outdir, $fastqprefix_base); +$outdir =~ s/\.$//; + +#set sbatch file +my $sbatch_file = catfile($ods, $fastqprefix_base . $fastqsuffix . '.tophat.sbatch'); +$sbatch_file =~ s/\.\./\./; + +#set insert size (NB: actually inner distance) +my $isize; +open(ISIZE, '<', $isizefile) or die("Couldnt open $isizefile, $!\n"); +while(defined(my $line = )){ + chomp($line); +# print 'DEBUG: ' . $line . "\n"; + my ($jsample, $isize_jsample) = split(/\t/, $line); +# print 'DEBUG: jsample: ' . $jsample . "\n"; +# print 'DEBUG: sample: ' . $sample . "\n"; +# my ($isize_jsample, $isize_jsample, $isizedev_jsample) = split(/ /, $line); + if($jsample =~ $sample){ + $isize = $isize_jsample; + $isize = $isize - 2 * $readlen; #Tophat uses the inner distance: isize - 2*readlen +# $isizedev = $isizedev_jsample; + } +} +close(ISIZE); + +#create outdirs +my $cmd = 'mkdir -p ' . catfile($outdir, 'log'); +system($cmd); +my $cmd = 'mkdir -p ' . $ods; +system($cmd); + +#print sbatch file +open(SBATCH, '>', $sbatch_file) or die("Couldn't open $sbatch_file"); +print SBATCH '#!/bin/bash -l' . "\n"; +print SBATCH '#SBATCH -A ' . $projid . "\n"; +print SBATCH '#SBATCH -t ' . $time . "\n"; +print SBATCH '#SBATCH -J tophat' . "\n"; +print SBATCH '#SBATCH -p node -n ' . $threads . "\n"; +print SBATCH '#SBATCH -e ' . $outdir . '/log/tophat.jid_%j.stderr' . "\n"; +print SBATCH '#SBATCH -o ' . $outdir . '/log/tophat.jid_%j.stdout' . "\n"; +print SBATCH '#SBATCH --mail-type=All' . "\n"; +print SBATCH '#SBATCH --mail-user=' . $email . "\n"; + +#set vars +print SBATCH 'module load bioinfo-tools' . "\n"; +print SBATCH 'module load samtools/0.1.18' . "\n"; +print SBATCH 'module load tophat/1.4.0' . "\n"; + +#print cmd +print SBATCH 'tophat --solexa1.3-quals -p ' . $threads . ' -o ' . $outdir . ' --GTF ' . $gtf . " -r '" . $isize . "' --mate-std-dev " . $isizedev . ' ' . $ref . ' ' . $fastqfile_1 . ' ' . $fastqfile_2 . "\n"; +close(SBATCH); + diff --git a/Degner/map/tophat.singleend.gensbatch.pl b/Degner/map/tophat.singleend.gensbatch.pl new file mode 100644 index 0000000..ff4976c --- /dev/null +++ b/Degner/map/tophat.singleend.gensbatch.pl @@ -0,0 +1,66 @@ +#!/usr/bin/perl + +use strict; +use File::Basename; +use File::Spec::Functions; + + +my ($fastqprefix, $outdir, $threads, $gtf, $ref,$ods, $projid, $email, $time, $readlen) = @ARGV; + +#set vars +my $fastqfile_1 = $fastqprefix . '1'; +#my $fastqfile_2 = $fastqprefix . '2' . $fastqsuffix; +my $fastqprefix_base = basename($fastqprefix); +my $outdir = catfile($outdir, $fastqprefix_base); +$outdir =~ s/\.$//; + +#set sbatch file +my $sbatch_file = catfile($ods, $fastqprefix_base . '.tophat.sbatch'); +$sbatch_file =~ s/\.\./\./; + +#set insert size (NB: actually inner distance) +my $isize; +open(ISIZE, '<', $isizefile) or die("Couldnt open $isizefile, $!\n"); +#while(defined(my $line = )){ +# chomp($line); +# print 'DEBUG: ' . $line . "\n"; + # my ($jsample, $isize_jsample) = split(/\t/, $line); +# print 'DEBUG: jsample: ' . $jsample . "\n"; +# print 'DEBUG: sample: ' . $sample . "\n"; +# my ($isize_jsample, $isize_jsample, $isizedev_jsample) = split(/ /, $line); + #if($jsample =~ $sample){ + # $isize = $isize_jsample; + # $isize = $isize - 2 * $readlen; #Tophat uses the inner distance: isize - 2*readlen +# $isizedev = $isizedev_jsample; + #} +#} +#close(ISIZE); + +#create outdirs +my $cmd = 'mkdir -p ' . catfile($outdir, 'log'); +system($cmd); +my $cmd = 'mkdir -p ' . $ods; +system($cmd); + +#print sbatch file +open(SBATCH, '>', $sbatch_file) or die("Couldn't open $sbatch_file"); +print SBATCH '#!/bin/bash -l' . "\n"; +print SBATCH '#SBATCH -A ' . $projid . "\n"; +print SBATCH '#SBATCH -t ' . $time . "\n"; +print SBATCH '#SBATCH -J tophat' . "\n"; +print SBATCH '#SBATCH -p node -n ' . $threads . "\n"; +print SBATCH '#SBATCH -e ' . $outdir . '/log/tophat.jid_%j.stderr' . "\n"; +print SBATCH '#SBATCH -o ' . $outdir . '/log/tophat.jid_%j.stdout' . "\n"; +print SBATCH '#SBATCH --mail-type=All' . "\n"; +print SBATCH '#SBATCH --mail-user=' . $email . "\n"; + +#set vars +print SBATCH 'module load bioinfo-tools' . "\n"; +print SBATCH 'module load samtools/0.1.18' . "\n"; +print SBATCH 'module load tophat/1.4.0' . "\n"; + +#print cmd +print SBATCH 'tophat --solexa1.3-quals -p ' . $threads . ' -o ' . $outdir . ' --GTF ' . $gtf . " -r '" . $isize . "' --mate-std-dev " . $isizedev . ' ' . $ref . ' ' . $fastqfile_1 . ' ' . "\n"; +close(SBATCH); + + From 83a23e7e7198df8797ac057b617a763fd4556332 Mon Sep 17 00:00:00 2001 From: alvarani Date: Fri, 28 Sep 2012 14:07:00 +0200 Subject: [PATCH 06/10] Added new test folder --- test/run.gunzip.sh | 29 +++++ test/test.sh | 193 +++++++++++++++++++++++++++++++++ test/testgsnap for smallseq.sh | 22 ++++ 3 files changed, 244 insertions(+) create mode 100644 test/run.gunzip.sh create mode 100644 test/test.sh create mode 100644 test/testgsnap for smallseq.sh diff --git a/test/run.gunzip.sh b/test/run.gunzip.sh new file mode 100644 index 0000000..b7897f1 --- /dev/null +++ b/test/run.gunzip.sh @@ -0,0 +1,29 @@ +#GSNAP +# I have downloaded the gmap from the homepage and then unziped and extracted +wget http://research-pub.gene.com/gmap/src/gmap-gsnap-2012-05-15.tar.gz +gunzip gmap-gsnap-2012-05-15.tar.gz +tar xvf./* gmap-gsnap-2012-05-15.tar +#Made some changes in the config file +#Specified the paths +./configure +make +make install +# I have downloaded the database and extracted it +wget http://research-pub.gene.com/gmap/genomes/hg19.tar +tar xvf hg19.tar +##optional if need to run with older versions + ln -s hg19.ref153positions hg19.ref12153.positions +#the fasta file used here +wget 'http://bgx.org.uk/software/Homo_sapiens.GRCh37.64.dna.toplevel.ref.fa.gz' +gunzip Homo_sapiens.GRCh37.64.dna.toplevel.ref.fa.gz + +#How to process a genome for GMAP...Started with an example file 'g' +# Ran an example ,downloaded a fasta file and saved as g in utils +#Ran coords.txt +./fa_coords g +#then Ran gmap_setup +./gmap_setup -d hg19 g +# tried the file from gmapdb +./md_coords /bubo/home/h24/alvaj/gmap-2012-05-15/gmapdb/hg19/hg19.contig + + diff --git a/test/test.sh b/test/test.sh new file mode 100644 index 0000000..f3107c3 --- /dev/null +++ b/test/test.sh @@ -0,0 +1,193 @@ +#GSNAP +# I have downloaded the gmap from the homepage and then unziped and extracted +wget 'http://research-pub.gene.com/gmap/src/gmap-gsnap-2012-05-15.tar.gz' +gunzip gmap-gsnap-2012-06-12.tar.gz +tar xvf./* gmap-gsnap-2012-06-12.tar +tar -xvf gmap-gsnap-2012-06-12.tar +#Made some changes in the config file +#Specified the paths +./configure +make +make installg +# I have downloaded the database and extracted it +wget 'http://research-pub.gene.com/gmap/genomes/hg19.tar' +tar xvf hg19.tar +##optional if need to run with older versions + ln -s hg19.ref153positions hg19.ref12153.positions +#the fasta file used here +wget 'http://bgx.org.uk/software/Homo_sapiens.GRCh37.64.dna.toplevel.ref.fa.gz' +gunzip Homo_sapiens.GRCh37.64.dna.toplevel.ref.fa.gz + +wget 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp135.txt.gz' + +#How to process a genome for GMAP...Started with an example file 'g' +# Ran an example ,downloaded a fasta file and saved as g in utils +#Ran coords.txt +./fa_coords g +#then Ran gmap_setup +./gmap_setup -d hg19 g +# tried the file from gmapdb +./md_coords /bubo/home/h24/alvaj/gmap-2012-05-15/gmapdb/hg19/hg19.contig + + +#Building map files +# here I used an example file 'Ailuropoda_melanoleuca.ailMel1.67.gtf',first I concatinated this file with a utility program gtf_splicesites by his command and stored in a folder at.splicesites +cat Ailuropoda_melanoleuca.ailMel1.67.gtf | /bubo/home/h24/alvaj/gsnap-2012-06-12/util/gtf_splicesites > at.splicesites +## then rename at.splicesites to at1, the folowwing comand +cat at.splicesites |./iit_store -o at1 + + +## Runing GSANP, the following command helps to run the fastaq file towards the genome hg19 +./gsnap -d hg19 x + +# GSNAP gives the difference between site level and intron level splicing based on the format of input file, to perform site level splicing the file should be in a special format which can be created by the folowwing commnad + + + cat Ailuropoda_melanoleuca.ailMel1.67.gtf |/bubo/home/h24/alvaj/gmap-2012-05-15/util/gtf_splicesites >foo.splicesites +# for intron level the format can be created by, +cat Ailuropoda_melanoleuca.ailMel1.67.gtf |/bubo/home/h24/alvaj/gmap-2012-05-15/util/gtf_introns >foo.introns +## once it is created you can process for map files by the following command, +cat foo.splicesites | ./iit_store -o splicesitefile + + +## Example for SNP tolerant alignment in GSNAP +# wget ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz +# unzipped by, +gunzip -c snp130.txt.gz | ./dbsnp_iit -w 3 -e snp130Exceptions.txt.gz > snpfile.txt +## changing the snpfile to .iit format file by, +cat snpfile.txt | /bubo/home/h24/alvaj/gmap-2012-05-15/src/iit_store -o snpfile +## after cpying the itt file into hg19.maps folder , and the programm to find the indictaed file from the .maps folder by the folowwing command(used at1.iit file) +/bubo/home/h24/alvaj/gmap-2012-05-15/src/snpindex -d hg19 -v at1 + +# for intron sites +cat Homo_sapiens.GRCh37.59.gtf |/bubo/home/h24/alvaj/glob/annotation/gsnap/gmap-2012-05-15/util/gtf_introns > snp.introns + +(cat < sample.fa +EOF +) >sbatch.template +cat sample.list | xargs -I% echo cat sbatch.template "| sed 's/sample/"%"/g' >" %.haploref.sbatch >cmds.sh +sh cmds.sh +find . -name '*.haploref.sbatch' | xargs -I% sbatch % + +##****Kalkyl**** +# assigning the job for some other interactive level +interactive -p devel -t 1:00:00 -A b2012046 +# to see om whic node I am runing +srun hostname -s |sort -u +# to create same directory for all the nodes which I am working with + srun -N 4 -n 4 mkdir /scratch/$SLURM_JOB_ID/indata +# Copy indata for my_program to the local directories + srun -N 4 -n 4 cp -r ~/glob/indata/* /scratch/$SLURM_JOB_ID/indata +# to see the queue line of the job +squeue -u alvaj + + +# to rename the file +cat snp.splicesite | awk '{print $1, "chr"$2, $3, $4;}' >snp.splicesite.uscschr +# tried for a short seqeunce from the fastq +/bubo/home/h24/alvaj/gmap-2012-05-15/src/gsnap -d hg19 ./fastq >shortfastaseq +# to copy a .sh file from the local computer to the cloud... +scp g.sh alvaj@kalkyl.uppmax.uu.se:/bubo/home/h24/alvaj/glob/annotation/gsnap/g.sh + +/bubo/home/h24/alvaj/gmap-2012-05-15/src/gsnap -d hg19 -s./splicesitesfile /proj/b2012046/edsgard/ase/sim/data/synt/fastqfilt/1_unstim.readp_1.2.ill.fastq.1.filter.fastq >output1 + +# for paired end rna seq reads +/bubo/home/h24/alvaj/gmap-2012-05-15/src/gsnap -d hg19 -s./splicesitesfile /proj/b2012046/edsgard/ase/sim/data/synt/fastqfilt/1_unstim.readp_1.2.ill.fastq.1.filter.fastq /proj/b2012046/edsgard/ase/sim/data/synt/fastqfilt/ 9_unstim.readp_1.2.ill.fastq.1.filter.fastq >output2 + + +(nohup prg args >my.out) >& my.err & +sbatch .script +srun -A b21010 -t 1:00:00 -p devel prg args >my.out + +## +#RAN for small seq and is working fine!!! +annotdir='/bubo/home/h24/alvaj/glob/annotation/gsnap' +outdir='/proj/b2012046/rani/analysis/gsnap' +datadir='/proj/b2012046/rani/data/fastq' +splicefile=${annotdir}/'splicesiteschro' +fastqfile1=${datadir}/test.n100.rp1.fastq +fastqfile2=${datadir}/test.n100.rp2.fastq +refdir=${annotdir}/gmapdb +ref=hg19 +snpfile=dbsnp135 +snpdir=${annotdir}/dbsnp +samdir='/bubo/home/h24/alvaj/glob/annotation/gsnap/samtools/samtools-0.1.6' + +cd $outdir +export PATH=$PATH:/bubo/home/h24/alvaj/opt0/gmap-2012-06-12/bin + +(gsnap -D $refdir -d $ref -A sam -s $splicefile -V $snpdir -v $snpfile --quality-protocol=illumina $fastqfile1 $fastqfile2 | ${samdir}/samtools view -Sb - > ${outdir}/test.n100.bam12) >& test.err12 & + +| ${samdir}/samtools view -Sb test.sam > test.sam +| ${samdir}/samtools view -S sam -b bam +samtools view -bT hg.fa -o xxx.bam xxx.sam +samdir='/bubo/home/h24/alvaj/glob/annotation/gsnap/samtools/samtools-0.1.6' + +## github commit// to push the changes from my computer to github +#when a fatal error occurs such as index lock already running + rm -f ./.git/index.lock + +# github to add file from computer towards repo +git add run.tophat.sh + git commit -m 'The edited tophat.sh file' run.tophat.sh +git remote add origin https:https://github.com/alvarani/ASE.git +git push origin master + +#kill all slurm jobs--if you want to specify the number of jobs eg: 'NR >32 {print $1;}' +squeue -u alvaj | grep -v JOB | awk '{print $1;}' | xargs -I% scancel % + + +# stream editor SED an example:to print the lines containg s/1.filter.fastq// and 2.filter.fastq'.... +cat fastq.files | sed 's/1.filter.fastq//' | grep -v '2.filter.fastq' >fastqfiles.prefix + + +# to delete a file with only a particular format within a directory,here to delete the files with .sam format +find . -type f -name "*.sam" -exec rm -f {} \; + + + +# it helps to change the suffix from fastqfiles10.prefix to sample1 and sample2 +cat fastqfiles10.prefix | xargs -I% basename % | xargs -I% echo cat sbatch10.template "| sed 's/sample/"%"/g' >" %gsnap.sbatch >cmds10.sh + + + +# to count the number of files in a directory, where targetdir is the name of directory +ls -1 targetdir | wc -l +# which is more advanced +find Downloads -maxdepth 1 -type f | wc -l + + + +# listing the jobs in human readable form +ls -ltrh +#to convet from sam to bam +samdir= '/bubo/home/h24/alvaj/glob/annotation/gsnap/samtools/samtools-0.1.6/samtools' +outdir='/proj/b2012046/rani/analysis/gsnap' +find . -type f -name "*.sam" | ${samdir}samtools view -bS > ${outdir}/samplebam + + +#example sed +sed 's/day/night/' new + +s Substitute command +/../../ Delimiter +day Regular Expression Pattern Search Pattern +night Replacement string , ie replace day with night +# anothe rexample with echno +echo Sunday | sed 's/day/night/' + +# the codes for bam file is in this one ASE +/bubo/home/h24/alvaj/glob/code/ASE + diff --git a/test/testgsnap for smallseq.sh b/test/testgsnap for smallseq.sh new file mode 100644 index 0000000..6316b04 --- /dev/null +++ b/test/testgsnap for smallseq.sh @@ -0,0 +1,22 @@ + + + +#RUN!for small test seq + +annotdir='/bubo/home/h24/alvaj/glob/annotation/gsnap' +splicefile=${annotdir}/'splicesiteschro' +snpfile=dbsnp135 +snpdir=${annotdir}/dbsnp +outdir='/proj/b2012046/rani/analysis/gsnap' +datadir='/proj/b2012046/rani/data/fastq' +fastqfile1=${datadir}/test.n100.rp1.fastq +fastqfile2=${datadir}/test.n100.rp2.fastq +refdir=${annotdir}/gmapdb +ref=hg19 + +cd $outdir +export PATH=$PATH:/bubo/home/h24/alvaj/opt/gmap-2012-06-02/bin +(gsnap -D $refdir -d $ref -A sam -s $splicefile -V $snpdir -v $snpfile --quality-protocol=illumina $fastqfile1 $fastqfile2 >${outdir}/test.n100.sam) >& test.err & + +### worked and given output + From 338afb6536b1c50456acf299180327cd69798e49 Mon Sep 17 00:00:00 2001 From: alvarani Date: Fri, 28 Sep 2012 14:08:11 +0200 Subject: [PATCH 07/10] removed unwanted files --- run.gsnap.sh | 104 --------------------------------------------------- 1 file changed, 104 deletions(-) delete mode 100644 run.gsnap.sh diff --git a/run.gsnap.sh b/run.gsnap.sh deleted file mode 100644 index 7a61d44..0000000 --- a/run.gsnap.sh +++ /dev/null @@ -1,104 +0,0 @@ -#! /bin/bash -l -# download the software and DB -wget 'http://research-pub.gene.com/gmap/src/gmap-gsnap-2012-06-06.tar.gz' -wget 'http://research-pub.gene.com/gmap/genomes/hg19.tar' - -gsnapexecdir='/bubo/home/h24/alvaj/opt0/gmap-2012-06-06' - -#SNPs -### -# downloaded the needed file snp134,snp135common, -cd $snpdir -wget 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp135Common.txt.gz' -## done again saved in annotation/gsnap for latest version of gsanp -wget 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp135.txt.gz' - -#reformat---from here I have to run for the new version -gunzip -c snp135Common.txt.gz | ${gsnapexecdir}/dbsnp_iit -w 3 > ${snpfile}.txt & -gunzip -c snp135Common.txt.gz | ${gsnapexecdir}/dbsnp_iit -w 3 > dbsnp135.txt & -## done and save in annotation/gsnap |I used it here for new version) - -(cat ${snpfile}.txt |${gsnapexecdir}/iit_store -o $snpfile >${snpfile}.iitstore.out) >& ${snpfile}.iitstore.err & -##done for new version - -(snpindex -D $refdir -d $ref -V $snpdir -v $snpfile ${snpfile}.iit >snpindex.out) >& snpindex.err & -/bubo/home/h24/alvaj/opt0/gmap-2012-06-06/src/snpindex - -### -# Splice sites , to generate a splice site index---***splicesite*** -### -cat Homo_sapiens.GRCh37.59.gtf |${gsnapexecdir}/gtf_splicesites > snp.splicesiteschr -#(---done) - -# renamed file is splicesitechr -cat snp.splicesiteschr | awk '{print $1, "chr"$2, $3, $4;}' >splicesiteschr - - -# renamed file is splicesitechr - -## processing it to map file (changing to .iit file)---done for snp.splicesitechr -cat splicesiteschr | ${gsnapexecdir}/iit_store -o splicesiteschro -## done - -### -## Run gsnap -### - -#set path for for each of these variables -gsnapexecdir='/bubo/home/h24/alvaj/opt0/gmap-2012-06-12' -annotdir='/bubo/home/h24/alvaj/glob/annotation/gsnap' -scriptdir='/proj/b2012046/rani/scripts/gsnap/test' -fastqdir='/proj/b2012046/rani/data/synt/fastqfilt/oneGchunks' -outdir='/proj/b2012046/rani/analysis/gsnap' -splicefile=${annotdir}/'splicesiteschro' -refdir=${annotdir}/gmapdb -ref=hg19 -snpdir=${annotdir}/dbsnp -snpfile=dbsnp135 -projid='b2012046' -email='alva.rani@scilifelab.se' - -#Rename fastq files -cd $fastqdir -ln -s /proj/b2012046/edsgard/ase/sim/data/synt/fastqfilt/oneGchunks/*.fastq . -#append readp_1 as suffix -ls -1 *.readp_1.*.fastq | xargs -I% mv % %.readp_1 -#rm first occurence of readp_1 -rename .readp_1.part .part *.readp_1 -#append readp_2 as suffix -ls -1 *.readp_2.*.fastq | xargs -I% mv % %.readp_2 -#rm first occurence of readp_2 -rename .readp_2.part .part *.readp_2 - -cd $scriptdir -find $fastqdir -maxdepth 1 -name '*fastq.readp_1' | sed 's/.readp_1//' >fastq.files.prefix - -samples=(`cat fastq.files.prefix`) - -for fastqfile in ${samples[@]} -do -sbatchfile=`basename $fastqfile` -echo $sbatchfile -(cat <${outdir}/${sbatchfile}.bam -EOF -)>${sbatchfile}.sbatch -done -find . -name '*.fastq.sbatch' | xargs -I% sbatch % -#status: submitted - -############ - From 2a614ec77f6ef1247deea90add7ebe755e9149e6 Mon Sep 17 00:00:00 2001 From: alvarani Date: Fri, 28 Sep 2012 14:08:57 +0200 Subject: [PATCH 08/10] Added new folder --- Monte/run.Monte.sh | 66 +++++++++++++++++++++++++++++++++++ Monte/run.monteWorkflow.sh | 71 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 Monte/run.Monte.sh create mode 100644 Monte/run.monteWorkflow.sh diff --git a/Monte/run.Monte.sh b/Monte/run.Monte.sh new file mode 100644 index 0000000..dcfda5e --- /dev/null +++ b/Monte/run.Monte.sh @@ -0,0 +1,66 @@ +#! /bin/bash -l +#Map with tophat +#DEP: tophat.gensbatch.pl + + +### +#Tophat +### +#Longest run: 5h38min, 6G file for each readpair: 8_LPS.readp_1.2.ill.fastq +email='alva.rani@scilifelab.se' +projid='b2012046' +#execdir='/bubo/home/h24/alvaj/glob/code/ASE/map' +fastqdir='/proj/b2012046/edsgard/ase/sim/data/degner' +#fastqdir='/proj/b2012046/rani/data/fastq' +#fastqfile='testsingle.bqB.fastq' +outdir='/proj/b2012046/rani/analysis/monte' +sbatchdir='/proj/b2012046/rani/scripts/monte' + +#Vars. +#threads='8' +annotdir='/bubo/home/h26/edsgard/glob/annotation/human/' +gtf=${annotdir}/'Homo_sapiens.GRCh37.59.gtf' +ref='/bubo/home/h26/edsgard/glob/annotation/bowtie/concat.fa' +#time='01:00:00' + +#Longest run: 7h10min: Input: ~7.3G per readpair fastq file + +#readlen='100' +#isize='175' +#isizedev='75' + +#very rough approx from the bioanalyzer plots. Also, we use trimming which further adds to the deviation. + +cd $sbatchdir +#rm cmds.sh +#samples=(`cat samples.list`) + +#for sample in ${samples[@]} +#do +#find $fastqdir -maxdepth 1 -name ${samples}'*' | sort -u >fastqfiles.list + +(cat <${outdir}/.tophat.sbatch +EOF +) > sbatch.tophat + +cat sbatch.tophat >tophat.sbatch +sbatch tophat.sbatch + +#find . -name 'sbatch.tophat' | xargs -I% sbatch % + + + +#submitted diff --git a/Monte/run.monteWorkflow.sh b/Monte/run.monteWorkflow.sh new file mode 100644 index 0000000..e42cb86 --- /dev/null +++ b/Monte/run.monteWorkflow.sh @@ -0,0 +1,71 @@ + +#Simulate data a la Degner +#DEP: get.overlaps.v3.perl (from Degner) + + +### +#Format input variants +### +#id chr pos strand ref alt +#EX: 10.100000020 10 100000020 + A G + +vcfdir='/proj/b2012046/edsgard/ase/sim/data/varcalls' +vars=${vcfdir}/vars2degner.dp10.txt +cat ${vcfdir}/*.hetvars.alleles | sort -u | awk -F'\t' '{print $1"."$2, $1, $2, "+", $3, $4;}' >$vars + +#Check: +wc -l vars2degner.dp10.txt #112901 + + +### +#Create simulated read set +### +#TBD: Add noise. +#TBD: NB: Disregards if >1 SNP covering a read! +#TBD: PE reads not simulated, only SEs! +degneroutdir='/proj/b2012046/edsgard/ase/sim/data/degner' +fastqoutfile=${degneroutdir}/mpileup.dp10.len100.bqB.fastq +vcfdir='/proj/b2012046/edsgard/ase/sim/data/varcalls' +vars=${vcfdir}/vars2degner.dp10.txt +projid='b2010035' +time='4:57:00' +logdir=${degneroutdir}/log +exedir='/bubo/home/h26/edsgard/glob/code/ase/degner' +refdir='/bubo/home/h26/edsgard/glob/annotation/bowtie' +readlen='100' +readqual='BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB' + +mkdir -p $logdir +srun -A $projid -p node -t $time -e ${logdir}/sim.err -o $fastqoutfile perl ${exedir}/get.overlaps.v3.perl $vars $refdir $readlen $readqual & +#Status: submitted + + +### +#Map with Tophat +### +#See map/run.tophat.sh +#You'll need to amend it such that it takes single end reads instead of paired end reads! + + +### +#rmDups +### +#Skip rmdups + + +### +#Merge and sort bam files +### +#TBD: Not sure that you need this. But in that case see bam/bamproc.sh + + +### +#MPILEUP (no varcall) +### +#See basecount/basecount.sh + + +### +#Mapping bias modification a la Montgomery +### +#I'll get back to you with code for that once you've reached this point:) From 00876d7f66562f26e663acc245e5fae16cdc2388 Mon Sep 17 00:00:00 2001 From: alvarani Date: Fri, 28 Sep 2012 14:17:44 +0200 Subject: [PATCH 09/10] The perl script for masking reference --- Degner/map/masked.degner.pl | 88 +++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 Degner/map/masked.degner.pl diff --git a/Degner/map/masked.degner.pl b/Degner/map/masked.degner.pl new file mode 100644 index 0000000..cff287a --- /dev/null +++ b/Degner/map/masked.degner.pl @@ -0,0 +1,88 @@ +#!/bin/perl +use strict; +use warnings; +no warnings ('uninitialized', 'substr'); + + +my $snpfile = $ARGV[0]; +my $sequencefile = $ARGV[1]; +my $chromosomename=$ARGV[2]; +#my $chr; + +open(SNP, "$snpfile"); +open(SEQUENCE,"genome.fa"); +open(NRSEQ, ">$chromosomename"); + +print_chromosome(); + +sub print_chromosome { +my @line; +my $line; +my $plusbase; +my $otherbase; +my $test_string; +my $NRsequence; +my $chr; my $ky; +my $seq; my $chr_snp; +my %seq; +#; +while (){ +chomp; +$NRsequence = $_; + $chr = $1 if ($NRsequence =~ />(\d+)\s(.*)/) || ($NRsequence =~ />(\w+)\s(.*)/ ); +$seq{$chr}.=$NRsequence unless ($NRsequence =~ />/); +} + +#close(SEQUENCE); +my $a = keys %seq; +print "keys:",$a; +#print $_,"\n\n" for keys %seq; + +#; +while($line=){ +@line = split(/\t/, $line); +#print @line, "\n"; +#if($line[3] eq "+"){ +$chr_snp =$line[0]; +$plusbase=$line[4]; +$otherbase=$line[3]; + +print "Ref:$plusbase,Alt:$otherbase\n"; + +#} +#elsif($line[3] eq "-"){ +#$plusbase = $line[4]; +#$otherbase=$line[5]; + +#$plusbase =~ tr/ATGCYRKMBDHVatgcyrkmbdhv/TACGRYMKVHDBtacgrymkvhdb/; +#$otherbase =~ tr/ATGCYRKMBDHVatgcyrkmbdhv/TACGRYMKVHDBtacgrymkvhdb/; +#} + +#print "$line[2]\n"; # +print(substr($seq{$chr_snp}, $line[2]-5, 10), "\n"); +$test_string = substr($seq{$chr_snp}, $line[2]-1, 1); +if( uc($plusbase) eq uc($test_string) ) { +print "OK\n"; +if(uc($plusbase) ne "C" && uc($otherbase) ne "C"){ +substr $seq{$chr_snp}, $line[2]-1, 1, "C"; +} elsif(uc($plusbase) ne "G" && uc($otherbase) ne "G"){ +substr $seq{$chr_snp}, $line[2]-1, 1, "G"; +} elsif(uc($plusbase) ne "A" && uc($otherbase) ne "A"){ +substr $seq{$chr_snp}, $line[2]-1, 1, "A"; +} elsif(uc($plusbase) ne "T" && uc($otherbase) ne "T"){ +substr $seq{$chr_snp}, $line[2]-1, 1, "T"; +} +} elsif($test_string eq "N"){ +print "Base was N in reference\n"; +} +else { +die "THE BASE GIVEN WAS NOT THE REFERENCE BASE, ABORTING MISSION!"; +print "THE BASE GIVEN WAS NOT THE REFERENCE BASE, ABORTING MISSION!"; +} +print(substr($seq{$chr_snp}, $line[2]-5, 10),"\n\n"); +} +#print NRSEQ (">chr\n$seq{$chr_snp}\n"); +#print NRSEQ ">chr",$_,"\n",$seq{$_},"\n" for each %seq; +foreach $ky (sort keys %seq) {print NRSEQ ">chr$ky\n$seq{$ky}\n";} +} + From 44f857a956839cf7daf88f09c72aa585a27a3624 Mon Sep 17 00:00:00 2001 From: edajeda Date: Mon, 8 Oct 2012 14:44:35 +0300 Subject: [PATCH 10/10] Fixed generation of filtered variant file --- GSNAP/basecount/basecount.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/GSNAP/basecount/basecount.sh b/GSNAP/basecount/basecount.sh index 2b162e4..93b655d 100644 --- a/GSNAP/basecount/basecount.sh +++ b/GSNAP/basecount/basecount.sh @@ -29,14 +29,16 @@ ls -1 *.het.pos | xargs -I% echo cat % '|' sed "'s/\./ /' >"%.hetvars >cmds.sh rename genome.DP.FORMAT.het.pos. . *.hetvars #Dump "chr,pos,ref,alt" for these vars. (Used as input in snv.ase.R since the alt allele is seldom biallelic when running mpileup.allelecount.pl below.) -find $vcdir -maxdepth 1 -name '*.vcf' | grep -v 'allbams' | xargs -I% echo "awk -F'\t' '{print \$1\".\"\$2, \$1, \$2, \$4, \$5;}' % | grep -v '^#' | sort -k 1,1 >"%.pos >cmds.sh +#find $vcdir -maxdepth 1 -name '*.vcf' | grep -v 'allbams' | xargs -I% echo "awk -F'\t' '{print \$1\".\"\$2, \$1, \$2, \$4, \$5;}' % | grep -v '^#' | sort -k 1,1 >"%.pos >cmds.sh +find $vcdir -maxdepth 1 -name 'allbams.vcf' | xargs -I% echo "awk -F'\t' '{print \$1\".\"\$2, \$1, \$2, \$4, \$5;}' % | grep -v '^#' | sort -k 1,1 >"%.pos >cmds.sh sh cmds.sh ls -1 *.het.pos | xargs -I% echo sort % '>'%.sorted >cmds.sh sh cmds.sh ls -1 *.het.pos.sorted | sed 's/.genome.DP.FORMAT.het.pos.sorted//' >samples.list -cat samples.list | xargs -I% echo join -j1 %.genome.DP.FORMAT.het.pos.sorted %.genome.DP.FORMAT.het.pos '>' %.hetvars.alleles >cmds.sh +cat samples.list | xargs -I% echo join -j1 %.genome.DP.FORMAT.het.pos.sorted allbams.vcf.pos '>' %.hetvars.alleles >cmds.sh srun -p devel -t 1:00:00 -A b2012046 sh cmds.sh & ls -1 *.hetvars.alleles | xargs -I% echo cut -d"' '" -f2-5 % "| tr ' ' '\t' >"%.tmp >cmds.sh +sh cmds.sh rename .hetvars.alleles.tmp .hetvars.alleles *.hetvars.alleles.tmp