-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathcontiguity.pl
More file actions
executable file
·93 lines (72 loc) · 2.25 KB
/
contiguity.pl
File metadata and controls
executable file
·93 lines (72 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/perl
my $usage="
Calculates transcriptome contiguity as per
Martin and Wang, Nat Rev Genetics 12, 671-682, 2012:
Contiguity is the percentage of transcriptome-matching reference
sequences aligned over [threshold] fraction of their length against
the longest matching contig in the assembly.
For example, contiguity of 0.5 at the threshold 0.75 implies that
50% of reference sequences that were matched by the transcriptome
contigs were aligned over 75% or more of their length against
the longest matching contig.
Arguments:
hits=[filename] : *_hits.tab output file from CDS_extractor_v2.pl
threshold=[from 0 to 1] : aligned length threshold for contiguity calculation;
default 0.75
Output:
prints the calculated contiguity to STDOUT
generates a pdf historgam of aligned proportions (calls Rscript)
Mikhail Matz, matz\@utexas.edu
October 2014
";
my $hits;
if ("@ARGV"=~/hits=(\S+)/) { $hits=$1; } else { die $usage;}
my $thresh=0.75;
if ("@ARGV"=~/threshold=(\S+)/) { $thresh=$1; }
open HITS, $hits or die "cannot open hits file $hits\n";
my %aligned={};
my @hline;
my $fl=1;
while (<HITS>) {
if ($fl==1) { $fl-- and next;}
chomp;
@hline=split(/\t/,$_);
if ($hline[4]==0) { warn "zero value at pos 4: $_\n" and next;}
my $cov=sprintf("%.3f",$hline[2]/$hline[4]);
my $hit=$hline[3];
push @{$aligned{$hit}}, $cov;
}
my @hnames=keys(%aligned);
my %bestalig={};
my $contig;
my $rfname=$hits."_contiguity.R";
open RF, ">$rfname";
print {RF} "aligns=c(";
foreach $hit (@hnames) {
next if ($hit=~/HASH/);
my $maxcov=0;
foreach $cov (@{$aligned{$hit}}) {
if ($cov>$maxcov) {
if ($cov>1.3){$cov=1.3;}
$maxcov=$cov;
$bestalig{$hit}=$cov;
}
}
if ($bestalig{$hit}>=$thresh) { $contig++;}
if ($hit eq $hnames[0]) { print {RF} "$bestalig{$hit}";}
else { print {RF} ",$bestalig{$hit}";}
}
$contig=sprintf("%.2f",$contig/($#hnames+1));
print "\ncontiguity at $thresh threshold: $contig\n\n";
print {RF} ")
pdf(\"contiguity_$hits.pdf\", width=4,height=4.5)
hist(aligns,xlim=c(-0.2,1.3),breaks=30,
bty=\"n\",
main=paste(\"Contiguity at \",$thresh,\" threshold: \",$contig,sep=\"\"),
xlab=\"reference CDS coverage\",
mgp=c(2.3,1,0)
)
dev.off()
";
close RF;
system("Rscript $rfname");