forked from raylim/scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfixTargetedSeqSam.pl
More file actions
124 lines (108 loc) · 2.54 KB
/
fixTargetedSeqSam.pl
File metadata and controls
124 lines (108 loc) · 2.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env perl
# zero base quality scores for G->T and C->A mutation artefacts from sorted sam file
use strict;
use warnings;
use Getopt::Std;
my %opt;
getopts('h', \%opt);
my $usage = <<ENDL;
perl fixTargetedSeqSam.pl < [namesorted sam]
ENDL
sub HELP_MESSAGE {
print STDERR $usage;
exit(1);
}
HELP_MESSAGE if $opt{h};
my %gtFilter = (
"xT" => 1,
"xC" => 1);
my %caFilter = (
"xT" => 1,
"xC" => 1);
#my %gtFilter = (
#"TxG" => 1,
#"TxA" => 1,
#"GxA" => 1,
#"CxG" => 1,
#"CxA" => 1,
#"GxA" => 1,
#"AxA" => 1
#);
#my %gtFilter = (
#"TxGT" => 1,
#"TxAT" => 1,
#"TxAT" => 1,
#"TxA" => 1,
#"CxG" => 1,
#"CxA" => 1,
#"AxA" => 1,
#"AxG" => 1
#);
#my %caFilter = (
#"TxT" => 1,
#"TxG" => 1,
#"TxC" => 1,
#"TxA" => 1,
#"CxG" => 1,
#"CxA" => 1
#);
# extract ref -> alt mismatches and contexts using MD
sub parseMD {
my ($seq, $md) = @_;
my $origSeq = $seq;
#my $origMD = $md;
#print "md: $md\n";
my @refs;
my @alts;
my @contexts;
my @posns;
my $i = 0;
do {
$md =~ s/^([0-9]+)//;
my $nmatch = $1;
$i += $nmatch;
$seq = substr $seq, $nmatch;
if ($md =~ s/^([ATCGN])//) {
my $ref = $1;
my $alt = substr $seq, 0, 1;
$i++;
push @posns, $i;
# pad sequence
my $ss = " " . $origSeq . " " ;
substr($ss, $i, 1) = "x";
my $context = substr $ss, $i, 2;
$seq = substr $seq, 1;
#print "$ref $alt\n";
push @refs, $ref;
push @alts, $alt;
push @contexts, $context;
}
$md =~ s/^\^[NATCG]+//;
} while ($md ne "");
(\@refs, \@alts, \@posns, \@contexts);
}
while (defined(my $line = <>)) {
print $line and next if $line =~ /^\@/;
chomp $line;
my @F = split /\t/, $line;
my $seq = $F[9];
my $bq = $F[10];
my @optF = @F[11..$#F];
my @mdFs = grep /^MD:Z:/, @optF;
next if (@mdFs < 1);
my $mdF = $mdFs[0];
$mdF =~ s/^MD:Z://;
my ($refs, $alts, $posns, $contexts) = parseMD($seq, $mdF);
for my $i (0..$#$refs) {
if ($refs->[$i] eq "G" && $alts->[$i] eq "T" && exists $gtFilter{$contexts->[$i]}) {
#print substr($seq, $posns->[$i] - 1, 1) . "\n";
substr($bq, $posns->[$i] - 1, 1) = "!";
# print $contexts->[$i] . "\n";
}
if ($refs->[$i] eq "C" && $alts->[$i] eq "A" && exists $caFilter{$contexts->[$i]}) {
substr($bq, $posns->[$i] - 1, 1) = "!";
}
}
$F[10] = $bq;
print join("\t", @F) . "\n";
}