From 1a89366b93a26a97e9bd889d557faac8d892ce40 Mon Sep 17 00:00:00 2001 From: Bradford Powell Date: Fri, 3 Apr 2015 07:19:21 -0400 Subject: [PATCH] avoid grepping for each line... --- ExCID_v2.0/check_HGNC_individual_VEGADB.pl | 39 ++++++++++++---------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/ExCID_v2.0/check_HGNC_individual_VEGADB.pl b/ExCID_v2.0/check_HGNC_individual_VEGADB.pl index 4e93a61..d516def 100755 --- a/ExCID_v2.0/check_HGNC_individual_VEGADB.pl +++ b/ExCID_v2.0/check_HGNC_individual_VEGADB.pl @@ -6,8 +6,26 @@ my $HGNC = $ARGV[1]; my $VEGA_HGNC_names = $ARGV[2]; -open(my $fh,"<$annotated_index") or die $!; +my %words_to_genes = (); # not a perfect index, but will do the same as "grep -w" +open(my $hgnc_fh, "<$HGNC") or die $!; +while (my $line = <$hgnc_fh>) { + chomp $line; + my @row = split(/[\s,]/, $line); + my $gene = shift @row; + map { $words_to_genes{$_} = $gene } @row; +} +close $hgnc_fh or die $!; + +my %vega_index = (); +open(my $vega_fh, "<$VEGA_HGNC_names") or die $!; +while (my $line = <$vega_fh>) { + chomp $line; + my ($a, $b) = split "\t", $line; + $vega_index{$b} = $a; +} +close $vega_fh or die $!; +open(my $fh,"<$annotated_index") or die $!; while (my $line = <$fh>) { @@ -17,23 +35,10 @@ my @transcript_ID_tmp_split = split("_exon_",$ID); my $transcript_ID = $transcript_ID_tmp_split[0]; - my @grep2 = `grep -w "$transcript_ID" $VEGA_HGNC_names `; - if (scalar(@grep2) > 0) { - my @grep_tmp = split("\t",$grep2[0]); - chomp($grep_tmp[0]); - $transcript_ID= $grep_tmp[0]; - } - - my @grep = `grep -w "$transcript_ID" $HGNC `; + $transcript_ID = $vega_index{$transcript_ID} || $transcript_ID; + my $gene_name = $words_to_genes{$transcript_ID} || '.'; - if (scalar(@grep) == 1) { - my @tmp = split("\t",$grep[0]); - my $gene_name = $tmp[0]; - print "$chr\t$start\t$Stop\t$gene\t$ID\t$gene_name\n"; - }else{ - print "$chr\t$start\t$Stop\t$gene\t$ID\t.\n"; - # print STDERR "$chr\t$start\t$Stop\t$gene\t$ID\t.\n"; - } + print "$chr\t$start\t$Stop\t$gene\t$ID\t$gene_name\n"; }