Skip to content

Commit 7da0359

Browse files
committed
Added check for missing hits in uniprot lists
1 parent 96fc738 commit 7da0359

File tree

1 file changed

+32
-5
lines changed

1 file changed

+32
-5
lines changed

Function_prediction/parse_annotators.pl

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/perl
22
## Pombert Lab, IIT, 2020
33
my $name = 'parse_annotators.pl';
4-
my $version = '1.7';
4+
my $version = '1.7a';
55
my $updated = '2021-06-29';
66

77
use strict; use warnings; use Getopt::Long qw(GetOptions);
@@ -16,6 +16,7 @@
1616
- BLASTP/DIAMOND searches against SwissProt/trEMBL databases
1717
- BLASTP/DIAMOND searches against reference organism(s)
1818
- KofamKOALA/GhostKOALA/BlastKOALA searches against KEGG
19+
- dbCAN2 searches against CAZy
1920
2021
USAGE ${name} \\
2122
-q BEOM2.proteins.queries \\
@@ -90,7 +91,7 @@
9091
## parsing step
9192
my %references;
9293
unless (scalar(@rblist) == scalar(@rbblast)){
93-
die "[E] the number of reference feature lists do not equal the number of reference blast files\n";
94+
die "ERROR: the number of reference feature lists does not equal the number of reference blast files\n";
9495
}
9596
else {
9697
for (my $i = 0; $i < scalar(@rblist); $i++){
@@ -106,7 +107,6 @@
106107
## Using a double pass for memory optimization and reduce the size of the hash
107108
my %sprot;
108109
open SB, "<", "$spblast" or die "Can't open $spblast: $!\n";
109-
my %sphits;
110110
while (my $line = <SB>){
111111
chomp $line;
112112
my @cols = split("\t", $line);
@@ -126,6 +126,7 @@
126126
close SP;
127127

128128
open SB, "<", "$spblast" or die "Can't open $spblast: $!\n";
129+
my %sphits;
129130
while (my $line = <SB>){
130131
chomp $line;
131132
my @cols = split("\t", $line);
@@ -136,17 +137,29 @@
136137
elsif ( $sprot{$hit} =~ /uncharacterized/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits
137138
elsif ( $sprot{$hit} =~ /hypothetical/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits
138139
elsif ( $sprot{$hit} =~ /predicted protein/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits
139-
else{
140+
elsif ( $sprot{$hit} eq '1'){ next; } ## Checking if entry is missing from $spblast, if so move to next hit
141+
else {
140142
$sphits{$query}[0] = $sprot{$hit};
141143
$sphits{$query}[1] = $evalue;
142144
}
143145
}
144146
close SB;
145147

148+
## Checking for discrepancies
149+
my $num = scalar (keys %sprot);
150+
my $match = 0;
151+
foreach (keys %sprot){
152+
if ($sprot{$_} eq '1'){
153+
if ($verbose) { print "$_ is missing from $splist\n"; }
154+
$match++;
155+
}
156+
}
157+
print "\nSwissProt hits = $num\n";
158+
print "SwissProt hits missing from $splist = $match\n";
159+
146160
## Parsing TREMBL blast.6
147161
## Using a double pass for memory optimization and reduce the size of the hash
148162
my %trembl;
149-
my %tbhits;
150163
open TBB, "<", "$tbblast" or die "Can't open $tbblast: $!\n";
151164
while(my $line = <TBB>){
152165
chomp $line;
@@ -167,6 +180,7 @@
167180
close TB;
168181

169182
open TBB, "<", "$tbblast" or die "Can't open $tbblast: $!\n";
183+
my %tbhits;
170184
while (my $line = <TBB>){
171185
chomp $line;
172186
my @cols = split("\t", $line);
@@ -177,13 +191,26 @@
177191
elsif ( $trembl{$hit} =~ /uncharacterized/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits
178192
elsif ( $trembl{$hit} =~ /hypothetical/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits
179193
elsif ( $trembl{$hit} =~ /predicted protein/i ){ next; } ## Discarding uninformative BLAST/DIAMOND hits
194+
elsif ( $trembl{$hit} eq '1'){ next; } ## Checking if entry is missing from $tblist, if so move to next hit
180195
else {
181196
$tbhits{$query}[0] = $trembl{$hit};
182197
$tbhits{$query}[1] = $evalue;
183198
}
184199
}
185200
close TBB;
186201

202+
## Checking for discrepancies
203+
my $num2 = scalar (keys %trembl);
204+
my $match2 = 0;
205+
foreach (keys %trembl){
206+
if ($trembl{$_} eq '1'){
207+
if ($verbose) { print "$_ is missing from $tblist\n"; }
208+
$match2++;
209+
}
210+
}
211+
print "\nTrEMBL hits = $num2\n";
212+
print "TrEMBL hits missing from $tblist = $match2\n\n";
213+
187214
my $time_taken = time - $tstart;
188215
print "$time: Finished obtaining annotations for $splist and $tblist in $time_taken seconds.\n";
189216

0 commit comments

Comments
 (0)