|
1 | 1 | #!/usr/bin/perl |
2 | 2 | ## Pombert Lab, IIT, 2020 |
3 | 3 | my $name = 'parse_annotators.pl'; |
4 | | -my $version = '1.7'; |
| 4 | +my $version = '1.7a'; |
5 | 5 | my $updated = '2021-06-29'; |
6 | 6 |
|
7 | 7 | use strict; use warnings; use Getopt::Long qw(GetOptions); |
|
16 | 16 | - BLASTP/DIAMOND searches against SwissProt/trEMBL databases |
17 | 17 | - BLASTP/DIAMOND searches against reference organism(s) |
18 | 18 | - KofamKOALA/GhostKOALA/BlastKOALA searches against KEGG |
| 19 | + - dbCAN2 searches against CAZy |
19 | 20 |
|
20 | 21 | USAGE ${name} \\ |
21 | 22 | -q BEOM2.proteins.queries \\ |
|
90 | 91 | ## parsing step |
91 | 92 | my %references; |
92 | 93 | unless (scalar(@rblist) == scalar(@rbblast)){ |
93 | | - die "[E] the number of reference feature lists do not equal the number of reference blast files\n"; |
| 94 | + die "ERROR: the number of reference feature lists does not equal the number of reference blast files\n"; |
94 | 95 | } |
95 | 96 | else { |
96 | 97 | for (my $i = 0; $i < scalar(@rblist); $i++){ |
|
106 | 107 | ## Using a double pass for memory optimization and reduce the size of the hash |
107 | 108 | my %sprot; |
108 | 109 | open SB, "<", "$spblast" or die "Can't open $spblast: $!\n"; |
109 | | -my %sphits; |
110 | 110 | while (my $line = <SB>){ |
111 | 111 | chomp $line; |
112 | 112 | my @cols = split("\t", $line); |
|
126 | 126 | close SP; |
127 | 127 |
|
128 | 128 | open SB, "<", "$spblast" or die "Can't open $spblast: $!\n"; |
| 129 | +my %sphits; |
129 | 130 | while (my $line = <SB>){ |
130 | 131 | chomp $line; |
131 | 132 | my @cols = split("\t", $line); |
|
136 | 137 | elsif ( $sprot{$hit} =~ /uncharacterized/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits |
137 | 138 | elsif ( $sprot{$hit} =~ /hypothetical/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits |
138 | 139 | elsif ( $sprot{$hit} =~ /predicted protein/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits |
139 | | - else{ |
| 140 | + elsif ( $sprot{$hit} eq '1'){ next; } ## Checking if entry is missing from $spblast, if so move to next hit |
| 141 | + else { |
140 | 142 | $sphits{$query}[0] = $sprot{$hit}; |
141 | 143 | $sphits{$query}[1] = $evalue; |
142 | 144 | } |
143 | 145 | } |
144 | 146 | close SB; |
145 | 147 |
|
| 148 | +## Checking for discrepancies |
| 149 | +my $num = scalar (keys %sprot); |
| 150 | +my $match = 0; |
| 151 | +foreach (keys %sprot){ |
| 152 | + if ($sprot{$_} eq '1'){ |
| 153 | + if ($verbose) { print "$_ is missing from $splist\n"; } |
| 154 | + $match++; |
| 155 | + } |
| 156 | +} |
| 157 | +print "\nSwissProt hits = $num\n"; |
| 158 | +print "SwissProt hits missing from $splist = $match\n"; |
| 159 | + |
146 | 160 | ## Parsing TREMBL blast.6 |
147 | 161 | ## Using a double pass for memory optimization and reduce the size of the hash |
148 | 162 | my %trembl; |
149 | | -my %tbhits; |
150 | 163 | open TBB, "<", "$tbblast" or die "Can't open $tbblast: $!\n"; |
151 | 164 | while(my $line = <TBB>){ |
152 | 165 | chomp $line; |
|
167 | 180 | close TB; |
168 | 181 |
|
169 | 182 | open TBB, "<", "$tbblast" or die "Can't open $tbblast: $!\n"; |
| 183 | +my %tbhits; |
170 | 184 | while (my $line = <TBB>){ |
171 | 185 | chomp $line; |
172 | 186 | my @cols = split("\t", $line); |
|
177 | 191 | elsif ( $trembl{$hit} =~ /uncharacterized/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits |
178 | 192 | elsif ( $trembl{$hit} =~ /hypothetical/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits |
179 | 193 | elsif ( $trembl{$hit} =~ /predicted protein/i ){ next; } ## Discarding uninformative BLAST/DIAMOND hits |
| 194 | + elsif ( $trembl{$hit} eq '1'){ next; } ## Checking if entry is missing from $tblist, if so move to next hit |
180 | 195 | else { |
181 | 196 | $tbhits{$query}[0] = $trembl{$hit}; |
182 | 197 | $tbhits{$query}[1] = $evalue; |
183 | 198 | } |
184 | 199 | } |
185 | 200 | close TBB; |
186 | 201 |
|
| 202 | +## Checking for discrepancies |
| 203 | +my $num2 = scalar (keys %trembl); |
| 204 | +my $match2 = 0; |
| 205 | +foreach (keys %trembl){ |
| 206 | + if ($trembl{$_} eq '1'){ |
| 207 | + if ($verbose) { print "$_ is missing from $tblist\n"; } |
| 208 | + $match2++; |
| 209 | + } |
| 210 | +} |
| 211 | +print "\nTrEMBL hits = $num2\n"; |
| 212 | +print "TrEMBL hits missing from $tblist = $match2\n\n"; |
| 213 | + |
187 | 214 | my $time_taken = time - $tstart; |
188 | 215 | print "$time: Finished obtaining annotations for $splist and $tblist in $time_taken seconds.\n"; |
189 | 216 |
|
|
0 commit comments