-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheck_problems.pl
More file actions
executable file
·141 lines (124 loc) · 3.23 KB
/
check_problems.pl
File metadata and controls
executable file
·141 lines (124 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/perl
## Pombert Lab, IIT, 2021
use strict; use warnings; use Getopt::Long qw(GetOptions); use File::Basename;
my $name = "check_problems.pl";
my $version = "0.4.1";
my $updated = '2021-10-14';
my $usage = <<"EXIT";
NAME ${name}
VERSION ${version}
UPDATED ${updated}
SYNOPSIS This script is used to notify the user of protein abnormalities, such as a missing
start methionines and internal stop codons. This script can also run EMBLtoFeatures.pl to
update the .prot files before checking for these abnormalities.
COMMAND ${name} \\
-p *.prot \\
-o ProteinCheck.log \\
-u \\
-v
OPTIONS
-p (--prot) FASTA files (.prot) to be checked for abnormalities
-o (--out) Print the output to a log file
-u (--update) Update .prot files using EMBLtoFeatures.pl
-v (--verb) Add verbosity
EXIT
die "\n$usage\n\n" unless @ARGV;
my @prot_files;
my $out;
my $update;
my $verb;
GetOptions(
'p|prot=s@{1,}' => \@prot_files,
'o|out=s' => \$out,
'u|update' => \$update,
'v|verb' => \$verb
);
if ($out){
open OUT, ">", "$out" or die "Can't create $out: $!\n";
}
my ($script_name,$script_dir) = fileparse($0);
my $script = $script_dir."/EMBLtoFeatures.pl";
## Runs EMBLtoFeatures.pl if flag update is on
if ($update){
foreach my $file (@prot_files){
my ($filename,$dir) = fileparse($file);
my $name = basename($file,".prot");
if (-f "$name.embl"){
if ($verb){
system "$script \\
-e $name.embl \\
-v";
}
else{
system "$script \\
-e $name.embl";
}
}
else {
print "[W] $file has no correlating .fsa file\n";
}
}
}
## Checking for problems in the .prot files
for my $prot_file (@prot_files){
open IN, "<", "$prot_file" or die "Can't open $prot_file: $!\n";
my ($filename,$dir) = fileparse($prot_file);
my %sequences;
my $locus;
while (my $line = <IN>){
chomp $line;
if($line =~ /^>(\S+)/){
$locus = $1;
next;
}
$sequences{$locus} .= $line;
}
if ($verb) { print "\nChecking for errors in $prot_file located in $dir\n"; }
my $count = undef;
## Iterating through locus tags in database %sequences
foreach my $locus_tag (sort(keys %sequences)){
my $line = $sequences{$locus_tag};
unless ($count){
if (($line !~ /^M/) || ($line =~ /\W/)){
if ($out){
print OUT "\n\t\tInvalid Start Codon\tInternal Stop Codon\n\n";
}
print "\n\t\tInvalid Start Codon\tInternal Stop Codon\n\n";
}
}
## No start methionine (bad) + internal stop codon found (bad)
if (($line !~ /^(M)/) && ($line =~ /\W/)){
my ($aa) = $line =~ /^(\w)/;
if ($out){
print OUT "$locus_tag\t\t$aa\t\t\tX\n";
}
print "$locus_tag\t\t$aa\t\t\tX\n";
$count = 1;
}
## No start methionine (bad), but no internal stop codon (good)
elsif ($line !~ /^(M)/){
my ($aa) = $line =~ /^(\w)/;
if ($out){
print OUT "$locus_tag\t\t$aa\t\t\t.\n";
}
print "$locus_tag\t\t$aa\t\t\t.\n";
$count = 1;
}
## Start methionine found (good), internal stop codon found (bad)
elsif ($line =~ /\W/){
if ($out){
print OUT "$locus_tag\t\t.\t\t\tX\n";
}
print "$locus_tag\t\t.\t\t\tX\n";
$count = 1;
}
}
## No error found
unless ($count){
if ($out){
print OUT "\nOK: No error found in $prot_file\n";
}
print "\nOK: No error found in $prot_file\n";
}
}
print "\n";