@@ -148,7 +148,7 @@ def check_sequence(
148148 max_subseq : float = 0.95 ,
149149 min_align : float = 0.1 ,
150150 min_len : int = 10 ,
151- ) -> bool :
151+ ) -> tuple [ bool , np . ndarray | None , np . ndarray | None ] :
152152 """Applies sequence filters to template hits following AF3 SI Section 2.4.
153153
154154 Args:
@@ -168,31 +168,37 @@ def check_sequence(
168168 Whether the hit passes the sequence filters and the aligned query and hit
169169 sequences as numpy arrays.
170170 """
171- query_seq = query .hit_sequence .replace ("-" , "" )
172- hit_seq = hit .hit_sequence .replace ("-" , "" )
173- if len (hit_seq ) < min_len :
174- return True , None , None
175- query_aln = np .frombuffer (
176- query .hit_sequence .replace ("." , "-" ).encode ("ascii" ), dtype = "S1"
177- )
178- hit_aln = np .frombuffer (
179- hit .hit_sequence .replace ("." , "-" ).encode ("ascii" ), dtype = "S1"
180- )
171+ q_aln_str = query .hit_sequence .replace ("." , "-" )
172+ h_aln_str = hit .hit_sequence .replace ("." , "-" )
181173
182- query_not_gap = query_aln != b "-"
183- hit_not_gap = hit_aln != b "-"
174+ query_sequence = q_aln_str . replace ( "-" , "" ). upper ()
175+ matching_sequence = h_aln_str . replace ( "-" , "" ). upper ()
184176
185- columns_to_keep = query_not_gap & hit_not_gap
186- covered = columns_to_keep .sum ()
177+ # Fail if no query sequence for whatever reason
178+ q_len = len (query_sequence )
179+ if q_len == 0 :
180+ return True , None , None
181+
182+ # Hits that are too short
183+ if len (matching_sequence ) < min_len :
184+ return True , None , None
187185
188- coverage = covered / (len (query_seq ) or 1 )
186+ query_aln = np .frombuffer (q_aln_str .encode ("ascii" ), dtype = "S1" )
187+ hit_aln = np .frombuffer (h_aln_str .encode ("ascii" ), dtype = "S1" )
189188
190- if coverage < min_align :
189+ # Hits with too few aligned columns
190+ is_upper = (hit_aln >= b"A" ) & (hit_aln <= b"Z" )
191+ n_aligned = int (is_upper .sum ())
192+ align_ratio = n_aligned / q_len
193+ if align_ratio <= min_align :
191194 return True , None , None
192195
193- identical = (columns_to_keep & (query_not_gap == hit_not_gap )).sum ()
196+ # Hits with too long exact contiguous subsequence
197+ length_ratio = len (matching_sequence ) / q_len
198+ if (length_ratio > max_subseq ) and (matching_sequence in query_sequence ):
199+ return True , None , None
194200
195- return coverage >= max_subseq and identical == covered , query_aln , hit_aln
201+ return False , query_aln , hit_aln
196202
197203
198204def parse_release_date (cif_file : CIFFile ) -> datetime :
0 commit comments