Skip to content

Commit 4b75eda

Browse files
authored
Merge pull request #35 from aqlaboratory/bugfix/template-sequence-check
Sequence filters fix
2 parents e1e0164 + cf00624 commit 4b75eda

File tree

1 file changed

+25
-19
lines changed

1 file changed

+25
-19
lines changed

openfold3/core/data/primitives/sequence/template.py

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def check_sequence(
148148
max_subseq: float = 0.95,
149149
min_align: float = 0.1,
150150
min_len: int = 10,
151-
) -> bool:
151+
) -> tuple[bool, np.ndarray | None, np.ndarray | None]:
152152
"""Applies sequence filters to template hits following AF3 SI Section 2.4.
153153
154154
Args:
@@ -168,31 +168,37 @@ def check_sequence(
168168
Whether the hit passes the sequence filters and the aligned query and hit
169169
sequences as numpy arrays.
170170
"""
171-
query_seq = query.hit_sequence.replace("-", "")
172-
hit_seq = hit.hit_sequence.replace("-", "")
173-
if len(hit_seq) < min_len:
174-
return True, None, None
175-
query_aln = np.frombuffer(
176-
query.hit_sequence.replace(".", "-").encode("ascii"), dtype="S1"
177-
)
178-
hit_aln = np.frombuffer(
179-
hit.hit_sequence.replace(".", "-").encode("ascii"), dtype="S1"
180-
)
171+
q_aln_str = query.hit_sequence.replace(".", "-")
172+
h_aln_str = hit.hit_sequence.replace(".", "-")
181173

182-
query_not_gap = query_aln != b"-"
183-
hit_not_gap = hit_aln != b"-"
174+
query_sequence = q_aln_str.replace("-", "").upper()
175+
matching_sequence = h_aln_str.replace("-", "").upper()
184176

185-
columns_to_keep = query_not_gap & hit_not_gap
186-
covered = columns_to_keep.sum()
177+
# Fail if no query sequence for whatever reason
178+
q_len = len(query_sequence)
179+
if q_len == 0:
180+
return True, None, None
181+
182+
# Hits that are too short
183+
if len(matching_sequence) < min_len:
184+
return True, None, None
187185

188-
coverage = covered / (len(query_seq) or 1)
186+
query_aln = np.frombuffer(q_aln_str.encode("ascii"), dtype="S1")
187+
hit_aln = np.frombuffer(h_aln_str.encode("ascii"), dtype="S1")
189188

190-
if coverage < min_align:
189+
# Hits with too few aligned columns
190+
is_upper = (hit_aln >= b"A") & (hit_aln <= b"Z")
191+
n_aligned = int(is_upper.sum())
192+
align_ratio = n_aligned / q_len
193+
if align_ratio <= min_align:
191194
return True, None, None
192195

193-
identical = (columns_to_keep & (query_not_gap == hit_not_gap)).sum()
196+
# Hits with too long exact contiguous subsequence
197+
length_ratio = len(matching_sequence) / q_len
198+
if (length_ratio > max_subseq) and (matching_sequence in query_sequence):
199+
return True, None, None
194200

195-
return coverage >= max_subseq and identical == covered, query_aln, hit_aln
201+
return False, query_aln, hit_aln
196202

197203

198204
def parse_release_date(cif_file: CIFFile) -> datetime:

0 commit comments

Comments
 (0)