Merge pull request #35 from aqlaboratory/bugfix/template-sequence-check

christinaflo · web-flow · commit 4b75edab0e1d · 2026-01-20T17:23:02.000+01:00
Sequence filters fix
diff --git a/openfold3/core/data/primitives/sequence/template.py b/openfold3/core/data/primitives/sequence/template.py
@@ -148,7 +148,7 @@ def check_sequence(
     max_subseq: float = 0.95,
     min_align: float = 0.1,
     min_len: int = 10,
-) -> bool:
+) -> tuple[bool, np.ndarray | None, np.ndarray | None]:
     """Applies sequence filters to template hits following AF3 SI Section 2.4.
 
     Args:
@@ -168,31 +168,37 @@ def check_sequence(
             Whether the hit passes the sequence filters and the aligned query and hit
             sequences as numpy arrays.
     """
-    query_seq = query.hit_sequence.replace("-", "")
-    hit_seq = hit.hit_sequence.replace("-", "")
-    if len(hit_seq) < min_len:
-        return True, None, None
-    query_aln = np.frombuffer(
-        query.hit_sequence.replace(".", "-").encode("ascii"), dtype="S1"
-    )
-    hit_aln = np.frombuffer(
-        hit.hit_sequence.replace(".", "-").encode("ascii"), dtype="S1"
-    )
+    q_aln_str = query.hit_sequence.replace(".", "-")
+    h_aln_str = hit.hit_sequence.replace(".", "-")
 
-    query_not_gap = query_aln != b"-"
-    hit_not_gap = hit_aln != b"-"
+    query_sequence = q_aln_str.replace("-", "").upper()
+    matching_sequence = h_aln_str.replace("-", "").upper()
 
-    columns_to_keep = query_not_gap & hit_not_gap
-    covered = columns_to_keep.sum()
+    # Fail if no query sequence for whatever reason
+    q_len = len(query_sequence)
+    if q_len == 0:
+        return True, None, None
+
+    # Hits that are too short
+    if len(matching_sequence) < min_len:
+        return True, None, None
 
-    coverage = covered / (len(query_seq) or 1)
+    query_aln = np.frombuffer(q_aln_str.encode("ascii"), dtype="S1")
+    hit_aln = np.frombuffer(h_aln_str.encode("ascii"), dtype="S1")
 
-    if coverage < min_align:
+    # Hits with too few aligned columns
+    is_upper = (hit_aln >= b"A") & (hit_aln <= b"Z")
+    n_aligned = int(is_upper.sum())
+    align_ratio = n_aligned / q_len
+    if align_ratio <= min_align:
         return True, None, None
 
-    identical = (columns_to_keep & (query_not_gap == hit_not_gap)).sum()
+    # Hits with too long exact contiguous subsequence
+    length_ratio = len(matching_sequence) / q_len
+    if (length_ratio > max_subseq) and (matching_sequence in query_sequence):
+        return True, None, None
 
-    return coverage >= max_subseq and identical == covered, query_aln, hit_aln
+    return False, query_aln, hit_aln
 
 
 def parse_release_date(cif_file: CIFFile) -> datetime: