Add example

padix-key · padix-key · commit 6de13651ed45 · 2025-02-02T10:22:13.000+01:00
diff --git a/doc/examples/scripts/structure/modeling/model_lddt.py b/doc/examples/scripts/structure/modeling/model_lddt.py
@@ -0,0 +1,129 @@
+r"""
+LDDT for predicted structure evaluation
+=======================================
+
+This example evaluates the quality of a predicted structure from *AlphaFold DB* compared
+to the experimental structure of a protein of interest by the means of the lDDT score.
+Furthermore, the measured lDDT score is compared to the pLDDT score predicted by the
+model.
+"""
+
+# Code source: Patrick Kunzmann
+# License: BSD 3 clause
+
+import matplotlib.pyplot as plt
+import numpy as np
+import biotite
+import biotite.database.afdb as afdb
+import biotite.database.rcsb as rcsb
+import biotite.sequence as seq
+import biotite.sequence.align as align
+import biotite.structure as struc
+import biotite.structure.io.pdbx as pdbx
+
+# Uniprot ID of the protein of interest (in this case human beta-galactosidase)
+UNIPROT_ID = "P16278"
+
+
+## Get the reference experimental structure from the PDB
+query = rcsb.FieldQuery(
+    "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
+    exact_match=UNIPROT_ID,
+)
+# The UniProt ID is defined for a single chain
+ids = rcsb.search(query, return_type="polymer_instance")
+# Simply use the first matching chain as reference
+pdb_id, chain_id = ids[0].split(".")
+pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch(pdb_id, "bcif"))
+reference = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)
+reference = reference[reference.chain_id == chain_id]
+# The experimental structure may contain additional small molecules
+# (e.g. water, ions etc.) that are not part of the predicted structure
+reference = reference[struc.filter_amino_acids(reference)]
+
+
+## Get the predicted structure from AlphaFold DB
+pdbx_file = pdbx.BinaryCIFFile.read(afdb.fetch(UNIPROT_ID, "bcif"))
+# Use 'label_<x>' fields to make sure the residue ID is the the same as given in the
+# `ma_qa_metric_local` category, where the pLDDT is obtained from
+model = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)
+
+
+## Filter the structures to common atoms that are present in both structures
+reference_sequence = struc.to_sequence(reference)[0][0]
+model_sequence = struc.to_sequence(model)[0][0]
+# This script does not rely on consistent residue numbering,
+# so a sequence alignment is done instead
+identity_matrix = align.SubstitutionMatrix(
+    seq.ProteinSequence.alphabet,
+    seq.ProteinSequence.alphabet,
+    np.eye(len(seq.ProteinSequence.alphabet), dtype=int),
+)
+alignment = align.align_optimal(
+    reference_sequence,
+    model_sequence,
+    # Residues might be missing due to experimental reasons but not due to homology
+    # -> use a simple identity matrix
+    identity_matrix,
+    gap_penalty=-1,
+    terminal_penalty=False,
+    max_number=1,
+)[0]
+# Remove residues from alignment
+# that have no correspondence in the respective other structure
+# -> Remove gaps (-1 entries in trace)
+alignment = alignment[(alignment.trace != -1).all(axis=1)]
+# Map the remaining alignment columns to atom indices
+reference = reference[
+    # Each mask is True for all atoms in one residue
+    struc.get_residue_masks(reference, struc.get_residue_starts(reference)) \
+    # Only keep masks for residues that correspond to remaining alignment columns
+    [alignment.trace[:,0]] \
+    # And aggregate them to get a single mask
+    .any(axis=0)
+]  # fmt: skip
+model = model[
+    struc.get_residue_masks(model, struc.get_residue_starts(model))[
+        alignment.trace[:, 1]
+    ].any(axis=0)
+]
+
+
+## Get predicted lDDT from the model file
+plddt_category = pdbx_file.block["ma_qa_metric_local"]
+plddt_res_ids = plddt_category["label_seq_id"].as_array(int)
+plddt = plddt_category["metric_value"].as_array(float) / 100
+# Remove values for residues that were removed in the alignment process
+mask = np.isin(plddt_res_ids, model.res_id)
+plddt_res_ids = plddt_res_ids[mask]
+plddt = plddt[mask]
+
+
+## Compute actual lDDT by comparing the model to the reference
+lddt_res_ids = np.unique(model.res_id)
+# The pLDDT predicts the lDDT of CA atoms, so for consistency we do the same
+ca_mask = model.atom_name == "CA"
+lddt = struc.lddt(reference[ca_mask], model[ca_mask], aggregation="residue")
+
+
+## Compare predicted to measured lDDT
+fig, ax = plt.subplots(figsize=(8.0, 4.0))
+ax.plot(
+    plddt_res_ids,
+    plddt,
+    color=biotite.colors["dimgreen"],
+    linestyle="-",
+    label="predicted",
+)
+ax.plot(
+    lddt_res_ids,
+    lddt,
+    color=biotite.colors["lightorange"],
+    linestyle="-",
+    label="measured",
+)
+ax.legend()
+ax.set_xlabel("Residue ID")
+ax.set_ylabel("lDDT")
+ax.autoscale(axis="x", tight=True)
+plt.show()