Skip to content

Commit 6de1365

Browse files
committed
Add example
1 parent 5b0259f commit 6de1365

File tree

1 file changed

+129
-0
lines changed

1 file changed

+129
-0
lines changed
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
r"""
2+
LDDT for predicted structure evaluation
3+
=======================================
4+
5+
This example evaluates the quality of a predicted structure from *AlphaFold DB* compared
6+
to the experimental structure of a protein of interest by the means of the lDDT score.
7+
Furthermore, the measured lDDT score is compared to the pLDDT score predicted by the
8+
model.
9+
"""
10+
11+
# Code source: Patrick Kunzmann
12+
# License: BSD 3 clause
13+
14+
import matplotlib.pyplot as plt
15+
import numpy as np
16+
import biotite
17+
import biotite.database.afdb as afdb
18+
import biotite.database.rcsb as rcsb
19+
import biotite.sequence as seq
20+
import biotite.sequence.align as align
21+
import biotite.structure as struc
22+
import biotite.structure.io.pdbx as pdbx
23+
24+
# Uniprot ID of the protein of interest (in this case human beta-galactosidase)
25+
UNIPROT_ID = "P16278"
26+
27+
28+
## Get the reference experimental structure from the PDB
29+
query = rcsb.FieldQuery(
30+
"rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
31+
exact_match=UNIPROT_ID,
32+
)
33+
# The UniProt ID is defined for a single chain
34+
ids = rcsb.search(query, return_type="polymer_instance")
35+
# Simply use the first matching chain as reference
36+
pdb_id, chain_id = ids[0].split(".")
37+
pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch(pdb_id, "bcif"))
38+
reference = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)
39+
reference = reference[reference.chain_id == chain_id]
40+
# The experimental structure may contain additional small molecules
41+
# (e.g. water, ions etc.) that are not part of the predicted structure
42+
reference = reference[struc.filter_amino_acids(reference)]
43+
44+
45+
## Get the predicted structure from AlphaFold DB
46+
pdbx_file = pdbx.BinaryCIFFile.read(afdb.fetch(UNIPROT_ID, "bcif"))
47+
# Use 'label_<x>' fields to make sure the residue ID is the the same as given in the
48+
# `ma_qa_metric_local` category, where the pLDDT is obtained from
49+
model = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)
50+
51+
52+
## Filter the structures to common atoms that are present in both structures
53+
reference_sequence = struc.to_sequence(reference)[0][0]
54+
model_sequence = struc.to_sequence(model)[0][0]
55+
# This script does not rely on consistent residue numbering,
56+
# so a sequence alignment is done instead
57+
identity_matrix = align.SubstitutionMatrix(
58+
seq.ProteinSequence.alphabet,
59+
seq.ProteinSequence.alphabet,
60+
np.eye(len(seq.ProteinSequence.alphabet), dtype=int),
61+
)
62+
alignment = align.align_optimal(
63+
reference_sequence,
64+
model_sequence,
65+
# Residues might be missing due to experimental reasons but not due to homology
66+
# -> use a simple identity matrix
67+
identity_matrix,
68+
gap_penalty=-1,
69+
terminal_penalty=False,
70+
max_number=1,
71+
)[0]
72+
# Remove residues from alignment
73+
# that have no correspondence in the respective other structure
74+
# -> Remove gaps (-1 entries in trace)
75+
alignment = alignment[(alignment.trace != -1).all(axis=1)]
76+
# Map the remaining alignment columns to atom indices
77+
reference = reference[
78+
# Each mask is True for all atoms in one residue
79+
struc.get_residue_masks(reference, struc.get_residue_starts(reference)) \
80+
# Only keep masks for residues that correspond to remaining alignment columns
81+
[alignment.trace[:,0]] \
82+
# And aggregate them to get a single mask
83+
.any(axis=0)
84+
] # fmt: skip
85+
model = model[
86+
struc.get_residue_masks(model, struc.get_residue_starts(model))[
87+
alignment.trace[:, 1]
88+
].any(axis=0)
89+
]
90+
91+
92+
## Get predicted lDDT from the model file
93+
plddt_category = pdbx_file.block["ma_qa_metric_local"]
94+
plddt_res_ids = plddt_category["label_seq_id"].as_array(int)
95+
plddt = plddt_category["metric_value"].as_array(float) / 100
96+
# Remove values for residues that were removed in the alignment process
97+
mask = np.isin(plddt_res_ids, model.res_id)
98+
plddt_res_ids = plddt_res_ids[mask]
99+
plddt = plddt[mask]
100+
101+
102+
## Compute actual lDDT by comparing the model to the reference
103+
lddt_res_ids = np.unique(model.res_id)
104+
# The pLDDT predicts the lDDT of CA atoms, so for consistency we do the same
105+
ca_mask = model.atom_name == "CA"
106+
lddt = struc.lddt(reference[ca_mask], model[ca_mask], aggregation="residue")
107+
108+
109+
## Compare predicted to measured lDDT
110+
fig, ax = plt.subplots(figsize=(8.0, 4.0))
111+
ax.plot(
112+
plddt_res_ids,
113+
plddt,
114+
color=biotite.colors["dimgreen"],
115+
linestyle="-",
116+
label="predicted",
117+
)
118+
ax.plot(
119+
lddt_res_ids,
120+
lddt,
121+
color=biotite.colors["lightorange"],
122+
linestyle="-",
123+
label="measured",
124+
)
125+
ax.legend()
126+
ax.set_xlabel("Residue ID")
127+
ax.set_ylabel("lDDT")
128+
ax.autoscale(axis="x", tight=True)
129+
plt.show()

0 commit comments

Comments
 (0)