|
| 1 | +r""" |
| 2 | +LDDT for predicted structure evaluation |
| 3 | +======================================= |
| 4 | +
|
| 5 | +This example evaluates the quality of a predicted structure from *AlphaFold DB* compared |
| 6 | +to the experimental structure of a protein of interest by the means of the lDDT score. |
| 7 | +Furthermore, the measured lDDT score is compared to the pLDDT score predicted by the |
| 8 | +model. |
| 9 | +""" |
| 10 | + |
| 11 | +# Code source: Patrick Kunzmann |
| 12 | +# License: BSD 3 clause |
| 13 | + |
| 14 | +import matplotlib.pyplot as plt |
| 15 | +import numpy as np |
| 16 | +import biotite |
| 17 | +import biotite.database.afdb as afdb |
| 18 | +import biotite.database.rcsb as rcsb |
| 19 | +import biotite.sequence as seq |
| 20 | +import biotite.sequence.align as align |
| 21 | +import biotite.structure as struc |
| 22 | +import biotite.structure.io.pdbx as pdbx |
| 23 | + |
| 24 | +# Uniprot ID of the protein of interest (in this case human beta-galactosidase) |
| 25 | +UNIPROT_ID = "P16278" |
| 26 | + |
| 27 | + |
| 28 | +## Get the reference experimental structure from the PDB |
| 29 | +query = rcsb.FieldQuery( |
| 30 | + "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession", |
| 31 | + exact_match=UNIPROT_ID, |
| 32 | +) |
| 33 | +# The UniProt ID is defined for a single chain |
| 34 | +ids = rcsb.search(query, return_type="polymer_instance") |
| 35 | +# Simply use the first matching chain as reference |
| 36 | +pdb_id, chain_id = ids[0].split(".") |
| 37 | +pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch(pdb_id, "bcif")) |
| 38 | +reference = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False) |
| 39 | +reference = reference[reference.chain_id == chain_id] |
| 40 | +# The experimental structure may contain additional small molecules |
| 41 | +# (e.g. water, ions etc.) that are not part of the predicted structure |
| 42 | +reference = reference[struc.filter_amino_acids(reference)] |
| 43 | + |
| 44 | + |
| 45 | +## Get the predicted structure from AlphaFold DB |
| 46 | +pdbx_file = pdbx.BinaryCIFFile.read(afdb.fetch(UNIPROT_ID, "bcif")) |
| 47 | +# Use 'label_<x>' fields to make sure the residue ID is the the same as given in the |
| 48 | +# `ma_qa_metric_local` category, where the pLDDT is obtained from |
| 49 | +model = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False) |
| 50 | + |
| 51 | + |
| 52 | +## Filter the structures to common atoms that are present in both structures |
| 53 | +reference_sequence = struc.to_sequence(reference)[0][0] |
| 54 | +model_sequence = struc.to_sequence(model)[0][0] |
| 55 | +# This script does not rely on consistent residue numbering, |
| 56 | +# so a sequence alignment is done instead |
| 57 | +identity_matrix = align.SubstitutionMatrix( |
| 58 | + seq.ProteinSequence.alphabet, |
| 59 | + seq.ProteinSequence.alphabet, |
| 60 | + np.eye(len(seq.ProteinSequence.alphabet), dtype=int), |
| 61 | +) |
| 62 | +alignment = align.align_optimal( |
| 63 | + reference_sequence, |
| 64 | + model_sequence, |
| 65 | + # Residues might be missing due to experimental reasons but not due to homology |
| 66 | + # -> use a simple identity matrix |
| 67 | + identity_matrix, |
| 68 | + gap_penalty=-1, |
| 69 | + terminal_penalty=False, |
| 70 | + max_number=1, |
| 71 | +)[0] |
| 72 | +# Remove residues from alignment |
| 73 | +# that have no correspondence in the respective other structure |
| 74 | +# -> Remove gaps (-1 entries in trace) |
| 75 | +alignment = alignment[(alignment.trace != -1).all(axis=1)] |
| 76 | +# Map the remaining alignment columns to atom indices |
| 77 | +reference = reference[ |
| 78 | + # Each mask is True for all atoms in one residue |
| 79 | + struc.get_residue_masks(reference, struc.get_residue_starts(reference)) \ |
| 80 | + # Only keep masks for residues that correspond to remaining alignment columns |
| 81 | + [alignment.trace[:,0]] \ |
| 82 | + # And aggregate them to get a single mask |
| 83 | + .any(axis=0) |
| 84 | +] # fmt: skip |
| 85 | +model = model[ |
| 86 | + struc.get_residue_masks(model, struc.get_residue_starts(model))[ |
| 87 | + alignment.trace[:, 1] |
| 88 | + ].any(axis=0) |
| 89 | +] |
| 90 | + |
| 91 | + |
| 92 | +## Get predicted lDDT from the model file |
| 93 | +plddt_category = pdbx_file.block["ma_qa_metric_local"] |
| 94 | +plddt_res_ids = plddt_category["label_seq_id"].as_array(int) |
| 95 | +plddt = plddt_category["metric_value"].as_array(float) / 100 |
| 96 | +# Remove values for residues that were removed in the alignment process |
| 97 | +mask = np.isin(plddt_res_ids, model.res_id) |
| 98 | +plddt_res_ids = plddt_res_ids[mask] |
| 99 | +plddt = plddt[mask] |
| 100 | + |
| 101 | + |
| 102 | +## Compute actual lDDT by comparing the model to the reference |
| 103 | +lddt_res_ids = np.unique(model.res_id) |
| 104 | +# The pLDDT predicts the lDDT of CA atoms, so for consistency we do the same |
| 105 | +ca_mask = model.atom_name == "CA" |
| 106 | +lddt = struc.lddt(reference[ca_mask], model[ca_mask], aggregation="residue") |
| 107 | + |
| 108 | + |
| 109 | +## Compare predicted to measured lDDT |
| 110 | +fig, ax = plt.subplots(figsize=(8.0, 4.0)) |
| 111 | +ax.plot( |
| 112 | + plddt_res_ids, |
| 113 | + plddt, |
| 114 | + color=biotite.colors["dimgreen"], |
| 115 | + linestyle="-", |
| 116 | + label="predicted", |
| 117 | +) |
| 118 | +ax.plot( |
| 119 | + lddt_res_ids, |
| 120 | + lddt, |
| 121 | + color=biotite.colors["lightorange"], |
| 122 | + linestyle="-", |
| 123 | + label="measured", |
| 124 | +) |
| 125 | +ax.legend() |
| 126 | +ax.set_xlabel("Residue ID") |
| 127 | +ax.set_ylabel("lDDT") |
| 128 | +ax.autoscale(axis="x", tight=True) |
| 129 | +plt.show() |
0 commit comments