Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion docs/training.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,22 @@ Unless you wish to do it again yourself, you can skip to the next step! If you d

```bash
wget https://files.wwpdb.org/pub/pdb/data/monomers/components.cif
python ccd.py --components components.cif --outdir ./ccd
python ccd.py --components components.cif --outdir ./ccd --with_symmetries
```

> Note: runs in parallel by default with as many threads as cpu cores on your machine, can be changed with `--num_processes`

Later steps will use redis for memory-efficient parallel access to the pickled CCD dictionary.
To create a redis database from the pickled CCD file, run:
```bash
# in a separate terminal
./redis-server --save "" --appendonly no --port 7777 --dbfilename ccd.rdb
```
And then to update the `ccd.rdb` file with the pickled data:
```bash
python3 ccd_to_redis.py --ccd_in ccd/symmetry.pkl
```

#### Step 4: Create sequence clusters

First, you must create a fasta file containing all the polymer sequences present in your data. You can use any header format you want for the sequences, it will not be used.
Expand Down
81 changes: 54 additions & 27 deletions scripts/process/ccd.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Compute conformers and symmetries for all the CCD molecules."""

import argparse
import copy
import multiprocessing
import pickle
import sys
Expand All @@ -13,7 +14,7 @@
from pdbeccdutils.core import ccd_reader
from pdbeccdutils.core.component import ConformerType
from rdkit import rdBase
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem.rdchem import Conformer, Mol
from tqdm import tqdm

Expand Down Expand Up @@ -124,45 +125,54 @@ def get_conformer(mol: Mol, c_type: ConformerType) -> Conformer:
raise ValueError(msg)


def compute_symmetries(mol: Mol) -> list[list[int]]:
def compute_symmetries(mol: Mol, return_mol: bool = False) -> list[list[int]] | tuple[list[list[int]], Mol | None]:
"""Compute the symmetries of a molecule.

Parameters
----------
mol : Mol
The molecule to process
return_mol : bool, optional
Whether to return the molecule with symmetries set as a prop, by default False

Returns
-------
list[list[int]]
The symmetries as a list of index permutations

Mol | None
The modified molecule, if return_mol is True
"""
mol = AllChem.RemoveHs(mol)
idx_map = {}
atom_idx = 0
for i, atom in enumerate(mol.GetAtoms()):
# Skip if leaving atoms
if int(atom.GetProp("leaving_atom")):
continue
idx_map[i] = atom_idx
atom_idx += 1

# Calculate self permutations
permutations = []
raw_permutations = mol.GetSubstructMatches(mol, uniquify=False)
for raw_permutation in raw_permutations:
# Filter out permutations with leaving atoms
try:
if {raw_permutation[idx] for idx in idx_map} == set(idx_map.keys()):
permutation = [
idx_map[idx] for idx in raw_permutation if idx in idx_map
]
permutations.append(permutation)
except Exception: # noqa: S110, PERF203, BLE001
pass
serialized_permutations = pickle.dumps(permutations)
mol.SetProp("symmetries", serialized_permutations.hex())
try:
mol = Chem.RemoveHs(mol)
idx_map = {}
atom_idx = 0
for i, atom in enumerate(mol.GetAtoms()):
# Skip if leaving atoms
if int(atom.GetProp("leaving_atom")):
continue
idx_map[i] = atom_idx
atom_idx += 1

# Calculate self permutations
raw_permutations = mol.GetSubstructMatches(mol, uniquify=False)
for raw_permutation in raw_permutations:
# Filter out permutations with leaving atoms
try:
if {raw_permutation[idx] for idx in idx_map} == set(idx_map.keys()):
permutation = [
idx_map[idx] for idx in raw_permutation if idx in idx_map
]
permutations.append(permutation)
except Exception: # noqa: S110, PERF203, BLE001
pass
serialized_permutations = pickle.dumps(permutations)
mol.SetProp("symmetries", serialized_permutations.hex())
except Exception:
mol = None

if return_mol:
return permutations, mol
return permutations


Expand Down Expand Up @@ -281,6 +291,18 @@ def main(args: argparse.Namespace) -> None:
with path.open("wb") as f:
pickle.dump(molecules, f)

# Optionally compute and dump symmetries
if not args.with_symmetries:
return
print("Computing symmetries") # noqa: T201
symmetries = copy.deepcopy(molecules)
for name in tqdm(symmetries):
symmetries[name] = compute_symmetries(symmetries[name], return_mol=True)[1]
# Dump the components
path = outdir / "symmetry.pkl"
with path.open("wb") as f:
pickle.dump(symmetries, f)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
Expand All @@ -291,5 +313,10 @@ def main(args: argparse.Namespace) -> None:
type=int,
default=multiprocessing.cpu_count(),
)
parser.add_argument(
"--with_symmetries",
action="store_true",
help="Whether to compute symmetries to additionally save the symmetry.pkl file.",
)
args = parser.parse_args()
main(args)
85 changes: 85 additions & 0 deletions scripts/process/ccd_to_redis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import argparse
import pickle

from rdkit import Chem
from redis import Redis


def load_pickled_dict_into_redis(
path_in: str,
host: str = "localhost",
port: int = 7777,
db: int = 0,
batch: int = 10_000,
):
"""
Load a pickled dict of {str -> any_serializable} into Redis.

Parameters
----------
path_in : str
Path to the pickled dict file.
host : str, optional
Redis host, by default "localhost".
port : int, optional
Redis port, by default 7777.
db : int, optional
Redis database number, by default 0.
batch : int, optional
Number of keys to set per pipeline execution, by default 10_000.
"""
# 1) read your dict
with open(path_in, "rb") as f:
data = pickle.load(f) # {str -> any_serializable}

# 2) connect; keep decode_responses=False so .get() returns bytes
r = Redis(host=host, port=port, db=db, decode_responses=False)

# 3) write as pickled bytes
# we want all RDKit properties pickled
Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)
pipe = r.pipeline(transaction=False)
i = 0
for k, v in data.items():
if not isinstance(k, str):
raise TypeError(f"Key is not str: {k!r}")
b = pickle.dumps(v, protocol=pickle.HIGHEST_PROTOCOL)
pipe.set(k, b)
i += 1
if i % batch == 0:
pipe.execute()
pipe.execute()

# 4) Snapshot to path_out
r.bgsave()


if __name__ == "__main__":
"""
Assumes Redis is running locally.

./redis-server \
--save "" --appendonly no --port 7777 \
--dir /abs/path/to/outdir \
--dbfilename mydump.rdb
"""
parser = argparse.ArgumentParser()
parser.add_argument("--ccd_in", type=str, help="Input file to convert.")
parser.add_argument(
"--redis-host",
type=str,
default="localhost",
help="The Redis host.",
)
parser.add_argument(
"--redis-port",
type=int,
default=7777,
help="The Redis port.",
)
args = parser.parse_args()
load_pickled_dict_into_redis(
path_in=args.ccd_in,
host=args.redis_host,
port=args.redis_port,
)