diff --git a/docs/training.md b/docs/training.md index 2f586c282..94f25a9fd 100644 --- a/docs/training.md +++ b/docs/training.md @@ -146,11 +146,22 @@ Unless you wish to do it again yourself, you can skip to the next step! If you d ```bash wget https://files.wwpdb.org/pub/pdb/data/monomers/components.cif -python ccd.py --components components.cif --outdir ./ccd +python ccd.py --components components.cif --outdir ./ccd --with_symmetries ``` > Note: runs in parallel by default with as many threads as cpu cores on your machine, can be changed with `--num_processes` +Later steps will use redis for memory-efficient parallel access to the pickled CCD dictionary. +To create a redis database from the pickled CCD file, run: +```bash +# in a separate terminal +./redis-server --save "" --appendonly no --port 7777 --dbfilename ccd.rdb +``` +And then to update the `ccd.rdb` file with the pickled data: +```bash +python3 ccd_to_redis.py --ccd_in ccd/symmetry.pkl +``` + #### Step 4: Create sequence clusters First, you must create a fasta file containing all the polymer sequences present in your data. You can use any header format you want for the sequences, it will not be used. diff --git a/scripts/process/ccd.py b/scripts/process/ccd.py index 080b22859..74346dc69 100644 --- a/scripts/process/ccd.py +++ b/scripts/process/ccd.py @@ -1,6 +1,7 @@ """Compute conformers and symmetries for all the CCD molecules.""" import argparse +import copy import multiprocessing import pickle import sys @@ -13,7 +14,7 @@ from pdbeccdutils.core import ccd_reader from pdbeccdutils.core.component import ConformerType from rdkit import rdBase -from rdkit.Chem import AllChem +from rdkit import Chem from rdkit.Chem.rdchem import Conformer, Mol from tqdm import tqdm @@ -124,45 +125,54 @@ def get_conformer(mol: Mol, c_type: ConformerType) -> Conformer: raise ValueError(msg) -def compute_symmetries(mol: Mol) -> list[list[int]]: +def compute_symmetries(mol: Mol, return_mol: bool = False) -> list[list[int]] | tuple[list[list[int]], Mol | None]: """Compute the symmetries of a molecule. Parameters ---------- mol : Mol The molecule to process + return_mol : bool, optional + Whether to return the molecule with symmetries set as a prop, by default False Returns ------- list[list[int]] The symmetries as a list of index permutations - + Mol | None + The modified molecule, if return_mol is True """ - mol = AllChem.RemoveHs(mol) - idx_map = {} - atom_idx = 0 - for i, atom in enumerate(mol.GetAtoms()): - # Skip if leaving atoms - if int(atom.GetProp("leaving_atom")): - continue - idx_map[i] = atom_idx - atom_idx += 1 - - # Calculate self permutations permutations = [] - raw_permutations = mol.GetSubstructMatches(mol, uniquify=False) - for raw_permutation in raw_permutations: - # Filter out permutations with leaving atoms - try: - if {raw_permutation[idx] for idx in idx_map} == set(idx_map.keys()): - permutation = [ - idx_map[idx] for idx in raw_permutation if idx in idx_map - ] - permutations.append(permutation) - except Exception: # noqa: S110, PERF203, BLE001 - pass - serialized_permutations = pickle.dumps(permutations) - mol.SetProp("symmetries", serialized_permutations.hex()) + try: + mol = Chem.RemoveHs(mol) + idx_map = {} + atom_idx = 0 + for i, atom in enumerate(mol.GetAtoms()): + # Skip if leaving atoms + if int(atom.GetProp("leaving_atom")): + continue + idx_map[i] = atom_idx + atom_idx += 1 + + # Calculate self permutations + raw_permutations = mol.GetSubstructMatches(mol, uniquify=False) + for raw_permutation in raw_permutations: + # Filter out permutations with leaving atoms + try: + if {raw_permutation[idx] for idx in idx_map} == set(idx_map.keys()): + permutation = [ + idx_map[idx] for idx in raw_permutation if idx in idx_map + ] + permutations.append(permutation) + except Exception: # noqa: S110, PERF203, BLE001 + pass + serialized_permutations = pickle.dumps(permutations) + mol.SetProp("symmetries", serialized_permutations.hex()) + except Exception: + mol = None + + if return_mol: + return permutations, mol return permutations @@ -281,6 +291,18 @@ def main(args: argparse.Namespace) -> None: with path.open("wb") as f: pickle.dump(molecules, f) + # Optionally compute and dump symmetries + if not args.with_symmetries: + return + print("Computing symmetries") # noqa: T201 + symmetries = copy.deepcopy(molecules) + for name in tqdm(symmetries): + symmetries[name] = compute_symmetries(symmetries[name], return_mol=True)[1] + # Dump the components + path = outdir / "symmetry.pkl" + with path.open("wb") as f: + pickle.dump(symmetries, f) + if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -291,5 +313,10 @@ def main(args: argparse.Namespace) -> None: type=int, default=multiprocessing.cpu_count(), ) + parser.add_argument( + "--with_symmetries", + action="store_true", + help="Whether to compute symmetries to additionally save the symmetry.pkl file.", + ) args = parser.parse_args() main(args) diff --git a/scripts/process/ccd_to_redis.py b/scripts/process/ccd_to_redis.py new file mode 100644 index 000000000..829b9773c --- /dev/null +++ b/scripts/process/ccd_to_redis.py @@ -0,0 +1,85 @@ +import argparse +import pickle + +from rdkit import Chem +from redis import Redis + + +def load_pickled_dict_into_redis( + path_in: str, + host: str = "localhost", + port: int = 7777, + db: int = 0, + batch: int = 10_000, +): + """ + Load a pickled dict of {str -> any_serializable} into Redis. + + Parameters + ---------- + path_in : str + Path to the pickled dict file. + host : str, optional + Redis host, by default "localhost". + port : int, optional + Redis port, by default 7777. + db : int, optional + Redis database number, by default 0. + batch : int, optional + Number of keys to set per pipeline execution, by default 10_000. + """ + # 1) read your dict + with open(path_in, "rb") as f: + data = pickle.load(f) # {str -> any_serializable} + + # 2) connect; keep decode_responses=False so .get() returns bytes + r = Redis(host=host, port=port, db=db, decode_responses=False) + + # 3) write as pickled bytes + # we want all RDKit properties pickled + Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps) + pipe = r.pipeline(transaction=False) + i = 0 + for k, v in data.items(): + if not isinstance(k, str): + raise TypeError(f"Key is not str: {k!r}") + b = pickle.dumps(v, protocol=pickle.HIGHEST_PROTOCOL) + pipe.set(k, b) + i += 1 + if i % batch == 0: + pipe.execute() + pipe.execute() + + # 4) Snapshot to path_out + r.bgsave() + + +if __name__ == "__main__": + """ + Assumes Redis is running locally. + + ./redis-server \ + --save "" --appendonly no --port 7777 \ + --dir /abs/path/to/outdir \ + --dbfilename mydump.rdb + """ + parser = argparse.ArgumentParser() + parser.add_argument("--ccd_in", type=str, help="Input file to convert.") + parser.add_argument( + "--redis-host", + type=str, + default="localhost", + help="The Redis host.", + ) + parser.add_argument( + "--redis-port", + type=int, + default=7777, + help="The Redis port.", + ) + args = parser.parse_args() + load_pickled_dict_into_redis( + path_in=args.ccd_in, + host=args.redis_host, + port=args.redis_port, + )