jwohlwend · chmnk · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025
diff --git a/docs/training.md b/docs/training.md
@@ -146,11 +146,22 @@ Unless you wish to do it again yourself, you can skip to the next step! If you d
 
 ```bash
 wget https://files.wwpdb.org/pub/pdb/data/monomers/components.cif
-python ccd.py --components components.cif --outdir ./ccd
+python ccd.py --components components.cif --outdir ./ccd --with_symmetries
 ```
 
 > Note: runs in parallel by default with as many threads as cpu cores on your machine, can be changed with `--num_processes`
 
+Later steps will use redis for memory-efficient parallel access to the pickled CCD dictionary. 
+To create a redis database from the pickled CCD file, run:
+```bash
+# in a separate terminal
+./redis-server --save "" --appendonly no --port 7777 --dbfilename ccd.rdb
+```
+And then to update the `ccd.rdb` file with the pickled data:
+```bash
+python3 ccd_to_redis.py --ccd_in ccd/symmetry.pkl
+```
+
 #### Step 4: Create sequence clusters
 
 First, you must create a fasta file containing all the polymer sequences present in your data. You can use any header format you want for the sequences, it will not be used.

diff --git a/scripts/process/ccd.py b/scripts/process/ccd.py
@@ -1,6 +1,7 @@
 """Compute conformers and symmetries for all the CCD molecules."""
 
 import argparse
+import copy
 import multiprocessing
 import pickle
 import sys
@@ -13,7 +14,7 @@
 from pdbeccdutils.core import ccd_reader
 from pdbeccdutils.core.component import ConformerType
 from rdkit import rdBase
-from rdkit.Chem import AllChem
+from rdkit import Chem
 from rdkit.Chem.rdchem import Conformer, Mol
 from tqdm import tqdm
 
@@ -124,45 +125,54 @@ def get_conformer(mol: Mol, c_type: ConformerType) -> Conformer:
     raise ValueError(msg)
 
 
-def compute_symmetries(mol: Mol) -> list[list[int]]:
+def compute_symmetries(mol: Mol, return_mol: bool = False) -> list[list[int]] | tuple[list[list[int]], Mol | None]:
     """Compute the symmetries of a molecule.
 
     Parameters
     ----------
     mol : Mol
         The molecule to process
+    return_mol : bool, optional
+        Whether to return the molecule with symmetries set as a prop, by default False
 
     Returns
     -------
     list[list[int]]
         The symmetries as a list of index permutations
-
+    Mol | None
+        The modified molecule, if return_mol is True
     """
-    mol = AllChem.RemoveHs(mol)
-    idx_map = {}
-    atom_idx = 0
-    for i, atom in enumerate(mol.GetAtoms()):
-        # Skip if leaving atoms
-        if int(atom.GetProp("leaving_atom")):
-            continue
-        idx_map[i] = atom_idx
-        atom_idx += 1
-
-    # Calculate self permutations
     permutations = []
-    raw_permutations = mol.GetSubstructMatches(mol, uniquify=False)
-    for raw_permutation in raw_permutations:
-        # Filter out permutations with leaving atoms
-        try:
-            if {raw_permutation[idx] for idx in idx_map} == set(idx_map.keys()):
-                permutation = [
-                    idx_map[idx] for idx in raw_permutation if idx in idx_map
-                ]
-                permutations.append(permutation)
-        except Exception:  # noqa: S110, PERF203, BLE001
-            pass
-    serialized_permutations = pickle.dumps(permutations)
-    mol.SetProp("symmetries", serialized_permutations.hex())
+    try:
+        mol = Chem.RemoveHs(mol)
+        idx_map = {}
+        atom_idx = 0
+        for i, atom in enumerate(mol.GetAtoms()):
+            # Skip if leaving atoms
+            if int(atom.GetProp("leaving_atom")):
+                continue
+            idx_map[i] = atom_idx
+            atom_idx += 1
+
+        # Calculate self permutations
+        raw_permutations = mol.GetSubstructMatches(mol, uniquify=False)
+        for raw_permutation in raw_permutations:
+            # Filter out permutations with leaving atoms
+            try:
+                if {raw_permutation[idx] for idx in idx_map} == set(idx_map.keys()):
+                    permutation = [
+                        idx_map[idx] for idx in raw_permutation if idx in idx_map
+                    ]
+                    permutations.append(permutation)
+            except Exception:  # noqa: S110, PERF203, BLE001
+                pass
+        serialized_permutations = pickle.dumps(permutations)
+        mol.SetProp("symmetries", serialized_permutations.hex())
+    except Exception:
+        mol = None
+
+    if return_mol:
+        return permutations, mol
     return permutations
 
 
@@ -281,6 +291,18 @@ def main(args: argparse.Namespace) -> None:
     with path.open("wb") as f:
         pickle.dump(molecules, f)
 
+    # Optionally compute and dump symmetries
+    if not args.with_symmetries:
+        return
+    print("Computing symmetries")  # noqa: T201
+    symmetries = copy.deepcopy(molecules)
+    for name in tqdm(symmetries):
+        symmetries[name] = compute_symmetries(symmetries[name], return_mol=True)[1]
+    # Dump the components
+    path = outdir / "symmetry.pkl"
+    with path.open("wb") as f:
+        pickle.dump(symmetries, f)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -291,5 +313,10 @@ def main(args: argparse.Namespace) -> None:
         type=int,
         default=multiprocessing.cpu_count(),
     )
+    parser.add_argument(
+        "--with_symmetries",
+        action="store_true",
+        help="Whether to compute symmetries to additionally save the symmetry.pkl file.",
+    )
     args = parser.parse_args()
     main(args)
diff --git a/scripts/process/ccd_to_redis.py b/scripts/process/ccd_to_redis.py
@@ -0,0 +1,85 @@
+import argparse
+import pickle
+
+from rdkit import Chem
+from redis import Redis
+
+
+def load_pickled_dict_into_redis(
+    path_in: str,
+    host: str = "localhost",
+    port: int = 7777,
+    db: int = 0,
+    batch: int = 10_000,
+):
+    """
+    Load a pickled dict of {str -> any_serializable} into Redis.
+
+    Parameters
+    ----------
+    path_in : str
+        Path to the pickled dict file.
+    host : str, optional
+        Redis host, by default "localhost".
+    port : int, optional
+        Redis port, by default 7777.
+    db : int, optional
+        Redis database number, by default 0.
+    batch : int, optional
+        Number of keys to set per pipeline execution, by default 10_000.
+    """
+    # 1) read your dict
+    with open(path_in, "rb") as f:
+        data = pickle.load(f)  # {str -> any_serializable}
+
+    # 2) connect; keep decode_responses=False so .get() returns bytes
+    r = Redis(host=host, port=port, db=db, decode_responses=False)
+
+    # 3) write as pickled bytes
+    # we want all RDKit properties pickled
+    Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)
+    pipe = r.pipeline(transaction=False)
+    i = 0
+    for k, v in data.items():
+        if not isinstance(k, str):
+            raise TypeError(f"Key is not str: {k!r}")
+        b = pickle.dumps(v, protocol=pickle.HIGHEST_PROTOCOL)
+        pipe.set(k, b)
+        i += 1
+        if i % batch == 0:
+            pipe.execute()
+    pipe.execute()
+
+    # 4) Snapshot to path_out
+    r.bgsave()
+
+
+if __name__ == "__main__":
+    """
+    Assumes Redis is running locally.
+
+    ./redis-server \
+      --save "" --appendonly no --port 7777 \
+      --dir /abs/path/to/outdir \
+      --dbfilename mydump.rdb
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ccd_in", type=str, help="Input file to convert.")
+    parser.add_argument(
+        "--redis-host",
+        type=str,
+        default="localhost",
+        help="The Redis host.",
+    )
+    parser.add_argument(
+        "--redis-port",
+        type=int,
+        default=7777,
+        help="The Redis port.",
+    )
+    args = parser.parse_args()
+    load_pickled_dict_into_redis(
+        path_in=args.ccd_in,
+        host=args.redis_host,
+        port=args.redis_port,
+    )