Skip to content
1 change: 1 addition & 0 deletions docs/source/Installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ This script will:
- Setup a directory for OpenFold3 model parameters [default: `~/.openfold3`]
- Writes the path to `$OPENFOLD_CACHE/ckpt_path`
- Download the model parameters, if the parameter file does not already exist
- Download and setup the [Chemical Component Dictionary (CCD)](https://www.wwpdb.org/data/ccd) with [Biotite](https://www.biotite-python.org/latest/apidoc/biotite.structure.info.get_ccd.html)
- Optionally runs an inference integration test on two samples, without MSA alignments (~5 min on A100)
- N.B. To run the integration tests, `pytest` must be installed.

Expand Down
60 changes: 60 additions & 0 deletions openfold3/core/utils/s3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright 2025 AlQuraishi Laboratory
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""S3 utilities for checksum comparison without downloading."""

import base64
from pathlib import Path

import boto3
from awscrt import checksums


def get_s3_checksum(bucket: str, key: str) -> str | None:
"""Get CRC64NVME checksum from S3 object metadata (HEAD request, no download)."""
s3 = boto3.client("s3")
response = s3.head_object(Bucket=bucket, Key=key, ChecksumMode="ENABLED")

if "ChecksumCRC64NVME" in response:
return response["ChecksumCRC64NVME"]
return None


def compute_local_crc64nvme_base64(filepath: Path) -> str:
"""Compute CRC64NVME of local file, return as base64 (S3 format)."""
crc = 0
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
crc = checksums.crc64nvme(chunk, crc)
# Convert to bytes (big-endian) and base64 encode
crc_bytes = crc.to_bytes(8, byteorder="big")
return base64.b64encode(crc_bytes).decode()


def s3_file_matches_local(local_path: Path, bucket: str, key: str) -> bool:
"""
Compare local file with S3 object using CRC64NVME checksum.

Returns True if files match, False if they differ or comparison fails.
"""
if not local_path.exists():
return False

s3_checksum = get_s3_checksum(bucket, key)
if s3_checksum is None:
# S3 object has no checksum, cannot compare
return False

local_checksum = compute_local_crc64nvme_base64(local_path)
return local_checksum == s3_checksum
93 changes: 93 additions & 0 deletions openfold3/scripts/update_ccd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/usr/bin/env python3
# Copyright 2025 AlQuraishi Laboratory
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Script to update the Chemical Component Dictionary (CCD).

Downloads the latest CCD from WWPDB, processes it to BinaryCIF format,
and uploads to S3.
"""

import argparse
import logging
from pathlib import Path

import biotite.setup_ccd
import boto3

from openfold3.core.utils.s3 import compute_local_crc64nvme_base64
from openfold3.setup_openfold import S3_BUCKET, S3_KEY

logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)


def upload_to_s3(local_path: Path, bucket: str, key: str) -> None:
"""Upload file to S3 with CRC64NVME checksum."""
logger.info(f"Uploading {local_path} to s3://{bucket}/{key}...")

checksum = compute_local_crc64nvme_base64(local_path)
logger.info(f"Local file checksum (CRC64NVME): {checksum}")

s3 = boto3.client("s3")
with open(local_path, "rb") as f:
s3.put_object(
Bucket=bucket,
Key=key,
Body=f,
ChecksumAlgorithm="CRC64NVME",
ChecksumCRC64NVME=checksum,
)

logger.info(f"Upload complete: s3://{bucket}/{key}")


def main() -> None:
parser = argparse.ArgumentParser(
description="Update CCD: download from WWPDB, process, and upload to S3"
)
parser.add_argument(
"--bucket",
type=str,
default=S3_BUCKET,
help=f"S3 bucket name (default: {S3_BUCKET})",
)
parser.add_argument(
"--key",
type=str,
default=S3_KEY,
help=f"S3 object key (default: {S3_KEY})",
)
parser.add_argument(
"--skip-upload",
action="store_true",
help="Skip S3 upload (useful for local testing)",
)
args = parser.parse_args()

# Download and process CCD using biotite's setup_ccd
logger.info("Downloading and processing CCD from WWPDB...")
biotite.setup_ccd.main()
output_path = biotite.setup_ccd.OUTPUT_CCD
logger.info(f"CCD processed and saved to: {output_path}")

if not args.skip_upload:
upload_to_s3(output_path, args.bucket, args.key)
else:
logger.info("Skipping S3 upload (--skip-upload flag set)")


if __name__ == "__main__":
main()
33 changes: 32 additions & 1 deletion openfold3/setup_openfold.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@
import sys
from pathlib import Path

import biotite.setup_ccd

from openfold3.core.utils.s3 import s3_file_matches_local

S3_BUCKET = "openfold3-data"
S3_KEY = "components.bcif"


logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -172,6 +180,26 @@ def download_parameters(param_dir) -> None:
logger.info("Download completed successfully.")


def setup_biotite_ccd(*, force_download: bool) -> None:
def ccd_is_stale(*, ccd_path: Path) -> bool:
if not ccd_path.exists():
return True
return not s3_file_matches_local(ccd_path, S3_BUCKET, S3_KEY)

logger.info("Starting Biotite CCD setup...")
if force_download or ccd_is_stale(ccd_path=biotite.setup_ccd.OUTPUT_CCD):
logger.info(
f"Downloading biotite CCD from s3://{S3_BUCKET}/{S3_KEY} "
f"to {biotite.setup_ccd.OUTPUT_CCD}..."
)
biotite.setup_ccd.main()
else:
logger.info(
f"Biotite CCD file at {biotite.setup_ccd.OUTPUT_CCD} is up-to-date with "
f"s3://{S3_BUCKET}/{S3_KEY}, skipping."
)


def run_integration_tests() -> None:
"""Run integration tests."""
confirm = input("Run integration tests? (yes/no)")
Expand Down Expand Up @@ -226,7 +254,10 @@ def main():
if should_download:
download_parameters(param_dir)

# Step 5: Run tests (always run regardless of download status)
# Step 5: Setup CCD with biotite
setup_biotite_ccd(force_download=False)

# Step 6: Run tests (always run regardless of download status)
run_integration_tests()


Expand Down
8 changes: 8 additions & 0 deletions openfold3/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import pytest
from biotite.structure import AtomArray

from openfold3.setup_openfold import setup_biotite_ccd


@pytest.fixture
def dummy_atom_array():
Expand Down Expand Up @@ -49,3 +51,9 @@ def mse_ala_atom_array():
atom_array.hetero[8:] = False

return atom_array


@pytest.fixture(scope="session", autouse=True)
def ensure_biotite_ccd():
"""Download CCD file before any tests run (once per test session)."""
setup_biotite_ccd(force_download=False)
50 changes: 50 additions & 0 deletions openfold3/tests/core/data/primitives/structure/test_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from collections import Counter

import pytest

from openfold3.core.data.primitives.structure.query import (
structure_with_ref_mol_from_ccd_code,
structure_with_ref_mol_from_smiles,
)


@pytest.mark.parametrize(
"smiles, ccd_code",
[
# Simple test cases
("CCO", "EOH"), # Ethanol
# Pat Walter's CYP substrates
("CC(C)(C)C(=O)Nc1cc(ccc1n2ccnc2)C(F)(F)F", "A1ASV"), # cyp3a4_9bv5
("Cc1nc(cs1)c2ccc(cc2)n3c(cnn3)c4ccc(cc4)OC", "A1ASU"), # cyp3a4_9bv6
("c1ccc(c(c1)CCC(=O)NC[C@@H]2Cc3cccc(c3O2)c4cccnc4)Cl", "A1AST"), # cyp3a4_9bv7
("c1cc(c(cc1C(F)(F)F)NC(=O)C2CCC2)n3ccnc3", "A1ASS"), # cyp3a4_9bv8
("c1ccc(c(c1)NC(=O)Nc2cc(ccc2n3ccnc3)C(F)(F)F)Cl", "A1ASR"), # cyp3a4_9bv9
("c1ccc(c(c1)CCCl)NC(=O)Nc2cc(ccc2n3ccnc3)C(F)(F)F", "A1ASQ"), # cyp3a4_9bva
("c1ccc(c(c1)CC(=O)Nc2cc(ccc2n3ccnc3)C(F)(F)F)Cl", "A1ASP"), # cyp3a4_9bvb
(
"c1ccc(c(c1)N(Cc2cccc(c2)O)C(=O)Nc3cc(ccc3n4ccnc4)C(F)(F)F)Cl",
"A1ASO",
), # cyp3a4_9bvc
(
"c1ccc(c(c1)NC(=O)N(Cc2cccc(c2)O)c3cc(ccc3n4ccnc4)C(F)(F)F)Cl",
"A1BNX",
), # cyp3a4_9ms1
(
"c1ccc(cc1)C(c2ccccc2)([C@@H]3CCN(C3)CCc4ccc5c(c4)CCO5)C(=O)N",
"A1CIW",
), # cyp3a4_9plk
],
)
def test_consistent_structure_from_smiles_and_ccd_code(smiles, ccd_code):
struct_from_smiles = structure_with_ref_mol_from_smiles(smiles, chain_id="X")
struct_from_ccd = structure_with_ref_mol_from_ccd_code(ccd_code, chain_id="X")

# Ideally, one day, we'll be able to do just this
# from openfold3.tests import custom_assert_utils
# custom_assert_utils.assert_atomarray_equal(struct_from_smiles.atom_array, struct_from_ccd.atom_array)

assert len(struct_from_smiles.atom_array) == len(struct_from_ccd.atom_array)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some quick suggestions to make this test more comprehensive:

  • Would it make sense to use the custom_assert_utils.assert_atom_array_equal utility here? Or would we expect some differences in atom ordering / bond ordering so that this assert would fail.

  • If assert_atom_array_equal is not a good fit here, perhaps we could add a test to check the number of carbons or some other aggregate property.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, it'd be really nice indeed if we could use that – but it currently doesn't work, just tried empirically (atm atom names different between smiles and CCD, eg C01 vs C1) – maybe we can just match on element count?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah figures. I think the atom counter is good enough here for now, thanks for adding it.


assert Counter(struct_from_smiles.atom_array.element) == Counter(
struct_from_ccd.atom_array.element
)
Loading