-
Notifications
You must be signed in to change notification settings - Fork 84
fix: CCD/SMILES inconsistency #95
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
e645c37
cafc71b
bbbfe6a
c9fb62e
5a6393c
3bc3f92
04b1fe5
85f6ec0
3a21a38
2be65b8
30bca1b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,60 @@ | ||
| # Copyright 2025 AlQuraishi Laboratory | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """S3 utilities for checksum comparison without downloading.""" | ||
|
|
||
| import base64 | ||
| from pathlib import Path | ||
|
|
||
| import boto3 | ||
| from awscrt import checksums | ||
|
|
||
|
|
||
| def get_s3_checksum(bucket: str, key: str) -> str | None: | ||
| """Get CRC64NVME checksum from S3 object metadata (HEAD request, no download).""" | ||
| s3 = boto3.client("s3") | ||
| response = s3.head_object(Bucket=bucket, Key=key, ChecksumMode="ENABLED") | ||
|
|
||
| if "ChecksumCRC64NVME" in response: | ||
| return response["ChecksumCRC64NVME"] | ||
| return None | ||
|
|
||
|
|
||
| def compute_local_crc64nvme_base64(filepath: Path) -> str: | ||
| """Compute CRC64NVME of local file, return as base64 (S3 format).""" | ||
| crc = 0 | ||
| with open(filepath, "rb") as f: | ||
| for chunk in iter(lambda: f.read(8192), b""): | ||
| crc = checksums.crc64nvme(chunk, crc) | ||
| # Convert to bytes (big-endian) and base64 encode | ||
| crc_bytes = crc.to_bytes(8, byteorder="big") | ||
| return base64.b64encode(crc_bytes).decode() | ||
|
|
||
|
|
||
| def s3_file_matches_local(local_path: Path, bucket: str, key: str) -> bool: | ||
| """ | ||
| Compare local file with S3 object using CRC64NVME checksum. | ||
|
|
||
| Returns True if files match, False if they differ or comparison fails. | ||
| """ | ||
| if not local_path.exists(): | ||
| return False | ||
|
|
||
| s3_checksum = get_s3_checksum(bucket, key) | ||
| if s3_checksum is None: | ||
| # S3 object has no checksum, cannot compare | ||
| return False | ||
|
|
||
| local_checksum = compute_local_crc64nvme_base64(local_path) | ||
| return local_checksum == s3_checksum |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| import pytest | ||
|
|
||
| from openfold3.core.data.primitives.structure.query import ( | ||
| structure_with_ref_mol_from_ccd_code, | ||
| structure_with_ref_mol_from_smiles, | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "smiles, ccd_code", | ||
| [ | ||
| # Simple test cases | ||
| ("CCO", "EOH"), # Ethanol | ||
| # Pat Walter's CYP substrates | ||
| ("CC(C)(C)C(=O)Nc1cc(ccc1n2ccnc2)C(F)(F)F", "A1ASV"), # cyp3a4_9bv5 | ||
| ("Cc1nc(cs1)c2ccc(cc2)n3c(cnn3)c4ccc(cc4)OC", "A1ASU"), # cyp3a4_9bv6 | ||
| ("c1ccc(c(c1)CCC(=O)NC[C@@H]2Cc3cccc(c3O2)c4cccnc4)Cl", "A1AST"), # cyp3a4_9bv7 | ||
| ("c1cc(c(cc1C(F)(F)F)NC(=O)C2CCC2)n3ccnc3", "A1ASS"), # cyp3a4_9bv8 | ||
| ("c1ccc(c(c1)NC(=O)Nc2cc(ccc2n3ccnc3)C(F)(F)F)Cl", "A1ASR"), # cyp3a4_9bv9 | ||
| ("c1ccc(c(c1)CCCl)NC(=O)Nc2cc(ccc2n3ccnc3)C(F)(F)F", "A1ASQ"), # cyp3a4_9bva | ||
| ("c1ccc(c(c1)CC(=O)Nc2cc(ccc2n3ccnc3)C(F)(F)F)Cl", "A1ASP"), # cyp3a4_9bvb | ||
| ( | ||
| "c1ccc(c(c1)N(Cc2cccc(c2)O)C(=O)Nc3cc(ccc3n4ccnc4)C(F)(F)F)Cl", | ||
| "A1ASO", | ||
| ), # cyp3a4_9bvc | ||
| ( | ||
| "c1ccc(c(c1)NC(=O)N(Cc2cccc(c2)O)c3cc(ccc3n4ccnc4)C(F)(F)F)Cl", | ||
| "A1BNX", | ||
| ), # cyp3a4_9ms1 | ||
| ( | ||
| "c1ccc(cc1)C(c2ccccc2)([C@@H]3CCN(C3)CCc4ccc5c(c4)CCO5)C(=O)N", | ||
| "A1CIW", | ||
| ), # cyp3a4_9plk | ||
| ], | ||
| ) | ||
| def test_consistent_structure_from_smiles_and_ccd_code(smiles, ccd_code): | ||
| struct_from_smiles = structure_with_ref_mol_from_smiles(smiles, chain_id="X") | ||
| struct_from_ccd = structure_with_ref_mol_from_ccd_code(ccd_code, chain_id="X") | ||
|
|
||
| assert len(struct_from_smiles.atom_array) == len(struct_from_ccd.atom_array) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some quick suggestions to make this test more comprehensive:
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, it'd be really nice indeed if we could use that – but it currently doesn't work, just tried empirically (atm atom names different between smiles and CCD, eg C01 vs C1) – maybe we can just match on element count?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah figures. I think the atom counter is good enough here for now, thanks for adding it. |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if the pointer of this link is super helpful, this is just a function to work with Biotite's CCD representation in code. Why not just add
python -m biotite.setup_ccdif the intent is to show users how to download and set it up?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could surface this as work-around but that's really our abstraction leaking. One day we might remove biotite, and the users should be none the wiser about it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My intention was to provide users who might be unfamiliar with the CCD some context about what it is since it is now included in our top level Installation documentation and how biotite is used in relation to the CCD.
We can certainly replace this link with a more general pointer to the CCD (perhaps this one: https://www.wwpdb.org/data/ccd) if that would be more helpful. This might make more sense if we now plan to provide the CCD file ourselves