Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,31 @@ There are two main predictions in the affinity output: `affinity_pred_value` and
## Authentication to MSA Server

When using the `--use_msa_server` option with a server that requires authentication, you can provide credentials in one of two ways. More information is available in our [prediction instructions](docs/prediction.md).


## MSA-only generation

Compute MSAs without running any structure prediction:

```bash
boltz msa [OPTIONS] path/to/input.fasta
```

By default this will:

- Use the online MMSeqs2 server (`https://api.colabfold.com`)
- Write one CSV per input under `./msa_results/msa`

Available options (e.g. `--out-dir`, `--msa-pairing-strategy`, authentication flags) mirror those in `boltz predict`.
For the full list of flags, run:

```bash
boltz msa --help
```

> **Warning:** Don’t send unpublished or confidential sequences to the public server.
> If you need private or offline operation, point `--msa-server-url` at your own MSA service.

## Evaluation

⚠️ **Coming soon: updated evaluation code for Boltz-2!**
Expand Down
153 changes: 153 additions & 0 deletions src/boltz/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1409,6 +1409,159 @@ def predict( # noqa: C901, PLR0915, PLR0912
return_predictions=False,
)

@cli.command(short_help="Generate MSA CSVs from input (uses online server).")
@click.argument("input_path", type=click.Path(exists=True, path_type=Path), nargs=-1)
@click.option(
"--out-dir",
type=click.Path(file_okay=False, dir_okay=True, path_type=Path),
default=Path("./msa_results"),
show_default=True,
help="Output directory for MSA CSVs.",
)
@click.option(
"--cache",
type=click.Path(exists=False, file_okay=False, dir_okay=True, path_type=Path),
default=lambda: Path(get_cache_path()),
show_default=True,
help="Path to Boltz cache (used for CCD/mols).",
)
@click.option(
"--msa-server-url",
type=str,
default="https://api.colabfold.com",
show_default=True,
help="URL of the MMseqs2 MSA server.",
)
@click.option(
"--msa-pairing-strategy",
type=click.Choice(["greedy", "complete"]),
default="greedy",
show_default=True,
help="MSA pairing strategy to use.",
)
@click.option(
"--max-msa-seqs",
type=int,
default=8192,
show_default=True,
help="Maximum number of MSA sequences to retain.",
)
@click.option(
"--preprocessing-threads",
type=int,
default=1,
show_default=True,
help="Number of threads to use for parallel MSA generation.",
)
@click.option(
"--model",
default="boltz2",
type=click.Choice(["boltz1", "boltz2"]),
help="The model to use for prediction. Default is boltz2.",
)
@click.option(
"--msa_server_username",
type=str,
help="MSA server username for basic auth. Used only if --use_msa_server is set. Can also be set via BOLTZ_MSA_USERNAME environment variable.",
default=None,
)
@click.option(
"--msa_server_password",
type=str,
help="MSA server password for basic auth. Used only if --use_msa_server is set. Can also be set via BOLTZ_MSA_PASSWORD environment variable.",
default=None,
)
@click.option(
"--api_key_header",
type=str,
help="Custom header key for API key authentication (default: X-API-Key).",
default=None,
)
@click.option(
"--api_key_value",
type=str,
help="Custom header value for API key authentication.",
default=None,
)
def msa(
input_path: tuple[Path, ...],
out_dir: Path,
cache: Path,
msa_server_url: str,
msa_pairing_strategy: str,
max_msa_seqs: int,
preprocessing_threads: int,
model: Literal["boltz1", "boltz2"],
msa_server_username: Optional[str],
msa_server_password: Optional[str],
api_key_header: Optional[str],
api_key_value: Optional[str],
):
"""
Generate MSAs from input FASTA or YAML files and save them as CSV.
This command uses an online MSA server (MMseqs2 at ColabFold) to compute
alignments. It should not be used with confidential or unpublished sequences.
The output CSVs can be reused on systems without internet access.
"""

# Ensure cache path exists
cache.mkdir(parents=True, exist_ok=True)
if not cache.is_dir():
raise NotADirectoryError(f"The cache path {cache} exists but is not a directory.")

all_inputs = []
for path in input_path:
all_inputs.extend(check_inputs(path))

# Download CCD data if needed
ccd = cache / "ccd.pkl"
mol_dir = cache / "mols"

if model == "boltz1":
if not ccd.exists():
click.echo(
f"Downloading the CCD dictionary to {ccd}. You may "
"change the cache directory with the --cache flag."
)
urllib.request.urlretrieve(CCD_URL, str(ccd)) # noqa: S310
else:

tar_mols = cache / "mols.tar"
if not tar_mols.exists():
click.echo(
f"Downloading the CCD data to {tar_mols}. "
"This may take a bit of time. You may change the cache directory "
"with the --cache flag."
)
urllib.request.urlretrieve(MOL_URL, str(tar_mols)) # noqa: S310
if not mol_dir.exists():
click.echo(
f"Extracting the CCD data to {mol_dir}. "
"This may take a bit of time. You may change the cache directory "
"with the --cache flag."
)
with tarfile.open(str(tar_mols), "r") as tar:
tar.extractall(cache) # noqa: S202

# Create output structure
process_inputs(
data=all_inputs,
out_dir=out_dir,
ccd_path=ccd,
mol_dir=mol_dir,
use_msa_server=True,
msa_server_url=msa_server_url,
msa_pairing_strategy=msa_pairing_strategy,
max_msa_seqs=max_msa_seqs,
boltz2=model=='boltz2',
preprocessing_threads=preprocessing_threads,
msa_server_username=msa_server_username,
msa_server_password=msa_server_password,
api_key_header=api_key_header,
api_key_value=api_key_value,
)

click.echo(f"MSAs written to: {out_dir / 'msa'}")

if __name__ == "__main__":
cli()