Skip to content

Commit 9be3bd6

Browse files
iqtree benchmark had missing rule in snakemake
1 parent 2f6c414 commit 9be3bd6

4 files changed

Lines changed: 40 additions & 42 deletions

File tree

Readme.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ Just `cargo add phylo_grad` (will work after publishing)
3939
Create and activate a conda environment:
4040

4141
```
42-
conda env create -n phylo_grad -c conda-forge python=3.11 pytorch bioconda::snakemake pytest bioconda::newick_utils
42+
conda env create -n phylo_grad -c conda-forge python=3.11
4343
source activate phylo_grad
4444
```
4545

@@ -50,17 +50,17 @@ pip install ./phylo_grad_py
5050
pip install ./phylo_grad_gpu
5151
```
5252

53-
Run Tests
54-
53+
Install other dependecies:
5554
```
56-
cd benchmark_test
57-
pytest test.py
55+
conda install -c conda-forge pytorch bioconda::snakemake pytest bioconda::newick_utils bioconda::iqtree bioconda::emboss
56+
cargo install phylotree
5857
```
5958

60-
Install phylotree (to generate random trees)
59+
Run Tests
6160

6261
```
63-
cargo install phylotree
62+
cd benchmark_test
63+
pytest test.py
6464
```
6565

6666
Run Benchmarks

benchmark_test/Snakefile

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,22 @@ rule benchmark_iqtree:
5050
slurm_extra="--exclusive",
5151
mem_mb=800000,
5252
runtime=1000
53-
shell: "/usr/bin/time -o {output[1]} -f '%E %P' iqtree3 -s {input.alignment} -nt {threads} --model-joint GTR20+FO --redo -te {input.tree} --show-lh -v -quiet"
53+
shell: "/usr/bin/time -o {output[1]} -f '%E %P' iqtree -s {input.alignment} -nt {threads} --model-joint GTR20+FO --redo -te {input.tree} --show-lh -v -quiet"
54+
55+
rule benchmark_phylograd:
56+
input: tree = "data/random/tree_{num_leafs}.nwk",
57+
alignment = "data/sim/alignment_{num_leafs}_{L}.fasta"
58+
output: "data/sim/alignment_{num_leafs}_{L}.fasta.phylograd.npz",
59+
"data/sim/alignment_{num_leafs}_{L}.fasta.phylograd.time",
60+
threads: 64
61+
resources:
62+
slurm_extra="--exclusive",
63+
mem_mb=800000,
64+
runtime=1000
65+
shell: "/usr/bin/time -o {output[1]} -f '%E %P' python optimize_global_model.py --fasta_amino {input.alignment} --newick {input.tree} --rust --f64 --output {output[0]}"
5466

5567
rule collect_iqtree_benchmark:
56-
input: iqtree = expand("data/sim/alignment_{num_leafs}_{L}.fasta.iqtree.time", num_leafs=[2**n for n in [4,6,8,10,12]], L=[100,500]),
57-
phylograd = expand("data/sim/alignment_{num_leafs}_{L}.fasta.phylograd.time", num_leafs=[2**n for n in [4,6,8,10,12]], L=[100,500])
68+
input: iqtree = expand("data/sim/alignment_{num_leafs}_{L}.fasta.iqtree.time", num_leafs=[2**n for n in [4,6,8]], L=[100,500]),
69+
phylograd = expand("data/sim/alignment_{num_leafs}_{L}.fasta.phylograd.time", num_leafs=[2**n for n in [4,6,8]], L=[100,500])
5870
output: "data/iqtree_benchmark.txt"
5971
shell: "tail -n +1 {input.iqtree} {input.phylograd} > {output}"

benchmark_test/input.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@ def read_fasta(fasta_file: str) -> dict:
2828

2929
return seq_dict
3030

31+
def read_fasta_numeric(fasta_file: str) -> dict:
32+
alignment = AlignIO.read(fasta_file, "fasta")
33+
assert isinstance(alignment, MultipleSeqAlignment)
34+
35+
seq_dict = {seq.id: np.array([amino_mapping[c] for c in seq.seq], dtype=np.int32) for seq in alignment}
36+
37+
return seq_dict
38+
3139
def read_newick(newick_file: str) -> dict:
3240
"""
3341
Reads a newick file and returns a parent_list, branch_lengths and number of leaf nodes.

benchmark_test/optimize_global_model.py

Lines changed: 10 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,9 @@
1-
"""
2-
This script is the benchmark, it supports the phylo_grad gradients and pytorch gradients.
3-
"""
4-
51
import sys
62

73
import torch
84

95
import input
106
import phylo_grad
11-
import felsenstein
127
import numpy as np
138
import argparse
149

@@ -27,8 +22,6 @@
2722
backend = parser.add_argument_group('Backend')
2823
exclusive_group = backend.add_mutually_exclusive_group(required=True)
2924
exclusive_group.add_argument('--rust', action='store_true')
30-
exclusive_group.add_argument('--pytorch', action='store_true')
31-
exclusive_group.add_argument('--pytorch_gpu', action='store_true')
3225

3326
fp_precision = parser.add_argument_group('fp precision')
3427
exclusive_group = fp_precision.add_mutually_exclusive_group(required=True)
@@ -45,42 +38,27 @@
4538
np_dtype = np.float32
4639

4740
if args.fasta_amino is not None:
48-
tree, L = input.read_newick_fasta(args.newick, args.fasta_amino)
41+
alignment = input.read_fasta_numeric(args.fasta_amino)
4942
# Counts amino acids
5043
counts = torch.zeros(21, dtype=torch.int64)
51-
for _, _, seq in tree:
52-
if seq is not None:
53-
numeric = [input.amino_mapping[c] for c in seq]
54-
for i in numeric:
55-
counts[i] += 1
44+
for seq in alignment.values():
45+
L = len(seq)
46+
for i in seq:
47+
counts[i] += 1
5648

5749
initial_energies = torch.log(counts[:-1])
58-
59-
if args.pytorch_gpu:
60-
tree = [(par, dist, input.amino_to_embedding(seq).cuda() if seq else None) for par, dist, seq in tree]
61-
else:
62-
tree = [(par, dist, input.amino_to_embedding(seq)) for par, dist, seq in tree]
63-
6450

6551

6652

6753
#Init random parameters
6854
torch.manual_seed(0)
6955

70-
if args.pytorch_gpu:
71-
shared = torch.zeros(190, requires_grad=True, dtype=dtype, device="cuda")
72-
energies = torch.tensor(initial_energies, requires_grad=True, dtype=dtype, device="cuda")
73-
else:
74-
shared = torch.zeros(190, requires_grad=True, dtype=dtype)
75-
energies = torch.tensor(initial_energies, requires_grad=True, dtype=dtype)
56+
shared = torch.zeros(190, requires_grad=True, dtype=dtype)
57+
energies = initial_energies.clone().to( dtype=dtype).requires_grad_(True)
7658

7759
if args.rust:
78-
leaf_log_p = torch.stack([seq for _,_, seq in tree if seq is not None]).transpose(1,0)
79-
tree = np.array([(par, dist) for par, dist, _ in tree], dtype=np_dtype)
80-
tree = phylo_grad.FelsensteinTree(tree, leaf_log_p.type(dtype).numpy(), 1e-4)
81-
82-
else:
83-
tree = felsenstein.FelsensteinTree(tree)
60+
leaf_log_p = input.read_fasta(args.fasta_amino)
61+
tree = phylo_grad.FelsensteinTree.from_newick(args.newick, leaf_log_p, np_dtype, 1e-4, gpu = False)
8462

8563
optimizer = torch.optim.Adam([shared, energies], lr=0.1)
8664

@@ -133,7 +111,7 @@ def rate_matrix(shared, energies, L):
133111

134112
S, sqrt_pi = rate_matrix(shared, energies, L)
135113

136-
np.savez(args.output, S=S.detach().cpu().numpy()[0], sqrt_pi=sqrt_pi.detach().cpu().numpy()[0], shared=shared.detach().cpu().numpy(), energies=energies.detach().cpu().numpy())
114+
np.savez(args.output, S=S.detach().numpy()[0], sqrt_pi=sqrt_pi.detach().numpy()[0], shared=shared.detach().numpy(), energies=energies.detach().numpy())
137115

138116
# Print peak memory usage
139117
import resource

0 commit comments

Comments
 (0)