logsumexp resuse implementation

benjamin-lieser · benjamin-lieser · commit 2e780f3d8d31 · 2025-11-06T10:30:19.000+01:00
diff --git a/phylo_grad/src/backward.rs b/phylo_grad/src/backward.rs
@@ -141,69 +141,63 @@ pub fn d_param<F: FloatTrait, const DIM: usize>(
     (grad_s, grad_sqrt_pi)
 }
 
-fn child_input_forward_data<F: FloatTrait, const DIM: usize>(
-    log_p: na::SVectorView<F, DIM>,
-    log_transition_T: &na::SMatrix<F, DIM, DIM>,
-    output: &mut na::SMatrix<F, DIM, DIM>,
-) {
-    /* result = log_p[None, :] + log_transition */
-    for i in 0..DIM {
-        for j in 0..DIM {
-            output[(i, j)] = log_p[j] + log_transition_T[(j, i)];
-        }
-    }
-}
-
 fn d_broadcast_vjp<F: FloatTrait, const DIM: usize>(
     cotangent_vector: na::SMatrixView<F, DIM, DIM>,
 ) -> na::SVector<F, DIM> {
     /* sum(cotangent_vector, dim=1) */
     na::SVector::<F, DIM>::from_iterator(cotangent_vector.column_iter().map(|col| col.sum()))
 }
 
+/// Main part of the backward where we go back through one Felsenstein step, it takes the cotangent of the parent log_p and calculates the cotangent of the child log_p and the parameters
+/// forward_exp_save will be the output cojangend for the log_transition matrix
 pub fn d_log_transition_child_input_vjp<F: FloatTrait, const DIM: usize>(
     cotangent_vector: na::SVectorView<F, DIM>,
-    log_p: na::SVectorView<F, DIM>,
-    forward: &LogTransitionForwardData<F, DIM>,
+    forward_exp_save: &mut na::SMatrix<F, DIM, DIM>,
+    forward_sum_save: &mut na::SVector<F, DIM>,
     compute_grad_log_p: bool,
-    output: &mut na::SMatrix<F, DIM, DIM>,
 ) -> Option<na::SVector<F, DIM>> {
-    child_input_forward_data(log_p, &forward.log_transition_T, output);
-
-    /* d_lse */
-    for mut row in output.row_iter_mut() {
-        row.copy_from(&softmax(&row.transpose()).transpose());
+    
+    let forward_exp_save_data = &mut forward_exp_save.data.0;
+    
+    // Does the softmax, which is the gradient of the logsumexp
+    for a in 0..DIM {
+        for b in 0..DIM {
+            forward_exp_save_data[a][b] /= forward_sum_save[b];
+        }
     }
-    diag_times_assign(output.as_view_mut(), cotangent_vector.iter().copied());
+    
+    forward_exp_save.transpose_mut();
+    
+    diag_times_assign(forward_exp_save.as_view_mut(), cotangent_vector.iter().copied());
 
     let grad_log_p = if compute_grad_log_p {
-        Some(d_broadcast_vjp(output.as_view()))
+        Some(d_broadcast_vjp(forward_exp_save.as_view()))
     } else {
         None
     };
 
     grad_log_p
 }
 
+/// forward_exp_save will be the output cotangent for Q
 pub fn d_child_input_param<F: FloatTrait, const DIM: usize>(
     cotangent_vector: na::SVectorView<F, DIM>,
     distance: F,
     param: &ParamPrecomp<F, DIM>,
-    log_p: na::SVectorView<F, DIM>,
     forward: &LogTransitionForwardData<F, DIM>,
+    forward_exp_save: &mut na::SMatrix<F, DIM, DIM>,
+    forward_sum_save: &mut na::SVector<F, DIM>,
     compute_grad_log_p: bool,
-    output: &mut na::SMatrix<F, DIM, DIM>,
 ) -> Option<na::SVector<F, DIM>> {
     let grad_log_p = d_log_transition_child_input_vjp(
         cotangent_vector,
-        log_p,
-        forward,
-        compute_grad_log_p,
-        output,
+        forward_exp_save,
+        forward_sum_save,
+        compute_grad_log_p
     );
-    d_ln_vjp(output, &forward.matrix_exp);
+    d_ln_vjp(forward_exp_save, &forward.matrix_exp);
 
-    d_expm_vjp(output, distance, param, &forward.exp_t_lambda);
+    d_expm_vjp(forward_exp_save, distance, param, &forward.exp_t_lambda);
 
     grad_log_p
 }
diff --git a/phylo_grad/src/data_types.rs b/phylo_grad/src/data_types.rs
@@ -26,6 +26,8 @@ where
     fn scalar_exp(self) -> Self;
     fn vec_exp<const N: usize>(x: &mut [Self; N]);
     fn vec_logsumexp<const N: usize>(x: &[Self; N]) -> Self;
+    // Saves exp(x - max) into exp_save to avoid recomputation for the softmax in the backward pass
+    fn vec_logsumexp_save<const N: usize>(x: &[Self; N], exp_save: &mut [Self; N], exp_sum: &mut Self) -> Self;
     fn symmetric_eigen<const N: usize>(
         matrix: na::SMatrix<Self, N, N>,
     ) -> Option<(SVector<Self, N>, SMatrix<Self, N, N>)>;
@@ -57,6 +59,41 @@ impl FloatTrait for f32 {
             x[i] = x[i].scalar_exp();
         }
     }
+    fn vec_logsumexp_save<const N: usize>(x: &[Self; N], exp_save: &mut [Self; N], exp_sum: &mut Self) -> Self {
+        let blocks = N / 8;
+
+        let mut max = simd::f32x8::splat(f32::NEG_INFINITY);
+        for i in 0..blocks {
+            let a = simd::f32x8::from_slice(&x[i * 8..]);
+            max = max.simd_max(a);
+        }
+
+        if N % 8 != 0 {
+            let last_elements =
+                simd::f32x8::load_or(&x[blocks * 8..], simd::f32x8::splat(f32::NEG_INFINITY));
+            max = max.simd_max(last_elements);
+        }
+        let max = max.reduce_max();
+
+        let mut sum = simd::f32x8::splat(0.0);
+        for i in 0..blocks {
+            let a = simd::f32x8::from_slice(&x[i * 8..]);
+            let b = a - simd::f32x8::splat(max);
+            let c = sleef::f32x::exp_u10(b);
+            simd::f32x8::copy_to_slice(c, &mut exp_save[i * 8..]);
+            sum += c;
+        }
+        if N % 8 != 0 {
+            let last_elements =
+                simd::f32x8::load_or(&x[blocks * 8..], simd::f32x8::splat(f32::NEG_INFINITY));
+            let c = sleef::f32x::exp_u10(last_elements - simd::f32x8::splat(max));
+            simd::f32x8::store_select(c, &mut exp_save[blocks * 8..], std::simd::Mask::splat(true));
+            sum += c;
+        }
+        let sum = sum.reduce_sum();
+        *exp_sum = sum;
+        max + (sum).ln()
+    }
     fn vec_logsumexp<const N: usize>(x: &[Self; N]) -> Self {
         let blocks = N / 8;
 
@@ -124,6 +161,41 @@ impl FloatTrait for f64 {
             x[i] = x[i].scalar_exp();
         }
     }
+    fn vec_logsumexp_save<const N: usize>(x: &[Self; N], exp_save: &mut [Self; N], exp_sum: &mut Self) -> Self {
+        let blocks = N / 4;
+
+        let mut max = simd::f64x4::splat(f64::NEG_INFINITY);
+        for i in 0..blocks {
+            let a = simd::f64x4::from_slice(&x[i * 4..]);
+            max = max.simd_max(a);
+        }
+
+        if N % 4 != 0 {
+            let last_elements =
+                simd::f64x4::load_or(&x[blocks * 4..], simd::f64x4::splat(f64::NEG_INFINITY));
+            max = max.simd_max(last_elements);
+        }
+        let max = max.reduce_max();
+
+        let mut sum = simd::f64x4::splat(0.0);
+        for i in 0..blocks {
+            let a = simd::f64x4::from_slice(&x[i * 4..]);
+            let b = a - simd::f64x4::splat(max);
+            let c = sleef::f64x::exp_u10(b);
+            simd::f64x4::copy_to_slice(c, &mut exp_save[i * 4..]);
+            sum += c;
+        }
+        if N % 4 != 0 {
+            let last_elements =
+                simd::f64x4::load_or(&x[blocks * 4..], simd::f64x4::splat(f64::NEG_INFINITY));
+            let c = sleef::f64x::exp_u10(last_elements - simd::f64x4::splat(max));
+            simd::f64x4::store_select(c, &mut exp_save[blocks * 4..], std::simd::Mask::splat(true));
+            sum += c;
+        }
+        let sum = sum.reduce_sum();
+        *exp_sum = sum;
+        max + (sum).ln()
+    }
     fn vec_logsumexp<const N: usize>(x: &[Self; N]) -> Self {
         let blocks = N / 4;
 
diff --git a/phylo_grad/src/forward.rs b/phylo_grad/src/forward.rs
@@ -2,10 +2,32 @@ use crate::data_types::*;
 
 use nalgebra as na;
 
+/// Forward data precomputed before the forward pass
 pub struct ForwardData<F, const DIM: usize> {
     pub log_transition: Vec<LogTransitionForwardData<F, DIM>>,
 }
 
+/// Forward data which is saved during the forward pass
+pub struct ForwardDataSave<F, const DIM: usize> {
+    pub logsumexp_exp_save: Vec<na::SMatrix<F, DIM, DIM>>,
+    pub logsumexp_sum_save: Vec<na::SVector<F, DIM>>,
+}
+
+impl<F : FloatTrait, const DIM: usize> ForwardDataSave<F, DIM> {
+    pub fn new(capacity: usize) -> Self {
+        Self {
+            logsumexp_exp_save: vec![
+                na::SMatrix::<F, DIM, DIM>::zeros();
+                capacity
+            ],
+            logsumexp_sum_save: vec![
+                na::SVector::<F, DIM>::zeros();
+                capacity
+            ],
+        }
+    }
+}
+
 impl<F, const DIM: usize> ForwardData<F, DIM> {
     pub fn with_capacity(capacity: usize) -> Self {
         Self {
@@ -152,15 +174,17 @@ pub fn forward_node<F: FloatTrait, const DIM: usize>(
     parent: usize,
     log_p: &mut [na::SVector<F, DIM>],
     forward_data: &ForwardData<F, DIM>,
+    forward_data_save: &mut ForwardDataSave<F, DIM>,
 ) {
+    let logsumexp_exp_save = &mut forward_data_save.logsumexp_exp_save[child].data.0;
+    let logsumexp_sum_save = forward_data_save.logsumexp_sum_save[child].as_mut_slice();
     /* log_p[parent]_a = logsumexp_b(log_p[child](b) + log_transition(rate_matrix, distance)(a, b) ) */
     for a in 0..DIM {
         let row_a = forward_data.log_transition[child].log_transition_T.column(a);
         let tmp = log_p[child] + row_a;
         unsafe {
-            log_p[parent][a] += F::vec_logsumexp(std::mem::transmute::<&[[F; DIM]; 1], &[F; DIM]>(
-                &tmp.data.0,
-            ))
+            log_p[parent][a] += F::vec_logsumexp_save(std::mem::transmute::<&[[F; DIM]; 1], &[F; DIM]>(
+                &tmp.data.0), &mut logsumexp_exp_save[a], &mut logsumexp_sum_save[a]);
         }
     }
 }
diff --git a/phylo_grad/src/run.rs b/phylo_grad/src/run.rs