Inline low level multiplication and reduction functions (#776)

fjarri · web-flow · commit 8fe22cba8196 · 2025-02-22T23:43:47.000+01:00
Fixes a performance regression introduced in #667. Evidently, compiler relies a lot on knowing the slice sizes at compile time, so I'm inlining `schoolbook_multiplication()`, `schoolbook_squaring()`, and `montgomery_reduction_inner()`, so the compiler can optimize in case of `Uint`s. Benchmarks: - `wrapping ops/split_mul, U256xU256` - 26ns to 9ns - `Const Montgomery arithmetic/multiplication, U256*U256` - 41ns to 21ns - `Dynamic Montgomery arithmetic/multiplication, U256*U256` - 62ns to 44ns The effect is less pronounced for longer integers, but sill amounts to 5-10% speedup for U4096. On a higher level, this affects many `crypto-primes` benchmarks, e.g. doubles the speed of Lucas test for U128. Possible addition: I think `panic!` in these functions can be replaced with `debug_assert!`, but I don't insist on it.
diff --git a/src/modular/reduction.rs b/src/modular/reduction.rs
@@ -6,6 +6,7 @@ use crate::{Limb, Odd, Uint};
 use {crate::BoxedUint, subtle::Choice};
 
 /// Algorithm 14.32 in Handbook of Applied Cryptography <https://cacr.uwaterloo.ca/hac/about/chap14.pdf>
+#[inline(always)]
 const fn montgomery_reduction_inner(
     upper: &mut [Limb],
     lower: &mut [Limb],
diff --git a/src/uint/mul.rs b/src/uint/mul.rs
@@ -17,6 +17,7 @@ pub(crate) mod karatsuba;
 /// schools.
 ///
 /// The most efficient method for small numbers.
+#[inline(always)]
 const fn schoolbook_multiplication(lhs: &[Limb], rhs: &[Limb], lo: &mut [Limb], hi: &mut [Limb]) {
     if lhs.len() != lo.len() || rhs.len() != hi.len() {
         panic!("schoolbook multiplication length mismatch");
@@ -52,6 +53,7 @@ const fn schoolbook_multiplication(lhs: &[Limb], rhs: &[Limb], lo: &mut [Limb],
 /// Schoolbook method of squaring.
 ///
 /// Like schoolbook multiplication, but only considering half of the multiplication grid.
+#[inline(always)]
 pub(crate) const fn schoolbook_squaring(limbs: &[Limb], lo: &mut [Limb], hi: &mut [Limb]) {
     // Translated from https://github.com/ucbrise/jedi-pairing/blob/c4bf151/include/core/bigint.hpp#L410
     //