diff --git a/blake2/Cargo.toml b/blake2/Cargo.toml index b7bcc7ad1..86df95367 100644 --- a/blake2/Cargo.toml +++ b/blake2/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "blake2" -version = "0.8.0" +version = "0.8.2" authors = ["RustCrypto Developers"] license = "MIT OR Apache-2.0" description = "BLAKE2 hash functions" @@ -26,6 +26,7 @@ std = ["digest/std", "crypto-mac/std"] simd = [] simd_opt = ["simd"] simd_asm = ["simd_opt"] +coresimd = [] [badges] travis-ci = { repository = "RustCrypto/hashes" } diff --git a/blake2/benches/blake2bp.rs b/blake2/benches/blake2bp.rs new file mode 100644 index 000000000..cd92a8854 --- /dev/null +++ b/blake2/benches/blake2bp.rs @@ -0,0 +1,7 @@ +#![no_std] +#![feature(test)] +#[macro_use] +extern crate digest; +extern crate blake2; + +bench!(blake2::Blake2bp); diff --git a/blake2/benches/blake2sp.rs b/blake2/benches/blake2sp.rs new file mode 100644 index 000000000..ab1eb153d --- /dev/null +++ b/blake2/benches/blake2sp.rs @@ -0,0 +1,7 @@ +#![no_std] +#![feature(test)] +#[macro_use] +extern crate digest; +extern crate blake2; + +bench!(blake2::Blake2sp); diff --git a/blake2/src/as_bytes.rs b/blake2/src/as_bytes.rs index ae1fcfcef..7a1284e3e 100644 --- a/blake2/src/as_bytes.rs +++ b/blake2/src/as_bytes.rs @@ -41,3 +41,5 @@ unsafe impl Safe for i8 {} unsafe impl Safe for i16 {} unsafe impl Safe for i32 {} unsafe impl Safe for i64 {} +unsafe impl Safe for [T; 8] {} +unsafe impl Safe for [T; 16] {} diff --git a/blake2/src/blake2.rs b/blake2/src/blake2.rs index 47e85bd82..9b57b9edb 100644 --- a/blake2/src/blake2.rs +++ b/blake2/src/blake2.rs @@ -1,34 +1,96 @@ -macro_rules! blake2_impl { +macro_rules! blake2_compressor_impl { ( - $state:ident, $fix_state:ident, $word:ident, $vec:ident, $bytes:ident, + $compressor:ident, $builder:ident, $word:ident, $vec:ident, $bytes:ident, $R1:expr, $R2:expr, $R3:expr, $R4:expr, $IV:expr, - $vardoc:expr, $doc:expr, + $XofLen:ident, $reserved_len:expr, $salt_len:expr, ) => { use $crate::as_bytes::AsBytes; + #[allow(unused_imports)] use $crate::simd::{Vector4, $vec}; - use digest::{Input, BlockInput, FixedOutput, VariableOutput, Reset}; - use digest::InvalidOutputSize; + use byte_tools::copy; + use core::{mem, u8, u32}; use digest::generic_array::GenericArray; use digest::generic_array::typenum::Unsigned; - use core::cmp; - use byte_tools::{copy, zero}; - use crypto_mac::{Mac, MacResult, InvalidKeyLength}; - type Output = GenericArray; + #[derive(Clone, Copy)] + #[repr(packed)] + #[allow(unused)] + pub struct $builder { + digest_len: u8, + key_len: u8, + fanout: u8, + depth: u8, + leaf_len: u32, + node_offs: u32, + xof_len: $XofLen, + node_depth: u8, + inner_len: u8, + reserved: [u8; $reserved_len], + salt: [u8; $salt_len], + personal: [u8; $salt_len], + } - #[derive(Clone)] - #[doc=$vardoc] - pub struct $state { - m: [$word; 16], - h: [$vec; 2], - t: u64, - n: usize, + impl $builder { + pub fn new() -> Self { + Self { + digest_len: 0, + key_len: 0, + fanout: 1, + depth: 1, + leaf_len: 0, + node_offs: 0, + xof_len: 0, + node_depth: 0, + inner_len: 0, + reserved: Default::default(), + salt: Default::default(), + personal: Default::default(), + } + } - h0: [$vec; 2], - m0: [$word; 16], - t0: u64, + pub fn out(&mut self, out: usize) { + assert!(out <= usize::from(u8::MAX)); + self.digest_len = out as u8; + } + + pub fn key(&mut self, kk: usize) { + assert!(kk as usize <= $bytes::to_usize()); + self.key_len = kk as u8; + } + + pub fn fanout(&mut self, fanout: u8) { + self.fanout = fanout; + } + + pub fn depth(&mut self, depth: u8) { + self.depth = depth; + } + + pub fn node_depth(&mut self, node_depth: u8) { + self.node_depth = node_depth; + } + + pub fn node_offset(&mut self, node_offs: usize) { + assert!(node_offs <= u32::MAX as usize); + assert!(node_offs as u32 <= u32::MAX); + self.node_offs = u32::to_le(node_offs as u32); + } + + pub fn inner_length(&mut self, inner_len: u8) { + self.inner_len = inner_len; + } + + pub fn build(&self) -> $compressor { + assert!(self.digest_len > 0); + // All fields of both types are Copy. + // Field endianness is handled at field-setting time. + let h0: [$vec; 2] = unsafe { mem::transmute(*self) }; + $compressor { + h: [iv0() ^ h0[0].to_le(), iv1() ^ h0[1].to_le()], + } + } } #[inline(always)] @@ -36,6 +98,19 @@ macro_rules! blake2_impl { #[inline(always)] fn iv1() -> $vec { $vec::new($IV[4], $IV[5], $IV[6], $IV[7]) } + #[derive(Clone)] + pub struct $compressor { + h: [$vec; 2], + } + + impl Default for $compressor { + fn default() -> Self { + Self { + h: [$vec::new(0, 0, 0, 0), $vec::new(0, 0, 0, 0)] + } + } + } + #[inline(always)] fn quarter_round(v: &mut [$vec; 4], rd: u32, rb: u32, m: $vec) { v[0] = v[0].wrapping_add(v[1]).wrapping_add(m.from_le()); @@ -53,9 +128,9 @@ macro_rules! blake2_impl { #[inline(always)] fn unshuffle(v: &mut [$vec; 4]) { - v[1] = v[1].shuffle_right_1(); - v[2] = v[2].shuffle_right_2(); - v[3] = v[3].shuffle_right_3(); + v[1] = v[1].shuffle_left_3(); + v[2] = v[2].shuffle_left_2(); + v[3] = v[3].shuffle_left_1(); } #[inline(always)] @@ -73,6 +148,104 @@ macro_rules! blake2_impl { unshuffle(v); } + impl $compressor { + pub fn with_parameter_block(p: &[$word; 8]) -> Self { + let h0 = [ + iv0() ^ $vec::new(p[0], p[1], p[2], p[3]), + iv1() ^ $vec::new(p[4], p[5], p[6], p[7]), + ]; + Self { + h: h0, + } + } + + pub fn compress(&mut self, m: &[$word; 16], f0: $word, f1: $word, t: u64) { + use $crate::consts::SIGMA; + + let h = &mut self.h; + + let t0 = t as $word; + let t1 = match $bytes::to_u8() { + 64 => 0, + 32 => (t >> 32) as $word, + _ => unreachable!(), + }; + + let mut v = [ + h[0], + h[1], + iv0(), + iv1() ^ $vec::new(t0, t1, f0, f1), + ]; + + round(&mut v, m, &SIGMA[0]); + round(&mut v, m, &SIGMA[1]); + round(&mut v, m, &SIGMA[2]); + round(&mut v, m, &SIGMA[3]); + round(&mut v, m, &SIGMA[4]); + round(&mut v, m, &SIGMA[5]); + round(&mut v, m, &SIGMA[6]); + round(&mut v, m, &SIGMA[7]); + round(&mut v, m, &SIGMA[8]); + round(&mut v, m, &SIGMA[9]); + if $bytes::to_u8() == 64 { + round(&mut v, m, &SIGMA[0]); + round(&mut v, m, &SIGMA[1]); + } + + h[0] = h[0] ^ (v[0] ^ v[2]); + h[1] = h[1] ^ (v[1] ^ v[3]); + } + + pub fn finalize(&mut self, out: &mut GenericArray, m: &[$word; 16], f1: $word, t: u64) { + self.compress(m, !0, f1, t); + let buf = [self.h[0].to_le(), self.h[1].to_le()]; + copy(buf.as_bytes(), out); + } + + pub fn finalize_into_slice(&mut self, out: &mut [u8], m: &[$word; 16], f1: $word, t: u64) { + self.compress(m, !0, f1, t); + let buf = [self.h[0].to_le(), self.h[1].to_le()]; + out.copy_from_slice(buf.as_bytes()); + } + + pub fn builder() -> $builder { + $builder::new() + } + } + } +} + +macro_rules! blake2_impl { + ( + $state:ident, $fix_state:ident, $compressor:ident, $word:ident, $bytes:ident, + $vardoc:expr, $doc:expr, + ) => { + + use $crate::as_bytes::AsBytes; + + use digest::{Input, BlockInput, FixedOutput, VariableOutput, Reset}; + use digest::InvalidOutputSize; + use digest::generic_array::GenericArray; + use digest::generic_array::typenum::Unsigned; + use core::cmp; + use byte_tools::{copy, zero}; + use crypto_mac::{Mac, MacResult, InvalidKeyLength}; + + type Output = GenericArray; + + #[derive(Clone)] + #[doc=$vardoc] + pub struct $state { + n: usize, + h: $compressor, + m: [$word; 16], + h0: $compressor, + m0: [$word; 16], + t: u64, + t0: u64, + } + impl $state { /// Creates a new hashing context with a key. /// @@ -80,32 +253,26 @@ macro_rules! blake2_impl { /// make sure to compare codes in constant time! It can be done /// for example by using `subtle` crate. pub fn new_keyed(key: &[u8], output_size: usize) -> Self { - let kk = key.len(); - assert!(kk <= $bytes::to_usize()); - assert!(output_size <= $bytes::to_usize()); - - let p0 = 0x0101_0000 ^ ((kk as $word) << 8) ^ - (output_size as $word); - let h0 = [iv0() ^ $vec::new(p0, 0, 0, 0), iv1()]; - let mut state = $state { - m: [0; 16], - h: h0, - t: 0, + let mut h0 = $compressor::builder(); + h0.key(key.len()); + h0.out(output_size); + let h0 = h0.build(); + let mut m = [0; 16]; + let mut t = 0; + if !key.is_empty() { + copy(key, m.as_mut_bytes()); + t = 2 * $bytes::to_u64(); + } + $state { + m, + h: h0.clone(), + t, n: output_size, - t0: 0, - m0: [0; 16], + t0: t, + m0: m, h0: h0, - }; - - if kk > 0 { - copy(key, state.m.as_mut_bytes()); - state.t = 2 * $bytes::to_u64(); } - - state.t0 = state.t; - state.m0 = state.m; - state } #[doc(hidden)] @@ -114,15 +281,10 @@ macro_rules! blake2_impl { let kk = (p[0] >> 8) as u8 as usize; assert!(nn >= 1 && nn <= $bytes::to_usize()); assert!(kk <= $bytes::to_usize()); - - let h0 = [ - iv0() ^ $vec::new(p[0], p[1], p[2], p[3]), - iv1() ^ $vec::new(p[4], p[5], p[6], p[7]), - ]; - + let h0 = $compressor::with_parameter_block(p); $state { m: [0; 16], - h: h0, + h: h0.clone(), t: 0, n: nn, @@ -150,25 +312,13 @@ macro_rules! blake2_impl { .expect("hash data length overflow"); } - while rest.len() >= block { - self.compress(0, 0); - - let part = &rest[..block]; - rest = &rest[part.len()..]; + for part in rest.chunks(block) { + self.h.compress(&self.m, 0, 0, self.t); copy(part, &mut self.m.as_mut_bytes()); self.t = self.t.checked_add(part.len() as u64) .expect("hash data length overflow"); } - - let n = rest.len(); - if n > 0 { - self.compress(0, 0); - - copy(rest, &mut self.m.as_mut_bytes()); - self.t = self.t.checked_add(rest.len() as u64) - .expect("hash data length overflow"); - } } #[doc(hidden)] @@ -182,53 +332,249 @@ macro_rules! blake2_impl { if off != 0 { zero(&mut self.m.as_mut_bytes()[off..]); } + let mut out = GenericArray::default(); + self.h.finalize(&mut out, &self.m, f1, self.t); + out + } + } - self.compress(!0, f1); + impl Default for $state { + fn default() -> Self { Self::new_keyed(&[], $bytes::to_usize()) } + } - let buf = [self.h[0].to_le(), self.h[1].to_le()]; + impl BlockInput for $state { + type BlockSize = $bytes; + } - let mut out = GenericArray::default(); - copy(buf.as_bytes(), &mut out); - out + impl Input for $state { + fn input>(&mut self, data: B) { + self.update(data.as_ref()); } + } - fn compress(&mut self, f0: $word, f1: $word) { - use $crate::consts::SIGMA; + impl VariableOutput for $state { + fn new(output_size: usize) -> Result { + if output_size == 0 || output_size > $bytes::to_usize() { + return Err(InvalidOutputSize); + } + Ok(Self::new_keyed(&[], output_size)) + } - let m = &self.m; - let h = &mut self.h; + fn output_size(&self) -> usize { + self.n + } - let t0 = self.t as $word; - let t1 = match $bytes::to_u8() { - 64 => 0, - 32 => (self.t >> 32) as $word, - _ => unreachable!(), + fn variable_result(self, f: F) { + let n = self.n; + let res = self.finalize_with_flag(0); + f(&res[..n]); + } + } + + impl Reset for $state { + fn reset(&mut self) { + self.t = self.t0; + self.m = self.m0; + self.h = self.h0.clone(); + } + } + + impl_opaque_debug!($state); + impl_write!($state); + + + #[derive(Clone)] + #[doc=$doc] + pub struct $fix_state { + state: $state, + } + + impl Default for $fix_state { + fn default() -> Self { + let state = $state::new_keyed(&[], $bytes::to_usize()); + Self { state } + } + } + + impl BlockInput for $fix_state { + type BlockSize = $bytes; + } + + impl Input for $fix_state { + fn input>(&mut self, data: B) { + self.state.update(data.as_ref()); + } + } + + impl FixedOutput for $fix_state { + type OutputSize = $bytes; + + fn fixed_result(self) -> Output { + self.state.finalize_with_flag(0) + } + } + + impl Reset for $fix_state { + fn reset(&mut self) { + self.state.reset() + } + } + + impl Mac for $fix_state { + type OutputSize = $bytes; + type KeySize = $bytes; + + fn new(key: &GenericArray) -> Self { + let state = $state::new_keyed(key, $bytes::to_usize()); + Self { state } + } + + fn new_varkey(key: &[u8]) -> Result { + if key.len() > $bytes::to_usize() { + Err(InvalidKeyLength) + } else { + let state = $state::new_keyed(key, $bytes::to_usize()); + Ok(Self { state }) + } + } + + fn input(&mut self, data: &[u8]) { self.state.update(data); } + + fn reset(&mut self) { + ::reset(self) + } + + fn result(self) -> MacResult { + MacResult::new(self.state.finalize_with_flag(0)) + } + } + + impl_opaque_debug!($fix_state); + impl_write!($fix_state); + } +} + +macro_rules! blake2_p_impl { + ( + $state:ident, $fix_state:ident, $compressor:ident, $builder:ident, $word:ident, $bytes:ident, $fanout:expr, + $vardoc:expr, $doc:expr, + ) => { + + use $crate::as_bytes::AsBytes; + + use digest::{Input, BlockInput, FixedOutput, VariableOutput, Reset}; + use digest::InvalidOutputSize; + use digest::generic_array::GenericArray; + use digest::generic_array::typenum::Unsigned; + use core::cmp; + use byte_tools::{copy, zero}; + use crypto_mac::{Mac, MacResult, InvalidKeyLength}; + + type Output = GenericArray; + + #[derive(Clone)] + #[doc=$vardoc] + pub struct $state { + n: usize, + m0: [$word; 16], + t0: u64, + h0: $builder, + h: [$compressor; $fanout], + m: [[$word; 16]; $fanout], + t: u64, + } + + impl $state { + /// Creates a new hashing context with a key. + /// + /// **WARNING!** If you plan to use it for variable output MAC, then + /// make sure to compare codes in constant time! It can be done + /// for example by using `subtle` crate. + pub fn new_keyed(key: &[u8], output_size: usize) -> Self { + let mut h0 = $builder::new(); + h0.key(key.len()); + h0.out(output_size); + h0.fanout($fanout); + h0.depth(2); + h0.inner_length($bytes::to_u8()); + let mut m0 = [0; 16]; + let mut t0 = 0; + if !key.is_empty() { + copy(key, m0.as_mut_bytes()); + t0 = 2 * $bytes::to_u64() * $fanout; + } + let mut state = $state { + n: output_size, + h0, + t0, + m0, + // everything else set up by reset() + h: Default::default(), + m: Default::default(), + t: Default::default(), }; + state.reset(); + state + } - let mut v = [ - h[0], - h[1], - iv0(), - iv1() ^ $vec::new(t0, t1, f0, f1), - ]; + /// Updates the hashing context with more data. + fn update(&mut self, mut data: &[u8]) { + const BLOCK: usize = 2 * $bytes::USIZE; + const RING: usize = BLOCK * $fanout; + + if self.t < RING as u64 { + // initial ring fill + let (d0, d1) = data.split_at(cmp::min(data.len(), RING - self.t as usize)); + self.m.as_mut_bytes()[self.t as usize..self.t as usize + d0.len()].copy_from_slice(d0); + self.t += d0.len() as u64; + data = d1; + } else if self.t as usize % BLOCK != 0 { + // complete partial block + let (d0, d1) = data.split_at(cmp::min(data.len(), BLOCK - self.t as usize % BLOCK)); + let ri = self.t as usize % RING; + self.m.as_mut_bytes()[ri..ri + d0.len()].copy_from_slice(d0); + self.t += d0.len() as u64; + data = d1; + } - round(&mut v, m, &SIGMA[0]); - round(&mut v, m, &SIGMA[1]); - round(&mut v, m, &SIGMA[2]); - round(&mut v, m, &SIGMA[3]); - round(&mut v, m, &SIGMA[4]); - round(&mut v, m, &SIGMA[5]); - round(&mut v, m, &SIGMA[6]); - round(&mut v, m, &SIGMA[7]); - round(&mut v, m, &SIGMA[8]); - round(&mut v, m, &SIGMA[9]); - if $bytes::to_u8() == 64 { - round(&mut v, m, &SIGMA[0]); - round(&mut v, m, &SIGMA[1]); + // if there's data remaining, the ring is full of whole blocks + for b in data.chunks(BLOCK) { + let i = self.t as usize / BLOCK % $fanout; + self.h[i].compress(&mut self.m[i], 0, 0, self.t / RING as u64 * BLOCK as u64); + self.m[i].as_mut_bytes()[..b.len()].copy_from_slice(b); + self.t += b.len() as u64; } + } - h[0] = h[0] ^ (v[0] ^ v[2]); - h[1] = h[1] ^ (v[1] ^ v[3]); + fn finalize(mut self) -> Output { + const BLOCK: usize = 2 * $bytes::USIZE; + const RING: usize = BLOCK * $fanout; + + self.h0.node_offset(0); + self.h0.node_depth(1); + let mut root = self.h0.build(); + + let mut ri = self.t as usize % RING; + let trb = self.t / RING as u64 * BLOCK as u64; + if ri % BLOCK != 0 { + let ni = ((self.t as usize & !(BLOCK - 1)) + BLOCK) % RING; + zero(&mut self.m.as_mut_bytes()[ri..ni]); + } + let mut inter = [0; 16]; + for i in 0..$fanout { + if i != 0 && i & 1 == 0 { + root.compress(&inter, 0, 0, i as u64 * $bytes::to_u64()); + } + let len = cmp::min(ri, BLOCK); + ri -= len; + let f1 = if i == $fanout - 1 { !0 } else { 0 }; + let ix0 = (i & 1) * $bytes::to_usize(); + let ix1 = ((i & 1) + 1) * $bytes::to_usize(); + self.h[i].finalize_into_slice(&mut inter.as_mut_bytes()[ix0..ix1], &self.m[i], f1, trb + len as u64); + } + let mut out = GenericArray::default(); + root.finalize(&mut out, &inter, !0, $fanout * $bytes::to_u64()); + out } } @@ -260,16 +606,24 @@ macro_rules! blake2_impl { fn variable_result(self, f: F) { let n = self.n; - let res = self.finalize_with_flag(0); + let res = self.finalize(); f(&res[..n]); } } - impl Reset for $state { + impl Reset for $state { fn reset(&mut self) { + self.h0.node_depth(0); + for (i, h) in self.h.iter_mut().enumerate() { + self.h0.node_offset(i); + *h = self.h0.build(); + } + + for m in self.m.iter_mut() { + m.copy_from_slice(&self.m0); + } + self.t = self.t0; - self.m = self.m0; - self.h = self.h0; } } @@ -304,7 +658,7 @@ macro_rules! blake2_impl { type OutputSize = $bytes; fn fixed_result(self) -> Output { - self.state.finalize_with_flag(0) + self.state.finalize() } } @@ -339,7 +693,7 @@ macro_rules! blake2_impl { } fn result(self) -> MacResult { - MacResult::new(self.state.finalize_with_flag(0)) + MacResult::new(self.state.finalize()) } } diff --git a/blake2/src/blake2b.rs b/blake2/src/blake2b.rs index 3a2a7a312..cc1913ad4 100644 --- a/blake2/src/blake2b.rs +++ b/blake2/src/blake2b.rs @@ -1,8 +1,7 @@ use digest::generic_array::typenum::U64; -use consts::BLAKE2B_IV; +use compressor_b::CompressorB; -blake2_impl!(VarBlake2b, Blake2b, u64, u64x4, U64, - 32, 24, 16, 63, BLAKE2B_IV, +blake2_impl!(VarBlake2b, Blake2b, CompressorB, u64, U64, "Blake2b instance with a variable output.", "Blake2b instance with a fixed output.", ); diff --git a/blake2/src/blake2bp.rs b/blake2/src/blake2bp.rs new file mode 100644 index 000000000..d5678c17e --- /dev/null +++ b/blake2/src/blake2bp.rs @@ -0,0 +1,7 @@ +use digest::generic_array::typenum::U64; +use compressor_b::{CompressorB, CompressorBBuilder}; + +blake2_p_impl!(VarBlake2bp, Blake2bp, CompressorB, CompressorBBuilder, u64, U64, 4, + "Blake2bp instance with a variable output.", + "Blake2bp instance with a fixed output.", +); diff --git a/blake2/src/blake2s.rs b/blake2/src/blake2s.rs index b6c7ae41c..071cfe649 100644 --- a/blake2/src/blake2s.rs +++ b/blake2/src/blake2s.rs @@ -1,8 +1,7 @@ use digest::generic_array::typenum::U32; -use consts::BLAKE2S_IV; +use compressor_s::CompressorS; -blake2_impl!(VarBlake2s, Blake2s, u32, u32x4, U32, - 16, 12, 8, 7, BLAKE2S_IV, +blake2_impl!(VarBlake2s, Blake2s, CompressorS, u32, U32, "Blake2s instance with a variable output.", "Blake2s instance with a fixed output.", ); diff --git a/blake2/src/blake2sp.rs b/blake2/src/blake2sp.rs new file mode 100644 index 000000000..aa7dd8a15 --- /dev/null +++ b/blake2/src/blake2sp.rs @@ -0,0 +1,7 @@ +use digest::generic_array::typenum::U32; +use compressor_s::{CompressorS, CompressorSBuilder}; + +blake2_p_impl!(VarBlake2sp, Blake2sp, CompressorS, CompressorSBuilder, u32, U32, 8, + "Blake2sp instance with a variable output.", + "Blake2sp instance with a fixed output.", +); diff --git a/blake2/src/compressor_b.rs b/blake2/src/compressor_b.rs new file mode 100644 index 000000000..bfea35a6d --- /dev/null +++ b/blake2/src/compressor_b.rs @@ -0,0 +1,4 @@ +use digest::generic_array::typenum::U64; +use consts::BLAKE2B_IV; + +blake2_compressor_impl!(CompressorB, CompressorBBuilder, u64, u64x4, U64, 32, 24, 16, 63, BLAKE2B_IV, u32, 14, 16, ); diff --git a/blake2/src/compressor_s.rs b/blake2/src/compressor_s.rs new file mode 100644 index 000000000..748e0b183 --- /dev/null +++ b/blake2/src/compressor_s.rs @@ -0,0 +1,4 @@ +use digest::generic_array::typenum::U32; +use consts::BLAKE2S_IV; + +blake2_compressor_impl!(CompressorS, CompressorSBuilder, u32, u32x4, U32, 16, 12, 8, 7, BLAKE2S_IV, u16, 0, 8, ); diff --git a/blake2/src/coresimd/mod.rs b/blake2/src/coresimd/mod.rs new file mode 100644 index 000000000..3b44911c7 --- /dev/null +++ b/blake2/src/coresimd/mod.rs @@ -0,0 +1,246 @@ +use as_bytes::Safe; + +pub trait Vector4 {} + +#[cfg(target_feature = "sse2")] +mod sse2 { + use core::ops::BitXor; + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + #[cfg(not(target_feature = "avx2"))] + use core::mem; + + #[cfg(not(target_feature = "avx2"))] + #[allow(non_camel_case_types)] + #[repr(C)] + #[derive(Copy, Clone)] + pub struct u64x4(__m128i, __m128i); + + #[cfg(not(target_feature = "avx2"))] + impl u64x4 { + #[inline(always)] + pub fn new(a: u64, b: u64, c: u64, d: u64) -> Self { + unsafe { u64x4(_mm_set_epi64x(b as i64, a as i64), _mm_set_epi64x(d as i64, c as i64)) } + } + + #[inline(always)] + pub fn gather(src: &[u64], i0: usize, i1: usize, i2: usize, i3: usize) -> Self { + Self::new(src[i0], src[i1], src[i2], src[i3]) + } + + #[inline(always)] + pub fn shuffle_left_1(mut self) -> Self { + unsafe { + let epi: &mut [u64; 4] = mem::transmute(&mut self); + let tmp = epi[0]; + epi[0] = epi[1]; + epi[1] = epi[2]; + epi[2] = epi[3]; + epi[3] = tmp; + self + } + } + + #[inline(always)] + pub fn shuffle_left_2(self) -> Self { u64x4(self.1, self.0) } + + #[inline(always)] + pub fn shuffle_left_3(mut self) -> Self { + unsafe { + let epi: &mut [u64; 4] = mem::transmute(&mut self); + let tmp = epi[3]; + epi[3] = epi[2]; + epi[2] = epi[1]; + epi[1] = epi[0]; + epi[0] = tmp; + self + } + } + + #[cfg(not(target_feature = "ssse3"))] + #[inline(always)] + pub fn rotate_right_const(self, i: u32) -> Self { + unsafe { + match i { + 16 => u64x4( + _mm_or_si128(_mm_slli_epi64(self.0, 48), _mm_srli_epi64(self.0, 16)), + _mm_or_si128(_mm_slli_epi64(self.1, 48), _mm_srli_epi64(self.1, 16)), + ), + 24 => u64x4( + _mm_or_si128(_mm_slli_epi64(self.0, 40), _mm_srli_epi64(self.0, 24)), + _mm_or_si128(_mm_slli_epi64(self.1, 40), _mm_srli_epi64(self.1, 24)), + ), + 32 => u64x4(_mm_shuffle_epi32(self.0, 0b10110001), _mm_shuffle_epi32(self.1, 0b10110001)), + 63 => u64x4( + _mm_or_si128(_mm_slli_epi64(self.0, 1), _mm_srli_epi64(self.0, 63)), + _mm_or_si128(_mm_slli_epi64(self.1, 1), _mm_srli_epi64(self.1, 63)), + ), + _ => unreachable!(), + } + } + } + + #[cfg(target_feature = "ssse3")] + #[inline(always)] + pub fn rotate_right_const(self, i: u32) -> Self { + unsafe { + let b16 = _mm_set_epi64x(0x09080f0e_0d0c0b0a, 0x01000706_05040302); + let b24 = _mm_set_epi64x(0x0a09080f_0e0d0c0b, 0x02010007_06050403); + match i { + 16 => u64x4(_mm_shuffle_epi8(self.0, b16), _mm_shuffle_epi8(self.1, b16)), + 24 => u64x4(_mm_shuffle_epi8(self.0, b24), _mm_shuffle_epi8(self.1, b24)), + 32 => u64x4(_mm_shuffle_epi32(self.0, 0b10110001), _mm_shuffle_epi32(self.1, 0b10110001)), + 63 => u64x4( + _mm_or_si128(_mm_slli_epi64(self.0, 1), _mm_srli_epi64(self.0, 63)), + _mm_or_si128(_mm_slli_epi64(self.1, 1), _mm_srli_epi64(self.1, 63)), + ), + _ => unreachable!(), + } + } + } + + #[inline(always)] pub fn wrapping_add(self, rhs: Self) -> Self { + unsafe { u64x4(_mm_add_epi64(self.0, rhs.0), _mm_add_epi64(self.1, rhs.1)) } + } + + #[inline(always)] pub fn to_le(self) -> Self { self } + #[inline(always)] pub fn from_le(self) -> Self { self } + } + + #[cfg(not(target_feature = "avx2"))] + impl BitXor for u64x4 { + type Output = Self; + #[inline(always)] + fn bitxor(self, rhs: Self) -> Self::Output { + unsafe { u64x4(_mm_xor_si128(self.0, rhs.0), _mm_xor_si128(self.1, rhs.1)) } + } + } + + #[allow(non_camel_case_types)] + #[repr(C)] + #[derive(Copy, Clone)] + pub struct u32x4(__m128i); + + impl u32x4 { + #[inline(always)] + pub fn new(a: u32, b: u32, c: u32, d: u32) -> Self { + unsafe { u32x4(_mm_set_epi32(d as i32, c as i32, b as i32, a as i32)) } + } + + #[inline(always)] + pub fn gather(src: &[u32], i0: usize, i1: usize, i2: usize, i3: usize) -> Self { + Self::new(src[i0], src[i1], src[i2], src[i3]) + } + + #[inline(always)] pub fn shuffle_left_1(self) -> Self { unsafe { u32x4(_mm_shuffle_epi32(self.0, 0b00111001)) } } + #[inline(always)] pub fn shuffle_left_2(self) -> Self { unsafe { u32x4(_mm_shuffle_epi32(self.0, 0b01001110)) } } + #[inline(always)] pub fn shuffle_left_3(self) -> Self { unsafe { u32x4(_mm_shuffle_epi32(self.0, 0b10010011)) } } + + #[cfg(not(target_feature = "ssse3"))] + #[inline(always)] + pub fn rotate_right_const(self, i: u32) -> Self { + unsafe { + match i { + 7 => u32x4(_mm_or_si128(_mm_slli_epi32(self.0, 25), _mm_srli_epi32(self.0, 7))), + 8 => u32x4(_mm_or_si128(_mm_slli_epi32(self.0, 24), _mm_srli_epi32(self.0, 8))), + 12 => u32x4(_mm_or_si128(_mm_slli_epi32(self.0, 20), _mm_srli_epi32(self.0, 12))), + 16 => u32x4(_mm_or_si128(_mm_slli_epi32(self.0, 16), _mm_srli_epi32(self.0, 16))), + _ => unreachable!(), + } + } + } + + #[cfg(target_feature = "ssse3")] + #[inline(always)] + pub fn rotate_right_const(self, i: u32) -> Self { + unsafe { + match i { + 7 => u32x4(_mm_or_si128(_mm_slli_epi32(self.0, 32 - 7), _mm_srli_epi32(self.0, 7))), + 8 => u32x4(_mm_shuffle_epi8(self.0, _mm_set_epi64x(0x0c0f0e0d_080b0a09, 0x04070605_00030201))), + 12 => u32x4(_mm_or_si128(_mm_slli_epi32(self.0, 32 - 12), _mm_srli_epi32(self.0, 12))), + 16 => u32x4(_mm_shuffle_epi8(self.0, _mm_set_epi64x(0x0d0c0f0e_09080b0a, 0x05040706_01000302))), + _ => unreachable!(), + } + } + } + + #[inline(always)] pub fn wrapping_add(self, rhs: Self) -> Self { unsafe { u32x4(_mm_add_epi32(self.0, rhs.0)) } } + + #[inline(always)] pub fn to_le(self) -> Self { self } + #[inline(always)] pub fn from_le(self) -> Self { self } + } + + impl BitXor for u32x4 { + type Output = Self; + #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { unsafe { u32x4(_mm_xor_si128(self.0, rhs.0)) } } + } +} + +#[cfg(target_feature = "avx2")] +mod avx2 { + use core::ops::BitXor; + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + #[allow(non_camel_case_types)] + #[repr(C)] + #[derive(Copy, Clone)] + pub struct u64x4(__m256i); + + impl u64x4 { + #[inline(always)] + pub fn new(a: u64, b: u64, c: u64, d: u64) -> Self { + unsafe { u64x4(_mm256_set_epi64x(d as i64, c as i64, b as i64, a as i64),) } + } + + #[inline(always)] + pub fn gather(src: &[u64], i0: usize, i1: usize, i2: usize, i3: usize) -> Self { + Self::new(src[i0], src[i1], src[i2], src[i3]) + } + + #[inline(always)] pub fn shuffle_left_1(self) -> Self { u64x4(unsafe { _mm256_permute4x64_epi64(self.0, 0b00111001) }) } + #[inline(always)] pub fn shuffle_left_2(self) -> Self { u64x4(unsafe { _mm256_permute4x64_epi64(self.0, 0b01001110) }) } + #[inline(always)] pub fn shuffle_left_3(self) -> Self { u64x4(unsafe { _mm256_permute4x64_epi64(self.0, 0b10010011) }) } + + #[inline(always)] + pub fn rotate_right_const(self, i: u32) -> Self { + unsafe { + let b16 = _mm256_set_epi64x(0x09080f0e0d0c0b0a, 0x0100070605040302, 0x09080f0e0d0c0b0a, 0x0100070605040302); + let b24 = _mm256_set_epi64x(0x0a09080f0e0d0c0b, 0x0201000706050403, 0x0a09080f0e0d0c0b, 0x0201000706050403); + match i { + 16 => u64x4(_mm256_shuffle_epi8(self.0, b16)), + 24 => u64x4(_mm256_shuffle_epi8(self.0, b24)), + 32 => u64x4(_mm256_shuffle_epi32(self.0, 0b10110001)), + 63 => u64x4(_mm256_or_si256(_mm256_slli_epi64(self.0, 1), _mm256_srli_epi64(self.0, 63))), + _ => unreachable!(), + } + } + } + + #[inline(always)] pub fn wrapping_add(self, rhs: Self) -> Self { unsafe { u64x4(_mm256_add_epi64(self.0, rhs.0)) } } + + #[inline(always)] pub fn to_le(self) -> Self { self } + #[inline(always)] pub fn from_le(self) -> Self { self } + } + + impl BitXor for u64x4 { + type Output = Self; + #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { u64x4(unsafe { _mm256_xor_si256(self.0, rhs.0) }) } + } +} + +#[cfg(all(target_feature = "sse2", not(target_feature = "avx2")))] +pub use self::sse2::u64x4; +#[cfg(target_feature = "avx2")] +pub use self::avx2::u64x4; + +#[cfg(target_feature = "sse2")] +pub use self::sse2::u32x4; + +unsafe impl Safe for u64x4 {} +unsafe impl Safe for u32x4 {} diff --git a/blake2/src/lib.rs b/blake2/src/lib.rs index 6d645d688..e4cc6abbf 100644 --- a/blake2/src/lib.rs +++ b/blake2/src/lib.rs @@ -88,8 +88,8 @@ "https://raw.githubusercontent.com/RustCrypto/meta/master/logo_small.png")] #![warn(missing_docs)] -#![cfg_attr(feature = "simd", feature(platform_intrinsics, repr_simd))] -#![cfg_attr(feature = "simd_asm", feature(asm))] +#![cfg_attr(all(feature = "simd", not(feature = "coresimd")), feature(platform_intrinsics, repr_simd))] +#![cfg_attr(all(feature = "simd_asm", not(feature = "coresimd")), feature(asm))] #[macro_use] extern crate opaque_debug; #[macro_use] pub extern crate digest; @@ -102,15 +102,26 @@ extern crate std; mod consts; mod as_bytes; +#[cfg(feature = "coresimd")] +mod coresimd; +#[cfg(feature = "coresimd")] +mod simd { pub use coresimd::*; } +#[cfg(not(feature = "coresimd"))] mod simd; #[macro_use] mod blake2; +mod compressor_b; +mod compressor_s; mod blake2b; mod blake2s; +mod blake2bp; +mod blake2sp; pub use digest::Digest; pub use blake2b::{Blake2b, VarBlake2b}; pub use blake2s::{Blake2s, VarBlake2s}; +pub use blake2bp::{Blake2bp, VarBlake2bp}; +pub use blake2sp::{Blake2sp, VarBlake2sp}; diff --git a/blake2/src/simd/mod.rs b/blake2/src/simd/mod.rs index bb999c223..661e93e68 100644 --- a/blake2/src/simd/mod.rs +++ b/blake2/src/simd/mod.rs @@ -18,7 +18,6 @@ pub trait Vector4: Copy { fn gather(src: &[T], i0: usize, i1: usize, i2: usize, i3: usize) -> Self; fn from_le(self) -> Self; - fn to_le(self) -> Self; fn wrapping_add(self, rhs: Self) -> Self; @@ -28,9 +27,7 @@ pub trait Vector4: Copy { fn shuffle_left_2(self) -> Self; fn shuffle_left_3(self) -> Self; - #[inline(always)] fn shuffle_right_1(self) -> Self { self.shuffle_left_3() } - #[inline(always)] fn shuffle_right_2(self) -> Self { self.shuffle_left_2() } - #[inline(always)] fn shuffle_right_3(self) -> Self { self.shuffle_left_1() } + #[inline(always)] fn to_le(self) -> Self { self.from_le() } } macro_rules! impl_vector4 { @@ -42,11 +39,6 @@ macro_rules! impl_vector4 { $vec::new(src[i0], src[i1], src[i2], src[i3]) } - #[cfg(target_endian = "little")] - #[inline(always)] - fn from_le(self) -> Self { self } - - #[cfg(not(target_endian = "little"))] #[inline(always)] fn from_le(self) -> Self { $vec::new($word::from_le(self.0), @@ -55,19 +47,6 @@ macro_rules! impl_vector4 { $word::from_le(self.3)) } - #[cfg(target_endian = "little")] - #[inline(always)] - fn to_le(self) -> Self { self } - - #[cfg(not(target_endian = "little"))] - #[inline(always)] - fn to_le(self) -> Self { - $vec::new(self.0.to_le(), - self.1.to_le(), - self.2.to_le(), - self.3.to_le()) - } - #[inline(always)] fn wrapping_add(self, rhs: Self) -> Self { self + rhs } diff --git a/blake2/tests/data/blake2bp.blb b/blake2/tests/data/blake2bp.blb new file mode 100644 index 000000000..1b0ba9fec Binary files /dev/null and b/blake2/tests/data/blake2bp.blb differ diff --git a/blake2/tests/data/blake2sp.blb b/blake2/tests/data/blake2sp.blb new file mode 100644 index 000000000..bcc4cd9d4 Binary files /dev/null and b/blake2/tests/data/blake2sp.blb differ diff --git a/blake2/tests/lib.rs b/blake2/tests/lib.rs index 452c618f7..2f15886e7 100644 --- a/blake2/tests/lib.rs +++ b/blake2/tests/lib.rs @@ -2,9 +2,12 @@ #[macro_use] extern crate digest; extern crate blake2; +extern crate hex_literal; use digest::dev::{digest_test, variable_test}; new_test!(blake2b_fixed, "blake2b/fixed", blake2::Blake2b, digest_test); new_test!(blake2b_variable, "blake2b/variable", blake2::VarBlake2b, variable_test); new_test!(blake2s_variable, "blake2s/variable", blake2::VarBlake2s, variable_test); +new_test!(blake2bp, "blake2bp", blake2::Blake2bp, digest_test); +new_test!(blake2sp, "blake2sp", blake2::Blake2sp, digest_test); diff --git a/test_features.sh b/test_features.sh index 03747f87b..650fce517 100755 --- a/test_features.sh +++ b/test_features.sh @@ -3,4 +3,7 @@ cd sha1 && cargo test --features asm && cd .. && cd whirlpool && cargo test --features asm && cd .. && cd blake2 && cargo test --features simd && cargo test --features simd_opt && - cargo test --features simd_asm + cargo test --features simd_asm && + RUSTFLAGS="-Ctarget-cpu=native" cargo test --features coresimd && + RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-ssse3" cargo test --features coresimd && + RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-avx2" cargo test --features coresimd