diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs index 2588a72ea3f..12838dd138d 100644 --- a/rand_core/src/impls.rs +++ b/rand_core/src/impls.rs @@ -52,36 +52,59 @@ pub fn fill_bytes_via_next(rng: &mut R, dest: &mut [u8]) { } } -macro_rules! fill_via_chunks { - ($src:expr, $dst:expr, $ty:ty) => {{ - const SIZE: usize = core::mem::size_of::<$ty>(); - let chunk_size_u8 = min($src.len() * SIZE, $dst.len()); - let chunk_size = (chunk_size_u8 + SIZE - 1) / SIZE; - - // The following can be replaced with safe code, but unfortunately it's - // ca. 8% slower. - if cfg!(target_endian = "little") { - unsafe { - core::ptr::copy_nonoverlapping( - $src.as_ptr() as *const u8, - $dst.as_mut_ptr(), - chunk_size_u8); - } - } else { - for (&n, chunk) in $src.iter().zip($dst.chunks_mut(SIZE)) { - let tmp = n.to_le(); - let src_ptr = &tmp as *const $ty as *const u8; - unsafe { - core::ptr::copy_nonoverlapping( - src_ptr, - chunk.as_mut_ptr(), - chunk.len()); - } - } +trait Observable: Copy { + type Bytes: AsRef<[u8]>; + fn to_le_bytes(self) -> Self::Bytes; + + // Contract: observing self is memory-safe (implies no uninitialised padding) + fn as_byte_slice(x: &[Self]) -> &[u8]; +} +impl Observable for u32 { + type Bytes = [u8; 4]; + fn to_le_bytes(self) -> Self::Bytes { + self.to_le_bytes() + } + fn as_byte_slice(x: &[Self]) -> &[u8] { + let ptr = x.as_ptr() as *const u8; + let len = x.len() * core::mem::size_of::(); + unsafe { core::slice::from_raw_parts(ptr, len) } + } +} +impl Observable for u64 { + type Bytes = [u8; 8]; + fn to_le_bytes(self) -> Self::Bytes { + self.to_le_bytes() + } + fn as_byte_slice(x: &[Self]) -> &[u8] { + let ptr = x.as_ptr() as *const u8; + let len = x.len() * core::mem::size_of::(); + unsafe { core::slice::from_raw_parts(ptr, len) } + } +} + +fn fill_via_chunks(src: &[T], dest: &mut [u8]) -> (usize, usize) { + let size = core::mem::size_of::(); + let byte_len = min(src.len() * size, dest.len()); + let num_chunks = (byte_len + size - 1) / size; + + if cfg!(target_endian = "little") { + // On LE we can do a simple copy, which is 25-50% faster: + dest[..byte_len].copy_from_slice(&T::as_byte_slice(&src[..num_chunks])[..byte_len]); + } else { + // This code is valid on all arches, but slower than the above: + let mut i = 0; + let mut iter = dest[..byte_len].chunks_exact_mut(size); + while let Some(chunk) = iter.next() { + chunk.copy_from_slice(src[i].to_le_bytes().as_ref()); + i += 1; } + let chunk = iter.into_remainder(); + if !chunk.is_empty() { + chunk.copy_from_slice(&src[i].to_le_bytes().as_ref()[..chunk.len()]); + } + } - (chunk_size, chunk_size_u8) - }}; + (num_chunks, byte_len) } /// Implement `fill_bytes` by reading chunks from the output buffer of a block @@ -115,7 +138,7 @@ macro_rules! fill_via_chunks { /// } /// ``` pub fn fill_via_u32_chunks(src: &[u32], dest: &mut [u8]) -> (usize, usize) { - fill_via_chunks!(src, dest, u32) + fill_via_chunks(src, dest) } /// Implement `fill_bytes` by reading chunks from the output buffer of a block @@ -129,7 +152,7 @@ pub fn fill_via_u32_chunks(src: &[u32], dest: &mut [u8]) -> (usize, usize) { /// /// See `fill_via_u32_chunks` for an example. pub fn fill_via_u64_chunks(src: &[u64], dest: &mut [u8]) -> (usize, usize) { - fill_via_chunks!(src, dest, u64) + fill_via_chunks(src, dest) } /// Implement `next_u32` via `fill_bytes`, little-endian order.