Skip to content

Commit 7089766

Browse files
authored
Add/Rework benchmarks to track initialization cost (#272)
This PR adds more benchmarks so we can get and accurate idea about two things: - What is the cost of having to zero the buffer before calling `getrandom`? - What is the performance on aligned, 32-byte buffers? - This is by far the most common use, as its used to seed usersapce CSPRNGs. I ran the benchmarks on my system: - CPU: AMD Ryzen 7 5700G - OS: Linux 5.15.52-1-lts - Rust Version: 1.62.0-nightly (ea92b0838 2022-05-07) I got the following results: ``` test bench_large ... bench: 3,759,323 ns/iter (+/- 177,100) = 557 MB/s test bench_large_init ... bench: 3,821,229 ns/iter (+/- 39,132) = 548 MB/s test bench_page ... bench: 7,281 ns/iter (+/- 59) = 562 MB/s test bench_page_init ... bench: 7,290 ns/iter (+/- 69) = 561 MB/s test bench_seed ... bench: 206 ns/iter (+/- 3) = 155 MB/s test bench_seed_init ... bench: 206 ns/iter (+/- 1) = 155 MB/s ``` These results were very consistent across multiple runs, and roughtly behave as we would expect: - The thoughput is highest with a buffer large enough to amoritize the syscall overhead, but small enough to stay in the L1D cache. - There is a _very_ small cost to zeroing the buffer beforehand. - This cost is imperceptible in the common 32-byte usecase, where the syscall overhead dominates. - The cost is slightly higher (1%) with multi-megabyte buffers as the data gets evicted from the L1 cache between the `memset` and the call to `getrandom`. I would love to see results for other platforms. Could we get someone to run this on an M1 Mac? Signed-off-by: Joe Richey <[email protected]>
1 parent 3d818a6 commit 7089766

File tree

1 file changed

+80
-8
lines changed

1 file changed

+80
-8
lines changed

benches/mod.rs

Lines changed: 80 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,94 @@
11
#![feature(test)]
22
extern crate test;
33

4-
#[bench]
5-
fn bench_64(b: &mut test::Bencher) {
6-
let mut buf = [0u8; 64];
4+
use std::{
5+
alloc::{alloc_zeroed, dealloc, Layout},
6+
ptr::NonNull,
7+
};
8+
9+
// AlignedBuffer is like a Box<[u8; N]> except that it is always N-byte aligned
10+
struct AlignedBuffer<const N: usize>(NonNull<[u8; N]>);
11+
12+
impl<const N: usize> AlignedBuffer<N> {
13+
fn layout() -> Layout {
14+
Layout::from_size_align(N, N).unwrap()
15+
}
16+
17+
fn new() -> Self {
18+
let p = unsafe { alloc_zeroed(Self::layout()) } as *mut [u8; N];
19+
Self(NonNull::new(p).unwrap())
20+
}
21+
22+
fn buf(&mut self) -> &mut [u8; N] {
23+
unsafe { self.0.as_mut() }
24+
}
25+
}
26+
27+
impl<const N: usize> Drop for AlignedBuffer<N> {
28+
fn drop(&mut self) {
29+
unsafe { dealloc(self.0.as_ptr() as *mut u8, Self::layout()) }
30+
}
31+
}
32+
33+
// Used to benchmark the throughput of getrandom in an optimal scenario.
34+
// The buffer is hot, and does not require initialization.
35+
#[inline(always)]
36+
fn bench<const N: usize>(b: &mut test::Bencher) {
37+
let mut ab = AlignedBuffer::<N>::new();
38+
let buf = ab.buf();
739
b.iter(|| {
840
getrandom::getrandom(&mut buf[..]).unwrap();
941
test::black_box(&buf);
1042
});
11-
b.bytes = buf.len() as u64;
43+
b.bytes = N as u64;
1244
}
1345

14-
#[bench]
15-
fn bench_65536(b: &mut test::Bencher) {
16-
let mut buf = [0u8; 65536];
46+
// Used to benchmark the throughput of getrandom is a slightly less optimal
47+
// scenario. The buffer is still hot, but requires initialization.
48+
#[inline(always)]
49+
fn bench_with_init<const N: usize>(b: &mut test::Bencher) {
50+
let mut ab = AlignedBuffer::<N>::new();
51+
let buf = ab.buf();
1752
b.iter(|| {
53+
for byte in buf.iter_mut() {
54+
*byte = 0;
55+
}
1856
getrandom::getrandom(&mut buf[..]).unwrap();
1957
test::black_box(&buf);
2058
});
21-
b.bytes = buf.len() as u64;
59+
b.bytes = N as u64;
60+
}
61+
62+
// 32 bytes (256-bit) is the seed sized used for rand::thread_rng
63+
const SEED: usize = 32;
64+
// Common size of a page, 4 KiB
65+
const PAGE: usize = 4096;
66+
// Large buffer to get asymptotic performance, 2 MiB
67+
const LARGE: usize = 1 << 21;
68+
69+
#[bench]
70+
fn bench_seed(b: &mut test::Bencher) {
71+
bench::<SEED>(b);
72+
}
73+
#[bench]
74+
fn bench_seed_init(b: &mut test::Bencher) {
75+
bench_with_init::<SEED>(b);
76+
}
77+
78+
#[bench]
79+
fn bench_page(b: &mut test::Bencher) {
80+
bench::<PAGE>(b);
81+
}
82+
#[bench]
83+
fn bench_page_init(b: &mut test::Bencher) {
84+
bench_with_init::<PAGE>(b);
85+
}
86+
87+
#[bench]
88+
fn bench_large(b: &mut test::Bencher) {
89+
bench::<LARGE>(b);
90+
}
91+
#[bench]
92+
fn bench_large_init(b: &mut test::Bencher) {
93+
bench_with_init::<LARGE>(b);
2294
}

0 commit comments

Comments
 (0)