Skip to content

Commit b7b1a3f

Browse files
alexcrichtongnzlbg
authored andcommitted
Add a x86_64::cmpxchg16b intrinsic
This intrinsic isn't actually specified by Intel, but it's something gated with CPUID and can otherwise be a useful thing to have when building primitives! There exists an `AtomicU128` type in the standard library but it's only exposed currently (and it's unstable) when a platform fully supports 128-bit atomics. The x86_64 architecture does not support it *unless* the `cmpxchg16b` instruction is available, and it isn't always available! This commit is also a proposal for how we can include support for 128-bit atomics in the standard library on relevant platforms. I'm thinking that we'll expose this one low-level intrinsic in `std::arch::x86_64`, and then if desired a crate on crates.io can build `AtomicU128` from this API. In any case this is all unstable regardless!
1 parent a7a1c79 commit b7b1a3f

File tree

8 files changed

+105
-1
lines changed

8 files changed

+105
-1
lines changed

coresimd/x86_64/cmpxchg16b.rs

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
use sync::atomic::Ordering;
2+
3+
#[cfg(test)]
4+
use stdsimd_test::assert_instr;
5+
6+
/// Compare and exchange 16 bytes (128 bits) of data atomically.
7+
///
8+
/// This intrinsic corresponds to the `cmpxchg16b` instruction on x86_64
9+
/// processors. It performs an atomic compare-and-swap, updating the `ptr`
10+
/// memory location to `val` if the current value in memory equals `old`.
11+
///
12+
/// # Return value
13+
///
14+
/// This function returns the previous value at the memory location. If it is
15+
/// equal to `old` then the memory was updated to `new`.
16+
///
17+
/// # Memory Orderings
18+
///
19+
/// This atomic operations has the same semantics of memory orderings as
20+
/// `AtomicUsize::compare_exchange` does, only operating on 16 bytes of memory
21+
/// instead of just a pointer.
22+
///
23+
/// For more information on memory orderings here see the `compare_exchange`
24+
/// documentation for other `Atomic*` types in the standard library.
25+
///
26+
/// # Unsafety
27+
///
28+
/// This method is unsafe because it takes a raw pointer and will attempt to
29+
/// read and possibly write the memory at the pointer. The pointer must also be
30+
/// aligned on a 16-byte boundary.
31+
///
32+
/// This method also requires the `cmpxchg16b` CPU feature to be available at
33+
/// runtime to work correctly. If the CPU running the binary does not actually
34+
/// support `cmpxchg16b` and the program enters an execution path that
35+
/// eventually would reach this function the behavior is undefined.
36+
///
37+
/// The `success` ordering must also be stronger or equal to `failure`, or this
38+
/// function call is undefined. See the `Atomic*` documentation's
39+
/// `compare_exchange` function for more information. When `compare_exchange`
40+
/// panics, this is undefined behavior. Currently this function aborts the
41+
/// process with an undefined instruction.
42+
#[inline]
43+
#[cfg_attr(test, assert_instr(cmpxchg16b, success = Ordering::SeqCst, failure = Ordering::SeqCst))]
44+
#[target_feature(enable = "cmpxchg16b")]
45+
pub unsafe fn cmpxchg16b(
46+
dst: *mut u128,
47+
old: u128,
48+
new: u128,
49+
success: Ordering,
50+
failure: Ordering,
51+
) -> u128 {
52+
use intrinsics;
53+
use sync::atomic::Ordering::*;
54+
55+
debug_assert!(dst as usize % 16 == 0);
56+
57+
let (val, _ok) = match (success, failure) {
58+
(Acquire, Acquire) => intrinsics::atomic_cxchg_acq(dst, old, new),
59+
(Release, Relaxed) => intrinsics::atomic_cxchg_rel(dst, old, new),
60+
(AcqRel, Acquire) => intrinsics::atomic_cxchg_acqrel(dst, old, new),
61+
(Relaxed, Relaxed) => intrinsics::atomic_cxchg_relaxed(dst, old, new),
62+
(SeqCst, SeqCst) => intrinsics::atomic_cxchg(dst, old, new),
63+
(Acquire, Relaxed) => intrinsics::atomic_cxchg_acq_failrelaxed(dst, old, new),
64+
(AcqRel, Relaxed) => intrinsics::atomic_cxchg_acqrel_failrelaxed(dst, old, new),
65+
(SeqCst, Relaxed) => intrinsics::atomic_cxchg_failrelaxed(dst, old, new),
66+
(SeqCst, Acquire) => intrinsics::atomic_cxchg_failacq(dst, old, new),
67+
68+
// The above block is all copied from libcore, and this statement is
69+
// also copied from libcore except that it's a panic in libcore and we
70+
// have a little bit more of a lightweight panic here.
71+
_ => ::coresimd::x86::ud2(),
72+
};
73+
val
74+
}

coresimd/x86_64/mod.rs

+3
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,6 @@ pub use self::bswap::*;
3838

3939
mod rdrand;
4040
pub use self::rdrand::*;
41+
42+
mod cmpxchg16b;
43+
pub use self::cmpxchg16b::*;

crates/coresimd/src/lib.rs

+5
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
sse4a_target_feature,
3333
arm_target_feature,
3434
aarch64_target_feature,
35+
cmpxchg16b_target_feature,
3536
avx512_target_feature,
3637
mips_target_feature,
3738
powerpc_target_feature,
@@ -74,6 +75,8 @@
7475
test(attr(allow(dead_code, deprecated, unused_variables, unused_mut)))
7576
)]
7677

78+
#[macro_use]
79+
#[allow(unused_imports)]
7780
extern crate core as _core;
7881
#[cfg(test)]
7982
#[macro_use]
@@ -129,6 +132,8 @@ use _core::result;
129132
#[allow(unused_imports)]
130133
use _core::slice;
131134
#[allow(unused_imports)]
135+
use _core::sync;
136+
#[allow(unused_imports)]
132137
use _core::u128;
133138
#[allow(unused_imports)]
134139
use _core::u8;

crates/stdsimd-test/src/disassembly.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,9 @@ fn parse_objdump(output: &str) -> HashMap<String, Vec<Function>> {
101101
.skip_while(|s| {
102102
s.len() == expected_len
103103
&& usize::from_str_radix(s, 16).is_ok()
104-
}).map(|s| s.to_string())
104+
})
105+
.skip_while(|s| *s == "lock") // skip x86-specific prefix
106+
.map(|s| s.to_string())
105107
.collect::<Vec<String>>();
106108
instructions.push(Instruction { parts });
107109
}
@@ -198,6 +200,7 @@ fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
198200
.skip_while(|s| {
199201
s.len() == 2 && usize::from_str_radix(s, 16).is_ok()
200202
}).map(|s| s.to_string())
203+
.skip_while(|s| *s == "lock") // skip x86-specific prefix
201204
.collect::<Vec<String>>();
202205
instructions.push(Instruction { parts });
203206
}

crates/stdsimd-verify/src/lib.rs

+2
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,9 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
124124
"u16" => quote! { &U16 },
125125
"u32" => quote! { &U32 },
126126
"u64" => quote! { &U64 },
127+
"u128" => quote! { &U128 },
127128
"u8" => quote! { &U8 },
129+
"Ordering" => quote! { &ORDERING },
128130
"CpuidResult" => quote! { &CPUID },
129131

130132
// arm ...

crates/stdsimd-verify/tests/x86-intel.rs

+5
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@ static I8: Type = Type::PrimSigned(8);
3737
static U16: Type = Type::PrimUnsigned(16);
3838
static U32: Type = Type::PrimUnsigned(32);
3939
static U64: Type = Type::PrimUnsigned(64);
40+
static U128: Type = Type::PrimUnsigned(128);
4041
static U8: Type = Type::PrimUnsigned(8);
42+
static ORDERING: Type = Type::Ordering;
4143

4244
static M64: Type = Type::M64;
4345
static M128: Type = Type::M128;
@@ -75,6 +77,7 @@ enum Type {
7577
Tuple,
7678
CpuidResult,
7779
Never,
80+
Ordering,
7881
}
7982

8083
stdsimd_verify::x86_functions!(static FUNCTIONS);
@@ -145,6 +148,8 @@ fn verify_all_signatures() {
145148
"__cpuid_count" |
146149
"__cpuid" |
147150
"__get_cpuid_max" |
151+
// Not listed with intel, but manually verified
152+
"cmpxchg16b" |
148153
// The UD2 intrinsic is not defined by Intel, but it was agreed on
149154
// in the RFC Issue 2512:
150155
// https://github.com/rust-lang/rfcs/issues/2512

stdsimd/arch/detect/arch/x86.rs

+6
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,10 @@ macro_rules! is_x86_feature_detected {
226226
cfg!(target_feature = "xsavec") || $crate::arch::detect::check_for(
227227
$crate::arch::detect::Feature::xsavec)
228228
};
229+
("cmpxchg16b") => {
230+
cfg!(target_feature = "cmpxchg16b") || $crate::arch::detect::check_for(
231+
$crate::arch::detect::Feature::cmpxchg16b)
232+
};
229233
($t:tt) => {
230234
compile_error!(concat!("unknown target feature: ", $t))
231235
};
@@ -316,4 +320,6 @@ pub enum Feature {
316320
xsaves,
317321
/// XSAVEC (Save Processor Extended States Compacted)
318322
xsavec,
323+
/// CMPXCH16B, a 16-byte compare-and-swap instruction
324+
cmpxchg16b,
319325
}

stdsimd/arch/detect/os/x86.rs

+6
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ fn detect_features() -> cache::Initializer {
116116

117117
enable(proc_info_ecx, 0, Feature::sse3);
118118
enable(proc_info_ecx, 9, Feature::ssse3);
119+
enable(proc_info_ecx, 13, Feature::cmpxchg16b);
119120
enable(proc_info_ecx, 19, Feature::sse4_1);
120121
enable(proc_info_ecx, 20, Feature::sse4_2);
121122
enable(proc_info_ecx, 23, Feature::popcnt);
@@ -288,6 +289,7 @@ mod tests {
288289
println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt"));
289290
println!("xsaves: {:?}", is_x86_feature_detected!("xsaves"));
290291
println!("xsavec: {:?}", is_x86_feature_detected!("xsavec"));
292+
println!("cmpxchg16b: {:?}", is_x86_feature_detected!("cmpxchg16b"));
291293
}
292294

293295
#[test]
@@ -344,5 +346,9 @@ mod tests {
344346
is_x86_feature_detected!("xsaves"),
345347
information.xsaves_xrstors_and_ia32_xss()
346348
);
349+
assert_eq!(
350+
is_x86_feature_detected!("cmpxchg16b"),
351+
information.cmpxchg16b(),
352+
);
347353
}
348354
}

0 commit comments

Comments
 (0)