From 233e651e31165c76f5930e7d2b27554950b7486b Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Fri, 4 Nov 2022 15:44:26 -0400 Subject: [PATCH 01/11] feat: add 256bit vpclmulqdq support Signed-off-by: Schrodinger ZHU Yifan --- Cargo.toml | 1 + src/lib.rs | 15 ++- src/pclmulqdq/mod.rs | 54 +++++++--- src/pclmulqdq/{x86.rs => x86/mod.rs} | 19 ++-- src/pclmulqdq/x86/vpclmulqdq.rs | 153 +++++++++++++++++++++++++++ 5 files changed, 220 insertions(+), 22 deletions(-) rename src/pclmulqdq/{x86.rs => x86/mod.rs} (73%) create mode 100644 src/pclmulqdq/x86/vpclmulqdq.rs diff --git a/Cargo.toml b/Cargo.toml index 5832066..403a2ea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ rand = "0.8" [features] pmull = [] +vpclmulqdq = [] fake-simd = [] [[bench]] diff --git a/src/lib.rs b/src/lib.rs index 847f2c1..17ac0e3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,11 +22,20 @@ feature = "pmull", feature(stdsimd, platform_intrinsics, aarch64_target_feature, llvm_asm) )] +#![cfg_attr( + feature = "vpclmulqdq", + feature( + simd_ffi, + link_llvm_intrinsics, + avx512_target_feature, + target_feature_11 + ) +)] mod pclmulqdq; mod table; -type UpdateFn = fn(u64, &[u8]) -> u64; +type UpdateFn = unsafe fn(u64, &[u8]) -> u64; /// Represents an in-progress CRC-64 computation. #[derive(Clone)] @@ -57,7 +66,9 @@ impl Digest { /// Writes some data into the digest. pub fn write(&mut self, bytes: &[u8]) { - self.state = (self.computer)(self.state, bytes); + unsafe { + self.state = (self.computer)(self.state, bytes); + } } /// Computes the current CRC-64-ECMA value. diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs index b329808..c709aec 100644 --- a/src/pclmulqdq/mod.rs +++ b/src/pclmulqdq/mod.rs @@ -7,21 +7,23 @@ //! //! [white paper]: https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +use std::{ + fmt::Debug, + ops::{BitXor, BitXorAssign}, +}; + +use super::table; + +use self::arch::Simd; + #[cfg(not(feature = "fake-simd"))] -#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), path = "x86.rs")] +#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), path = "x86/mod.rs")] #[cfg_attr(all(target_arch = "aarch64", feature = "pmull"), path = "aarch64.rs")] mod arch; #[cfg(feature = "fake-simd")] mod arch; -use self::arch::Simd; -use super::table; -use std::{ - fmt::Debug, - ops::{BitXor, BitXorAssign}, -}; - /// This trait must be implemented on `self::arch::Simd` to provide the /// platform-specific SIMD implementations. trait SimdExt: Copy + Debug + BitXor { @@ -70,24 +72,47 @@ impl BitXorAssign for Simd { } pub fn get_update() -> super::UpdateFn { + #[cfg(all(feature = "vpclmulqdq"))] + { + use arch::vpclmulqdq::*; + if Simd256::is_supported() { + return update_256_batch; + } + } + if Simd::is_supported() { - update + update_128_batch } else { table::update } } -fn update(mut state: u64, bytes: &[u8]) -> u64 { - let (left, middle, right) = unsafe { bytes.align_to::<[Simd; 8]>() }; +// This function is unsafe because it uses platform dependent functions. +unsafe fn update_128_batch(mut state: u64, bytes: &[u8]) -> u64 { + let (left, middle, right) = bytes.align_to::<[Simd; 8]>(); if let Some((first, rest)) = middle.split_first() { state = table::update(state, left); - state = unsafe { update_simd(state, first, rest) }; + state = update_simd(state, first, rest); table::update(state, right) } else { table::update(state, bytes) } } +#[cfg(feature = "vpclmulqdq")] +#[target_feature(enable = "avx2", enable = "avx512vpclmulqdq")] +unsafe fn update_256_batch(mut state: u64, bytes: &[u8]) -> u64 { + use arch::vpclmulqdq::*; + let (left, middle, right) = bytes.align_to::<[[Simd256; 4]; 2]>(); + if let Some((first, rest)) = middle.split_first() { + state = update_128_batch(state, left); + state = update_vpclmulqdq(state, first, rest); + update_128_batch(state, right) + } else { + update_128_batch(state, bytes) + } +} + #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1") @@ -111,6 +136,11 @@ unsafe fn update_simd(state: u64, first: &[Simd; 8], rest: &[[Simd; 8]]) -> u64 } } + fold_tail(x) +} + +#[inline(always)] +unsafe fn fold_tail(x: [Simd; 8]) -> u64 { let coeffs = [ Simd::new(table::K_895, table::K_959), // fold by distance of 112 bytes Simd::new(table::K_767, table::K_831), // fold by distance of 96 bytes diff --git a/src/pclmulqdq/x86.rs b/src/pclmulqdq/x86/mod.rs similarity index 73% rename from src/pclmulqdq/x86.rs rename to src/pclmulqdq/x86/mod.rs index 2fce99a..2e18de8 100644 --- a/src/pclmulqdq/x86.rs +++ b/src/pclmulqdq/x86/mod.rs @@ -8,6 +8,9 @@ use std::arch::x86::*; use std::arch::x86_64::*; use std::ops::BitXor; +#[cfg(all(feature = "vpclmulqdq"))] +pub mod vpclmulqdq; + #[repr(transparent)] #[derive(Copy, Clone, Debug)] pub struct Simd(__m128i); @@ -28,8 +31,8 @@ impl super::SimdExt for Simd { #[inline] #[target_feature(enable = "sse2", enable = "pclmulqdq")] unsafe fn fold_16(self, coeff: Self) -> Self { - let h = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x11)); - let l = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x00)); + let h = Self(_mm_clmulepi64_si128::<0x11>(self.0, coeff.0)); + let l = Self(_mm_clmulepi64_si128::<0x00>(self.0, coeff.0)); h ^ l } @@ -37,8 +40,8 @@ impl super::SimdExt for Simd { #[target_feature(enable = "sse2", enable = "pclmulqdq")] unsafe fn fold_8(self, coeff: u64) -> Self { let coeff = Self::new(0, coeff); - let h = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x00)); - let l = Self(_mm_srli_si128(self.0, 8)); + let h = Self(_mm_clmulepi64_si128::<0x00>(self.0, coeff.0)); + let l = Self(_mm_srli_si128::<8>(self.0)); h ^ l } @@ -46,11 +49,11 @@ impl super::SimdExt for Simd { #[target_feature(enable = "sse2", enable = "sse4.1", enable = "pclmulqdq")] unsafe fn barrett(self, poly: u64, mu: u64) -> u64 { let polymu = Self::new(poly, mu); - let t1 = _mm_clmulepi64_si128(self.0, polymu.0, 0x00); - let h = Self(_mm_slli_si128(t1, 8)); - let l = Self(_mm_clmulepi64_si128(t1, polymu.0, 0x10)); + let t1 = _mm_clmulepi64_si128::<0x00>(self.0, polymu.0); + let h = Self(_mm_slli_si128::<8>(t1)); + let l = Self(_mm_clmulepi64_si128::<0x10>(t1, polymu.0)); let reduced = h ^ l ^ self; - _mm_extract_epi64(reduced.0, 1) as u64 + _mm_extract_epi64::<1>(reduced.0) as u64 } } diff --git a/src/pclmulqdq/x86/vpclmulqdq.rs b/src/pclmulqdq/x86/vpclmulqdq.rs new file mode 100644 index 0000000..cdf9e6c --- /dev/null +++ b/src/pclmulqdq/x86/vpclmulqdq.rs @@ -0,0 +1,153 @@ +use core::ops::BitXor; + +use super::{__cpuid_count, __m256i, _mm256_set_epi64x, _mm256_xor_si256, Simd, super::fold_tail}; + +// PCLMULQDQ can be used without avx512vl. However, this is only addressed by rust recently --- so we +// need to manually specify the intrinsic, otherwise rustc will inline it poorly. +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.pclmulqdq.256"] + fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i; +} + +#[derive(Clone, Copy, Debug)] +pub struct Simd256(__m256i); + +impl Simd256 { + #[inline] + pub fn is_supported() -> bool { + let avx2 = is_x86_feature_detected!("avx2"); + // Rust is very confused about VPCLMULQDQ + // Let us detect it use CPUID directly + let leaf_7 = unsafe { __cpuid_count(7, 0) }; + let vpclmulqdq = (leaf_7.ecx & (1u32 << 10)) != 0; + avx2 && vpclmulqdq + } + + #[inline] + #[target_feature(enable = "avx2")] + pub unsafe fn new(x3: u64, x2: u64, x1: u64, x0: u64) -> Self { + Self(_mm256_set_epi64x(x3 as _, x2 as _, x1 as _, x0 as _)) + } + + #[inline] + #[target_feature(enable = "avx2")] + pub unsafe fn to_simd_x8(self4: [Self; 4]) -> [Simd; 8] { + core::mem::transmute(self4) + } + + #[inline] + #[target_feature(enable = "avx2", enable = "avx512vpclmulqdq")] + pub unsafe fn fold_32(self, coeff: Self) -> Self { + let h = pclmulqdq_256(self.0, coeff.0, 0x11); + let l = pclmulqdq_256(self.0, coeff.0, 0x00); + Self(h) ^ Self(l) + } +} + +impl BitXor for Simd256 { + type Output = Self; + + #[inline(always)] + fn bitxor(self, other: Self) -> Self { + Self(unsafe { _mm256_xor_si256(self.0, other.0) }) + } +} + +#[inline] +#[target_feature(enable = "avx2", enable = "avx512vpclmulqdq")] +pub(crate) unsafe fn update_vpclmulqdq( + state: u64, + first: &[[Simd256; 4]; 2], + rest: &[[[Simd256; 4]; 2]], +) -> u64 { + // receive the initial 128 bytes of data + let (mut x, y) = (first[0], first[1]); + + // xor the initial CRC value + x[0] = x[0] ^ Simd256::new(0, 0, 0, state); + + let coeff = Simd256::new( + crate::table::K_1023, + crate::table::K_1087, + crate::table::K_1023, + crate::table::K_1087, + ); + + x[0] = x[0].fold_32(coeff) ^ y[0]; + x[1] = x[1].fold_32(coeff) ^ y[1]; + x[2] = x[2].fold_32(coeff) ^ y[2]; + x[3] = x[3].fold_32(coeff) ^ y[3]; + + // perform 256-byte folding. + for chunk in rest { + let chunk = *chunk; + x[0] = x[0].fold_32(coeff) ^ chunk[0][0]; + x[0] = x[0].fold_32(coeff) ^ chunk[1][0]; + x[1] = x[1].fold_32(coeff) ^ chunk[0][1]; + x[1] = x[1].fold_32(coeff) ^ chunk[1][1]; + x[2] = x[2].fold_32(coeff) ^ chunk[0][2]; + x[2] = x[2].fold_32(coeff) ^ chunk[1][2]; + x[3] = x[3].fold_32(coeff) ^ chunk[0][3]; + x[3] = x[3].fold_32(coeff) ^ chunk[1][3]; + } + + let x = Simd256::to_simd_x8(x); + fold_tail(x) +} + +impl PartialEq for Simd256 { + fn eq(&self, other: &Self) -> bool { + unsafe { + use core::mem::transmute; + let a: [u128; 2] = transmute(*self); + let b: [u128; 2] = transmute(*other); + a == b + } + } +} + +impl Eq for Simd256 {} + +#[cfg(target_feature = "avx2")] +#[test] +fn test_size_and_alignment() { + assert_eq!(std::mem::size_of::(), 32); + assert_eq!(std::mem::align_of::(), 32); +} + +#[cfg(target_feature = "avx2")] +#[test] +fn test_new() { + unsafe { + let x = Simd256::new(0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804, 0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804); + let y = Simd256::new(0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804, 0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804); + let z = Simd256::new(0xfa3e_0099_cd5e_d60d, 0xad71_9ee6_57d1_498e, 0xfa3e_0099_cd5e_d60d, 0xad71_9ee6_57d1_498e); + assert_eq!(x, y); + assert_ne!(x, z); + } +} + +#[cfg(target_feature = "avx2")] +#[test] +fn test_xor() { + unsafe { + let x = Simd256::new(0xe450_87f9_b031_0d47, 0x3d72_e92a_96c7_4c63, 0xe450_87f9_b031_0d47, 0x3d72_e92a_96c7_4c63); + let y = Simd256::new(0x7ed8_ae0a_dfbd_89c0, 0x1c9b_dfaa_953e_0ef4, 0x7ed8_ae0a_dfbd_89c0, 0x1c9b_dfaa_953e_0ef4); + let mut z = x ^ y; + assert_eq!(z, Simd256::new(0x9a88_29f3_6f8c_8487, 0x21e9_3680_03f9_4297, 0x9a88_29f3_6f8c_8487, 0x21e9_3680_03f9_4297)); + z = z ^ Simd256::new(0x57a2_0f44_c005_b2ea, 0x7056_bde9_9303_aa51, 0x57a2_0f44_c005_b2ea, 0x7056_bde9_9303_aa51); + assert_eq!(z, Simd256::new(0xcd2a_26b7_af89_366d, 0x51bf_8b69_90fa_e8c6, 0xcd2a_26b7_af89_366d, 0x51bf_8b69_90fa_e8c6)); + } +} + +#[cfg(all(target_feature = "avx2", target_feature = "avx512vpclmulqdq"))] +#[test] +fn test_fold_32() { + unsafe { + let x = Simd256::new(0xb5f1_2590_5645_0b6c, 0x333a_2c49_c361_9e21, 0xb5f1_2590_5645_0b6c, 0x333a_2c49_c361_9e21); + let f = x.fold_32(Simd256::new(0xbecc_9dd9_038f_c366, 0x5ba9_365b_e2e9_5bf5, 0xbecc_9dd9_038f_c366, 0x5ba9_365b_e2e9_5bf5)); + assert_eq!(f, Simd256::new(0x4f55_42df_ef35_1810, 0x0c03_5bd6_70fc_5abd, 0x4f55_42df_ef35_1810, 0x0c03_5bd6_70fc_5abd)); + } +} + From 806ea64033a52466dd7cae351c1d93da55216c98 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Fri, 4 Nov 2022 16:12:04 -0400 Subject: [PATCH 02/11] switch to lazy static to avoid extra overhead Signed-off-by: Schrodinger ZHU Yifan --- Cargo.toml | 3 +- src/pclmulqdq/x86/vpclmulqdq.rs | 103 ++++++++++++++++++++++++++------ 2 files changed, 88 insertions(+), 18 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 403a2ea..3ce4d92 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ readme = "README.md" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +lazy_static = { version = "1.4.0", optional = true } [dev-dependencies] crc = "1" @@ -22,7 +23,7 @@ rand = "0.8" [features] pmull = [] -vpclmulqdq = [] +vpclmulqdq = ["lazy_static"] fake-simd = [] [[bench]] diff --git a/src/pclmulqdq/x86/vpclmulqdq.rs b/src/pclmulqdq/x86/vpclmulqdq.rs index cdf9e6c..7295ecc 100644 --- a/src/pclmulqdq/x86/vpclmulqdq.rs +++ b/src/pclmulqdq/x86/vpclmulqdq.rs @@ -1,6 +1,6 @@ +use super::{super::fold_tail, Simd, __cpuid_count, __m256i, _mm256_set_epi64x, _mm256_xor_si256}; use core::ops::BitXor; - -use super::{__cpuid_count, __m256i, _mm256_set_epi64x, _mm256_xor_si256, Simd, super::fold_tail}; +use lazy_static::lazy_static; // PCLMULQDQ can be used without avx512vl. However, this is only addressed by rust recently --- so we // need to manually specify the intrinsic, otherwise rustc will inline it poorly. @@ -13,15 +13,21 @@ extern "C" { #[derive(Clone, Copy, Debug)] pub struct Simd256(__m256i); -impl Simd256 { - #[inline] - pub fn is_supported() -> bool { +lazy_static! { + static ref VPCLMULQDQ_SUPPORTED : bool = { let avx2 = is_x86_feature_detected!("avx2"); // Rust is very confused about VPCLMULQDQ // Let us detect it use CPUID directly let leaf_7 = unsafe { __cpuid_count(7, 0) }; let vpclmulqdq = (leaf_7.ecx & (1u32 << 10)) != 0; avx2 && vpclmulqdq + }; +} + +impl Simd256 { + #[inline] + pub fn is_supported() -> bool { + *VPCLMULQDQ_SUPPORTED } #[inline] @@ -120,9 +126,24 @@ fn test_size_and_alignment() { #[test] fn test_new() { unsafe { - let x = Simd256::new(0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804, 0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804); - let y = Simd256::new(0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804, 0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804); - let z = Simd256::new(0xfa3e_0099_cd5e_d60d, 0xad71_9ee6_57d1_498e, 0xfa3e_0099_cd5e_d60d, 0xad71_9ee6_57d1_498e); + let x = Simd256::new( + 0xd7c8_11cf_e5c5_c792, + 0x86e6_5c36_e68b_4804, + 0xd7c8_11cf_e5c5_c792, + 0x86e6_5c36_e68b_4804, + ); + let y = Simd256::new( + 0xd7c8_11cf_e5c5_c792, + 0x86e6_5c36_e68b_4804, + 0xd7c8_11cf_e5c5_c792, + 0x86e6_5c36_e68b_4804, + ); + let z = Simd256::new( + 0xfa3e_0099_cd5e_d60d, + 0xad71_9ee6_57d1_498e, + 0xfa3e_0099_cd5e_d60d, + 0xad71_9ee6_57d1_498e, + ); assert_eq!(x, y); assert_ne!(x, z); } @@ -132,12 +153,43 @@ fn test_new() { #[test] fn test_xor() { unsafe { - let x = Simd256::new(0xe450_87f9_b031_0d47, 0x3d72_e92a_96c7_4c63, 0xe450_87f9_b031_0d47, 0x3d72_e92a_96c7_4c63); - let y = Simd256::new(0x7ed8_ae0a_dfbd_89c0, 0x1c9b_dfaa_953e_0ef4, 0x7ed8_ae0a_dfbd_89c0, 0x1c9b_dfaa_953e_0ef4); + let x = Simd256::new( + 0xe450_87f9_b031_0d47, + 0x3d72_e92a_96c7_4c63, + 0xe450_87f9_b031_0d47, + 0x3d72_e92a_96c7_4c63, + ); + let y = Simd256::new( + 0x7ed8_ae0a_dfbd_89c0, + 0x1c9b_dfaa_953e_0ef4, + 0x7ed8_ae0a_dfbd_89c0, + 0x1c9b_dfaa_953e_0ef4, + ); let mut z = x ^ y; - assert_eq!(z, Simd256::new(0x9a88_29f3_6f8c_8487, 0x21e9_3680_03f9_4297, 0x9a88_29f3_6f8c_8487, 0x21e9_3680_03f9_4297)); - z = z ^ Simd256::new(0x57a2_0f44_c005_b2ea, 0x7056_bde9_9303_aa51, 0x57a2_0f44_c005_b2ea, 0x7056_bde9_9303_aa51); - assert_eq!(z, Simd256::new(0xcd2a_26b7_af89_366d, 0x51bf_8b69_90fa_e8c6, 0xcd2a_26b7_af89_366d, 0x51bf_8b69_90fa_e8c6)); + assert_eq!( + z, + Simd256::new( + 0x9a88_29f3_6f8c_8487, + 0x21e9_3680_03f9_4297, + 0x9a88_29f3_6f8c_8487, + 0x21e9_3680_03f9_4297 + ) + ); + z = z ^ Simd256::new( + 0x57a2_0f44_c005_b2ea, + 0x7056_bde9_9303_aa51, + 0x57a2_0f44_c005_b2ea, + 0x7056_bde9_9303_aa51, + ); + assert_eq!( + z, + Simd256::new( + 0xcd2a_26b7_af89_366d, + 0x51bf_8b69_90fa_e8c6, + 0xcd2a_26b7_af89_366d, + 0x51bf_8b69_90fa_e8c6 + ) + ); } } @@ -145,9 +197,26 @@ fn test_xor() { #[test] fn test_fold_32() { unsafe { - let x = Simd256::new(0xb5f1_2590_5645_0b6c, 0x333a_2c49_c361_9e21, 0xb5f1_2590_5645_0b6c, 0x333a_2c49_c361_9e21); - let f = x.fold_32(Simd256::new(0xbecc_9dd9_038f_c366, 0x5ba9_365b_e2e9_5bf5, 0xbecc_9dd9_038f_c366, 0x5ba9_365b_e2e9_5bf5)); - assert_eq!(f, Simd256::new(0x4f55_42df_ef35_1810, 0x0c03_5bd6_70fc_5abd, 0x4f55_42df_ef35_1810, 0x0c03_5bd6_70fc_5abd)); + let x = Simd256::new( + 0xb5f1_2590_5645_0b6c, + 0x333a_2c49_c361_9e21, + 0xb5f1_2590_5645_0b6c, + 0x333a_2c49_c361_9e21, + ); + let f = x.fold_32(Simd256::new( + 0xbecc_9dd9_038f_c366, + 0x5ba9_365b_e2e9_5bf5, + 0xbecc_9dd9_038f_c366, + 0x5ba9_365b_e2e9_5bf5, + )); + assert_eq!( + f, + Simd256::new( + 0x4f55_42df_ef35_1810, + 0x0c03_5bd6_70fc_5abd, + 0x4f55_42df_ef35_1810, + 0x0c03_5bd6_70fc_5abd + ) + ); } } - From 83ecd0b1ec79caf4951d22457820a67e7395e48e Mon Sep 17 00:00:00 2001 From: Don MacAskill Date: Wed, 11 Dec 2024 23:55:24 -0800 Subject: [PATCH 03/11] Adapt to use latest Rust SIMD changes --- src/lib.rs | 7 +------ src/pclmulqdq/mod.rs | 2 +- src/pclmulqdq/x86/vpclmulqdq.rs | 6 +++--- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 5925d18..22b14b7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,12 +20,7 @@ #![cfg_attr( feature = "vpclmulqdq", - feature( - simd_ffi, - link_llvm_intrinsics, - avx512_target_feature, - target_feature_11 - ) + feature(simd_ffi, link_llvm_intrinsics, avx512_target_feature,) )] mod pclmulqdq; diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs index f483760..67888e4 100644 --- a/src/pclmulqdq/mod.rs +++ b/src/pclmulqdq/mod.rs @@ -100,7 +100,7 @@ unsafe fn update_128_batch(mut state: u64, bytes: &[u8]) -> u64 { } #[cfg(feature = "vpclmulqdq")] -#[target_feature(enable = "avx2", enable = "avx512vpclmulqdq")] +#[target_feature(enable = "avx2", enable = "vpclmulqdq")] unsafe fn update_256_batch(mut state: u64, bytes: &[u8]) -> u64 { use arch::vpclmulqdq::*; let (left, middle, right) = bytes.align_to::<[[Simd256; 4]; 2]>(); diff --git a/src/pclmulqdq/x86/vpclmulqdq.rs b/src/pclmulqdq/x86/vpclmulqdq.rs index 7295ecc..e085e24 100644 --- a/src/pclmulqdq/x86/vpclmulqdq.rs +++ b/src/pclmulqdq/x86/vpclmulqdq.rs @@ -43,7 +43,7 @@ impl Simd256 { } #[inline] - #[target_feature(enable = "avx2", enable = "avx512vpclmulqdq")] + #[target_feature(enable = "avx2", enable = "vpclmulqdq")] pub unsafe fn fold_32(self, coeff: Self) -> Self { let h = pclmulqdq_256(self.0, coeff.0, 0x11); let l = pclmulqdq_256(self.0, coeff.0, 0x00); @@ -61,7 +61,7 @@ impl BitXor for Simd256 { } #[inline] -#[target_feature(enable = "avx2", enable = "avx512vpclmulqdq")] +#[target_feature(enable = "avx2", enable = "vpclmulqdq")] pub(crate) unsafe fn update_vpclmulqdq( state: u64, first: &[[Simd256; 4]; 2], @@ -193,7 +193,7 @@ fn test_xor() { } } -#[cfg(all(target_feature = "avx2", target_feature = "avx512vpclmulqdq"))] +#[cfg(all(target_feature = "avx2", target_feature = "vpclmulqdq"))] #[test] fn test_fold_32() { unsafe { From 2b401e124d099572f3af7925edc308a556841160 Mon Sep 17 00:00:00 2001 From: Don MacAskill Date: Thu, 12 Dec 2024 00:07:15 -0800 Subject: [PATCH 04/11] Add links to issues for tracking unstable features --- src/lib.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 22b14b7..00e9c7e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,6 +17,12 @@ //! let checksum = c.sum64(); //! assert_eq!(checksum, 0xd9160d1fa8e418e3); //! ``` +//! +//! Tracking links for unstable features used here (which require nightly builds): +//! +//! - simd_ffi: https://github.com/rust-lang/rust/issues/27731 +//! - link_llvm_intrinsics: https://github.com/rust-lang/rust/issues/29602 +//! - avx512_target_feature: https://github.com/rust-lang/rust/issues/111137 #![cfg_attr( feature = "vpclmulqdq", From 5f6346d09c1ffc78051c0e0fe2a43723481811a6 Mon Sep 17 00:00:00 2001 From: Don MacAskill Date: Thu, 12 Dec 2024 12:43:22 -0800 Subject: [PATCH 05/11] Update README with VPCLMULQDQ details --- README.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 912f673..d33a3c7 100644 --- a/README.md +++ b/README.md @@ -50,15 +50,18 @@ be chosen based on CPU feature at runtime. [crc 3.0.1]: https://docs.rs/crc/3.0.1/crc/index.html -## TODO +## Experimental VPCLMULQDQ support -This crate is mainly intended for use in TiKV only. -Features beyond AArch64 are unlikely to be implemented. +Using Rust's support for [AVX512 intrinsics](https://github.com/rust-lang/rust/issues/111137), specifically [VPCLMULQDQ](https://doc.rust-lang.org/src/core/stdarch/crates/core_arch/src/x86/vpclmulqdq.rs.html), we can massively improve throughput for x86_64 processors which support them (Intel Ice Lake+ and AMD Zen4+). -* [x] AArch64 support based on PMULL -* [ ] `no_std` support -* [x] Fuzz test -* [ ] Custom polynomial +Specifically, on an `m7i.8xlarge` EC2 instance (4th gen Xeon, aka Sapphire Rapids), throughput approximately _doubles_ from ~26GiB/s to ~52GiB/s. + +Since these are currently marked as unstable features in Rust, you'll need to build with `nightly` and enable the `vpclmulqdq` feature: + +``` +rustup toolchain install nightly +cargo +nightly build --features="vpclmulqdq" -r +``` ## License From 732a20d18dd961d668c1846d5af3821670cb1c81 Mon Sep 17 00:00:00 2001 From: Don MacAskill Date: Thu, 12 Dec 2024 14:35:43 -0800 Subject: [PATCH 06/11] Incorporate x86 changes into vpclmulqdq --- src/pclmulqdq/mod.rs | 2 +- src/pclmulqdq/{x86_64.rs => x86.rs} | 29 ++++++++++++++++----- src/pclmulqdq/{x86 => x86_64}/mod.rs | 13 ++------- src/pclmulqdq/{x86 => x86_64}/vpclmulqdq.rs | 0 4 files changed, 25 insertions(+), 19 deletions(-) rename src/pclmulqdq/{x86_64.rs => x86.rs} (61%) rename src/pclmulqdq/{x86 => x86_64}/mod.rs (82%) rename src/pclmulqdq/{x86 => x86_64}/vpclmulqdq.rs (100%) diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs index 9f85219..ca4ed00 100644 --- a/src/pclmulqdq/mod.rs +++ b/src/pclmulqdq/mod.rs @@ -17,7 +17,7 @@ use super::table; use self::arch::Simd; #[cfg(not(feature = "fake-simd"))] -#[cfg_attr(target_arch = "x86_64", path = "x86_64.rs")] +#[cfg_attr(target_arch = "x86_64", path = "x86_64/mod.rs")] #[cfg_attr(target_arch = "aarch64", path = "aarch64.rs")] #[cfg_attr(target_arch = "x86", path = "x86.rs")] mod arch; diff --git a/src/pclmulqdq/x86_64.rs b/src/pclmulqdq/x86.rs similarity index 61% rename from src/pclmulqdq/x86_64.rs rename to src/pclmulqdq/x86.rs index 63abd80..8dc41b6 100644 --- a/src/pclmulqdq/x86_64.rs +++ b/src/pclmulqdq/x86.rs @@ -1,9 +1,9 @@ // Copyright 2020 TiKV Project Authors. Licensed under MIT or Apache-2.0. -//! x86_64 implementation of the PCLMULQDQ-based CRC calculation. +//! x86 (32-bit) implementation of the PCLMULQDQ-based CRC calculation. -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; use std::ops::BitXor; #[repr(transparent)] @@ -14,13 +14,20 @@ impl super::SimdExt for Simd { fn is_supported() -> bool { is_x86_feature_detected!("pclmulqdq") // _mm_clmulepi64_si128 && is_x86_feature_detected!("sse2") // (all other _mm_*) - && is_x86_feature_detected!("sse4.1") // _mm_extract_epi64 + && is_x86_feature_detected!("sse4.1") } #[inline] #[target_feature(enable = "sse2")] unsafe fn new(high: u64, low: u64) -> Self { - Self(_mm_set_epi64x(high as i64, low as i64)) + // On 32-bit systems, we need to split u64 into low and high 32-bit parts + let high_low = (high & 0xFFFFFFFF) as i32; + let high_high = ((high >> 32) & 0xFFFFFFFF) as i32; + let low_low = (low & 0xFFFFFFFF) as i32; + let low_high = ((low >> 32) & 0xFFFFFFFF) as i32; + + // Create the 128-bit vector using 32-bit parts + Self(_mm_set_epi32(high_high, high_low, low_high, low_low)) } #[inline] @@ -48,7 +55,15 @@ impl super::SimdExt for Simd { let h = Self(_mm_slli_si128(t1, 8)); let l = Self(_mm_clmulepi64_si128(t1, polymu.0, 0x10)); let reduced = h ^ l ^ self; - _mm_extract_epi64(reduced.0, 1) as u64 + + // Store the result in memory and read it back as u64 + // This approach is more reliable for handling 64-bit values on 32-bit systems + let mut result: [u32; 4] = [0; 4]; + _mm_storeu_si128(result.as_mut_ptr() as *mut __m128i, reduced.0); + + // Combine the two 32-bit values into a 64-bit result + // We want the high 64 bits (indices 2 and 3) + ((result[3] as u64) << 32) | (result[2] as u64) } } @@ -58,4 +73,4 @@ impl BitXor for Simd { fn bitxor(self, other: Self) -> Self { Self(unsafe { _mm_xor_si128(self.0, other.0) }) } -} +} \ No newline at end of file diff --git a/src/pclmulqdq/x86/mod.rs b/src/pclmulqdq/x86_64/mod.rs similarity index 82% rename from src/pclmulqdq/x86/mod.rs rename to src/pclmulqdq/x86_64/mod.rs index 6181a0d..401a619 100644 --- a/src/pclmulqdq/x86/mod.rs +++ b/src/pclmulqdq/x86_64/mod.rs @@ -2,8 +2,8 @@ //! x86 (32-bit) implementation of the PCLMULQDQ-based CRC calculation. -#[cfg(target_arch = "x86")] -use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; use std::ops::BitXor; #[cfg(all(feature = "vpclmulqdq"))] @@ -59,15 +59,6 @@ impl super::SimdExt for Simd { let l = Self(_mm_clmulepi64_si128::<0x10>(t1, polymu.0)); let reduced = h ^ l ^ self; _mm_extract_epi64::<1>(reduced.0) as u64 - - // Store the result in memory and read it back as u64 - // This approach is more reliable for handling 64-bit values on 32-bit systems - let mut result: [u32; 4] = [0; 4]; - _mm_storeu_si128(result.as_mut_ptr() as *mut __m128i, reduced.0); - - // Combine the two 32-bit values into a 64-bit result - // We want the high 64 bits (indices 2 and 3) - ((result[3] as u64) << 32) | (result[2] as u64) } } diff --git a/src/pclmulqdq/x86/vpclmulqdq.rs b/src/pclmulqdq/x86_64/vpclmulqdq.rs similarity index 100% rename from src/pclmulqdq/x86/vpclmulqdq.rs rename to src/pclmulqdq/x86_64/vpclmulqdq.rs From 5bcf1187d35caeb54146ba150b33c31b85937e77 Mon Sep 17 00:00:00 2001 From: Don MacAskill Date: Thu, 12 Dec 2024 15:23:45 -0800 Subject: [PATCH 07/11] Fix formatting --- src/pclmulqdq/x86.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pclmulqdq/x86.rs b/src/pclmulqdq/x86.rs index 8dc41b6..55532c1 100644 --- a/src/pclmulqdq/x86.rs +++ b/src/pclmulqdq/x86.rs @@ -73,4 +73,4 @@ impl BitXor for Simd { fn bitxor(self, other: Self) -> Self { Self(unsafe { _mm_xor_si128(self.0, other.0) }) } -} \ No newline at end of file +} From 602e0e91be94d327249986cc75a78c9c96a8b01e Mon Sep 17 00:00:00 2001 From: Don MacAskill Date: Thu, 12 Dec 2024 15:24:00 -0800 Subject: [PATCH 08/11] Improve docs with more VPCLMULQDQ details --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4516b1f..368013d 100644 --- a/README.md +++ b/README.md @@ -48,11 +48,12 @@ be chosen based on CPU feature at runtime. |:----------------------------|--------------------:|---------------------:| | [crc 3.0.1] | 0.5 GiB/s | 0.3 GiB/s | | crc64fast-nvme (table) | 2.3 GiB/s | 1.8 GiB/s | -| crc64fast-nvme (simd) | 28.2 GiB/s | 20.0 GiB/s | +| crc64fast-nvme (SIMD) | 28.2 GiB/s | 20.0 GiB/s | +| crc64fast-nvme (VPCLMULQDQ) | 52 GiB/s | n/a | [crc 3.0.1]: https://docs.rs/crc/3.0.1/crc/index.html -## Experimental VPCLMULQDQ support +## Experimental "Vector Carry-Less Multiplication of Quadwords" (VPCLMULQDQ) support Using Rust's support for [AVX512 intrinsics](https://github.com/rust-lang/rust/issues/111137), specifically [VPCLMULQDQ](https://doc.rust-lang.org/src/core/stdarch/crates/core_arch/src/x86/vpclmulqdq.rs.html), we can massively improve throughput for x86_64 processors which support them (Intel Ice Lake+ and AMD Zen4+). @@ -79,6 +80,7 @@ cargo +nightly build --features="vpclmulqdq" -r * [StackOverflow PCLMULQDQ CRC32 question](https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq) - Insightful question & answer to CRC32 implementation details. * [AWS S3 announcement about CRC64-NVME support](https://aws.amazon.com/blogs/aws/introducing-default-data-integrity-protections-for-new-objects-in-amazon-s3/) * [AWS S3 docs on checking object integrity using CRC64-NVME](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html) +* [Vector Carry-Less Multiplication of Quadwords (VPCLMULQDQ) details](https://en.wikichip.org/wiki/x86/vpclmulqdq) ## License From a945288d1aa9fec71f25f71b418d6ad92791daec Mon Sep 17 00:00:00 2001 From: Don MacAskill Date: Thu, 19 Dec 2024 18:48:14 -0800 Subject: [PATCH 09/11] Use internal Rust functions --- src/lib.rs | 2 +- src/pclmulqdq/x86_64/vpclmulqdq.rs | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 00e9c7e..b306ce3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,7 +26,7 @@ #![cfg_attr( feature = "vpclmulqdq", - feature(simd_ffi, link_llvm_intrinsics, avx512_target_feature,) + feature(avx512_target_feature, stdarch_x86_avx512) )] mod pclmulqdq; diff --git a/src/pclmulqdq/x86_64/vpclmulqdq.rs b/src/pclmulqdq/x86_64/vpclmulqdq.rs index e085e24..d25900e 100644 --- a/src/pclmulqdq/x86_64/vpclmulqdq.rs +++ b/src/pclmulqdq/x86_64/vpclmulqdq.rs @@ -1,18 +1,12 @@ use super::{super::fold_tail, Simd, __cpuid_count, __m256i, _mm256_set_epi64x, _mm256_xor_si256}; use core::ops::BitXor; use lazy_static::lazy_static; - -// PCLMULQDQ can be used without avx512vl. However, this is only addressed by rust recently --- so we -// need to manually specify the intrinsic, otherwise rustc will inline it poorly. -#[allow(improper_ctypes)] -extern "C" { - #[link_name = "llvm.x86.pclmulqdq.256"] - fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i; -} +use std::arch::x86_64::_mm256_clmulepi64_epi128; #[derive(Clone, Copy, Debug)] pub struct Simd256(__m256i); +// this lazy_static bit takes throughput from ~39GiB/s to ~52GiB/s lazy_static! { static ref VPCLMULQDQ_SUPPORTED : bool = { let avx2 = is_x86_feature_detected!("avx2"); @@ -45,8 +39,8 @@ impl Simd256 { #[inline] #[target_feature(enable = "avx2", enable = "vpclmulqdq")] pub unsafe fn fold_32(self, coeff: Self) -> Self { - let h = pclmulqdq_256(self.0, coeff.0, 0x11); - let l = pclmulqdq_256(self.0, coeff.0, 0x00); + let h = _mm256_clmulepi64_epi128(self.0, coeff.0, 0x11); + let l = _mm256_clmulepi64_epi128(self.0, coeff.0, 0x00); Self(h) ^ Self(l) } } From f30e24cc4573df94e4b366c67c2d1bf9aaa50c49 Mon Sep 17 00:00:00 2001 From: Don MacAskill Date: Thu, 19 Dec 2024 18:48:25 -0800 Subject: [PATCH 10/11] Improve docs --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 368013d..1519b2a 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ crc64fast-nvme SIMD-accelerated carryless-multiplication [CRC-64/NVME](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-64-nvme) checksum computation (similar to [crc32fast](https://crates.io/crates/crc32fast) and forked from [crc64fast](https://github.com/tikv/crc64fast) which calculates [CRC-64/XZ](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-64-xz) [a.k.a `CRC-64/GO-ECMA`]). -`CRC-64/NVME` comes from the [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf) (Revision 1.0d, December 2023) and has also been implemented in the [Linux kernel](https://github.com/torvalds/linux/blob/786c8248dbd33a5a7a07f7c6e55a7bfc68d2ca48/lib/crc64.c#L66-L73) (where it's called `CRC-64/Rocksoft`) and [AWS S3's recommended checksum option](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html) as `CRC64-NVME`. (Note that the Check value in the spec uses incorrect endianness (Section 5.2.1.3.4, Figure 120, page 83). +`CRC-64/NVME` comes from the [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf) (Revision 1.0d, December 2023) and has also been implemented in the [Linux kernel](https://github.com/torvalds/linux/blob/786c8248dbd33a5a7a07f7c6e55a7bfc68d2ca48/lib/crc64.c#L66-L73) (where it's called `CRC-64/Rocksoft`) and is [AWS S3's recommended checksum option](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html) as `CRC64-NVME`. (Note that the Check value in the spec uses incorrect endianness [Section 5.2.1.3.4, Figure 120, page 83]). SIMD-accelerated carryless-multiplication is based on the Intel [Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction](https://web.archive.org/web/20131224125630/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf) paper. From 7edf216bfb4c0784e0c1a9adc183b20fe6de96df Mon Sep 17 00:00:00 2001 From: Don MacAskill Date: Thu, 26 Dec 2024 15:38:47 -0800 Subject: [PATCH 11/11] Improve cfg() usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There’s only one feature… --- src/pclmulqdq/mod.rs | 2 +- src/pclmulqdq/x86_64/mod.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs index ca4ed00..a19c888 100644 --- a/src/pclmulqdq/mod.rs +++ b/src/pclmulqdq/mod.rs @@ -73,7 +73,7 @@ impl BitXorAssign for Simd { } pub fn get_update() -> super::UpdateFn { - #[cfg(all(feature = "vpclmulqdq"))] + #[cfg(feature = "vpclmulqdq")] { use arch::vpclmulqdq::*; if Simd256::is_supported() { diff --git a/src/pclmulqdq/x86_64/mod.rs b/src/pclmulqdq/x86_64/mod.rs index 401a619..e09c802 100644 --- a/src/pclmulqdq/x86_64/mod.rs +++ b/src/pclmulqdq/x86_64/mod.rs @@ -6,7 +6,7 @@ use std::arch::x86_64::*; use std::ops::BitXor; -#[cfg(all(feature = "vpclmulqdq"))] +#[cfg(feature = "vpclmulqdq")] pub mod vpclmulqdq; #[repr(transparent)]