From 233e651e31165c76f5930e7d2b27554950b7486b Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Fri, 4 Nov 2022 15:44:26 -0400
Subject: [PATCH 01/11] feat: add 256bit vpclmulqdq support

Signed-off-by: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
---
 Cargo.toml                           |   1 +
 src/lib.rs                           |  15 ++-
 src/pclmulqdq/mod.rs                 |  54 +++++++---
 src/pclmulqdq/{x86.rs => x86/mod.rs} |  19 ++--
 src/pclmulqdq/x86/vpclmulqdq.rs      | 153 +++++++++++++++++++++++++++
 5 files changed, 220 insertions(+), 22 deletions(-)
 rename src/pclmulqdq/{x86.rs => x86/mod.rs} (73%)
 create mode 100644 src/pclmulqdq/x86/vpclmulqdq.rs

diff --git a/Cargo.toml b/Cargo.toml
index 5832066..403a2ea 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,6 +22,7 @@ rand = "0.8"
 
 [features]
 pmull = []
+vpclmulqdq = []
 fake-simd = []
 
 [[bench]]
diff --git a/src/lib.rs b/src/lib.rs
index 847f2c1..17ac0e3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -22,11 +22,20 @@
     feature = "pmull",
     feature(stdsimd, platform_intrinsics, aarch64_target_feature, llvm_asm)
 )]
+#![cfg_attr(
+    feature = "vpclmulqdq",
+    feature(
+        simd_ffi,
+        link_llvm_intrinsics,
+        avx512_target_feature,
+        target_feature_11
+    )
+)]
 
 mod pclmulqdq;
 mod table;
 
-type UpdateFn = fn(u64, &[u8]) -> u64;
+type UpdateFn = unsafe fn(u64, &[u8]) -> u64;
 
 /// Represents an in-progress CRC-64 computation.
 #[derive(Clone)]
@@ -57,7 +66,9 @@ impl Digest {
 
     /// Writes some data into the digest.
     pub fn write(&mut self, bytes: &[u8]) {
-        self.state = (self.computer)(self.state, bytes);
+        unsafe {
+            self.state = (self.computer)(self.state, bytes);
+        }
     }
 
     /// Computes the current CRC-64-ECMA value.
diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs
index b329808..c709aec 100644
--- a/src/pclmulqdq/mod.rs
+++ b/src/pclmulqdq/mod.rs
@@ -7,21 +7,23 @@
 //!
 //! [white paper]: https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
 
+use std::{
+    fmt::Debug,
+    ops::{BitXor, BitXorAssign},
+};
+
+use super::table;
+
+use self::arch::Simd;
+
 #[cfg(not(feature = "fake-simd"))]
-#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), path = "x86.rs")]
+#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), path = "x86/mod.rs")]
 #[cfg_attr(all(target_arch = "aarch64", feature = "pmull"), path = "aarch64.rs")]
 mod arch;
 
 #[cfg(feature = "fake-simd")]
 mod arch;
 
-use self::arch::Simd;
-use super::table;
-use std::{
-    fmt::Debug,
-    ops::{BitXor, BitXorAssign},
-};
-
 /// This trait must be implemented on `self::arch::Simd` to provide the
 /// platform-specific SIMD implementations.
 trait SimdExt: Copy + Debug + BitXor {
@@ -70,24 +72,47 @@ impl BitXorAssign for Simd {
 }
 
 pub fn get_update() -> super::UpdateFn {
+    #[cfg(all(feature = "vpclmulqdq"))]
+    {
+        use arch::vpclmulqdq::*;
+        if Simd256::is_supported() {
+            return update_256_batch;
+        }
+    }
+
     if Simd::is_supported() {
-        update
+        update_128_batch
     } else {
         table::update
     }
 }
 
-fn update(mut state: u64, bytes: &[u8]) -> u64 {
-    let (left, middle, right) = unsafe { bytes.align_to::<[Simd; 8]>() };
+// This function is unsafe because it uses platform dependent functions.
+unsafe fn update_128_batch(mut state: u64, bytes: &[u8]) -> u64 {
+    let (left, middle, right) = bytes.align_to::<[Simd; 8]>();
     if let Some((first, rest)) = middle.split_first() {
         state = table::update(state, left);
-        state = unsafe { update_simd(state, first, rest) };
+        state = update_simd(state, first, rest);
         table::update(state, right)
     } else {
         table::update(state, bytes)
     }
 }
 
+#[cfg(feature = "vpclmulqdq")]
+#[target_feature(enable = "avx2", enable = "avx512vpclmulqdq")]
+unsafe fn update_256_batch(mut state: u64, bytes: &[u8]) -> u64 {
+    use arch::vpclmulqdq::*;
+    let (left, middle, right) = bytes.align_to::<[[Simd256; 4]; 2]>();
+    if let Some((first, rest)) = middle.split_first() {
+        state = update_128_batch(state, left);
+        state = update_vpclmulqdq(state, first, rest);
+        update_128_batch(state, right)
+    } else {
+        update_128_batch(state, bytes)
+    }
+}
+
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
     target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")
@@ -111,6 +136,11 @@ unsafe fn update_simd(state: u64, first: &[Simd; 8], rest: &[[Simd; 8]]) -> u64
         }
     }
 
+    fold_tail(x)
+}
+
+#[inline(always)]
+unsafe fn fold_tail(x: [Simd; 8]) -> u64 {
     let coeffs = [
         Simd::new(table::K_895, table::K_959), // fold by distance of 112 bytes
         Simd::new(table::K_767, table::K_831), // fold by distance of 96 bytes
diff --git a/src/pclmulqdq/x86.rs b/src/pclmulqdq/x86/mod.rs
similarity index 73%
rename from src/pclmulqdq/x86.rs
rename to src/pclmulqdq/x86/mod.rs
index 2fce99a..2e18de8 100644
--- a/src/pclmulqdq/x86.rs
+++ b/src/pclmulqdq/x86/mod.rs
@@ -8,6 +8,9 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 use std::ops::BitXor;
 
+#[cfg(all(feature = "vpclmulqdq"))]
+pub mod vpclmulqdq;
+
 #[repr(transparent)]
 #[derive(Copy, Clone, Debug)]
 pub struct Simd(__m128i);
@@ -28,8 +31,8 @@ impl super::SimdExt for Simd {
     #[inline]
     #[target_feature(enable = "sse2", enable = "pclmulqdq")]
     unsafe fn fold_16(self, coeff: Self) -> Self {
-        let h = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x11));
-        let l = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x00));
+        let h = Self(_mm_clmulepi64_si128::<0x11>(self.0, coeff.0));
+        let l = Self(_mm_clmulepi64_si128::<0x00>(self.0, coeff.0));
         h ^ l
     }
 
@@ -37,8 +40,8 @@ impl super::SimdExt for Simd {
     #[target_feature(enable = "sse2", enable = "pclmulqdq")]
     unsafe fn fold_8(self, coeff: u64) -> Self {
         let coeff = Self::new(0, coeff);
-        let h = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x00));
-        let l = Self(_mm_srli_si128(self.0, 8));
+        let h = Self(_mm_clmulepi64_si128::<0x00>(self.0, coeff.0));
+        let l = Self(_mm_srli_si128::<8>(self.0));
         h ^ l
     }
 
@@ -46,11 +49,11 @@ impl super::SimdExt for Simd {
     #[target_feature(enable = "sse2", enable = "sse4.1", enable = "pclmulqdq")]
     unsafe fn barrett(self, poly: u64, mu: u64) -> u64 {
         let polymu = Self::new(poly, mu);
-        let t1 = _mm_clmulepi64_si128(self.0, polymu.0, 0x00);
-        let h = Self(_mm_slli_si128(t1, 8));
-        let l = Self(_mm_clmulepi64_si128(t1, polymu.0, 0x10));
+        let t1 = _mm_clmulepi64_si128::<0x00>(self.0, polymu.0);
+        let h = Self(_mm_slli_si128::<8>(t1));
+        let l = Self(_mm_clmulepi64_si128::<0x10>(t1, polymu.0));
         let reduced = h ^ l ^ self;
-        _mm_extract_epi64(reduced.0, 1) as u64
+        _mm_extract_epi64::<1>(reduced.0) as u64
     }
 }
 
diff --git a/src/pclmulqdq/x86/vpclmulqdq.rs b/src/pclmulqdq/x86/vpclmulqdq.rs
new file mode 100644
index 0000000..cdf9e6c
--- /dev/null
+++ b/src/pclmulqdq/x86/vpclmulqdq.rs
@@ -0,0 +1,153 @@
+use core::ops::BitXor;
+
+use super::{__cpuid_count, __m256i, _mm256_set_epi64x, _mm256_xor_si256, Simd, super::fold_tail};
+
+// PCLMULQDQ can be used without avx512vl. However, this is only addressed by rust recently --- so we
+// need to manually specify the intrinsic, otherwise rustc will inline it poorly.
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.pclmulqdq.256"]
+    fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i;
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct Simd256(__m256i);
+
+impl Simd256 {
+    #[inline]
+    pub fn is_supported() -> bool {
+        let avx2 = is_x86_feature_detected!("avx2");
+        // Rust is very confused about VPCLMULQDQ
+        // Let us detect it use CPUID directly
+        let leaf_7 = unsafe { __cpuid_count(7, 0) };
+        let vpclmulqdq = (leaf_7.ecx & (1u32 << 10)) != 0;
+        avx2 && vpclmulqdq
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx2")]
+    pub unsafe fn new(x3: u64, x2: u64, x1: u64, x0: u64) -> Self {
+        Self(_mm256_set_epi64x(x3 as _, x2 as _, x1 as _, x0 as _))
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx2")]
+    pub unsafe fn to_simd_x8(self4: [Self; 4]) -> [Simd; 8] {
+        core::mem::transmute(self4)
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx2", enable = "avx512vpclmulqdq")]
+    pub unsafe fn fold_32(self, coeff: Self) -> Self {
+        let h = pclmulqdq_256(self.0, coeff.0, 0x11);
+        let l = pclmulqdq_256(self.0, coeff.0, 0x00);
+        Self(h) ^ Self(l)
+    }
+}
+
+impl BitXor for Simd256 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn bitxor(self, other: Self) -> Self {
+        Self(unsafe { _mm256_xor_si256(self.0, other.0) })
+    }
+}
+
+#[inline]
+#[target_feature(enable = "avx2", enable = "avx512vpclmulqdq")]
+pub(crate) unsafe fn update_vpclmulqdq(
+    state: u64,
+    first: &[[Simd256; 4]; 2],
+    rest: &[[[Simd256; 4]; 2]],
+) -> u64 {
+    // receive the initial 128 bytes of data
+    let (mut x, y) = (first[0], first[1]);
+
+    // xor the initial CRC value
+    x[0] = x[0] ^ Simd256::new(0, 0, 0, state);
+
+    let coeff = Simd256::new(
+        crate::table::K_1023,
+        crate::table::K_1087,
+        crate::table::K_1023,
+        crate::table::K_1087,
+    );
+
+    x[0] = x[0].fold_32(coeff) ^ y[0];
+    x[1] = x[1].fold_32(coeff) ^ y[1];
+    x[2] = x[2].fold_32(coeff) ^ y[2];
+    x[3] = x[3].fold_32(coeff) ^ y[3];
+
+    // perform 256-byte folding.
+    for chunk in rest {
+        let chunk = *chunk;
+        x[0] = x[0].fold_32(coeff) ^ chunk[0][0];
+        x[0] = x[0].fold_32(coeff) ^ chunk[1][0];
+        x[1] = x[1].fold_32(coeff) ^ chunk[0][1];
+        x[1] = x[1].fold_32(coeff) ^ chunk[1][1];
+        x[2] = x[2].fold_32(coeff) ^ chunk[0][2];
+        x[2] = x[2].fold_32(coeff) ^ chunk[1][2];
+        x[3] = x[3].fold_32(coeff) ^ chunk[0][3];
+        x[3] = x[3].fold_32(coeff) ^ chunk[1][3];
+    }
+
+    let x = Simd256::to_simd_x8(x);
+    fold_tail(x)
+}
+
+impl PartialEq for Simd256 {
+    fn eq(&self, other: &Self) -> bool {
+        unsafe {
+            use core::mem::transmute;
+            let a: [u128; 2] = transmute(*self);
+            let b: [u128; 2] = transmute(*other);
+            a == b
+        }
+    }
+}
+
+impl Eq for Simd256 {}
+
+#[cfg(target_feature = "avx2")]
+#[test]
+fn test_size_and_alignment() {
+    assert_eq!(std::mem::size_of::<Simd256>(), 32);
+    assert_eq!(std::mem::align_of::<Simd256>(), 32);
+}
+
+#[cfg(target_feature = "avx2")]
+#[test]
+fn test_new() {
+    unsafe {
+        let x = Simd256::new(0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804, 0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804);
+        let y = Simd256::new(0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804, 0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804);
+        let z = Simd256::new(0xfa3e_0099_cd5e_d60d, 0xad71_9ee6_57d1_498e, 0xfa3e_0099_cd5e_d60d, 0xad71_9ee6_57d1_498e);
+        assert_eq!(x, y);
+        assert_ne!(x, z);
+    }
+}
+
+#[cfg(target_feature = "avx2")]
+#[test]
+fn test_xor() {
+    unsafe {
+        let x = Simd256::new(0xe450_87f9_b031_0d47, 0x3d72_e92a_96c7_4c63, 0xe450_87f9_b031_0d47, 0x3d72_e92a_96c7_4c63);
+        let y = Simd256::new(0x7ed8_ae0a_dfbd_89c0, 0x1c9b_dfaa_953e_0ef4, 0x7ed8_ae0a_dfbd_89c0, 0x1c9b_dfaa_953e_0ef4);
+        let mut z = x ^ y;
+        assert_eq!(z, Simd256::new(0x9a88_29f3_6f8c_8487, 0x21e9_3680_03f9_4297, 0x9a88_29f3_6f8c_8487, 0x21e9_3680_03f9_4297));
+        z = z ^ Simd256::new(0x57a2_0f44_c005_b2ea, 0x7056_bde9_9303_aa51, 0x57a2_0f44_c005_b2ea, 0x7056_bde9_9303_aa51);
+        assert_eq!(z, Simd256::new(0xcd2a_26b7_af89_366d, 0x51bf_8b69_90fa_e8c6, 0xcd2a_26b7_af89_366d, 0x51bf_8b69_90fa_e8c6));
+    }
+}
+
+#[cfg(all(target_feature = "avx2", target_feature = "avx512vpclmulqdq"))]
+#[test]
+fn test_fold_32() {
+    unsafe {
+        let x = Simd256::new(0xb5f1_2590_5645_0b6c, 0x333a_2c49_c361_9e21, 0xb5f1_2590_5645_0b6c, 0x333a_2c49_c361_9e21);
+        let f = x.fold_32(Simd256::new(0xbecc_9dd9_038f_c366, 0x5ba9_365b_e2e9_5bf5, 0xbecc_9dd9_038f_c366, 0x5ba9_365b_e2e9_5bf5));
+        assert_eq!(f, Simd256::new(0x4f55_42df_ef35_1810, 0x0c03_5bd6_70fc_5abd, 0x4f55_42df_ef35_1810, 0x0c03_5bd6_70fc_5abd));
+    }
+}
+

From 806ea64033a52466dd7cae351c1d93da55216c98 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Fri, 4 Nov 2022 16:12:04 -0400
Subject: [PATCH 02/11] switch to lazy static to avoid extra overhead

Signed-off-by: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
---
 Cargo.toml                      |   3 +-
 src/pclmulqdq/x86/vpclmulqdq.rs | 103 ++++++++++++++++++++++++++------
 2 files changed, 88 insertions(+), 18 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 403a2ea..3ce4d92 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,7 @@ readme = "README.md"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+lazy_static = { version = "1.4.0", optional = true }
 
 [dev-dependencies]
 crc = "1"
@@ -22,7 +23,7 @@ rand = "0.8"
 
 [features]
 pmull = []
-vpclmulqdq = []
+vpclmulqdq = ["lazy_static"]
 fake-simd = []
 
 [[bench]]
diff --git a/src/pclmulqdq/x86/vpclmulqdq.rs b/src/pclmulqdq/x86/vpclmulqdq.rs
index cdf9e6c..7295ecc 100644
--- a/src/pclmulqdq/x86/vpclmulqdq.rs
+++ b/src/pclmulqdq/x86/vpclmulqdq.rs
@@ -1,6 +1,6 @@
+use super::{super::fold_tail, Simd, __cpuid_count, __m256i, _mm256_set_epi64x, _mm256_xor_si256};
 use core::ops::BitXor;
-
-use super::{__cpuid_count, __m256i, _mm256_set_epi64x, _mm256_xor_si256, Simd, super::fold_tail};
+use lazy_static::lazy_static;
 
 // PCLMULQDQ can be used without avx512vl. However, this is only addressed by rust recently --- so we
 // need to manually specify the intrinsic, otherwise rustc will inline it poorly.
@@ -13,15 +13,21 @@ extern "C" {
 #[derive(Clone, Copy, Debug)]
 pub struct Simd256(__m256i);
 
-impl Simd256 {
-    #[inline]
-    pub fn is_supported() -> bool {
+lazy_static! {
+    static ref VPCLMULQDQ_SUPPORTED : bool = {
         let avx2 = is_x86_feature_detected!("avx2");
         // Rust is very confused about VPCLMULQDQ
         // Let us detect it use CPUID directly
         let leaf_7 = unsafe { __cpuid_count(7, 0) };
         let vpclmulqdq = (leaf_7.ecx & (1u32 << 10)) != 0;
         avx2 && vpclmulqdq
+    };
+}
+
+impl Simd256 {
+    #[inline]
+    pub fn is_supported() -> bool {
+        *VPCLMULQDQ_SUPPORTED
     }
 
     #[inline]
@@ -120,9 +126,24 @@ fn test_size_and_alignment() {
 #[test]
 fn test_new() {
     unsafe {
-        let x = Simd256::new(0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804, 0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804);
-        let y = Simd256::new(0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804, 0xd7c8_11cf_e5c5_c792, 0x86e6_5c36_e68b_4804);
-        let z = Simd256::new(0xfa3e_0099_cd5e_d60d, 0xad71_9ee6_57d1_498e, 0xfa3e_0099_cd5e_d60d, 0xad71_9ee6_57d1_498e);
+        let x = Simd256::new(
+            0xd7c8_11cf_e5c5_c792,
+            0x86e6_5c36_e68b_4804,
+            0xd7c8_11cf_e5c5_c792,
+            0x86e6_5c36_e68b_4804,
+        );
+        let y = Simd256::new(
+            0xd7c8_11cf_e5c5_c792,
+            0x86e6_5c36_e68b_4804,
+            0xd7c8_11cf_e5c5_c792,
+            0x86e6_5c36_e68b_4804,
+        );
+        let z = Simd256::new(
+            0xfa3e_0099_cd5e_d60d,
+            0xad71_9ee6_57d1_498e,
+            0xfa3e_0099_cd5e_d60d,
+            0xad71_9ee6_57d1_498e,
+        );
         assert_eq!(x, y);
         assert_ne!(x, z);
     }
@@ -132,12 +153,43 @@ fn test_new() {
 #[test]
 fn test_xor() {
     unsafe {
-        let x = Simd256::new(0xe450_87f9_b031_0d47, 0x3d72_e92a_96c7_4c63, 0xe450_87f9_b031_0d47, 0x3d72_e92a_96c7_4c63);
-        let y = Simd256::new(0x7ed8_ae0a_dfbd_89c0, 0x1c9b_dfaa_953e_0ef4, 0x7ed8_ae0a_dfbd_89c0, 0x1c9b_dfaa_953e_0ef4);
+        let x = Simd256::new(
+            0xe450_87f9_b031_0d47,
+            0x3d72_e92a_96c7_4c63,
+            0xe450_87f9_b031_0d47,
+            0x3d72_e92a_96c7_4c63,
+        );
+        let y = Simd256::new(
+            0x7ed8_ae0a_dfbd_89c0,
+            0x1c9b_dfaa_953e_0ef4,
+            0x7ed8_ae0a_dfbd_89c0,
+            0x1c9b_dfaa_953e_0ef4,
+        );
         let mut z = x ^ y;
-        assert_eq!(z, Simd256::new(0x9a88_29f3_6f8c_8487, 0x21e9_3680_03f9_4297, 0x9a88_29f3_6f8c_8487, 0x21e9_3680_03f9_4297));
-        z = z ^ Simd256::new(0x57a2_0f44_c005_b2ea, 0x7056_bde9_9303_aa51, 0x57a2_0f44_c005_b2ea, 0x7056_bde9_9303_aa51);
-        assert_eq!(z, Simd256::new(0xcd2a_26b7_af89_366d, 0x51bf_8b69_90fa_e8c6, 0xcd2a_26b7_af89_366d, 0x51bf_8b69_90fa_e8c6));
+        assert_eq!(
+            z,
+            Simd256::new(
+                0x9a88_29f3_6f8c_8487,
+                0x21e9_3680_03f9_4297,
+                0x9a88_29f3_6f8c_8487,
+                0x21e9_3680_03f9_4297
+            )
+        );
+        z = z ^ Simd256::new(
+            0x57a2_0f44_c005_b2ea,
+            0x7056_bde9_9303_aa51,
+            0x57a2_0f44_c005_b2ea,
+            0x7056_bde9_9303_aa51,
+        );
+        assert_eq!(
+            z,
+            Simd256::new(
+                0xcd2a_26b7_af89_366d,
+                0x51bf_8b69_90fa_e8c6,
+                0xcd2a_26b7_af89_366d,
+                0x51bf_8b69_90fa_e8c6
+            )
+        );
     }
 }
 
@@ -145,9 +197,26 @@ fn test_xor() {
 #[test]
 fn test_fold_32() {
     unsafe {
-        let x = Simd256::new(0xb5f1_2590_5645_0b6c, 0x333a_2c49_c361_9e21, 0xb5f1_2590_5645_0b6c, 0x333a_2c49_c361_9e21);
-        let f = x.fold_32(Simd256::new(0xbecc_9dd9_038f_c366, 0x5ba9_365b_e2e9_5bf5, 0xbecc_9dd9_038f_c366, 0x5ba9_365b_e2e9_5bf5));
-        assert_eq!(f, Simd256::new(0x4f55_42df_ef35_1810, 0x0c03_5bd6_70fc_5abd, 0x4f55_42df_ef35_1810, 0x0c03_5bd6_70fc_5abd));
+        let x = Simd256::new(
+            0xb5f1_2590_5645_0b6c,
+            0x333a_2c49_c361_9e21,
+            0xb5f1_2590_5645_0b6c,
+            0x333a_2c49_c361_9e21,
+        );
+        let f = x.fold_32(Simd256::new(
+            0xbecc_9dd9_038f_c366,
+            0x5ba9_365b_e2e9_5bf5,
+            0xbecc_9dd9_038f_c366,
+            0x5ba9_365b_e2e9_5bf5,
+        ));
+        assert_eq!(
+            f,
+            Simd256::new(
+                0x4f55_42df_ef35_1810,
+                0x0c03_5bd6_70fc_5abd,
+                0x4f55_42df_ef35_1810,
+                0x0c03_5bd6_70fc_5abd
+            )
+        );
     }
 }
-

From 83ecd0b1ec79caf4951d22457820a67e7395e48e Mon Sep 17 00:00:00 2001
From: Don MacAskill <don@smugmug.com>
Date: Wed, 11 Dec 2024 23:55:24 -0800
Subject: [PATCH 03/11] Adapt to use latest Rust SIMD changes

---
 src/lib.rs                      | 7 +------
 src/pclmulqdq/mod.rs            | 2 +-
 src/pclmulqdq/x86/vpclmulqdq.rs | 6 +++---
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 5925d18..22b14b7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -20,12 +20,7 @@
 
 #![cfg_attr(
     feature = "vpclmulqdq",
-    feature(
-        simd_ffi,
-        link_llvm_intrinsics,
-        avx512_target_feature,
-        target_feature_11
-    )
+    feature(simd_ffi, link_llvm_intrinsics, avx512_target_feature,)
 )]
 
 mod pclmulqdq;
diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs
index f483760..67888e4 100644
--- a/src/pclmulqdq/mod.rs
+++ b/src/pclmulqdq/mod.rs
@@ -100,7 +100,7 @@ unsafe fn update_128_batch(mut state: u64, bytes: &[u8]) -> u64 {
 }
 
 #[cfg(feature = "vpclmulqdq")]
-#[target_feature(enable = "avx2", enable = "avx512vpclmulqdq")]
+#[target_feature(enable = "avx2", enable = "vpclmulqdq")]
 unsafe fn update_256_batch(mut state: u64, bytes: &[u8]) -> u64 {
     use arch::vpclmulqdq::*;
     let (left, middle, right) = bytes.align_to::<[[Simd256; 4]; 2]>();
diff --git a/src/pclmulqdq/x86/vpclmulqdq.rs b/src/pclmulqdq/x86/vpclmulqdq.rs
index 7295ecc..e085e24 100644
--- a/src/pclmulqdq/x86/vpclmulqdq.rs
+++ b/src/pclmulqdq/x86/vpclmulqdq.rs
@@ -43,7 +43,7 @@ impl Simd256 {
     }
 
     #[inline]
-    #[target_feature(enable = "avx2", enable = "avx512vpclmulqdq")]
+    #[target_feature(enable = "avx2", enable = "vpclmulqdq")]
     pub unsafe fn fold_32(self, coeff: Self) -> Self {
         let h = pclmulqdq_256(self.0, coeff.0, 0x11);
         let l = pclmulqdq_256(self.0, coeff.0, 0x00);
@@ -61,7 +61,7 @@ impl BitXor for Simd256 {
 }
 
 #[inline]
-#[target_feature(enable = "avx2", enable = "avx512vpclmulqdq")]
+#[target_feature(enable = "avx2", enable = "vpclmulqdq")]
 pub(crate) unsafe fn update_vpclmulqdq(
     state: u64,
     first: &[[Simd256; 4]; 2],
@@ -193,7 +193,7 @@ fn test_xor() {
     }
 }
 
-#[cfg(all(target_feature = "avx2", target_feature = "avx512vpclmulqdq"))]
+#[cfg(all(target_feature = "avx2", target_feature = "vpclmulqdq"))]
 #[test]
 fn test_fold_32() {
     unsafe {

From 2b401e124d099572f3af7925edc308a556841160 Mon Sep 17 00:00:00 2001
From: Don MacAskill <don@smugmug.com>
Date: Thu, 12 Dec 2024 00:07:15 -0800
Subject: [PATCH 04/11] Add links to issues for tracking unstable features

---
 src/lib.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/lib.rs b/src/lib.rs
index 22b14b7..00e9c7e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,6 +17,12 @@
 //! let checksum = c.sum64();
 //! assert_eq!(checksum, 0xd9160d1fa8e418e3);
 //! ```
+//!
+//! Tracking links for unstable features used here (which require nightly builds):
+//!
+//! - simd_ffi: https://github.com/rust-lang/rust/issues/27731
+//! - link_llvm_intrinsics: https://github.com/rust-lang/rust/issues/29602
+//! - avx512_target_feature: https://github.com/rust-lang/rust/issues/111137
 
 #![cfg_attr(
     feature = "vpclmulqdq",

From 5f6346d09c1ffc78051c0e0fe2a43723481811a6 Mon Sep 17 00:00:00 2001
From: Don MacAskill <don@smugmug.com>
Date: Thu, 12 Dec 2024 12:43:22 -0800
Subject: [PATCH 05/11] Update README with VPCLMULQDQ details

---
 README.md | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 912f673..d33a3c7 100644
--- a/README.md
+++ b/README.md
@@ -50,15 +50,18 @@ be chosen based on CPU feature at runtime.
 
 [crc 3.0.1]: https://docs.rs/crc/3.0.1/crc/index.html
 
-## TODO
+## Experimental VPCLMULQDQ support
 
-This crate is mainly intended for use in TiKV only.
-Features beyond AArch64 are unlikely to be implemented.
+Using Rust's support for [AVX512 intrinsics](https://github.com/rust-lang/rust/issues/111137), specifically [VPCLMULQDQ](https://doc.rust-lang.org/src/core/stdarch/crates/core_arch/src/x86/vpclmulqdq.rs.html), we can massively improve throughput for x86_64 processors which support them (Intel Ice Lake+ and AMD Zen4+).
 
-* [x] AArch64 support based on PMULL
-* [ ] `no_std` support
-* [x] Fuzz test
-* [ ] Custom polynomial
+Specifically, on an `m7i.8xlarge` EC2 instance (4th gen Xeon, aka Sapphire Rapids), throughput approximately _doubles_ from ~26GiB/s to ~52GiB/s.
+
+Since these are currently marked as unstable features in Rust, you'll need to build with `nightly` and enable the `vpclmulqdq` feature:
+
+``` 
+rustup toolchain install nightly
+cargo +nightly build --features="vpclmulqdq" -r
+```
 
 ## License
 

From 732a20d18dd961d668c1846d5af3821670cb1c81 Mon Sep 17 00:00:00 2001
From: Don MacAskill <don@smugmug.com>
Date: Thu, 12 Dec 2024 14:35:43 -0800
Subject: [PATCH 06/11] Incorporate x86 changes into vpclmulqdq

---
 src/pclmulqdq/mod.rs                        |  2 +-
 src/pclmulqdq/{x86_64.rs => x86.rs}         | 29 ++++++++++++++++-----
 src/pclmulqdq/{x86 => x86_64}/mod.rs        | 13 ++-------
 src/pclmulqdq/{x86 => x86_64}/vpclmulqdq.rs |  0
 4 files changed, 25 insertions(+), 19 deletions(-)
 rename src/pclmulqdq/{x86_64.rs => x86.rs} (61%)
 rename src/pclmulqdq/{x86 => x86_64}/mod.rs (82%)
 rename src/pclmulqdq/{x86 => x86_64}/vpclmulqdq.rs (100%)

diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs
index 9f85219..ca4ed00 100644
--- a/src/pclmulqdq/mod.rs
+++ b/src/pclmulqdq/mod.rs
@@ -17,7 +17,7 @@ use super::table;
 use self::arch::Simd;
 
 #[cfg(not(feature = "fake-simd"))]
-#[cfg_attr(target_arch = "x86_64", path = "x86_64.rs")]
+#[cfg_attr(target_arch = "x86_64", path = "x86_64/mod.rs")]
 #[cfg_attr(target_arch = "aarch64", path = "aarch64.rs")]
 #[cfg_attr(target_arch = "x86", path = "x86.rs")]
 mod arch;
diff --git a/src/pclmulqdq/x86_64.rs b/src/pclmulqdq/x86.rs
similarity index 61%
rename from src/pclmulqdq/x86_64.rs
rename to src/pclmulqdq/x86.rs
index 63abd80..8dc41b6 100644
--- a/src/pclmulqdq/x86_64.rs
+++ b/src/pclmulqdq/x86.rs
@@ -1,9 +1,9 @@
 // Copyright 2020 TiKV Project Authors. Licensed under MIT or Apache-2.0.
 
-//! x86_64 implementation of the PCLMULQDQ-based CRC calculation.
+//! x86 (32-bit) implementation of the PCLMULQDQ-based CRC calculation.
 
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
 use std::ops::BitXor;
 
 #[repr(transparent)]
@@ -14,13 +14,20 @@ impl super::SimdExt for Simd {
     fn is_supported() -> bool {
         is_x86_feature_detected!("pclmulqdq") // _mm_clmulepi64_si128
             && is_x86_feature_detected!("sse2") // (all other _mm_*)
-            && is_x86_feature_detected!("sse4.1") // _mm_extract_epi64
+            && is_x86_feature_detected!("sse4.1")
     }
 
     #[inline]
     #[target_feature(enable = "sse2")]
     unsafe fn new(high: u64, low: u64) -> Self {
-        Self(_mm_set_epi64x(high as i64, low as i64))
+        // On 32-bit systems, we need to split u64 into low and high 32-bit parts
+        let high_low = (high & 0xFFFFFFFF) as i32;
+        let high_high = ((high >> 32) & 0xFFFFFFFF) as i32;
+        let low_low = (low & 0xFFFFFFFF) as i32;
+        let low_high = ((low >> 32) & 0xFFFFFFFF) as i32;
+
+        // Create the 128-bit vector using 32-bit parts
+        Self(_mm_set_epi32(high_high, high_low, low_high, low_low))
     }
 
     #[inline]
@@ -48,7 +55,15 @@ impl super::SimdExt for Simd {
         let h = Self(_mm_slli_si128(t1, 8));
         let l = Self(_mm_clmulepi64_si128(t1, polymu.0, 0x10));
         let reduced = h ^ l ^ self;
-        _mm_extract_epi64(reduced.0, 1) as u64
+
+        // Store the result in memory and read it back as u64
+        // This approach is more reliable for handling 64-bit values on 32-bit systems
+        let mut result: [u32; 4] = [0; 4];
+        _mm_storeu_si128(result.as_mut_ptr() as *mut __m128i, reduced.0);
+
+        // Combine the two 32-bit values into a 64-bit result
+        // We want the high 64 bits (indices 2 and 3)
+        ((result[3] as u64) << 32) | (result[2] as u64)
     }
 }
 
@@ -58,4 +73,4 @@ impl BitXor for Simd {
     fn bitxor(self, other: Self) -> Self {
         Self(unsafe { _mm_xor_si128(self.0, other.0) })
     }
-}
+}
\ No newline at end of file
diff --git a/src/pclmulqdq/x86/mod.rs b/src/pclmulqdq/x86_64/mod.rs
similarity index 82%
rename from src/pclmulqdq/x86/mod.rs
rename to src/pclmulqdq/x86_64/mod.rs
index 6181a0d..401a619 100644
--- a/src/pclmulqdq/x86/mod.rs
+++ b/src/pclmulqdq/x86_64/mod.rs
@@ -2,8 +2,8 @@
 
 //! x86 (32-bit) implementation of the PCLMULQDQ-based CRC calculation.
 
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
 use std::ops::BitXor;
 
 #[cfg(all(feature = "vpclmulqdq"))]
@@ -59,15 +59,6 @@ impl super::SimdExt for Simd {
         let l = Self(_mm_clmulepi64_si128::<0x10>(t1, polymu.0));
         let reduced = h ^ l ^ self;
         _mm_extract_epi64::<1>(reduced.0) as u64
-
-        // Store the result in memory and read it back as u64
-        // This approach is more reliable for handling 64-bit values on 32-bit systems
-        let mut result: [u32; 4] = [0; 4];
-        _mm_storeu_si128(result.as_mut_ptr() as *mut __m128i, reduced.0);
-
-        // Combine the two 32-bit values into a 64-bit result
-        // We want the high 64 bits (indices 2 and 3)
-        ((result[3] as u64) << 32) | (result[2] as u64)
     }
 }
 
diff --git a/src/pclmulqdq/x86/vpclmulqdq.rs b/src/pclmulqdq/x86_64/vpclmulqdq.rs
similarity index 100%
rename from src/pclmulqdq/x86/vpclmulqdq.rs
rename to src/pclmulqdq/x86_64/vpclmulqdq.rs

From 5bcf1187d35caeb54146ba150b33c31b85937e77 Mon Sep 17 00:00:00 2001
From: Don MacAskill <don@smugmug.com>
Date: Thu, 12 Dec 2024 15:23:45 -0800
Subject: [PATCH 07/11] Fix formatting

---
 src/pclmulqdq/x86.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pclmulqdq/x86.rs b/src/pclmulqdq/x86.rs
index 8dc41b6..55532c1 100644
--- a/src/pclmulqdq/x86.rs
+++ b/src/pclmulqdq/x86.rs
@@ -73,4 +73,4 @@ impl BitXor for Simd {
     fn bitxor(self, other: Self) -> Self {
         Self(unsafe { _mm_xor_si128(self.0, other.0) })
     }
-}
\ No newline at end of file
+}

From 602e0e91be94d327249986cc75a78c9c96a8b01e Mon Sep 17 00:00:00 2001
From: Don MacAskill <don@smugmug.com>
Date: Thu, 12 Dec 2024 15:24:00 -0800
Subject: [PATCH 08/11] Improve docs with more VPCLMULQDQ details

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4516b1f..368013d 100644
--- a/README.md
+++ b/README.md
@@ -48,11 +48,12 @@ be chosen based on CPU feature at runtime.
 |:----------------------------|--------------------:|---------------------:|
 | [crc 3.0.1]                 |           0.5 GiB/s |            0.3 GiB/s |
 | crc64fast-nvme (table)      |           2.3 GiB/s |            1.8 GiB/s |
-| crc64fast-nvme (simd)       |          28.2 GiB/s |           20.0 GiB/s |
+| crc64fast-nvme (SIMD)       |          28.2 GiB/s |           20.0 GiB/s |
+| crc64fast-nvme (VPCLMULQDQ) |            52 GiB/s |                 n/a  |
 
 [crc 3.0.1]: https://docs.rs/crc/3.0.1/crc/index.html
 
-## Experimental VPCLMULQDQ support
+## Experimental "Vector Carry-Less Multiplication of Quadwords" (VPCLMULQDQ) support
 
 Using Rust's support for [AVX512 intrinsics](https://github.com/rust-lang/rust/issues/111137), specifically [VPCLMULQDQ](https://doc.rust-lang.org/src/core/stdarch/crates/core_arch/src/x86/vpclmulqdq.rs.html), we can massively improve throughput for x86_64 processors which support them (Intel Ice Lake+ and AMD Zen4+).
 
@@ -79,6 +80,7 @@ cargo +nightly build --features="vpclmulqdq" -r
 * [StackOverflow PCLMULQDQ CRC32 question](https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq) - Insightful question & answer to CRC32 implementation details.
 * [AWS S3 announcement about CRC64-NVME support](https://aws.amazon.com/blogs/aws/introducing-default-data-integrity-protections-for-new-objects-in-amazon-s3/)
 * [AWS S3 docs on checking object integrity using CRC64-NVME](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html)
+* [Vector Carry-Less Multiplication of Quadwords (VPCLMULQDQ) details](https://en.wikichip.org/wiki/x86/vpclmulqdq)
 
 ## License
 

From a945288d1aa9fec71f25f71b418d6ad92791daec Mon Sep 17 00:00:00 2001
From: Don MacAskill <don@smugmug.com>
Date: Thu, 19 Dec 2024 18:48:14 -0800
Subject: [PATCH 09/11] Use internal Rust functions

---
 src/lib.rs                         |  2 +-
 src/pclmulqdq/x86_64/vpclmulqdq.rs | 14 ++++----------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 00e9c7e..b306ce3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -26,7 +26,7 @@
 
 #![cfg_attr(
     feature = "vpclmulqdq",
-    feature(simd_ffi, link_llvm_intrinsics, avx512_target_feature,)
+    feature(avx512_target_feature, stdarch_x86_avx512)
 )]
 
 mod pclmulqdq;
diff --git a/src/pclmulqdq/x86_64/vpclmulqdq.rs b/src/pclmulqdq/x86_64/vpclmulqdq.rs
index e085e24..d25900e 100644
--- a/src/pclmulqdq/x86_64/vpclmulqdq.rs
+++ b/src/pclmulqdq/x86_64/vpclmulqdq.rs
@@ -1,18 +1,12 @@
 use super::{super::fold_tail, Simd, __cpuid_count, __m256i, _mm256_set_epi64x, _mm256_xor_si256};
 use core::ops::BitXor;
 use lazy_static::lazy_static;
-
-// PCLMULQDQ can be used without avx512vl. However, this is only addressed by rust recently --- so we
-// need to manually specify the intrinsic, otherwise rustc will inline it poorly.
-#[allow(improper_ctypes)]
-extern "C" {
-    #[link_name = "llvm.x86.pclmulqdq.256"]
-    fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i;
-}
+use std::arch::x86_64::_mm256_clmulepi64_epi128;
 
 #[derive(Clone, Copy, Debug)]
 pub struct Simd256(__m256i);
 
+// this lazy_static bit takes throughput from ~39GiB/s to ~52GiB/s
 lazy_static! {
     static ref VPCLMULQDQ_SUPPORTED : bool = {
         let avx2 = is_x86_feature_detected!("avx2");
@@ -45,8 +39,8 @@ impl Simd256 {
     #[inline]
     #[target_feature(enable = "avx2", enable = "vpclmulqdq")]
     pub unsafe fn fold_32(self, coeff: Self) -> Self {
-        let h = pclmulqdq_256(self.0, coeff.0, 0x11);
-        let l = pclmulqdq_256(self.0, coeff.0, 0x00);
+        let h = _mm256_clmulepi64_epi128(self.0, coeff.0, 0x11);
+        let l = _mm256_clmulepi64_epi128(self.0, coeff.0, 0x00);
         Self(h) ^ Self(l)
     }
 }

From f30e24cc4573df94e4b366c67c2d1bf9aaa50c49 Mon Sep 17 00:00:00 2001
From: Don MacAskill <don@smugmug.com>
Date: Thu, 19 Dec 2024 18:48:25 -0800
Subject: [PATCH 10/11] Improve docs

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 368013d..1519b2a 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ crc64fast-nvme
 SIMD-accelerated carryless-multiplication [CRC-64/NVME](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-64-nvme) checksum computation
 (similar to [crc32fast](https://crates.io/crates/crc32fast) and forked from [crc64fast](https://github.com/tikv/crc64fast) which calculates [CRC-64/XZ](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-64-xz) [a.k.a `CRC-64/GO-ECMA`]).
 
-`CRC-64/NVME` comes from the [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf) (Revision 1.0d, December 2023) and has also been implemented in the [Linux kernel](https://github.com/torvalds/linux/blob/786c8248dbd33a5a7a07f7c6e55a7bfc68d2ca48/lib/crc64.c#L66-L73) (where it's called `CRC-64/Rocksoft`) and [AWS S3's recommended checksum option](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html) as `CRC64-NVME`. (Note that the Check value in the spec uses incorrect endianness (Section 5.2.1.3.4, Figure 120, page 83).
+`CRC-64/NVME` comes from the [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf) (Revision 1.0d, December 2023) and has also been implemented in the [Linux kernel](https://github.com/torvalds/linux/blob/786c8248dbd33a5a7a07f7c6e55a7bfc68d2ca48/lib/crc64.c#L66-L73) (where it's called `CRC-64/Rocksoft`) and is [AWS S3's recommended checksum option](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html) as `CRC64-NVME`. (Note that the Check value in the spec uses incorrect endianness [Section 5.2.1.3.4, Figure 120, page 83]).
 
 SIMD-accelerated carryless-multiplication is based on the Intel [Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction](https://web.archive.org/web/20131224125630/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf) paper.
 

From 7edf216bfb4c0784e0c1a9adc183b20fe6de96df Mon Sep 17 00:00:00 2001
From: Don MacAskill <don@smugmug.com>
Date: Thu, 26 Dec 2024 15:38:47 -0800
Subject: [PATCH 11/11] Improve cfg() usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There’s only one feature…
---
 src/pclmulqdq/mod.rs        | 2 +-
 src/pclmulqdq/x86_64/mod.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs
index ca4ed00..a19c888 100644
--- a/src/pclmulqdq/mod.rs
+++ b/src/pclmulqdq/mod.rs
@@ -73,7 +73,7 @@ impl BitXorAssign for Simd {
 }
 
 pub fn get_update() -> super::UpdateFn {
-    #[cfg(all(feature = "vpclmulqdq"))]
+    #[cfg(feature = "vpclmulqdq")]
     {
         use arch::vpclmulqdq::*;
         if Simd256::is_supported() {
diff --git a/src/pclmulqdq/x86_64/mod.rs b/src/pclmulqdq/x86_64/mod.rs
index 401a619..e09c802 100644
--- a/src/pclmulqdq/x86_64/mod.rs
+++ b/src/pclmulqdq/x86_64/mod.rs
@@ -6,7 +6,7 @@
 use std::arch::x86_64::*;
 use std::ops::BitXor;
 
-#[cfg(all(feature = "vpclmulqdq"))]
+#[cfg(feature = "vpclmulqdq")]
 pub mod vpclmulqdq;
 
 #[repr(transparent)]