From 57ffd2e0a9ae4e29eb8d464265e277bcaa960a54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sun, 23 Jan 2022 19:10:28 +0100 Subject: [PATCH 1/3] Implement avx512 compressstore intrinsics --- crates/core_arch/src/x86/avx512f.rs | 280 ++++++++++++++++++++++++++++ 1 file changed, 280 insertions(+) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 97c6f6c4d6..325304bf0c 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -16156,6 +16156,126 @@ pub unsafe fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d { transmute(vcompresspd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k)) } +/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcompressd))] +pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut i8, k: __mmask16, a: __m512i) { + vcompressd_mem(base_addr, a.as_i32x16(), k) +} + +/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpcompressd))] +pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut i8, k: __mmask8, a: __m256i) { + vcompressd256_mem(base_addr, a.as_i32x8(), k) +} + +/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpcompressd))] +pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut i8, k: __mmask8, a: __m128i) { + vcompressd128_mem(base_addr, a.as_i32x4(), k) +} + +/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcompressq))] +pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut i8, k: __mmask8, a: __m512i) { + vcompressq_mem(base_addr, a.as_i64x8(), k) +} + +/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpcompressq))] +pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut i8, k: __mmask8, a: __m256i) { + vcompressq256_mem(base_addr, a.as_i64x4(), k) +} + +/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpcompressq))] +pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut i8, k: __mmask8, a: __m128i) { + vcompressq128_mem(base_addr, a.as_i64x2(), k) +} + +/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcompressps))] +pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut i8, k: __mmask16, a: __m512) { + vcompressps_mem(base_addr, a.as_f32x16(), k) +} + +/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vcompressps))] +pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut i8, k: __mmask8, a: __m256) { + vcompressps256_mem(base_addr, a.as_f32x8(), k) +} + +/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vcompressps))] +pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut i8, k: __mmask8, a: __m128) { + vcompressps128_mem(base_addr, a.as_f32x4(), k) +} + +/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcompresspd))] +pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut i8, k: __mmask8, a: __m512d) { + vcompresspd_mem(base_addr, a.as_f64x8(), k) +} + +/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vcompresspd))] +pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut i8, k: __mmask8, a: __m256d) { + vcompresspd256_mem(base_addr, a.as_f64x4(), k) +} + +/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vcompresspd))] +pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut i8, k: __mmask8, a: __m128d) { + vcompresspd128_mem(base_addr, a.as_f64x2(), k) +} + /// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi32&expand=2316) @@ -38007,6 +38127,34 @@ extern "C" { #[link_name = "llvm.x86.avx512.mask.compress.pd.128"] fn vcompresspd128(a: f64x2, src: f64x2, mask: u8) -> f64x2; + #[link_name = "llvm.x86.avx512.mask.compress.store.d.512"] + fn vcompressd_mem(mem: *mut i8, data: i32x16, mask: u16); + #[link_name = "llvm.x86.avx512.mask.compress.store.d.256"] + fn vcompressd256_mem(mem: *mut i8, data: i32x8, mask: u8); + #[link_name = "llvm.x86.avx512.mask.compress.store.d.128"] + fn vcompressd128_mem(mem: *mut i8, data: i32x4, mask: u8); + + #[link_name = "llvm.x86.avx512.mask.compress.store.q.512"] + fn vcompressq_mem(mem: *mut i8, data: i64x8, mask: u8); + #[link_name = "llvm.x86.avx512.mask.compress.store.q.256"] + fn vcompressq256_mem(mem: *mut i8, data: i64x4, mask: u8); + #[link_name = "llvm.x86.avx512.mask.compress.store.q.128"] + fn vcompressq128_mem(mem: *mut i8, data: i64x2, mask: u8); + + #[link_name = "llvm.x86.avx512.mask.compress.store.ps.512"] + fn vcompressps_mem(mem: *mut i8, data: f32x16, mask: u16); + #[link_name = "llvm.x86.avx512.mask.compress.store.ps.256"] + fn vcompressps256_mem(mem: *mut i8, data: f32x8, mask: u8); + #[link_name = "llvm.x86.avx512.mask.compress.store.ps.128"] + fn vcompressps128_mem(mem: *mut i8, data: f32x4, mask: u8); + + #[link_name = "llvm.x86.avx512.mask.compress.store.pd.512"] + fn vcompresspd_mem(mem: *mut i8, data: f64x8, mask: u8); + #[link_name = "llvm.x86.avx512.mask.compress.store.pd.256"] + fn vcompresspd256_mem(mem: *mut i8, data: f64x4, mask: u8); + #[link_name = "llvm.x86.avx512.mask.compress.store.pd.128"] + fn vcompresspd128_mem(mem: *mut i8, data: f64x2, mask: u8); + #[link_name = "llvm.x86.avx512.mask.expand.d.512"] fn vpexpandd(a: i32x16, src: i32x16, mask: u16) -> i32x16; #[link_name = "llvm.x86.avx512.mask.expand.d.256"] @@ -51357,6 +51505,138 @@ mod tests { assert_eq_m128(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_compressstoreu_epi32() { + let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let mut r = [0_i32; 16]; + _mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a); + assert_eq!(&r, &[0_i32; 16]); + _mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1111000011001010, a); + assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_compressstoreu_epi32() { + let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let mut r = [0_i32; 8]; + _mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a); + assert_eq!(&r, &[0_i32; 8]); + _mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b11001010, a); + assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_compressstoreu_epi32() { + let a = _mm_setr_epi32(1, 2, 3, 4); + let mut r = [0_i32; 4]; + _mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a); + assert_eq!(&r, &[0_i32; 4]); + _mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1011, a); + assert_eq!(&r, &[1, 2, 4, 0]); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_compressstoreu_epi64() { + let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let mut r = [0_i64; 8]; + _mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a); + assert_eq!(&r, &[0_i64; 8]); + _mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b11001010, a); + assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_compressstoreu_epi64() { + let a = _mm256_setr_epi64x(1, 2, 3, 4); + let mut r = [0_i64; 4]; + _mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a); + assert_eq!(&r, &[0_i64; 4]); + _mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b1011, a); + assert_eq!(&r, &[1, 2, 4, 0]); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_compressstoreu_epi64() { + let a = _mm_setr_epi64x(1, 2); + let mut r = [0_i64; 2]; + _mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a); + assert_eq!(&r, &[0_i64; 2]); + _mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b10, a); + assert_eq!(&r, &[2, 0]); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_compressstoreu_ps() { + let a = _mm512_setr_ps( + 1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32, 9_f32, 10_f32, 11_f32, 12_f32, + 13_f32, 14_f32, 15_f32, 16_f32, + ); + let mut r = [0_f32; 16]; + _mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a); + assert_eq!(&r, &[0_f32; 16]); + _mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1111000011001010, a); + assert_eq!( + &r, + &[ + 2_f32, 4_f32, 7_f32, 8_f32, 13_f32, 14_f32, 15_f32, 16_f32, 0_f32, 0_f32, 0_f32, + 0_f32, 0_f32, 0_f32, 0_f32, 0_f32 + ] + ); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_compressstoreu_ps() { + let a = _mm256_setr_ps(1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32); + let mut r = [0_f32; 8]; + _mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a); + assert_eq!(&r, &[0_f32; 8]); + _mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b11001010, a); + assert_eq!( + &r, + &[2_f32, 4_f32, 7_f32, 8_f32, 0_f32, 0_f32, 0_f32, 0_f32] + ); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_compressstoreu_ps() { + let a = _mm_setr_ps(1_f32, 2_f32, 3_f32, 4_f32); + let mut r = [0.; 4]; + _mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a); + assert_eq!(&r, &[0.; 4]); + _mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1011, a); + assert_eq!(&r, &[1_f32, 2_f32, 4_f32, 0_f32]); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_compressstoreu_pd() { + let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let mut r = [0.; 8]; + _mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a); + assert_eq!(&r, &[0.; 8]); + _mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b11001010, a); + assert_eq!(&r, &[2., 4., 7., 8., 0., 0., 0., 0.]); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_compressstoreu_pd() { + let a = _mm256_setr_pd(1., 2., 3., 4.); + let mut r = [0.; 4]; + _mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a); + assert_eq!(&r, &[0.; 4]); + _mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b1011, a); + assert_eq!(&r, &[1., 2., 4., 0.]); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_compressstoreu_pd() { + let a = _mm_setr_pd(1., 2.); + let mut r = [0.; 2]; + _mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a); + assert_eq!(&r, &[0.; 2]); + _mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b10, a); + assert_eq!(&r, &[2., 0.]); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_expand_epi32() { let src = _mm512_set1_epi32(200); From 7fcf1f940d376d66c3d92a3997528ae80adb33f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sun, 23 Jan 2022 19:28:11 +0100 Subject: [PATCH 2/3] Mark avx512f compressstore as implemented --- crates/core_arch/avx512f.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 9d95f0c492..997a04b38f 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -1629,18 +1629,18 @@ * [x] [`_mm_maskz_compress_pd`] * [x] [`_mm256_mask_compress_pd`] * [x] [`_mm256_maskz_compress_pd`] - * [ ] [`_mm512_mask_compressstoreu_epi32`] //need i1 - * [_] [`_mm_mask_compressstoreu_epi32`] //need i1 - * [_] [`_mm256_mask_compressstoreu_epi32`] //need i1 - * [ ] [`_mm512_mask_compressstoreu_epi64`] //need i1 - * [_] [`_mm_mask_compressstoreu_epi64`] //need i1 - * [_] [`_mm256_mask_compressstoreu_epi64`] //need i1 - * [ ] [`_mm512_mask_compressstoreu_ps`] //need i1 - * [_] [`_mm_mask_compressstoreu_ps`] //need i1 - * [_] [`_mm256_mask_compressstoreu_ps`] //need i1 - * [ ] [`_mm512_mask_compressstoreu_pd`] //need i1 - * [_] [`_mm_mask_compressstoreu_pd`] //need i1 - * [_] [`_mm256_mask_compressstoreu_pd`] //need i1 + * [x] [`_mm512_mask_compressstoreu_epi32`] //need i1 + * [x] [`_mm_mask_compressstoreu_epi32`] //need i1 + * [x] [`_mm256_mask_compressstoreu_epi32`] //need i1 + * [x] [`_mm512_mask_compressstoreu_epi64`] //need i1 + * [x] [`_mm_mask_compressstoreu_epi64`] //need i1 + * [x] [`_mm256_mask_compressstoreu_epi64`] //need i1 + * [x] [`_mm512_mask_compressstoreu_ps`] //need i1 + * [x] [`_mm_mask_compressstoreu_ps`] //need i1 + * [x] [`_mm256_mask_compressstoreu_ps`] //need i1 + * [x] [`_mm512_mask_compressstoreu_pd`] //need i1 + * [x] [`_mm_mask_compressstoreu_pd`] //need i1 + * [x] [`_mm256_mask_compressstoreu_pd`] //need i1 * [x] [`_mm512_mask_expand_epi32`] * [x] [`_mm512_maskz_expand_epi32`] * [x] [`_mm_mask_expand_epi32`] From de5ef4035375b63fc24aa6a905e8ae466b60df8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Mon, 24 Jan 2022 22:36:04 +0100 Subject: [PATCH 3/3] Change naming convention for llvm intrinsics and use u8 pointers --- crates/core_arch/src/x86/avx512f.rs | 72 ++++++++++++++--------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 325304bf0c..df0a28d5c8 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -16162,8 +16162,8 @@ pub unsafe fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d { #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpcompressd))] -pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut i8, k: __mmask16, a: __m512i) { - vcompressd_mem(base_addr, a.as_i32x16(), k) +pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask16, a: __m512i) { + vcompressstored(base_addr as *mut _, a.as_i32x16(), k) } /// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. @@ -16172,8 +16172,8 @@ pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut i8, k: __mmask16, #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vpcompressd))] -pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut i8, k: __mmask8, a: __m256i) { - vcompressd256_mem(base_addr, a.as_i32x8(), k) +pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m256i) { + vcompressstored256(base_addr as *mut _, a.as_i32x8(), k) } /// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. @@ -16182,8 +16182,8 @@ pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut i8, k: __mmask8, #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vpcompressd))] -pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut i8, k: __mmask8, a: __m128i) { - vcompressd128_mem(base_addr, a.as_i32x4(), k) +pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m128i) { + vcompressstored128(base_addr as *mut _, a.as_i32x4(), k) } /// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. @@ -16192,8 +16192,8 @@ pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut i8, k: __mmask8, a: #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpcompressq))] -pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut i8, k: __mmask8, a: __m512i) { - vcompressq_mem(base_addr, a.as_i64x8(), k) +pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m512i) { + vcompressstoreq(base_addr as *mut _, a.as_i64x8(), k) } /// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. @@ -16202,8 +16202,8 @@ pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut i8, k: __mmask8, #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vpcompressq))] -pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut i8, k: __mmask8, a: __m256i) { - vcompressq256_mem(base_addr, a.as_i64x4(), k) +pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m256i) { + vcompressstoreq256(base_addr as *mut _, a.as_i64x4(), k) } /// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. @@ -16212,8 +16212,8 @@ pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut i8, k: __mmask8, #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vpcompressq))] -pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut i8, k: __mmask8, a: __m128i) { - vcompressq128_mem(base_addr, a.as_i64x2(), k) +pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m128i) { + vcompressstoreq128(base_addr as *mut _, a.as_i64x2(), k) } /// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. @@ -16222,8 +16222,8 @@ pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut i8, k: __mmask8, a: #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcompressps))] -pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut i8, k: __mmask16, a: __m512) { - vcompressps_mem(base_addr, a.as_f32x16(), k) +pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask16, a: __m512) { + vcompressstoreps(base_addr as *mut _, a.as_f32x16(), k) } /// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. @@ -16232,8 +16232,8 @@ pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut i8, k: __mmask16, a: #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vcompressps))] -pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut i8, k: __mmask8, a: __m256) { - vcompressps256_mem(base_addr, a.as_f32x8(), k) +pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m256) { + vcompressstoreps256(base_addr as *mut _, a.as_f32x8(), k) } /// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. @@ -16242,8 +16242,8 @@ pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut i8, k: __mmask8, a: #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vcompressps))] -pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut i8, k: __mmask8, a: __m128) { - vcompressps128_mem(base_addr, a.as_f32x4(), k) +pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m128) { + vcompressstoreps128(base_addr as *mut _, a.as_f32x4(), k) } /// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. @@ -16252,8 +16252,8 @@ pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut i8, k: __mmask8, a: __m #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcompresspd))] -pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut i8, k: __mmask8, a: __m512d) { - vcompresspd_mem(base_addr, a.as_f64x8(), k) +pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m512d) { + vcompressstorepd(base_addr as *mut _, a.as_f64x8(), k) } /// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. @@ -16262,8 +16262,8 @@ pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut i8, k: __mmask8, a: #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vcompresspd))] -pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut i8, k: __mmask8, a: __m256d) { - vcompresspd256_mem(base_addr, a.as_f64x4(), k) +pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m256d) { + vcompressstorepd256(base_addr as *mut _, a.as_f64x4(), k) } /// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. @@ -16272,8 +16272,8 @@ pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut i8, k: __mmask8, a: #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vcompresspd))] -pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut i8, k: __mmask8, a: __m128d) { - vcompresspd128_mem(base_addr, a.as_f64x2(), k) +pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m128d) { + vcompressstorepd128(base_addr as *mut _, a.as_f64x2(), k) } /// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -38128,32 +38128,32 @@ extern "C" { fn vcompresspd128(a: f64x2, src: f64x2, mask: u8) -> f64x2; #[link_name = "llvm.x86.avx512.mask.compress.store.d.512"] - fn vcompressd_mem(mem: *mut i8, data: i32x16, mask: u16); + fn vcompressstored(mem: *mut i8, data: i32x16, mask: u16); #[link_name = "llvm.x86.avx512.mask.compress.store.d.256"] - fn vcompressd256_mem(mem: *mut i8, data: i32x8, mask: u8); + fn vcompressstored256(mem: *mut i8, data: i32x8, mask: u8); #[link_name = "llvm.x86.avx512.mask.compress.store.d.128"] - fn vcompressd128_mem(mem: *mut i8, data: i32x4, mask: u8); + fn vcompressstored128(mem: *mut i8, data: i32x4, mask: u8); #[link_name = "llvm.x86.avx512.mask.compress.store.q.512"] - fn vcompressq_mem(mem: *mut i8, data: i64x8, mask: u8); + fn vcompressstoreq(mem: *mut i8, data: i64x8, mask: u8); #[link_name = "llvm.x86.avx512.mask.compress.store.q.256"] - fn vcompressq256_mem(mem: *mut i8, data: i64x4, mask: u8); + fn vcompressstoreq256(mem: *mut i8, data: i64x4, mask: u8); #[link_name = "llvm.x86.avx512.mask.compress.store.q.128"] - fn vcompressq128_mem(mem: *mut i8, data: i64x2, mask: u8); + fn vcompressstoreq128(mem: *mut i8, data: i64x2, mask: u8); #[link_name = "llvm.x86.avx512.mask.compress.store.ps.512"] - fn vcompressps_mem(mem: *mut i8, data: f32x16, mask: u16); + fn vcompressstoreps(mem: *mut i8, data: f32x16, mask: u16); #[link_name = "llvm.x86.avx512.mask.compress.store.ps.256"] - fn vcompressps256_mem(mem: *mut i8, data: f32x8, mask: u8); + fn vcompressstoreps256(mem: *mut i8, data: f32x8, mask: u8); #[link_name = "llvm.x86.avx512.mask.compress.store.ps.128"] - fn vcompressps128_mem(mem: *mut i8, data: f32x4, mask: u8); + fn vcompressstoreps128(mem: *mut i8, data: f32x4, mask: u8); #[link_name = "llvm.x86.avx512.mask.compress.store.pd.512"] - fn vcompresspd_mem(mem: *mut i8, data: f64x8, mask: u8); + fn vcompressstorepd(mem: *mut i8, data: f64x8, mask: u8); #[link_name = "llvm.x86.avx512.mask.compress.store.pd.256"] - fn vcompresspd256_mem(mem: *mut i8, data: f64x4, mask: u8); + fn vcompressstorepd256(mem: *mut i8, data: f64x4, mask: u8); #[link_name = "llvm.x86.avx512.mask.compress.store.pd.128"] - fn vcompresspd128_mem(mem: *mut i8, data: f64x2, mask: u8); + fn vcompressstorepd128(mem: *mut i8, data: f64x2, mask: u8); #[link_name = "llvm.x86.avx512.mask.expand.d.512"] fn vpexpandd(a: i32x16, src: i32x16, mask: u16) -> i32x16;