Skip to content

Commit f4c5507

Browse files
authored
Implement avx512 compressstore intrinsics (#1273)
1 parent af80aef commit f4c5507

File tree

2 files changed

+292
-12
lines changed

2 files changed

+292
-12
lines changed

crates/core_arch/avx512f.md

+12-12
Original file line numberDiff line numberDiff line change
@@ -1629,18 +1629,18 @@
16291629
* [x] [`_mm_maskz_compress_pd`]
16301630
* [x] [`_mm256_mask_compress_pd`]
16311631
* [x] [`_mm256_maskz_compress_pd`]
1632-
* [ ] [`_mm512_mask_compressstoreu_epi32`] //need i1
1633-
* [_] [`_mm_mask_compressstoreu_epi32`] //need i1
1634-
* [_] [`_mm256_mask_compressstoreu_epi32`] //need i1
1635-
* [ ] [`_mm512_mask_compressstoreu_epi64`] //need i1
1636-
* [_] [`_mm_mask_compressstoreu_epi64`] //need i1
1637-
* [_] [`_mm256_mask_compressstoreu_epi64`] //need i1
1638-
* [ ] [`_mm512_mask_compressstoreu_ps`] //need i1
1639-
* [_] [`_mm_mask_compressstoreu_ps`] //need i1
1640-
* [_] [`_mm256_mask_compressstoreu_ps`] //need i1
1641-
* [ ] [`_mm512_mask_compressstoreu_pd`] //need i1
1642-
* [_] [`_mm_mask_compressstoreu_pd`] //need i1
1643-
* [_] [`_mm256_mask_compressstoreu_pd`] //need i1
1632+
* [x] [`_mm512_mask_compressstoreu_epi32`] //need i1
1633+
* [x] [`_mm_mask_compressstoreu_epi32`] //need i1
1634+
* [x] [`_mm256_mask_compressstoreu_epi32`] //need i1
1635+
* [x] [`_mm512_mask_compressstoreu_epi64`] //need i1
1636+
* [x] [`_mm_mask_compressstoreu_epi64`] //need i1
1637+
* [x] [`_mm256_mask_compressstoreu_epi64`] //need i1
1638+
* [x] [`_mm512_mask_compressstoreu_ps`] //need i1
1639+
* [x] [`_mm_mask_compressstoreu_ps`] //need i1
1640+
* [x] [`_mm256_mask_compressstoreu_ps`] //need i1
1641+
* [x] [`_mm512_mask_compressstoreu_pd`] //need i1
1642+
* [x] [`_mm_mask_compressstoreu_pd`] //need i1
1643+
* [x] [`_mm256_mask_compressstoreu_pd`] //need i1
16441644
* [x] [`_mm512_mask_expand_epi32`]
16451645
* [x] [`_mm512_maskz_expand_epi32`]
16461646
* [x] [`_mm_mask_expand_epi32`]

crates/core_arch/src/x86/avx512f.rs

+280
Original file line numberDiff line numberDiff line change
@@ -16156,6 +16156,126 @@ pub unsafe fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d {
1615616156
transmute(vcompresspd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
1615716157
}
1615816158

16159+
/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
16160+
///
16161+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi32)
16162+
#[inline]
16163+
#[target_feature(enable = "avx512f")]
16164+
#[cfg_attr(test, assert_instr(vpcompressd))]
16165+
pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask16, a: __m512i) {
16166+
vcompressstored(base_addr as *mut _, a.as_i32x16(), k)
16167+
}
16168+
16169+
/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
16170+
///
16171+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi32)
16172+
#[inline]
16173+
#[target_feature(enable = "avx512f,avx512vl")]
16174+
#[cfg_attr(test, assert_instr(vpcompressd))]
16175+
pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m256i) {
16176+
vcompressstored256(base_addr as *mut _, a.as_i32x8(), k)
16177+
}
16178+
16179+
/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
16180+
///
16181+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi32)
16182+
#[inline]
16183+
#[target_feature(enable = "avx512f,avx512vl")]
16184+
#[cfg_attr(test, assert_instr(vpcompressd))]
16185+
pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m128i) {
16186+
vcompressstored128(base_addr as *mut _, a.as_i32x4(), k)
16187+
}
16188+
16189+
/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
16190+
///
16191+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi64)
16192+
#[inline]
16193+
#[target_feature(enable = "avx512f")]
16194+
#[cfg_attr(test, assert_instr(vpcompressq))]
16195+
pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m512i) {
16196+
vcompressstoreq(base_addr as *mut _, a.as_i64x8(), k)
16197+
}
16198+
16199+
/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
16200+
///
16201+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi64)
16202+
#[inline]
16203+
#[target_feature(enable = "avx512f,avx512vl")]
16204+
#[cfg_attr(test, assert_instr(vpcompressq))]
16205+
pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m256i) {
16206+
vcompressstoreq256(base_addr as *mut _, a.as_i64x4(), k)
16207+
}
16208+
16209+
/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
16210+
///
16211+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi64)
16212+
#[inline]
16213+
#[target_feature(enable = "avx512f,avx512vl")]
16214+
#[cfg_attr(test, assert_instr(vpcompressq))]
16215+
pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m128i) {
16216+
vcompressstoreq128(base_addr as *mut _, a.as_i64x2(), k)
16217+
}
16218+
16219+
/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
16220+
///
16221+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_ps)
16222+
#[inline]
16223+
#[target_feature(enable = "avx512f")]
16224+
#[cfg_attr(test, assert_instr(vcompressps))]
16225+
pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask16, a: __m512) {
16226+
vcompressstoreps(base_addr as *mut _, a.as_f32x16(), k)
16227+
}
16228+
16229+
/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
16230+
///
16231+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_ps)
16232+
#[inline]
16233+
#[target_feature(enable = "avx512f,avx512vl")]
16234+
#[cfg_attr(test, assert_instr(vcompressps))]
16235+
pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m256) {
16236+
vcompressstoreps256(base_addr as *mut _, a.as_f32x8(), k)
16237+
}
16238+
16239+
/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
16240+
///
16241+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_ps)
16242+
#[inline]
16243+
#[target_feature(enable = "avx512f,avx512vl")]
16244+
#[cfg_attr(test, assert_instr(vcompressps))]
16245+
pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m128) {
16246+
vcompressstoreps128(base_addr as *mut _, a.as_f32x4(), k)
16247+
}
16248+
16249+
/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
16250+
///
16251+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_pd)
16252+
#[inline]
16253+
#[target_feature(enable = "avx512f")]
16254+
#[cfg_attr(test, assert_instr(vcompresspd))]
16255+
pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m512d) {
16256+
vcompressstorepd(base_addr as *mut _, a.as_f64x8(), k)
16257+
}
16258+
16259+
/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
16260+
///
16261+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_pd)
16262+
#[inline]
16263+
#[target_feature(enable = "avx512f,avx512vl")]
16264+
#[cfg_attr(test, assert_instr(vcompresspd))]
16265+
pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m256d) {
16266+
vcompressstorepd256(base_addr as *mut _, a.as_f64x4(), k)
16267+
}
16268+
16269+
/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
16270+
///
16271+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_pd)
16272+
#[inline]
16273+
#[target_feature(enable = "avx512f,avx512vl")]
16274+
#[cfg_attr(test, assert_instr(vcompresspd))]
16275+
pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m128d) {
16276+
vcompressstorepd128(base_addr as *mut _, a.as_f64x2(), k)
16277+
}
16278+
1615916279
/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1616016280
///
1616116281
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi32&expand=2316)
@@ -38007,6 +38127,34 @@ extern "C" {
3800738127
#[link_name = "llvm.x86.avx512.mask.compress.pd.128"]
3800838128
fn vcompresspd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;
3800938129

38130+
#[link_name = "llvm.x86.avx512.mask.compress.store.d.512"]
38131+
fn vcompressstored(mem: *mut i8, data: i32x16, mask: u16);
38132+
#[link_name = "llvm.x86.avx512.mask.compress.store.d.256"]
38133+
fn vcompressstored256(mem: *mut i8, data: i32x8, mask: u8);
38134+
#[link_name = "llvm.x86.avx512.mask.compress.store.d.128"]
38135+
fn vcompressstored128(mem: *mut i8, data: i32x4, mask: u8);
38136+
38137+
#[link_name = "llvm.x86.avx512.mask.compress.store.q.512"]
38138+
fn vcompressstoreq(mem: *mut i8, data: i64x8, mask: u8);
38139+
#[link_name = "llvm.x86.avx512.mask.compress.store.q.256"]
38140+
fn vcompressstoreq256(mem: *mut i8, data: i64x4, mask: u8);
38141+
#[link_name = "llvm.x86.avx512.mask.compress.store.q.128"]
38142+
fn vcompressstoreq128(mem: *mut i8, data: i64x2, mask: u8);
38143+
38144+
#[link_name = "llvm.x86.avx512.mask.compress.store.ps.512"]
38145+
fn vcompressstoreps(mem: *mut i8, data: f32x16, mask: u16);
38146+
#[link_name = "llvm.x86.avx512.mask.compress.store.ps.256"]
38147+
fn vcompressstoreps256(mem: *mut i8, data: f32x8, mask: u8);
38148+
#[link_name = "llvm.x86.avx512.mask.compress.store.ps.128"]
38149+
fn vcompressstoreps128(mem: *mut i8, data: f32x4, mask: u8);
38150+
38151+
#[link_name = "llvm.x86.avx512.mask.compress.store.pd.512"]
38152+
fn vcompressstorepd(mem: *mut i8, data: f64x8, mask: u8);
38153+
#[link_name = "llvm.x86.avx512.mask.compress.store.pd.256"]
38154+
fn vcompressstorepd256(mem: *mut i8, data: f64x4, mask: u8);
38155+
#[link_name = "llvm.x86.avx512.mask.compress.store.pd.128"]
38156+
fn vcompressstorepd128(mem: *mut i8, data: f64x2, mask: u8);
38157+
3801038158
#[link_name = "llvm.x86.avx512.mask.expand.d.512"]
3801138159
fn vpexpandd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
3801238160
#[link_name = "llvm.x86.avx512.mask.expand.d.256"]
@@ -51357,6 +51505,138 @@ mod tests {
5135751505
assert_eq_m128(r, e);
5135851506
}
5135951507

51508+
#[simd_test(enable = "avx512f")]
51509+
unsafe fn test_mm512_mask_compressstoreu_epi32() {
51510+
let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
51511+
let mut r = [0_i32; 16];
51512+
_mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
51513+
assert_eq!(&r, &[0_i32; 16]);
51514+
_mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1111000011001010, a);
51515+
assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
51516+
}
51517+
51518+
#[simd_test(enable = "avx512f,avx512vl")]
51519+
unsafe fn test_mm256_mask_compressstoreu_epi32() {
51520+
let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
51521+
let mut r = [0_i32; 8];
51522+
_mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
51523+
assert_eq!(&r, &[0_i32; 8]);
51524+
_mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b11001010, a);
51525+
assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
51526+
}
51527+
51528+
#[simd_test(enable = "avx512f,avx512vl")]
51529+
unsafe fn test_mm_mask_compressstoreu_epi32() {
51530+
let a = _mm_setr_epi32(1, 2, 3, 4);
51531+
let mut r = [0_i32; 4];
51532+
_mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
51533+
assert_eq!(&r, &[0_i32; 4]);
51534+
_mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1011, a);
51535+
assert_eq!(&r, &[1, 2, 4, 0]);
51536+
}
51537+
51538+
#[simd_test(enable = "avx512f")]
51539+
unsafe fn test_mm512_mask_compressstoreu_epi64() {
51540+
let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
51541+
let mut r = [0_i64; 8];
51542+
_mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
51543+
assert_eq!(&r, &[0_i64; 8]);
51544+
_mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b11001010, a);
51545+
assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
51546+
}
51547+
51548+
#[simd_test(enable = "avx512f,avx512vl")]
51549+
unsafe fn test_mm256_mask_compressstoreu_epi64() {
51550+
let a = _mm256_setr_epi64x(1, 2, 3, 4);
51551+
let mut r = [0_i64; 4];
51552+
_mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
51553+
assert_eq!(&r, &[0_i64; 4]);
51554+
_mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b1011, a);
51555+
assert_eq!(&r, &[1, 2, 4, 0]);
51556+
}
51557+
51558+
#[simd_test(enable = "avx512f,avx512vl")]
51559+
unsafe fn test_mm_mask_compressstoreu_epi64() {
51560+
let a = _mm_setr_epi64x(1, 2);
51561+
let mut r = [0_i64; 2];
51562+
_mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
51563+
assert_eq!(&r, &[0_i64; 2]);
51564+
_mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b10, a);
51565+
assert_eq!(&r, &[2, 0]);
51566+
}
51567+
51568+
#[simd_test(enable = "avx512f")]
51569+
unsafe fn test_mm512_mask_compressstoreu_ps() {
51570+
let a = _mm512_setr_ps(
51571+
1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32, 9_f32, 10_f32, 11_f32, 12_f32,
51572+
13_f32, 14_f32, 15_f32, 16_f32,
51573+
);
51574+
let mut r = [0_f32; 16];
51575+
_mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
51576+
assert_eq!(&r, &[0_f32; 16]);
51577+
_mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1111000011001010, a);
51578+
assert_eq!(
51579+
&r,
51580+
&[
51581+
2_f32, 4_f32, 7_f32, 8_f32, 13_f32, 14_f32, 15_f32, 16_f32, 0_f32, 0_f32, 0_f32,
51582+
0_f32, 0_f32, 0_f32, 0_f32, 0_f32
51583+
]
51584+
);
51585+
}
51586+
51587+
#[simd_test(enable = "avx512f,avx512vl")]
51588+
unsafe fn test_mm256_mask_compressstoreu_ps() {
51589+
let a = _mm256_setr_ps(1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32);
51590+
let mut r = [0_f32; 8];
51591+
_mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
51592+
assert_eq!(&r, &[0_f32; 8]);
51593+
_mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b11001010, a);
51594+
assert_eq!(
51595+
&r,
51596+
&[2_f32, 4_f32, 7_f32, 8_f32, 0_f32, 0_f32, 0_f32, 0_f32]
51597+
);
51598+
}
51599+
51600+
#[simd_test(enable = "avx512f,avx512vl")]
51601+
unsafe fn test_mm_mask_compressstoreu_ps() {
51602+
let a = _mm_setr_ps(1_f32, 2_f32, 3_f32, 4_f32);
51603+
let mut r = [0.; 4];
51604+
_mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
51605+
assert_eq!(&r, &[0.; 4]);
51606+
_mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1011, a);
51607+
assert_eq!(&r, &[1_f32, 2_f32, 4_f32, 0_f32]);
51608+
}
51609+
51610+
#[simd_test(enable = "avx512f")]
51611+
unsafe fn test_mm512_mask_compressstoreu_pd() {
51612+
let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
51613+
let mut r = [0.; 8];
51614+
_mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
51615+
assert_eq!(&r, &[0.; 8]);
51616+
_mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b11001010, a);
51617+
assert_eq!(&r, &[2., 4., 7., 8., 0., 0., 0., 0.]);
51618+
}
51619+
51620+
#[simd_test(enable = "avx512f,avx512vl")]
51621+
unsafe fn test_mm256_mask_compressstoreu_pd() {
51622+
let a = _mm256_setr_pd(1., 2., 3., 4.);
51623+
let mut r = [0.; 4];
51624+
_mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
51625+
assert_eq!(&r, &[0.; 4]);
51626+
_mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b1011, a);
51627+
assert_eq!(&r, &[1., 2., 4., 0.]);
51628+
}
51629+
51630+
#[simd_test(enable = "avx512f,avx512vl")]
51631+
unsafe fn test_mm_mask_compressstoreu_pd() {
51632+
let a = _mm_setr_pd(1., 2.);
51633+
let mut r = [0.; 2];
51634+
_mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
51635+
assert_eq!(&r, &[0.; 2]);
51636+
_mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b10, a);
51637+
assert_eq!(&r, &[2., 0.]);
51638+
}
51639+
5136051640
#[simd_test(enable = "avx512f")]
5136151641
unsafe fn test_mm512_mask_expand_epi32() {
5136251642
let src = _mm512_set1_epi32(200);

0 commit comments

Comments
 (0)