Skip to content

Implement avx512 compressstore intrinsics #1273

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions crates/core_arch/avx512f.md
Original file line number Diff line number Diff line change
Expand Up @@ -1629,18 +1629,18 @@
* [x] [`_mm_maskz_compress_pd`]
* [x] [`_mm256_mask_compress_pd`]
* [x] [`_mm256_maskz_compress_pd`]
* [ ] [`_mm512_mask_compressstoreu_epi32`] //need i1
* [_] [`_mm_mask_compressstoreu_epi32`] //need i1
* [_] [`_mm256_mask_compressstoreu_epi32`] //need i1
* [ ] [`_mm512_mask_compressstoreu_epi64`] //need i1
* [_] [`_mm_mask_compressstoreu_epi64`] //need i1
* [_] [`_mm256_mask_compressstoreu_epi64`] //need i1
* [ ] [`_mm512_mask_compressstoreu_ps`] //need i1
* [_] [`_mm_mask_compressstoreu_ps`] //need i1
* [_] [`_mm256_mask_compressstoreu_ps`] //need i1
* [ ] [`_mm512_mask_compressstoreu_pd`] //need i1
* [_] [`_mm_mask_compressstoreu_pd`] //need i1
* [_] [`_mm256_mask_compressstoreu_pd`] //need i1
* [x] [`_mm512_mask_compressstoreu_epi32`] //need i1
* [x] [`_mm_mask_compressstoreu_epi32`] //need i1
* [x] [`_mm256_mask_compressstoreu_epi32`] //need i1
* [x] [`_mm512_mask_compressstoreu_epi64`] //need i1
* [x] [`_mm_mask_compressstoreu_epi64`] //need i1
* [x] [`_mm256_mask_compressstoreu_epi64`] //need i1
* [x] [`_mm512_mask_compressstoreu_ps`] //need i1
* [x] [`_mm_mask_compressstoreu_ps`] //need i1
* [x] [`_mm256_mask_compressstoreu_ps`] //need i1
* [x] [`_mm512_mask_compressstoreu_pd`] //need i1
* [x] [`_mm_mask_compressstoreu_pd`] //need i1
* [x] [`_mm256_mask_compressstoreu_pd`] //need i1
* [x] [`_mm512_mask_expand_epi32`]
* [x] [`_mm512_maskz_expand_epi32`]
* [x] [`_mm_mask_expand_epi32`]
Expand Down
280 changes: 280 additions & 0 deletions crates/core_arch/src/x86/avx512f.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16156,6 +16156,126 @@ pub unsafe fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d {
transmute(vcompresspd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi32)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask16, a: __m512i) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intel's intrinsic guide uses a void* for base_addr, the llvm intrinsics use an i8*. Using a ptr of the correct datatype would be more ergonomic, but I'm not sure whether that might prevent using the intrinsics for actually unaligned data.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The convention here is to use *mut u8 where C uses void pointers. LLVM's i8 doesn't mean anything since LLVM IR types don't have signs: LLVM's i8 is used for both of Rust's u8 and i8.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I missed that convention when implementing the masked load/store instructions, there are also several more intrinsics that did not already follow this convention. I can take a look at adjusting the stdarch-verify test to catch this and change any existing type differences. Since avx512 is still unstable that should be possible I guess.

vcompressstored(base_addr as *mut _, a.as_i32x16(), k)
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m256i) {
vcompressstored256(base_addr as *mut _, a.as_i32x8(), k)
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m128i) {
vcompressstored128(base_addr as *mut _, a.as_i32x4(), k)
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m512i) {
vcompressstoreq(base_addr as *mut _, a.as_i64x8(), k)
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m256i) {
vcompressstoreq256(base_addr as *mut _, a.as_i64x4(), k)
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m128i) {
vcompressstoreq128(base_addr as *mut _, a.as_i64x2(), k)
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_ps)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask16, a: __m512) {
vcompressstoreps(base_addr as *mut _, a.as_f32x16(), k)
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m256) {
vcompressstoreps256(base_addr as *mut _, a.as_f32x8(), k)
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m128) {
vcompressstoreps128(base_addr as *mut _, a.as_f32x4(), k)
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_pd)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m512d) {
vcompressstorepd(base_addr as *mut _, a.as_f64x8(), k)
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m256d) {
vcompressstorepd256(base_addr as *mut _, a.as_f64x4(), k)
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m128d) {
vcompressstorepd128(base_addr as *mut _, a.as_f64x2(), k)
}

/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi32&expand=2316)
Expand Down Expand Up @@ -38007,6 +38127,34 @@ extern "C" {
#[link_name = "llvm.x86.avx512.mask.compress.pd.128"]
fn vcompresspd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;

#[link_name = "llvm.x86.avx512.mask.compress.store.d.512"]
fn vcompressstored(mem: *mut i8, data: i32x16, mask: u16);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a better naming convention for these intrinsics? The asm mnemonic is the same for mem and reg operations.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The recommended way to figure this out is to look at what IR clang generates: https://godbolt.org/z/nvaxM4MGh

In this case it is calling the llvm.masked.compressstore.v2f64 intrinsic which unfortunately can't be called directly from Rust because it uses a i1 vector which can't be represented with Rust types.

This is the reason why #1254 implemented some of the AVX512 intrinsics using inline assembly instead. I think this is the right approach in this case as well.

Copy link
Contributor Author

@jhorstmann jhorstmann Jan 24, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These llvm intrinsic seem to work though, and I saw them used with plain integer masks in llvm testcases: https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll#L8938 (not sure if there is an official documentation for them though). I wanted to avoid asm unless absolutely necessary.

Test failures in CI seem unrelated to the changes in this PR.

#[link_name = "llvm.x86.avx512.mask.compress.store.d.256"]
fn vcompressstored256(mem: *mut i8, data: i32x8, mask: u8);
#[link_name = "llvm.x86.avx512.mask.compress.store.d.128"]
fn vcompressstored128(mem: *mut i8, data: i32x4, mask: u8);

#[link_name = "llvm.x86.avx512.mask.compress.store.q.512"]
fn vcompressstoreq(mem: *mut i8, data: i64x8, mask: u8);
#[link_name = "llvm.x86.avx512.mask.compress.store.q.256"]
fn vcompressstoreq256(mem: *mut i8, data: i64x4, mask: u8);
#[link_name = "llvm.x86.avx512.mask.compress.store.q.128"]
fn vcompressstoreq128(mem: *mut i8, data: i64x2, mask: u8);

#[link_name = "llvm.x86.avx512.mask.compress.store.ps.512"]
fn vcompressstoreps(mem: *mut i8, data: f32x16, mask: u16);
#[link_name = "llvm.x86.avx512.mask.compress.store.ps.256"]
fn vcompressstoreps256(mem: *mut i8, data: f32x8, mask: u8);
#[link_name = "llvm.x86.avx512.mask.compress.store.ps.128"]
fn vcompressstoreps128(mem: *mut i8, data: f32x4, mask: u8);

#[link_name = "llvm.x86.avx512.mask.compress.store.pd.512"]
fn vcompressstorepd(mem: *mut i8, data: f64x8, mask: u8);
#[link_name = "llvm.x86.avx512.mask.compress.store.pd.256"]
fn vcompressstorepd256(mem: *mut i8, data: f64x4, mask: u8);
#[link_name = "llvm.x86.avx512.mask.compress.store.pd.128"]
fn vcompressstorepd128(mem: *mut i8, data: f64x2, mask: u8);

#[link_name = "llvm.x86.avx512.mask.expand.d.512"]
fn vpexpandd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
#[link_name = "llvm.x86.avx512.mask.expand.d.256"]
Expand Down Expand Up @@ -51357,6 +51505,138 @@ mod tests {
assert_eq_m128(r, e);
}

#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_mask_compressstoreu_epi32() {
let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let mut r = [0_i32; 16];
_mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_i32; 16]);
_mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1111000011001010, a);
assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm256_mask_compressstoreu_epi32() {
let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
let mut r = [0_i32; 8];
_mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_i32; 8]);
_mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b11001010, a);
assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm_mask_compressstoreu_epi32() {
let a = _mm_setr_epi32(1, 2, 3, 4);
let mut r = [0_i32; 4];
_mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_i32; 4]);
_mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1011, a);
assert_eq!(&r, &[1, 2, 4, 0]);
}

#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_mask_compressstoreu_epi64() {
let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
let mut r = [0_i64; 8];
_mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_i64; 8]);
_mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b11001010, a);
assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm256_mask_compressstoreu_epi64() {
let a = _mm256_setr_epi64x(1, 2, 3, 4);
let mut r = [0_i64; 4];
_mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_i64; 4]);
_mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b1011, a);
assert_eq!(&r, &[1, 2, 4, 0]);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm_mask_compressstoreu_epi64() {
let a = _mm_setr_epi64x(1, 2);
let mut r = [0_i64; 2];
_mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_i64; 2]);
_mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b10, a);
assert_eq!(&r, &[2, 0]);
}

#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_mask_compressstoreu_ps() {
let a = _mm512_setr_ps(
1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32, 9_f32, 10_f32, 11_f32, 12_f32,
13_f32, 14_f32, 15_f32, 16_f32,
);
let mut r = [0_f32; 16];
_mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_f32; 16]);
_mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1111000011001010, a);
assert_eq!(
&r,
&[
2_f32, 4_f32, 7_f32, 8_f32, 13_f32, 14_f32, 15_f32, 16_f32, 0_f32, 0_f32, 0_f32,
0_f32, 0_f32, 0_f32, 0_f32, 0_f32
]
);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm256_mask_compressstoreu_ps() {
let a = _mm256_setr_ps(1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32);
let mut r = [0_f32; 8];
_mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_f32; 8]);
_mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b11001010, a);
assert_eq!(
&r,
&[2_f32, 4_f32, 7_f32, 8_f32, 0_f32, 0_f32, 0_f32, 0_f32]
);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm_mask_compressstoreu_ps() {
let a = _mm_setr_ps(1_f32, 2_f32, 3_f32, 4_f32);
let mut r = [0.; 4];
_mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0.; 4]);
_mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1011, a);
assert_eq!(&r, &[1_f32, 2_f32, 4_f32, 0_f32]);
}

#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_mask_compressstoreu_pd() {
let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
let mut r = [0.; 8];
_mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0.; 8]);
_mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b11001010, a);
assert_eq!(&r, &[2., 4., 7., 8., 0., 0., 0., 0.]);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm256_mask_compressstoreu_pd() {
let a = _mm256_setr_pd(1., 2., 3., 4.);
let mut r = [0.; 4];
_mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0.; 4]);
_mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b1011, a);
assert_eq!(&r, &[1., 2., 4., 0.]);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm_mask_compressstoreu_pd() {
let a = _mm_setr_pd(1., 2.);
let mut r = [0.; 2];
_mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0.; 2]);
_mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b10, a);
assert_eq!(&r, &[2., 0.]);
}

#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_mask_expand_epi32() {
let src = _mm512_set1_epi32(200);
Expand Down