-
Notifications
You must be signed in to change notification settings - Fork 289
Implement avx512 compressstore intrinsics #1273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16156,6 +16156,126 @@ pub unsafe fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d { | |
transmute(vcompresspd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k)) | ||
} | ||
|
||
/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. | ||
/// | ||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi32) | ||
#[inline] | ||
#[target_feature(enable = "avx512f")] | ||
#[cfg_attr(test, assert_instr(vpcompressd))] | ||
pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask16, a: __m512i) { | ||
vcompressstored(base_addr as *mut _, a.as_i32x16(), k) | ||
} | ||
|
||
/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. | ||
/// | ||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi32) | ||
#[inline] | ||
#[target_feature(enable = "avx512f,avx512vl")] | ||
#[cfg_attr(test, assert_instr(vpcompressd))] | ||
pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m256i) { | ||
vcompressstored256(base_addr as *mut _, a.as_i32x8(), k) | ||
} | ||
|
||
/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. | ||
/// | ||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi32) | ||
#[inline] | ||
#[target_feature(enable = "avx512f,avx512vl")] | ||
#[cfg_attr(test, assert_instr(vpcompressd))] | ||
pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m128i) { | ||
vcompressstored128(base_addr as *mut _, a.as_i32x4(), k) | ||
} | ||
|
||
/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. | ||
/// | ||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi64) | ||
#[inline] | ||
#[target_feature(enable = "avx512f")] | ||
#[cfg_attr(test, assert_instr(vpcompressq))] | ||
pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m512i) { | ||
vcompressstoreq(base_addr as *mut _, a.as_i64x8(), k) | ||
} | ||
|
||
/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. | ||
/// | ||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi64) | ||
#[inline] | ||
#[target_feature(enable = "avx512f,avx512vl")] | ||
#[cfg_attr(test, assert_instr(vpcompressq))] | ||
pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m256i) { | ||
vcompressstoreq256(base_addr as *mut _, a.as_i64x4(), k) | ||
} | ||
|
||
/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. | ||
/// | ||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi64) | ||
#[inline] | ||
#[target_feature(enable = "avx512f,avx512vl")] | ||
#[cfg_attr(test, assert_instr(vpcompressq))] | ||
pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m128i) { | ||
vcompressstoreq128(base_addr as *mut _, a.as_i64x2(), k) | ||
} | ||
|
||
/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. | ||
/// | ||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_ps) | ||
#[inline] | ||
#[target_feature(enable = "avx512f")] | ||
#[cfg_attr(test, assert_instr(vcompressps))] | ||
pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask16, a: __m512) { | ||
vcompressstoreps(base_addr as *mut _, a.as_f32x16(), k) | ||
} | ||
|
||
/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. | ||
/// | ||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_ps) | ||
#[inline] | ||
#[target_feature(enable = "avx512f,avx512vl")] | ||
#[cfg_attr(test, assert_instr(vcompressps))] | ||
pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m256) { | ||
vcompressstoreps256(base_addr as *mut _, a.as_f32x8(), k) | ||
} | ||
|
||
/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. | ||
/// | ||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_ps) | ||
#[inline] | ||
#[target_feature(enable = "avx512f,avx512vl")] | ||
#[cfg_attr(test, assert_instr(vcompressps))] | ||
pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m128) { | ||
vcompressstoreps128(base_addr as *mut _, a.as_f32x4(), k) | ||
} | ||
|
||
/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. | ||
/// | ||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_pd) | ||
#[inline] | ||
#[target_feature(enable = "avx512f")] | ||
#[cfg_attr(test, assert_instr(vcompresspd))] | ||
pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m512d) { | ||
vcompressstorepd(base_addr as *mut _, a.as_f64x8(), k) | ||
} | ||
|
||
/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. | ||
/// | ||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_pd) | ||
#[inline] | ||
#[target_feature(enable = "avx512f,avx512vl")] | ||
#[cfg_attr(test, assert_instr(vcompresspd))] | ||
pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m256d) { | ||
vcompressstorepd256(base_addr as *mut _, a.as_f64x4(), k) | ||
} | ||
|
||
/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. | ||
/// | ||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_pd) | ||
#[inline] | ||
#[target_feature(enable = "avx512f,avx512vl")] | ||
#[cfg_attr(test, assert_instr(vcompresspd))] | ||
pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m128d) { | ||
vcompressstorepd128(base_addr as *mut _, a.as_f64x2(), k) | ||
} | ||
|
||
/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). | ||
/// | ||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi32&expand=2316) | ||
|
@@ -38007,6 +38127,34 @@ extern "C" { | |
#[link_name = "llvm.x86.avx512.mask.compress.pd.128"] | ||
fn vcompresspd128(a: f64x2, src: f64x2, mask: u8) -> f64x2; | ||
|
||
#[link_name = "llvm.x86.avx512.mask.compress.store.d.512"] | ||
fn vcompressstored(mem: *mut i8, data: i32x16, mask: u16); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a better naming convention for these intrinsics? The asm mnemonic is the same for mem and reg operations. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The recommended way to figure this out is to look at what IR clang generates: https://godbolt.org/z/nvaxM4MGh In this case it is calling the This is the reason why #1254 implemented some of the AVX512 intrinsics using inline assembly instead. I think this is the right approach in this case as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These llvm intrinsic seem to work though, and I saw them used with plain integer masks in llvm testcases: https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll#L8938 (not sure if there is an official documentation for them though). I wanted to avoid asm unless absolutely necessary. Test failures in CI seem unrelated to the changes in this PR. |
||
#[link_name = "llvm.x86.avx512.mask.compress.store.d.256"] | ||
fn vcompressstored256(mem: *mut i8, data: i32x8, mask: u8); | ||
#[link_name = "llvm.x86.avx512.mask.compress.store.d.128"] | ||
fn vcompressstored128(mem: *mut i8, data: i32x4, mask: u8); | ||
|
||
#[link_name = "llvm.x86.avx512.mask.compress.store.q.512"] | ||
fn vcompressstoreq(mem: *mut i8, data: i64x8, mask: u8); | ||
#[link_name = "llvm.x86.avx512.mask.compress.store.q.256"] | ||
fn vcompressstoreq256(mem: *mut i8, data: i64x4, mask: u8); | ||
#[link_name = "llvm.x86.avx512.mask.compress.store.q.128"] | ||
fn vcompressstoreq128(mem: *mut i8, data: i64x2, mask: u8); | ||
|
||
#[link_name = "llvm.x86.avx512.mask.compress.store.ps.512"] | ||
fn vcompressstoreps(mem: *mut i8, data: f32x16, mask: u16); | ||
#[link_name = "llvm.x86.avx512.mask.compress.store.ps.256"] | ||
fn vcompressstoreps256(mem: *mut i8, data: f32x8, mask: u8); | ||
#[link_name = "llvm.x86.avx512.mask.compress.store.ps.128"] | ||
fn vcompressstoreps128(mem: *mut i8, data: f32x4, mask: u8); | ||
|
||
#[link_name = "llvm.x86.avx512.mask.compress.store.pd.512"] | ||
fn vcompressstorepd(mem: *mut i8, data: f64x8, mask: u8); | ||
#[link_name = "llvm.x86.avx512.mask.compress.store.pd.256"] | ||
fn vcompressstorepd256(mem: *mut i8, data: f64x4, mask: u8); | ||
#[link_name = "llvm.x86.avx512.mask.compress.store.pd.128"] | ||
fn vcompressstorepd128(mem: *mut i8, data: f64x2, mask: u8); | ||
|
||
#[link_name = "llvm.x86.avx512.mask.expand.d.512"] | ||
fn vpexpandd(a: i32x16, src: i32x16, mask: u16) -> i32x16; | ||
#[link_name = "llvm.x86.avx512.mask.expand.d.256"] | ||
|
@@ -51357,6 +51505,138 @@ mod tests { | |
assert_eq_m128(r, e); | ||
} | ||
|
||
#[simd_test(enable = "avx512f")] | ||
unsafe fn test_mm512_mask_compressstoreu_epi32() { | ||
let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); | ||
let mut r = [0_i32; 16]; | ||
_mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a); | ||
assert_eq!(&r, &[0_i32; 16]); | ||
_mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1111000011001010, a); | ||
assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]); | ||
} | ||
|
||
#[simd_test(enable = "avx512f,avx512vl")] | ||
unsafe fn test_mm256_mask_compressstoreu_epi32() { | ||
let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); | ||
let mut r = [0_i32; 8]; | ||
_mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a); | ||
assert_eq!(&r, &[0_i32; 8]); | ||
_mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b11001010, a); | ||
assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]); | ||
} | ||
|
||
#[simd_test(enable = "avx512f,avx512vl")] | ||
unsafe fn test_mm_mask_compressstoreu_epi32() { | ||
let a = _mm_setr_epi32(1, 2, 3, 4); | ||
let mut r = [0_i32; 4]; | ||
_mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a); | ||
assert_eq!(&r, &[0_i32; 4]); | ||
_mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1011, a); | ||
assert_eq!(&r, &[1, 2, 4, 0]); | ||
} | ||
|
||
#[simd_test(enable = "avx512f")] | ||
unsafe fn test_mm512_mask_compressstoreu_epi64() { | ||
let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); | ||
let mut r = [0_i64; 8]; | ||
_mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a); | ||
assert_eq!(&r, &[0_i64; 8]); | ||
_mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b11001010, a); | ||
assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]); | ||
} | ||
|
||
#[simd_test(enable = "avx512f,avx512vl")] | ||
unsafe fn test_mm256_mask_compressstoreu_epi64() { | ||
let a = _mm256_setr_epi64x(1, 2, 3, 4); | ||
let mut r = [0_i64; 4]; | ||
_mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a); | ||
assert_eq!(&r, &[0_i64; 4]); | ||
_mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b1011, a); | ||
assert_eq!(&r, &[1, 2, 4, 0]); | ||
} | ||
|
||
#[simd_test(enable = "avx512f,avx512vl")] | ||
unsafe fn test_mm_mask_compressstoreu_epi64() { | ||
let a = _mm_setr_epi64x(1, 2); | ||
let mut r = [0_i64; 2]; | ||
_mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a); | ||
assert_eq!(&r, &[0_i64; 2]); | ||
_mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b10, a); | ||
assert_eq!(&r, &[2, 0]); | ||
} | ||
|
||
#[simd_test(enable = "avx512f")] | ||
unsafe fn test_mm512_mask_compressstoreu_ps() { | ||
let a = _mm512_setr_ps( | ||
1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32, 9_f32, 10_f32, 11_f32, 12_f32, | ||
13_f32, 14_f32, 15_f32, 16_f32, | ||
); | ||
let mut r = [0_f32; 16]; | ||
_mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a); | ||
assert_eq!(&r, &[0_f32; 16]); | ||
_mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1111000011001010, a); | ||
assert_eq!( | ||
&r, | ||
&[ | ||
2_f32, 4_f32, 7_f32, 8_f32, 13_f32, 14_f32, 15_f32, 16_f32, 0_f32, 0_f32, 0_f32, | ||
0_f32, 0_f32, 0_f32, 0_f32, 0_f32 | ||
] | ||
); | ||
} | ||
|
||
#[simd_test(enable = "avx512f,avx512vl")] | ||
unsafe fn test_mm256_mask_compressstoreu_ps() { | ||
let a = _mm256_setr_ps(1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32); | ||
let mut r = [0_f32; 8]; | ||
_mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a); | ||
assert_eq!(&r, &[0_f32; 8]); | ||
_mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b11001010, a); | ||
assert_eq!( | ||
&r, | ||
&[2_f32, 4_f32, 7_f32, 8_f32, 0_f32, 0_f32, 0_f32, 0_f32] | ||
); | ||
} | ||
|
||
#[simd_test(enable = "avx512f,avx512vl")] | ||
unsafe fn test_mm_mask_compressstoreu_ps() { | ||
let a = _mm_setr_ps(1_f32, 2_f32, 3_f32, 4_f32); | ||
let mut r = [0.; 4]; | ||
_mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a); | ||
assert_eq!(&r, &[0.; 4]); | ||
_mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1011, a); | ||
assert_eq!(&r, &[1_f32, 2_f32, 4_f32, 0_f32]); | ||
} | ||
|
||
#[simd_test(enable = "avx512f")] | ||
unsafe fn test_mm512_mask_compressstoreu_pd() { | ||
let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.); | ||
let mut r = [0.; 8]; | ||
_mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a); | ||
assert_eq!(&r, &[0.; 8]); | ||
_mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b11001010, a); | ||
assert_eq!(&r, &[2., 4., 7., 8., 0., 0., 0., 0.]); | ||
} | ||
|
||
#[simd_test(enable = "avx512f,avx512vl")] | ||
unsafe fn test_mm256_mask_compressstoreu_pd() { | ||
let a = _mm256_setr_pd(1., 2., 3., 4.); | ||
let mut r = [0.; 4]; | ||
_mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a); | ||
assert_eq!(&r, &[0.; 4]); | ||
_mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b1011, a); | ||
assert_eq!(&r, &[1., 2., 4., 0.]); | ||
} | ||
|
||
#[simd_test(enable = "avx512f,avx512vl")] | ||
unsafe fn test_mm_mask_compressstoreu_pd() { | ||
let a = _mm_setr_pd(1., 2.); | ||
let mut r = [0.; 2]; | ||
_mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a); | ||
assert_eq!(&r, &[0.; 2]); | ||
_mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b10, a); | ||
assert_eq!(&r, &[2., 0.]); | ||
} | ||
|
||
#[simd_test(enable = "avx512f")] | ||
unsafe fn test_mm512_mask_expand_epi32() { | ||
let src = _mm512_set1_epi32(200); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Intel's intrinsic guide uses a
void*
forbase_addr
, the llvm intrinsics use ani8*
. Using a ptr of the correct datatype would be more ergonomic, but I'm not sure whether that might prevent using the intrinsics for actually unaligned data.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The convention here is to use
*mut u8
where C uses void pointers. LLVM'si8
doesn't mean anything since LLVM IR types don't have signs: LLVM'si8
is used for both of Rust'su8
andi8
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I missed that convention when implementing the masked load/store instructions, there are also several more intrinsics that did not already follow this convention. I can take a look at adjusting the
stdarch-verify
test to catch this and change any existing type differences. Since avx512 is still unstable that should be possible I guess.