Implement _mm256_i32scatter_epi64 from AVX512VL

asomers · Amanieu · commit 7e2cdc675b92 · 2023-05-08T07:20:40.000+01:00
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
@@ -1519,7 +1519,7 @@
   * [x] [`_mm512_mask_i32scatter_epi64`]
   * [_] [`_mm_i32scatter_epi64`]//need i1
   * [_] [`_mm_mask_i32scatter_epi64`] //need i1
-  * [_] [`_mm256_i32scatter_epi64`] //need i1
+  * [x] [`_mm256_i32scatter_epi64`]
   * [_] [`_mm256_mask_i32scatter_epi64`] //need i1
   * [x] [`_mm512_i32scatter_ps`]
   * [x] [`_mm512_mask_i32scatter_ps`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
@@ -15757,6 +15757,26 @@ pub unsafe fn _mm512_mask_i32scatter_epi64<const SCALE: i32>(
     vpscatterdq(slice, mask, offsets, src, SCALE);
 }
 
+/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32scatter_epi64&expand=4099)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_i32scatter_epi64<const SCALE: i32>(
+    slice: *mut u8,
+    offsets: __m128i,
+    src: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x4();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x4();
+    vpscatterdq256(slice, neg_one, offsets, src, SCALE);
+}
+
 /// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_epi64&expand=3116)
@@ -38307,6 +38327,8 @@ extern "C" {
     fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32);
     #[link_name = "llvm.x86.avx512.scatter.dpq.512"]
     fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv4.di"]
+    fn vpscatterdq256(slice: *mut i8, mask: i8, offsets: i32x4, src: i64x4, scale: i32);
 
     #[link_name = "llvm.x86.avx512.scatter.dpi.512"]
     fn vpscatterdd(slice: *mut i8, mask: i16, offsets: i32x16, src: i32x16, scale: i32);
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
@@ -7551,6 +7551,20 @@ mod tests {
         assert_eq!(&arr[..], &expected[..],);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i32scatter_epi64() {
+        let mut arr = [0i64; 64];
+        let index = _mm_setr_epi32(0, 16, 32, 48);
+        let src = _mm256_setr_epi64x(1, 2, 3, 4);
+        // A multiplier of 8 is word-addressing
+        _mm256_i32scatter_epi64::<8>(arr.as_mut_ptr() as *mut u8, index, src);
+        let mut expected = [0i64; 64];
+        for i in 0..4 {
+            expected[i * 16] = (i + 1) as i64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_i64scatter_epi64() {
         let mut arr = [0i64; 128];