|
2 | 2 | //!
|
3 | 3 | //! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16
|
4 | 4 |
|
| 5 | +use crate::arch::asm; |
5 | 6 | use crate::core_arch::{simd::*, x86::*};
|
6 | 7 | use crate::intrinsics::simd::*;
|
7 | 8 |
|
@@ -490,9 +491,85 @@ pub unsafe fn _mm_cvtsbh_ss(a: u16) -> f32 {
|
490 | 491 | f32::from_bits((a as u32) << 16)
|
491 | 492 | }
|
492 | 493 |
|
| 494 | +/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
| 495 | +/// floating-point elements, and store the results in dst. |
| 496 | +/// |
| 497 | +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh) |
| 498 | +#[inline] |
| 499 | +#[target_feature(enable = "avx512bf16,avx512vl,sse")] |
| 500 | +#[cfg_attr(test, assert_instr("vcvtneps2bf16"))] |
| 501 | +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] |
| 502 | +pub unsafe fn _mm_cvtneps_pbh(a: __m128) -> __m128bh { |
| 503 | + let mut dst: __m128bh; |
| 504 | + asm!( |
| 505 | + "vcvtneps2bf16 {dst}, {src}", |
| 506 | + dst = lateout(xmm_reg) dst, |
| 507 | + src = in(xmm_reg) a, |
| 508 | + options(pure, nomem, nostack, preserves_flags) |
| 509 | + ); |
| 510 | + dst |
| 511 | +} |
| 512 | + |
| 513 | +/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
| 514 | +/// floating-point elements, and store the results in dst using writemask k (elements are copied |
| 515 | +/// from src when the corresponding mask bit is not set). |
| 516 | +/// |
| 517 | +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh) |
| 518 | +#[inline] |
| 519 | +#[target_feature(enable = "avx512bf16,avx512vl,sse,avx512f")] |
| 520 | +#[cfg_attr(test, assert_instr("vcvtneps2bf16"))] |
| 521 | +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] |
| 522 | +pub unsafe fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh { |
| 523 | + let mut dst = src; |
| 524 | + asm!( |
| 525 | + "vcvtneps2bf16 {dst}{{{k}}},{src}", |
| 526 | + dst = inlateout(xmm_reg) dst, |
| 527 | + src = in(xmm_reg) a, |
| 528 | + k = in(kreg) k, |
| 529 | + options(pure, nomem, nostack, preserves_flags) |
| 530 | + ); |
| 531 | + dst |
| 532 | +} |
| 533 | + |
| 534 | +/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
| 535 | +/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out |
| 536 | +/// when the corresponding mask bit is not set). |
| 537 | +/// |
| 538 | +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh) |
| 539 | +#[inline] |
| 540 | +#[target_feature(enable = "avx512bf16,avx512vl,sse,avx512f")] |
| 541 | +#[cfg_attr(test, assert_instr("vcvtneps2bf16"))] |
| 542 | +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] |
| 543 | +pub unsafe fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh { |
| 544 | + let mut dst: __m128bh; |
| 545 | + asm!( |
| 546 | + "vcvtneps2bf16 {dst}{{{k}}}{{z}},{src}", |
| 547 | + dst = lateout(xmm_reg) dst, |
| 548 | + src = in(xmm_reg) a, |
| 549 | + k = in(kreg) k, |
| 550 | + options(pure, nomem, nostack, preserves_flags) |
| 551 | + ); |
| 552 | + dst |
| 553 | +} |
| 554 | + |
| 555 | +/// Converts a single-precision (32-bit) floating-point element in a to a BF16 (16-bit) floating-point |
| 556 | +/// element, and store the result in dst. |
| 557 | +/// |
| 558 | +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh) |
| 559 | +#[inline] |
| 560 | +#[target_feature(enable = "avx512bf16,avx512vl")] |
| 561 | +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] |
| 562 | +pub unsafe fn _mm_cvtness_sbh(a: f32) -> u16 { |
| 563 | + simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0) |
| 564 | +} |
| 565 | + |
493 | 566 | #[cfg(test)]
|
494 | 567 | mod tests {
|
495 |
| - use crate::{core_arch::x86::*, mem::transmute}; |
| 568 | + use crate::core_arch::simd::u16x4; |
| 569 | + use crate::{ |
| 570 | + core_arch::x86::*, |
| 571 | + mem::{transmute, transmute_copy}, |
| 572 | + }; |
496 | 573 | use stdarch_test::simd_test;
|
497 | 574 |
|
498 | 575 | #[simd_test(enable = "avx512bf16,avx512vl")]
|
@@ -1836,4 +1913,37 @@ mod tests {
|
1836 | 1913 | let r = _mm_cvtsbh_ss(BF16_ONE);
|
1837 | 1914 | assert_eq!(r, 1.);
|
1838 | 1915 | }
|
| 1916 | + |
| 1917 | + #[simd_test(enable = "avx512bf16,avx512vl")] |
| 1918 | + unsafe fn test_mm_cvtneps_pbh() { |
| 1919 | + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
| 1920 | + let r: u16x4 = transmute_copy(&_mm_cvtneps_pbh(a)); |
| 1921 | + let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR); |
| 1922 | + assert_eq!(r, e); |
| 1923 | + } |
| 1924 | + |
| 1925 | + #[simd_test(enable = "avx512bf16,avx512vl")] |
| 1926 | + unsafe fn test_mm_mask_cvtneps_pbh() { |
| 1927 | + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
| 1928 | + let src = __m128bh(5, 6, 7, 8, !0, !0, !0, !0); |
| 1929 | + let k = 0b1010; |
| 1930 | + let r: u16x4 = transmute_copy(&_mm_mask_cvtneps_pbh(src, k, a)); |
| 1931 | + let e = u16x4::new(5, BF16_TWO, 7, BF16_FOUR); |
| 1932 | + assert_eq!(r, e); |
| 1933 | + } |
| 1934 | + |
| 1935 | + #[simd_test(enable = "avx512bf16,avx512vl")] |
| 1936 | + unsafe fn test_mm_maskz_cvtneps_pbh() { |
| 1937 | + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
| 1938 | + let k = 0b1010; |
| 1939 | + let r: u16x4 = transmute_copy(&_mm_maskz_cvtneps_pbh(k, a)); |
| 1940 | + let e = u16x4::new(0, BF16_TWO, 0, BF16_FOUR); |
| 1941 | + assert_eq!(r, e); |
| 1942 | + } |
| 1943 | + |
| 1944 | + #[simd_test(enable = "avx512bf16,avx512vl")] |
| 1945 | + unsafe fn test_mm_cvtness_sbh() { |
| 1946 | + let r = _mm_cvtness_sbh(1.); |
| 1947 | + assert_eq!(r, BF16_ONE); |
| 1948 | + } |
1839 | 1949 | }
|
0 commit comments