Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 183 additions & 1 deletion rpcs3/Emu/RSX/Common/BufferUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,42 +15,57 @@
#define SSE4_1_FUNC
#define AVX2_FUNC
#define AVX3_FUNC
#define AVX512_ICL_FUNC
#else
#define SSE4_1_FUNC __attribute__((__target__("sse4.1")))
#define AVX2_FUNC __attribute__((__target__("avx2")))
#define AVX3_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl")))
#define AVX512_ICL_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl,avx512bitalg,avx512ifma,avx512vbmi,avx512vbmi2,avx512vnni,avx512vpopcntdq")))
#endif // _MSC_VER

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__)

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__) && defined(__AVX512BITALG__) && defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && defined(__AVX512VBMI2__) && defined(__AVX512VNNI__) && defined(__AVX512VPOPCNTDQ__)
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
[[maybe_unused]] constexpr bool s_use_avx2 = true;
[[maybe_unused]] constexpr bool s_use_avx3 = true;
[[maybe_unused]] constexpr bool s_use_avx512_icl = true;
#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__)
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
[[maybe_unused]] constexpr bool s_use_avx2 = true;
[[maybe_unused]] constexpr bool s_use_avx3 = true;
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
#elif defined(__AVX2__)
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
[[maybe_unused]] constexpr bool s_use_avx2 = true;
[[maybe_unused]] constexpr bool s_use_avx3 = false;
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
#elif defined(__SSE4_1__)
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
[[maybe_unused]] constexpr bool s_use_avx2 = false;
[[maybe_unused]] constexpr bool s_use_avx3 = false;
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
#elif defined(__SSSE3__)
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
[[maybe_unused]] constexpr bool s_use_sse4_1 = false;
[[maybe_unused]] constexpr bool s_use_avx2 = false;
[[maybe_unused]] constexpr bool s_use_avx3 = false;
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
#elif defined(ARCH_X64)
[[maybe_unused]] const bool s_use_ssse3 = utils::has_ssse3();
[[maybe_unused]] const bool s_use_sse4_1 = utils::has_sse41();
[[maybe_unused]] const bool s_use_avx2 = utils::has_avx2();
[[maybe_unused]] const bool s_use_avx3 = utils::has_avx512();
[[maybe_unused]] const bool s_use_avx512_icl = utils::has_avx512_icl();
#else
[[maybe_unused]] constexpr bool s_use_ssse3 = true; // Non x86
[[maybe_unused]] constexpr bool s_use_sse4_1 = true; // Non x86
[[maybe_unused]] constexpr bool s_use_avx2 = false;
[[maybe_unused]] constexpr bool s_use_avx3 = false;
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
#endif

const v128 s_bswap_u32_mask = v128::from32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
Expand Down Expand Up @@ -404,6 +419,153 @@ namespace
}
};



#if defined(ARCH_X64)

SSE4_1_FUNC static inline u16 sse41_hmin_epu16(__m128i x)
{
return _mm_cvtsi128_si32(_mm_minpos_epu16(x));
}

SSE4_1_FUNC static inline u16 sse41_hmax_epu16(__m128i x)
{
return ~_mm_cvtsi128_si32(_mm_minpos_epu16(_mm_xor_si128(x, _mm_set1_epi32(-1))));
}

AVX512_ICL_FUNC
static
std::tuple<u16, u16, u32> upload_u16_swapped_avx512_icl_skip_restart(const void *src, void *dst, u32 count, u16 restart_index)
{
const __m512i s_bswap_u16_mask512 = _mm512_broadcast_i64x2(s_bswap_u16_mask);

auto src_stream = static_cast<const __m512*>(src);
auto dst_stream = static_cast<u16 *>(dst);

const __m512i restart = _mm512_set1_epi16(restart_index);
__m512i min = _mm512_set1_epi16(-1);
__m512i max = _mm512_set1_epi16(0);
const __m512i ones = _mm512_set1_epi16(-1);

int written = 0;

const auto iterations = count / 32;
for (u32 i = 0; i < iterations; i++)
{
const __m512i raw = _mm512_loadu_si512(src_stream++);
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512);
const __mmask32 mask = _mm512_cmpneq_epi16_mask(restart, value);
const __m512i value_with_max_restart = _mm512_mask_blend_epi16(mask, ones, value);

max = _mm512_mask_max_epu16(max, mask, max, value);
min = _mm512_mask_min_epu16(min, mask, min, value);
const __m512i packed = _mm512_maskz_compress_epi16(mask, value_with_max_restart);

const int processed = _mm_popcnt_u32(mask);
_mm512_storeu_si512(dst_stream, packed);
dst_stream += processed;
written += processed;
}

u32 remainder = count % 32;
if (remainder > 0)
{
const __mmask32 rem_mask = (1U << remainder) - 1;
const __m512i raw = _mm512_maskz_loadu_epi16(rem_mask, src_stream);
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512);
const __mmask32 mask = _mm512_mask_cmpneq_epi16_mask(rem_mask, restart, value);

const __m512i value_with_max_restart = _mm512_mask_blend_epi16(mask, ones, value);
max = _mm512_mask_max_epu16(max, mask, max, value);
min = _mm512_mask_min_epu16(min, mask, min, value);
const __m512i packed = _mm512_maskz_compress_epi16(mask, value_with_max_restart);

const int processed = _mm_popcnt_u32(mask);
const __mmask32 store_mask = (1U << processed) - 1;
_mm512_mask_storeu_epi16(dst_stream, store_mask, packed);
written += processed;
}

__m256i tmp256 = _mm512_extracti64x4_epi64(min, 1);
__m256i min2 = _mm512_castsi512_si256(min);
min2 = _mm256_min_epu16(min2, tmp256);
__m128i tmp = _mm256_extracti128_si256(min2, 1);
__m128i min3 = _mm256_castsi256_si128(min2);
min3 = _mm_min_epu16(min3, tmp);

tmp256 = _mm512_extracti64x4_epi64(max, 1);
__m256i max2 = _mm512_castsi512_si256(max);
max2 = _mm256_max_epu16(max2, tmp256);
tmp = _mm256_extracti128_si256(max2, 1);
__m128i max3 = _mm256_castsi256_si128(max2);
max3 = _mm_max_epu16(max3, tmp);

const u16 min_index = sse41_hmin_epu16(min3);
const u16 max_index = sse41_hmax_epu16(max3);

return std::make_tuple(min_index, max_index, written);
}

AVX3_FUNC
static
std::tuple<u32, u32, u32> upload_u32_swapped_avx3_skip_restart(const void *src, void *dst, u32 count, u32 restart_index)
{
const __m512i s_bswap_u32_mask512 = _mm512_broadcast_i32x4(s_bswap_u32_mask);

auto src_stream = static_cast<const __m512i*>(src);
auto dst_stream = static_cast<u32 *>(dst);

const __m512i restart = _mm512_set1_epi32(restart_index);
__m512i min = _mm512_set1_epi32(-1);
__m512i max = _mm512_set1_epi32(0);
const __m512i ones = _mm512_set1_epi32(-1);

int written = 0;

const u32 iterations = count / 16;
for (u32 i = 0; i < iterations; i++)
{
const __m512i raw = _mm512_loadu_si512(src_stream++);
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u32_mask512);
const __mmask16 mask = _mm512_cmpneq_epi32_mask(restart, value);
const __m512i value_with_max_restart = _mm512_mask_blend_epi32(mask, ones, value);

max = _mm512_mask_max_epu32(max, mask, max, value);
min = _mm512_mask_min_epu32(min, mask, min, value);
const __m512i packed = _mm512_maskz_compress_epi32(mask, value_with_max_restart);

const int processed = _mm_popcnt_u32(mask);
_mm512_storeu_si512(dst_stream, packed);
dst_stream += processed;
written += processed;
}

u32 remainder = count % 16;
if (remainder > 0)
{
const __mmask16 rem_mask = (1U << remainder) - 1;
const __m512i raw = _mm512_maskz_loadu_epi32(rem_mask, src_stream);
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u32_mask512);

const __mmask16 mask = _mm512_mask_cmpneq_epi32_mask(rem_mask, restart, value);
const __m512i value_with_max_restart = _mm512_mask_blend_epi32(mask, ones, value);
max = _mm512_mask_max_epu32(max, mask, max, value);
min = _mm512_mask_min_epu32(min, mask, min, value);
const __m512i packed = _mm512_maskz_compress_epi32(mask, value_with_max_restart);

const int processed = _mm_popcnt_u32(mask);
const __mmask16 store_mask = (1U << processed) - 1;
_mm512_mask_storeu_epi32(dst_stream, store_mask, packed);
written += processed;
}

u32 min_index = _mm512_reduce_min_epu32(min);
u32 max_index = _mm512_reduce_max_epu32(max);

return std::make_tuple(min_index, max_index, written);
}
#endif

template <typename T>
NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be_t<const T>> src, std::span<T> dst, T restart_index)
{
Expand All @@ -412,6 +574,26 @@ NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be
u32 written = 0;
u32 length = ::size32(src);

#if defined(ARCH_X64)
if constexpr (std::is_same_v<T, u16>)
{
if (s_use_avx512_icl)
{
std::tie(min_index, max_index, written) = upload_u16_swapped_avx512_icl_skip_restart(src.data(), dst.data(), length, restart_index);
return std::make_tuple(min_index, max_index, written);
}
}

if constexpr (std::is_same_v<T, u32>)
{
if (s_use_avx3)
{
std::tie(min_index, max_index, written) = upload_u32_swapped_avx3_skip_restart(src.data(), dst.data(), length, restart_index);
return std::make_tuple(min_index, max_index, written);
}
}
#endif

for (u32 i = written; i < length; ++i)
{
T index = src[i];
Expand Down
Loading