diff --git a/hipamd/src/CMakeLists.txt b/hipamd/src/CMakeLists.txt index 8c25a0abea..acd26adc20 100644 --- a/hipamd/src/CMakeLists.txt +++ b/hipamd/src/CMakeLists.txt @@ -156,6 +156,13 @@ target_include_directories(amdhip64 target_compile_definitions(amdhip64 PRIVATE __HIP_PLATFORM_AMD__) target_link_libraries(amdhip64 PRIVATE ${OPENGL_LIBRARIES}) target_link_libraries(amdhip64 PRIVATE ${CMAKE_DL_LIBS}) + +# Link against SIMDe pseudo-target, even though it is header-only, to ensure +# <> inclusion. +find_package(PkgConfig REQUIRED) +pkg_check_modules(simde REQUIRED IMPORTED_TARGET simde) +target_link_libraries(amdhip64 PRIVATE PkgConfig::simde) + # Add link to comgr, hsa-runtime and other required libraries in target files # This is required for static libraries if(NOT BUILD_SHARED_LIBS) diff --git a/hipamd/src/hip_embed_pch.sh b/hipamd/src/hip_embed_pch.sh index 6c92d43884..4593f76444 100755 --- a/hipamd/src/hip_embed_pch.sh +++ b/hipamd/src/hip_embed_pch.sh @@ -142,19 +142,20 @@ __hip_pch_wave64_size: .long __hip_pch_wave64_size - __hip_pch_wave64 EOF + host_triple="$(uname -m)" set -x $LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only --cuda-gpu-arch=gfx1030 -x hip $tmp/hip_pch.h -E >$tmp/pch_wave32.cui && cat $tmp/hip_macros.h >> $tmp/pch_wave32.cui && - $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui && + $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple "$host_triple" -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui && $LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch_wave64.cui && cat $tmp/hip_macros.h >> $tmp/pch_wave64.cui && - $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui && + $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple "$host_triple" -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui && $LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj && diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp index 20f60a4c3f..14f976402e 100644 --- a/hipamd/src/hip_graph_internal.cpp +++ b/hipamd/src/hip_graph_internal.cpp @@ -19,6 +19,9 @@ THE SOFTWARE. */ #include "hip_graph_internal.hpp" + +#include + #include #define CASE_STRING(X, C) \ @@ -806,9 +809,9 @@ void GraphKernelArgManager::ReadBackOrFlush() { address dev_ptr = kernarg_graph_.back().kernarg_pool_addr_ + kernarg_graph_.back().kernarg_pool_size_; auto kSentinel = *reinterpret_cast(dev_ptr - 1); - _mm_sfence(); + simde_mm_sfence(); *(dev_ptr - 1) = kSentinel; - _mm_mfence(); + simde_mm_mfence(); kSentinel = *reinterpret_cast(dev_ptr - 1); } } diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index f4e5799afc..437e01a6dc 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -38,6 +38,10 @@ #include "hsa/amd_hsa_queue.h" #include "hsa/amd_hsa_signal.h" +#include +#include +#include + #include #include #include @@ -45,14 +49,6 @@ #include #include -#if defined(__AVX__) -#if defined(__MINGW64__) -#include -#else -#include -#endif -#endif - /** * HSA image object size in bytes (see HSAIL spec) */ @@ -3261,49 +3257,44 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) __attribute__((optimize("unroll-all-loops"), always_inline)) static inline void nontemporalMemcpy( void* __restrict dst, const void* __restrict src, size_t size) { -#if defined(ATI_ARCH_X86) -#if defined(__AVX512F__) - for (auto i = 0u; i != size / sizeof(__m512i); ++i) { - _mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++, - *reinterpret_cast(src)++); +#if defined(__AVX512F__) && false // Disable until SIMDe adds support. + for (auto i = 0u; i != size / sizeof(simde__m512i); ++i) { + simde_mm512_stream_si512(reinterpret_cast(dst)++, + *reinterpret_cast(src)++); } - size = size % sizeof(__m512i); + size = size % sizeof(simde__m512i); #endif #if defined(__AVX__) - for (auto i = 0u; i != size / sizeof(__m256i); ++i) { - _mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++, - *reinterpret_cast(src)++); + for (auto i = 0u; i != size / sizeof(simde__m256i); ++i) { + simde_mm256_stream_si256(reinterpret_cast(dst)++, + *reinterpret_cast(src)++); } - size = size % sizeof(__m256i); + size = size % sizeof(simde__m256i); #endif - - for (auto i = 0u; i != size / sizeof(__m128i); ++i) { - _mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++, - *(reinterpret_cast(src)++)); + for (auto i = 0u; i != size / sizeof(simde__m128i); ++i) { + simde_mm_stream_si128(reinterpret_cast(dst)++, + *(reinterpret_cast(src)++)); } - size = size % sizeof(__m128i); + size = size % sizeof(simde__m128i); - for (auto i = 0u; i != size / sizeof(long long); ++i) { - _mm_stream_si64(reinterpret_cast(dst)++, - *reinterpret_cast(src)++); + for (auto i = 0u; i != size / sizeof(int64_t); ++i) { + simde_mm_stream_si64(reinterpret_cast(dst)++, + *reinterpret_cast(src)++); } - size = size % sizeof(long long); + size = size % sizeof(int64_t); - for (auto i = 0u; i != size / sizeof(int); ++i) { - _mm_stream_si32(reinterpret_cast(dst)++, - *reinterpret_cast(src)++); + for (auto i = 0u; i != size / sizeof(int32_t); ++i) { + simde_mm_stream_si32(reinterpret_cast(dst)++, + *reinterpret_cast(src)++); } - size = size % sizeof(int); + size = size % sizeof(int32_t); // Copy remaining bytes for unaligned size std::memcpy(dst, src, size); // Add memory fence - _mm_sfence(); -#else - std::memcpy(dst, src, size); -#endif + simde_mm_sfence(); } void VirtualGPU::HiddenHeapInit() { const_cast(dev()).HiddenHeapInit(*this); } @@ -3555,9 +3546,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, *dev().info().hdpMemFlushCntl = 1u; auto kSentinel = *reinterpret_cast(dev().info().hdpMemFlushCntl); } else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback && argSize != 0) { - _mm_sfence(); + simde_mm_sfence(); *(argBuffer + argSize - 1) = *(parameters + argSize - 1); - _mm_mfence(); + simde_mm_mfence(); auto kSentinel = *reinterpret_cast(argBuffer + argSize - 1); } } diff --git a/rocclr/os/os.cpp b/rocclr/os/os.cpp index e15800a7f7..85fd6436b6 100644 --- a/rocclr/os/os.cpp +++ b/rocclr/os/os.cpp @@ -31,9 +31,7 @@ #include #endif // !_WIN32 -#if defined(ATI_ARCH_X86) -#include // for _mm_pause -#endif // ATI_ARCH_X86 +#include namespace amd { @@ -120,11 +118,7 @@ size_t Os::pageSize_ = 0; int Os::processorCount_ = 0; void Os::spinPause() { -#if defined(ATI_ARCH_X86) - _mm_pause(); -#elif defined(ATI_ARCH_ARM) - __asm__ __volatile__("yield"); -#endif + simde_mm_pause(); } void Os::sleep(long n) {