Skip to content

Commit

Permalink
Update gpurt from commit b0e4674a
Browse files Browse the repository at this point in the history
Support cpsGlobal in DispatchRaysIndirect
First attempt at shared file validation
Move CopyAS to shadersClean
Use global atomic to get dispatch id
Fix line length limits for GPURT_ENABLE_GPU_DEBUG=ON
Updated stages for CmdWriteImmediate
gpurtDevice: Add public numprims calc function
[Continuations] Introduce helper structs for packed and unpacked Vpc values
Defines an RT IP enumeration inside GpuRT that is more stable (across driver versions) than the PAL equivalent
[Continuations] Remove SuspendWave mode
[Continuations] Revamp scheduling to not use wait masks
Add float3 versions of AmdExtD3DShaderIntrinsics_WaveClusterMax / AmdExtD3DShaderIntrinsics_WaveClusterMin
  • Loading branch information
qiaojbao committed Oct 30, 2024
1 parent 95c27c4 commit f2d96f1
Show file tree
Hide file tree
Showing 57 changed files with 578 additions and 346 deletions.
6 changes: 3 additions & 3 deletions backends/pal/gpurtPalBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,11 @@ void PalBackend::WriteImmediateSingle(
ImmediateDataWidth width
) const
{
// We want to use HwPipePreCs (ME) so that the writes do not occur before UAV barriers are done waiting.
// We want to use StagePostPrefetch (ME) so that the writes do not occur before UAV barriers are done waiting.
// Both internal barriers during the build and application barriers synchronizing access to acceleration
// structure memory wait at HwPipePreCs.
// structure memory wait at StagePostPrefetch.
GetCmdBuffer(cmdBuffer)->CmdWriteImmediate(
Pal::HwPipePoint::HwPipePreCs,
Pal::PipelineStageFlag::PipelineStagePostPrefetch,
value,
GpuRtToPalImmediateDataWidth(width),
destVa);
Expand Down
8 changes: 6 additions & 2 deletions cmake/GpuRtGenerateShaders.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ if (GPURT_ENABLE_GPU_DEBUG)
set(debugShaderDirectory "${CMAKE_CURRENT_BINARY_DIR}/debugShaders/src/shaders/")
set(gpurtShaderSource ${GPURT_SHADER_SOURCE_FILES})
set(gpurtShadersSourceDir ${debugShaderDirectory})
set(gpurtShadersPreprocessInputFile "${CMAKE_CURRENT_BINARY_DIR}/debugShaders/DebugPreprocessShadersInput.txt")
list(TRANSFORM gpurtShaderSource PREPEND "${debugShaderDirectory}")
set(preprocessArgs "")
foreach(originalSourceFile ${GPURT_SHADER_SOURCE_FILES})
Expand All @@ -84,10 +85,13 @@ if (GPURT_ENABLE_GPU_DEBUG)
list(APPEND preprocessArgs "${originalSourcePath}" "${newSourceFilePath}")
endforeach()
set(gpurtDebugPreprocessorScript "${gpurtToolsDir}/DebugPreprocessShaders.py")
configure_file("${gpurtToolsDir}/DebugPreprocessShadersInput.txt.in"
${gpurtShadersPreprocessInputFile}
)
add_custom_command(
OUTPUT ${gpurtShaderSource} ${gpurtDebugInfoFile}
DEPENDS ${originalShaderSource} ${gpurtDebugPreprocessorScript}
COMMAND Python3::Interpreter ${gpurtDebugPreprocessorScript} ${preprocessArgs} ${gpurtDebugInfoFile}
DEPENDS ${originalShaderSource} ${gpurtDebugPreprocessorScript} ${gpurtShadersPreprocessInputFile}
COMMAND Python3::Interpreter ${gpurtDebugPreprocessorScript} -i ${gpurtShadersPreprocessInputFile} -o ${gpurtDebugInfoFile}
)
else()
set(gpurtShaderSource "${originalShaderSource}")
Expand Down
14 changes: 12 additions & 2 deletions gpurt/gpurt.h
Original file line number Diff line number Diff line change
Expand Up @@ -1471,13 +1471,21 @@ class IDevice
// @param pDispatchRaysConstants (in/out) Non-null pointer to a DispatchRaysConstants
// @param cpsMemoryGpuAddr (in) GPU address pointing to the beginning of cps memory
// @param cpsMemoryBytes (in) Cps allocated memory size in bytes
//
// @return the required global memory allocation size in bytes
virtual void PatchDispatchRaysConstants(
DispatchRaysConstants* pDispatchRaysConstants,
const gpusize cpsMemoryGpuAddr,
const gpusize cpsMemoryBytes) = 0;

// Populates the GPU addresses in the InitExecuteIndirectConstants structure
//
// @param pInitExecuteIndirectConstants (in/out) Non-null pointer to a InitExecuteIndirectConstants
// @param cpsMemoryGpuAddr (in) GPU address pointing to the beginning of cps memory
// @param cpsMemoryBytes (in) Cps allocated memory size in bytes
virtual void PatchInitExecuteIndirectConstants(
GpuRt::InitExecuteIndirectConstants* pInitExecuteIndirectConstants,
const gpusize cpsMemoryGpuAddr,
const gpusize cpsMemoryBytes) = 0;

//
// @param cpsVideoMem [in] Cps video memory
// @param cpsMemoryBytes [in] Cps allocated memory size in bytes
Expand Down Expand Up @@ -1630,6 +1638,8 @@ class IDevice
// Check if a build is a good candidate for ACE offload (typically barrier-free cases)
virtual bool ShouldUseGangedAceForBuild(const AccelStructBuildInputs& inputs) const = 0;

virtual uint32 CalculateBvhPrimitiveCount(const AccelStructBuildInputs& inputs) const = 0;

protected:

/// Client must create objects by explicitly calling CreateDevice method
Expand Down
36 changes: 23 additions & 13 deletions gpurt/gpurtDispatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ struct DispatchRaysTopLevelData
uint32 accelStructTrackerSrd[MaxBufferSrdSize]; // Structured buffer SRD pointing to the accel struct tracker
};

#define DISPATCHRAYSCONSTANTDATA_STRUCT_OFFSET_DISPATCHID 48

// Dispatch rays constant buffer data (GPU structure). Note, using unaligned uint64_t in HLSL constant buffers requires
// -no-legacy-cbuf-layout for cpp style structure alignment to work. But currently that support is incomplete in DXC
// and until that is resolved we need to use uint32's explicitly.
Expand All @@ -74,7 +76,8 @@ struct DispatchRaysConstantData
uint32 hitGroupTableBaseAddressLo; // Hit group table base address low 32-bits
uint32 hitGroupTableBaseAddressHi; // Hit group table base address high 32-bits
uint32 hitGroupTableStrideInBytes; // Hit group table record byte stride
uint32 reserved0; // Reserved padding
uint32 cpsDispatchId; // Continuations DispatchId, written in the persistent mode.
// This value should not be read via constant buffer.
uint32 callableTableBaseAddressLo; // Callable shader table base address low 32-bits
uint32 callableTableBaseAddressHi; // Callable shader table base address high 32-bits
uint32 callableTableStrideInBytes; // Callable shader table byte stride
Expand All @@ -96,6 +99,8 @@ struct DispatchRaysConstantData
uint32 cpsGlobalMemoryAddressLo; // Separate CPS stack memory base address low 32-bits
uint32 cpsGlobalMemoryAddressHi; // Separate CPS stack memory base address high 32-bits
uint32 counterMask; // Mask for filtering ray history token
uint32 cpsDispatchIdAddressLo; // Continuations cpsDispatchId address low 32-bits
uint32 cpsDispatchIdAddressHi; // Continuations cpsDispatchId address high 32-bits
};
#pragma pack(pop)

Expand All @@ -109,6 +114,8 @@ struct DispatchRaysConstants
#if __cplusplus
static_assert((sizeof(DispatchRaysConstants) % sizeof(uint32)) == 0,
"DispatchRaysConstants is not dword-aligned");
static_assert(DISPATCHRAYSCONSTANTDATA_STRUCT_OFFSET_DISPATCHID == offsetof(DispatchRaysConstantData, cpsDispatchId),
"DISPATCHRAYSCONSTANTDATA_STRUCT_OFFSET_DISPATCHID mismatches to cpsDispatchId");

constexpr uint32 DispatchRaysConstantsDw = sizeof(DispatchRaysConstants) / sizeof(uint32);
#endif
Expand All @@ -132,6 +139,17 @@ struct InitExecuteIndirectUserData
// Constants for InitExecuteIndirect shader
struct InitExecuteIndirectConstants
{
#if __cplusplus
// Internal counter buffer SRDs
uint32 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize];

// Internal acceleration structure tracker buffer SRD.
uint32 accelStructTrackerSrd[MaxBufferSrdSize];
#else
uint4 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize / 4];
uint4 accelStructTrackerSrd[MaxBufferSrdSize / 4];
#endif

uint32 inputBytesPerDispatch; // Size of application indirect arguments
uint32 outputBytesPerDispatch; // Size of resulting driver internal arguments
uint32 bindingArgsSize; // Size of binding arguments in the app buffer preceeding the dispatch
Expand Down Expand Up @@ -160,18 +178,10 @@ struct InitExecuteIndirectConstants
uint32 counterRayIdRangeBegin; // Counter ray ID range begin
uint32 counterRayIdRangeEnd; // Counter ray ID range end
uint32 cpsBackendStackSize; // Scratch memory used by a compiler backend, start at offset 0
uint32 padding0; // Padding for 16-byte alignment
uint32 cpsFrontendStackSize; // Scratch memory used by IR (Intermediate Representation), for a continuation passing shader

#if __cplusplus
// Internal counter buffer SRDs
uint32 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize];

// Internal acceleration structure tracker buffer SRD.
uint32 accelStructTrackerSrd[MaxBufferSrdSize];
#else
uint4 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize / 4];
uint4 accelStructTrackerSrd[MaxBufferSrdSize / 4];
#endif
uint32 cpsGlobalMemoryAddressLo; // Separate CPS stack memory base address low 32-bits
uint32 cpsGlobalMemoryAddressHi; // Separate CPS stack memory base address high 32-bits
};

constexpr uint32 InitExecuteIndirectConstantsDw = sizeof(InitExecuteIndirectConstants) / sizeof(uint32);
Expand All @@ -184,7 +194,7 @@ static_assert((MaxBufferSrdSize == 4), "Buffer SRD size changed, affected shader
#endif
static_assert((sizeof(InitExecuteIndirectConstants) % sizeof(uint32)) == 0,
"InitExecuteIndirectConstants is not dword-aligned");
}
} // namespace GpuRt
#endif

#endif
6 changes: 3 additions & 3 deletions src/gpurtBvhBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1629,7 +1629,7 @@ AccelStructHeader BvhBuilder::InitAccelStructHeader() const
header.geometryType = static_cast<uint32>(m_buildConfig.geometryType);
header.uuidLo = Util::LowPart(m_deviceSettings.accelerationStructureUUID);
header.uuidHi = Util::HighPart(m_deviceSettings.accelerationStructureUUID);
header.rtIpLevel = uint32(m_pDevice->GetRtIpLevel());
header.rtIpLevel = static_cast<uint32>(PalToGpuRtIpLevel(m_pDevice->GetRtIpLevel()));

if (m_buildConfig.topLevelBuild)
{
Expand Down Expand Up @@ -2313,8 +2313,8 @@ void BvhBuilder::GetAccelerationStructurePrebuildInfo(
// the build when performing the update causing page faults.
scratchDataSize = Util::Max(scratchDataSize, updateDataSize);

// Some applications crash when the driver reports 0 scratch size. Use 1 instead.
scratchDataSize = Util::Max(1u, scratchDataSize);
// Some applications crash when the driver reports 0 scratch size. Use 1 DWORD instead.
scratchDataSize = Util::Max(static_cast<uint32>(sizeof(uint32)), scratchDataSize);

prebuildInfo.scratchDataSizeInBytes = scratchDataSize;
prebuildInfo.updateScratchDataSizeInBytes = updateDataSize;
Expand Down
45 changes: 43 additions & 2 deletions src/gpurtDevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -467,16 +467,36 @@ Pal::Result Device::InitializeCpsMemory(
return result;
}

//=====================================================================================================================
// Populates the GPU addresses in the Constant structure
template<typename ConstantsType>
void Device::PatchConstants(ConstantsType* pConstant,
const gpusize cpsMemoryGpuAddr,
const gpusize cpsMemoryBytes)
{
pConstant->cpsGlobalMemoryAddressLo = Util::LowPart(cpsMemoryGpuAddr);
pConstant->cpsGlobalMemoryAddressHi = Util::HighPart(cpsMemoryGpuAddr);

}

//=====================================================================================================================
// Populates the GPU addresses in the DispatchRaysConstants structure
void Device::PatchDispatchRaysConstants(
DispatchRaysConstants* pDispatchRaysConstants,
const gpusize cpsMemoryGpuAddr,
const gpusize cpsMemoryBytes)
{
pDispatchRaysConstants->constData.cpsGlobalMemoryAddressLo = Util::LowPart(cpsMemoryGpuAddr);
pDispatchRaysConstants->constData.cpsGlobalMemoryAddressHi = Util::HighPart(cpsMemoryGpuAddr);
PatchConstants(&pDispatchRaysConstants->constData, cpsMemoryGpuAddr, cpsMemoryBytes);
}

//=====================================================================================================================
// Populates the GPU addresses in the InitExecuteIndirectConstants structure
void Device::PatchInitExecuteIndirectConstants(
GpuRt::InitExecuteIndirectConstants* pInitExecuteIndirectConstants,
const gpusize cpsMemoryGpuAddr,
const gpusize cpsMemoryBytes)
{
PatchConstants(pInitExecuteIndirectConstants, cpsMemoryGpuAddr, cpsMemoryBytes);
}

//=====================================================================================================================
Expand Down Expand Up @@ -2125,6 +2145,27 @@ bool Device::ShouldUseGangedAceForBuild(
return shouldUseGangedAce;
}

// =====================================================================================================================
uint32 Device::CalculateBvhPrimitiveCount(
const AccelStructBuildInputs& inputs
) const
{
// For top-level acceleration structure, inputElementCount represents the number of instances
uint32 primitiveCount = (inputs.type == AccelStructType::TopLevel) ? inputs.inputElemCount : 0;

if (inputs.type == AccelStructType::BottomLevel)
{
for (uint32 i = 0; i < inputs.inputElemCount; ++i)
{
const Geometry geometry = m_clientCb.pfnConvertAccelStructBuildGeometry(inputs, i);
const uint32 geometryPrimCount = BvhBuilder::GetGeometryPrimCount(geometry);
primitiveCount += geometryPrimCount;
}
}

return primitiveCount;
}

// =====================================================================================================================
const AccelStructBuildInputs Device::OverrideBuildInputs(
const AccelStructBuildInputs& inputs
Expand Down
56 changes: 54 additions & 2 deletions src/gpurtInternal.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,42 @@ enum EncodeFlags : uint32
EncodeFlagFusedInstanceNode = 0x00000008,
};

// Values should remain stable for RRA binary-compatibility (PAL equivalents do not guarantee stability)
enum RtIpLevel : uint32
{
RtIpNone = 0x0, ///< The device does not have an RayTracing Ip Level
RtIp1_0 = 0x1, ///< First Implementation of HW RT
RtIp1_1 = 0x2, ///< Added computation of triangle barycentrics into HW
RtIp2_0 = 0x3, ///< Added more Hardware RayTracing features, such as BoxSort, PointerFlag, etc
RtIpReserved = 0x5, ///< Special value, should not be used
};

// =====================================================================================================================
// Convert PAL RtIpLevel values to their GpuRT equivalent
static RtIpLevel PalToGpuRtIpLevel(Pal::RayTracingIpLevel palRtIpLevel)
{
RtIpLevel gpuRtIpLevel = RtIpLevel::RtIpNone;

switch (palRtIpLevel)
{
case Pal::RayTracingIpLevel::RtIp1_0:
gpuRtIpLevel = RtIpLevel::RtIp1_0;
break;
case Pal::RayTracingIpLevel::RtIp1_1:
gpuRtIpLevel = RtIpLevel::RtIp1_1;
break;
case Pal::RayTracingIpLevel::RtIp2_0:
gpuRtIpLevel = RtIpLevel::RtIp2_0;
break;
case Pal::RayTracingIpLevel::None:
default:
gpuRtIpLevel = RtIpLevel::RtIpNone;
break;
}

return gpuRtIpLevel;
}

struct RadixSortConfig
{
uint32 workGroupSize;
Expand Down Expand Up @@ -336,13 +372,21 @@ class Device : public IDevice
// @param pDispatchRaysConstants (in/out) Non-null pointer to a DispatchRaysConstants
// @param cpsMemoryGpuAddr (in) GPU address pointing to the beginning of cps memory
// @param cpsMemoryBytes (in) Cps allocated memory size in bytes
//
// @return the required global memory allocation size in bytes
virtual void PatchDispatchRaysConstants(
DispatchRaysConstants* pDispatchRaysConstants,
const gpusize cpsMemoryGpuAddr,
const gpusize cpsMemoryBytes) override;

// Populates the GPU addresses in the InitExecuteIndirectConstants structure
//
// @param pInitExecuteIndirectConstants (in/out) Non-null pointer to a InitExecuteIndirectConstants
// @param cpsMemoryGpuAddr (in) GPU address pointing to the beginning of cps memory
// @param cpsMemoryBytes (in) Cps allocated memory size in bytes
virtual void PatchInitExecuteIndirectConstants(
GpuRt::InitExecuteIndirectConstants* pInitExecuteIndirectConstants,
const gpusize cpsMemoryGpuAddr,
const gpusize cpsMemoryBytes) override;

//
// @param cpsVideoMem [in] Cps video memory
// @param cpsMemoryBytes [in] Cps allocated memory size in bytes
Expand Down Expand Up @@ -683,6 +727,8 @@ class Device : public IDevice

virtual bool ShouldUseGangedAceForBuild(const AccelStructBuildInputs& inputs) const override;

virtual uint32 CalculateBvhPrimitiveCount(const AccelStructBuildInputs& inputs) const override;

// Returns size in DWORDs of a typed buffer view SRD
uint32 GetTypedBufferSrdSizeDw() const { return m_typedBufferSrdSizeDw; };

Expand Down Expand Up @@ -722,6 +768,12 @@ class Device : public IDevice

virtual ~Device() override;

template<typename ConstantsType>
void PatchConstants(
ConstantsType* pConstant,
const gpusize cpsMemoryGpuAddr,
const gpusize cpsMemoryBytes);

DeviceInitInfo m_info;

Util::GenericAllocatorTracked m_allocator;
Expand Down
4 changes: 2 additions & 2 deletions src/gpurtTraceSource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ void AccelStructTraceSource::OnTraceBegin(
if (m_pDevice->AccelStructTrackerGpuAddr() != 0)
{
// Before starting the trace set tracking to enabled.
pCmdBuf->CmdWriteImmediate(Pal::HwPipeBottom, 1, Pal::ImmediateDataWidth::ImmediateData32Bit,
pCmdBuf->CmdWriteImmediate(Pal::PipelineStageBottomOfPipe, 1, Pal::ImmediateDataWidth::ImmediateData32Bit,
m_pDevice->AccelStructTrackerGpuAddr() + offsetof(AccelStructTracker, enabled));
m_pDevice->RaytracingBarrier(pCmdBuf, BarrierFlagSyncPostCpWrite);
}
Expand All @@ -67,7 +67,7 @@ void AccelStructTraceSource::OnTraceEnd(
if (m_pDevice->AccelStructTrackerGpuAddr() != 0)
{
// Disable tracking.
pCmdBuf->CmdWriteImmediate(Pal::HwPipeBottom, 0, Pal::ImmediateDataWidth::ImmediateData32Bit,
pCmdBuf->CmdWriteImmediate(Pal::PipelineStageBottomOfPipe, 0, Pal::ImmediateDataWidth::ImmediateData32Bit,
m_pDevice->AccelStructTrackerGpuAddr() + offsetof(AccelStructTracker, enabled));
m_pDevice->RaytracingBarrier(pCmdBuf, BarrierFlagSyncPostCpWrite);
}
Expand Down
6 changes: 3 additions & 3 deletions src/options.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ enum CpsCandidatePrimitiveMode:
# Controls how candidate primitives are handled in the continuations (CPS) software Traversal loop.
SuspendLane: # Suspend a lane upon candidate hits and wait for other lanes to end the Traversal loop.
# This is the default. Other modes are experimental and might not be implemented on all RtIps.
SuspendWave: # On each Traversal iteration, check whether any lane has a candidate, and break if so.
# Only implemented for RtIp 2.0, all other cases use SuspendLane.
DeferFirst: # When finding the first candidate, record it and ignore it for the time being. At the end of the
# Traversal loop, process pending candidates. When finding the second candidate, immediately break
# out of the loop to first process the first one.
# Only implemented for triangle primitives on RtIp 2.0, all other cases use SuspendLane.
# Implementation status:
# * RtIp 1.1: Not supported, SuspendLane is always used.
# * RtIp 2.0: DeferFirst is supported, but only for triangle primitives.

# ------------------------------------------------------------------------------------------------------------------
# This is the definition of the single options struct.
Expand Down
2 changes: 1 addition & 1 deletion src/shaders/BuildBVH.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
**********************************************************************************************************************/
#if NO_SHADER_ENTRYPOINT == 0
//=====================================================================================================================
#include "../shared/rayTracingDefs.h"
#include "../shadersClean/common/ShaderDefs.hlsli"

#include "BuildRootSignature.hlsl"

Expand Down
2 changes: 1 addition & 1 deletion src/shaders/BuildBVHTDTR.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ struct StateTDBuild
#define USE_LDS 1

//=====================================================================================================================
#include "../shared/rayTracingDefs.h"
#include "../shadersClean/common/ShaderDefs.hlsli"

#define GC_DSTMETADATA
#define GC_SCRATCHBUFFER
Expand Down
Loading

0 comments on commit f2d96f1

Please sign in to comment.