Skip to content

Commit

Permalink
Remove copy from hsMatrix2SIMD in Metal renderer
Browse files Browse the repository at this point in the history
hsMatrix2SIMD used to do a transpose of the input hsMatrix - but shaders and code were rewritten to make this unnecessary. hsMatrix2SIMD still does a vestigial memory copy. This commit removes that memory copy.

This changes the meaning of hsMatrix2SIMD - so functions that need to make a copy now do so manually. Otherwise a direct cast is made. hsMatrix2SIMD has also been made constexpr.

This should result in a small (~1%) performance improvement in the Metal renderer main loop.
  • Loading branch information
colincornaby committed Jan 1, 2025
1 parent c4cbb98 commit 14d89ad
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 45 deletions.
22 changes: 5 additions & 17 deletions Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalDevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,18 +124,6 @@ static inline uint8_t* inlStuff(uint8_t* dst, const T* val)
return reinterpret_cast<uint8_t*>(ptr);
}

matrix_float4x4* hsMatrix2SIMD(const hsMatrix44& src, matrix_float4x4* dst)
{
constexpr auto matrixSize = sizeof(matrix_float4x4);
if (src.fFlags & hsMatrix44::kIsIdent) {
memcpy(dst, &matrix_identity_float4x4, matrixSize);
} else {
memcpy(dst, &src.fMap, matrixSize);
}

return dst;
}

bool plMetalDevice::InitDevice()
{
// FIXME: Should Metal adopt InitDevice like OGL?
Expand Down Expand Up @@ -972,25 +960,25 @@ void plMetalDevice::MakeCubicTextureRef(plMetalDevice::TextureRef* tRef, plCubic

void plMetalDevice::SetProjectionMatrix(const hsMatrix44& src)
{
hsMatrix2SIMD(src, &fMatrixProj);
fMatrixProj = *hsMatrix2SIMD(src);
}

void plMetalDevice::SetWorldToCameraMatrix(const hsMatrix44& src)
{
hsMatrix44 inv;
src.GetInverse(&inv);

hsMatrix2SIMD(src, &fMatrixW2C);
hsMatrix2SIMD(inv, &fMatrixC2W);
fMatrixW2C = *hsMatrix2SIMD(src);
fMatrixC2W = *hsMatrix2SIMD(inv);
}

void plMetalDevice::SetLocalToWorldMatrix(const hsMatrix44& src)
{
hsMatrix44 inv;
src.GetInverse(&inv);

hsMatrix2SIMD(src, &fMatrixL2W);
hsMatrix2SIMD(inv, &fMatrixW2L);
fMatrixL2W = *hsMatrix2SIMD(src);
fMatrixW2L = *hsMatrix2SIMD(inv);
}

void plMetalDevice::CreateNewCommandBuffer(CA::MetalDrawable* drawable)
Expand Down
7 changes: 5 additions & 2 deletions Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalDevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,11 @@ class plCubicEnvironmap;
class plLayerInterface;
class plMetalPipelineState;

// NOTE: Results of this will be row major
matrix_float4x4* hsMatrix2SIMD(const hsMatrix44& src, matrix_float4x4* dst);
constexpr const matrix_float4x4* hsMatrix2SIMD(const hsMatrix44& src)
{
//reinterperate_cast not allowed in constexpr
return (simd_float4x4*)(src.fMap);
}

class plMetalDevice
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,7 @@ void plMetalMaterialShaderRef::EncodeArguments(MTL::RenderCommandEncoder* encode

void plMetalMaterialShaderRef::EncodeTransform(const plLayerInterface* layer, UVOutDescriptor* transform)
{
matrix_float4x4 tXfm;
hsMatrix2SIMD(layer->GetTransform(), &tXfm);
transform->transform = tXfm;
transform->transform = *hsMatrix2SIMD(layer->GetTransform());
transform->UVWSrc = layer->GetUVWSrc();
}

Expand Down
34 changes: 11 additions & 23 deletions Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1147,9 +1147,8 @@ void plMetalPipeline::ISetupTransforms(plDrawableSpans* drawable, const plSpan&
}

if (span.fNumMatrices == 2) {
matrix_float4x4 mat;
hsMatrix2SIMD(drawable->GetPaletteMatrix(span.fBaseMatrix + 1), &mat);
fDevice.CurrentRenderCommandEncoder()->setVertexBytes(&mat, sizeof(matrix_float4x4), VertexShaderArgumentBlendMatrix1);
const matrix_float4x4 *mat = hsMatrix2SIMD(drawable->GetPaletteMatrix(span.fBaseMatrix + 1));
fDevice.CurrentRenderCommandEncoder()->setVertexBytes(mat, sizeof(matrix_float4x4), VertexShaderArgumentBlendMatrix1);
}

fCurrentRenderPassUniforms->projectionMatrix = fDevice.fMatrixProj;
Expand Down Expand Up @@ -1315,9 +1314,8 @@ void plMetalPipeline::IRenderProjection(const plRenderPrimFunc& render, plLightI
fCurrentRenderPassUniforms->fogColor = {0.f, 0.f, 0.f};
fCurrentRenderPassUniforms->diffuseCol = {1.f, 1.f, 1.f, 1.f};

matrix_float4x4 tXfm;
hsMatrix2SIMD(proj->GetTransform(), &tXfm);
fCurrentRenderPassUniforms->uvTransforms[0].transform = tXfm;
const matrix_float4x4 *tXfm = hsMatrix2SIMD(proj->GetTransform());
fCurrentRenderPassUniforms->uvTransforms[0].transform = *tXfm;
fCurrentRenderPassUniforms->uvTransforms[0].UVWSrc = proj->GetUVWSrc();

fCurrNumLayers = 1;
Expand Down Expand Up @@ -3235,10 +3233,7 @@ bool plMetalPipeline::IPushShadowCastState(plShadowSlave* slave)
castLUT = castLUT * c2w;
}

simd_float4x4 tXfm;
hsMatrix2SIMD(castLUT, &tXfm);

fCurrentRenderPassUniforms->uvTransforms[0].transform = tXfm;
fCurrentRenderPassUniforms->uvTransforms[0].transform = *hsMatrix2SIMD(castLUT);
fCurrentRenderPassUniforms->uvTransforms[0].UVWSrc = plLayerInterface::kUVWPosition;

/*DWORD clearColor = 0xff000000L;
Expand Down Expand Up @@ -3877,9 +3872,7 @@ void plMetalPipeline::ISetupShadowRcvTextureStages(hsGMaterial* mat)
// Normal UVW source.
fCurrentRenderPassUniforms->uvTransforms[2].UVWSrc = uvwSrc;
// MiscFlags to layer's misc flags
matrix_float4x4 tXfm;
hsMatrix2SIMD(layer->GetTransform(), &tXfm);
fCurrentRenderPassUniforms->uvTransforms[2].transform = tXfm;
fCurrentRenderPassUniforms->uvTransforms[2].transform = *hsMatrix2SIMD(layer->GetTransform());
}

fDevice.CurrentRenderCommandEncoder()->setFragmentBytes(&layerIndex, sizeof(int), FragmentShaderArgumentShadowCastAlphaSrc);
Expand Down Expand Up @@ -3947,19 +3940,16 @@ void plMetalPipeline::ISetupShadowSlaveTextures(plShadowSlave* slave)
fDevice.CurrentRenderCommandEncoder()->setFragmentBytes(&uniforms, sizeof(plMetalShadowCastFragmentShaderArgumentBuffer), FragmentShaderArgumentShadowCastUniforms);

hsMatrix44 cameraToTexture = slave->fWorldToTexture * c2w;
simd_float4x4 tXfm;
hsMatrix2SIMD(cameraToTexture, &tXfm);

fCurrentRenderPassUniforms->uvTransforms[0].UVWSrc = plLayerInterface::kUVWPosition;
fCurrentRenderPassUniforms->uvTransforms[0].transform = tXfm;
fCurrentRenderPassUniforms->uvTransforms[0].transform = *hsMatrix2SIMD(cameraToTexture);

// Stage 1: the lut
// Set the texture transform to slave's fRcvLUT
hsMatrix44 cameraToLut = slave->fRcvLUT * c2w;
hsMatrix2SIMD(cameraToLut, &tXfm);

fCurrentRenderPassUniforms->uvTransforms[1].UVWSrc = plLayerInterface::kUVWPosition;
fCurrentRenderPassUniforms->uvTransforms[1].transform = tXfm;
fCurrentRenderPassUniforms->uvTransforms[1].transform = *hsMatrix2SIMD(cameraToLut);
}

///////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -4198,15 +4188,13 @@ void plMetalPipeline::IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette,
simd_float4 destNorm_buf = (simd_float4){0.f, 0.f, 0.f, 0.f};
simd_float4 destPt_buf = (simd_float4){0.f, 0.f, 0.f, 1.f};

simd_float4x4 simdMatrix;

// Blend
for (uint32_t j = 0; j < numWeights + 1; ++j) {
hsMatrix2SIMD(matrixPalette[indices & 0xFF], &simdMatrix);
const simd_float4x4 *simdMatrix = hsMatrix2SIMD(matrixPalette[indices & 0xFF]);
if (weights[j]) {
// Note: This bit is different than GL/DirectX. It's using acclerate so this is also accelerated on ARM through NEON or maybe even the Neural Engine.
destPt_buf += simd_mul(*(simd_float4*)pt_buf, simdMatrix) * weights[j];
destNorm_buf += simd_mul(*(simd_float4*)vec_buf, simdMatrix) * weights[j];
destPt_buf += simd_mul(*(simd_float4*)pt_buf, *simdMatrix) * weights[j];
destNorm_buf += simd_mul(*(simd_float4*)vec_buf, *simdMatrix) * weights[j];
}
// ISkinVertexSSE41(matrixPalette[indices & 0xFF], weights[j], pt_buf, destPt_buf, vec_buf, destNorm_buf);
indices >>= 8;
Expand Down

0 comments on commit 14d89ad

Please sign in to comment.