Remove copy from hsMatrix2SIMD in Metal renderer

hsMatrix2SIMD used to do a transpose of the input hsMatrix - but shaders and code were rewritten to make this unnecessary. hsMatrix2SIMD still does a vestigial memory copy. This commit removes that memory copy. This changes the meaning of hsMatrix2SIMD - so functions that need to make a copy now do so manually. Otherwise a direct cast is made. hsMatrix2SIMD has also been made constexpr. This should result in a small (~1%) performance improvement in the Metal renderer main loop.
H-uru · Jan 1, 2025 · 14d89ad · 14d89ad
1 parent c4cbb98
commit 14d89ad
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 45 deletions.
diff --git a/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalDevice.cpp b/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalDevice.cpp
@@ -124,18 +124,6 @@ static inline uint8_t* inlStuff(uint8_t* dst, const T* val)
     return reinterpret_cast<uint8_t*>(ptr);
 }
 
-matrix_float4x4* hsMatrix2SIMD(const hsMatrix44& src, matrix_float4x4* dst)
-{
-    constexpr auto matrixSize = sizeof(matrix_float4x4);
-    if (src.fFlags & hsMatrix44::kIsIdent) {
-        memcpy(dst, &matrix_identity_float4x4, matrixSize);
-    } else {
-        memcpy(dst, &src.fMap, matrixSize);
-    }
-
-    return dst;
-}
-
 bool plMetalDevice::InitDevice()
 {
     // FIXME: Should Metal adopt InitDevice like OGL?
@@ -972,25 +960,25 @@ void plMetalDevice::MakeCubicTextureRef(plMetalDevice::TextureRef* tRef, plCubic
 
 void plMetalDevice::SetProjectionMatrix(const hsMatrix44& src)
 {
-    hsMatrix2SIMD(src, &fMatrixProj);
+    fMatrixProj = *hsMatrix2SIMD(src);
 }
 
 void plMetalDevice::SetWorldToCameraMatrix(const hsMatrix44& src)
 {
     hsMatrix44 inv;
     src.GetInverse(&inv);
 
-    hsMatrix2SIMD(src, &fMatrixW2C);
-    hsMatrix2SIMD(inv, &fMatrixC2W);
+    fMatrixW2C = *hsMatrix2SIMD(src);
+    fMatrixC2W = *hsMatrix2SIMD(inv);
 }
 
 void plMetalDevice::SetLocalToWorldMatrix(const hsMatrix44& src)
 {
     hsMatrix44 inv;
     src.GetInverse(&inv);
 
-    hsMatrix2SIMD(src, &fMatrixL2W);
-    hsMatrix2SIMD(inv, &fMatrixW2L);
+    fMatrixL2W = *hsMatrix2SIMD(src);
+    fMatrixW2L = *hsMatrix2SIMD(inv);
 }
 
 void plMetalDevice::CreateNewCommandBuffer(CA::MetalDrawable* drawable)

diff --git a/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalDevice.h b/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalDevice.h
@@ -65,8 +65,11 @@ class plCubicEnvironmap;
 class plLayerInterface;
 class plMetalPipelineState;
 
-// NOTE: Results of this will be row major
-matrix_float4x4* hsMatrix2SIMD(const hsMatrix44& src, matrix_float4x4* dst);
+constexpr const matrix_float4x4* hsMatrix2SIMD(const hsMatrix44& src)
+{
+    //reinterperate_cast not allowed in constexpr
+    return (simd_float4x4*)(src.fMap);
+}
 
 class plMetalDevice
 {

diff --git a/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalMaterialShaderRef.cpp b/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalMaterialShaderRef.cpp
@@ -204,9 +204,7 @@ void plMetalMaterialShaderRef::EncodeArguments(MTL::RenderCommandEncoder* encode
 
 void plMetalMaterialShaderRef::EncodeTransform(const plLayerInterface* layer, UVOutDescriptor* transform)
 {
-    matrix_float4x4 tXfm;
-    hsMatrix2SIMD(layer->GetTransform(), &tXfm);
-    transform->transform = tXfm;
+    transform->transform = *hsMatrix2SIMD(layer->GetTransform());
     transform->UVWSrc = layer->GetUVWSrc();
 }
 

diff --git a/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalPipeline.cpp b/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalPipeline.cpp
@@ -1147,9 +1147,8 @@ void plMetalPipeline::ISetupTransforms(plDrawableSpans* drawable, const plSpan&
     }
 
     if (span.fNumMatrices == 2) {
-        matrix_float4x4 mat;
-        hsMatrix2SIMD(drawable->GetPaletteMatrix(span.fBaseMatrix + 1), &mat);
-        fDevice.CurrentRenderCommandEncoder()->setVertexBytes(&mat, sizeof(matrix_float4x4), VertexShaderArgumentBlendMatrix1);
+        const matrix_float4x4 *mat = hsMatrix2SIMD(drawable->GetPaletteMatrix(span.fBaseMatrix + 1));
+        fDevice.CurrentRenderCommandEncoder()->setVertexBytes(mat, sizeof(matrix_float4x4), VertexShaderArgumentBlendMatrix1);
     }
 
     fCurrentRenderPassUniforms->projectionMatrix = fDevice.fMatrixProj;
@@ -1315,9 +1314,8 @@ void plMetalPipeline::IRenderProjection(const plRenderPrimFunc& render, plLightI
     fCurrentRenderPassUniforms->fogColor = {0.f, 0.f, 0.f};
     fCurrentRenderPassUniforms->diffuseCol = {1.f, 1.f, 1.f, 1.f};
 
-    matrix_float4x4 tXfm;
-    hsMatrix2SIMD(proj->GetTransform(), &tXfm);
-    fCurrentRenderPassUniforms->uvTransforms[0].transform = tXfm;
+    const matrix_float4x4 *tXfm = hsMatrix2SIMD(proj->GetTransform());
+    fCurrentRenderPassUniforms->uvTransforms[0].transform = *tXfm;
     fCurrentRenderPassUniforms->uvTransforms[0].UVWSrc = proj->GetUVWSrc();
 
     fCurrNumLayers = 1;
@@ -3235,10 +3233,7 @@ bool plMetalPipeline::IPushShadowCastState(plShadowSlave* slave)
         castLUT = castLUT * c2w;
     }
 
-    simd_float4x4 tXfm;
-    hsMatrix2SIMD(castLUT, &tXfm);
-
-    fCurrentRenderPassUniforms->uvTransforms[0].transform = tXfm;
+    fCurrentRenderPassUniforms->uvTransforms[0].transform = *hsMatrix2SIMD(castLUT);
     fCurrentRenderPassUniforms->uvTransforms[0].UVWSrc = plLayerInterface::kUVWPosition;
 
     /*DWORD clearColor = 0xff000000L;
@@ -3877,9 +3872,7 @@ void plMetalPipeline::ISetupShadowRcvTextureStages(hsGMaterial* mat)
         // Normal UVW source.
         fCurrentRenderPassUniforms->uvTransforms[2].UVWSrc = uvwSrc;
         // MiscFlags to layer's misc flags
-        matrix_float4x4 tXfm;
-        hsMatrix2SIMD(layer->GetTransform(), &tXfm);
-        fCurrentRenderPassUniforms->uvTransforms[2].transform = tXfm;
+        fCurrentRenderPassUniforms->uvTransforms[2].transform = *hsMatrix2SIMD(layer->GetTransform());
     }
 
     fDevice.CurrentRenderCommandEncoder()->setFragmentBytes(&layerIndex, sizeof(int), FragmentShaderArgumentShadowCastAlphaSrc);
@@ -3947,19 +3940,16 @@ void plMetalPipeline::ISetupShadowSlaveTextures(plShadowSlave* slave)
     fDevice.CurrentRenderCommandEncoder()->setFragmentBytes(&uniforms, sizeof(plMetalShadowCastFragmentShaderArgumentBuffer), FragmentShaderArgumentShadowCastUniforms);
 
     hsMatrix44    cameraToTexture = slave->fWorldToTexture * c2w;
-    simd_float4x4 tXfm;
-    hsMatrix2SIMD(cameraToTexture, &tXfm);
 
     fCurrentRenderPassUniforms->uvTransforms[0].UVWSrc = plLayerInterface::kUVWPosition;
-    fCurrentRenderPassUniforms->uvTransforms[0].transform = tXfm;
+    fCurrentRenderPassUniforms->uvTransforms[0].transform = *hsMatrix2SIMD(cameraToTexture);
 
     // Stage 1: the lut
     // Set the texture transform to slave's fRcvLUT
     hsMatrix44 cameraToLut = slave->fRcvLUT * c2w;
-    hsMatrix2SIMD(cameraToLut, &tXfm);
 
     fCurrentRenderPassUniforms->uvTransforms[1].UVWSrc = plLayerInterface::kUVWPosition;
-    fCurrentRenderPassUniforms->uvTransforms[1].transform = tXfm;
+    fCurrentRenderPassUniforms->uvTransforms[1].transform = *hsMatrix2SIMD(cameraToLut);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4198,15 +4188,13 @@ void plMetalPipeline::IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette,
         simd_float4 destNorm_buf = (simd_float4){0.f, 0.f, 0.f, 0.f};
         simd_float4 destPt_buf = (simd_float4){0.f, 0.f, 0.f, 1.f};
 
-        simd_float4x4 simdMatrix;
-
         // Blend
         for (uint32_t j = 0; j < numWeights + 1; ++j) {
-            hsMatrix2SIMD(matrixPalette[indices & 0xFF], &simdMatrix);
+            const simd_float4x4 *simdMatrix = hsMatrix2SIMD(matrixPalette[indices & 0xFF]);
             if (weights[j]) {
                 // Note: This bit is different than GL/DirectX. It's using acclerate so this is also accelerated on ARM through NEON or maybe even the Neural Engine.
-                destPt_buf += simd_mul(*(simd_float4*)pt_buf, simdMatrix) * weights[j];
-                destNorm_buf += simd_mul(*(simd_float4*)vec_buf, simdMatrix) * weights[j];
+                destPt_buf += simd_mul(*(simd_float4*)pt_buf, *simdMatrix) * weights[j];
+                destNorm_buf += simd_mul(*(simd_float4*)vec_buf, *simdMatrix) * weights[j];
             }
             // ISkinVertexSSE41(matrixPalette[indices & 0xFF], weights[j], pt_buf, destPt_buf, vec_buf, destNorm_buf);
             indices >>= 8;