Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove copy from hsMatrix2SIMD in Metal renderer #1647

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 5 additions & 17 deletions Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalDevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,18 +124,6 @@ static inline uint8_t* inlStuff(uint8_t* dst, const T* val)
return reinterpret_cast<uint8_t*>(ptr);
}

matrix_float4x4* hsMatrix2SIMD(const hsMatrix44& src, matrix_float4x4* dst)
{
constexpr auto matrixSize = sizeof(matrix_float4x4);
if (src.fFlags & hsMatrix44::kIsIdent) {
memcpy(dst, &matrix_identity_float4x4, matrixSize);
} else {
memcpy(dst, &src.fMap, matrixSize);
}

return dst;
}

bool plMetalDevice::InitDevice()
{
// FIXME: Should Metal adopt InitDevice like OGL?
Expand Down Expand Up @@ -972,25 +960,25 @@ void plMetalDevice::MakeCubicTextureRef(plMetalDevice::TextureRef* tRef, plCubic

void plMetalDevice::SetProjectionMatrix(const hsMatrix44& src)
{
hsMatrix2SIMD(src, &fMatrixProj);
fMatrixProj = *hsMatrix2SIMD(src);
}

void plMetalDevice::SetWorldToCameraMatrix(const hsMatrix44& src)
{
hsMatrix44 inv;
src.GetInverse(&inv);

hsMatrix2SIMD(src, &fMatrixW2C);
hsMatrix2SIMD(inv, &fMatrixC2W);
fMatrixW2C = *hsMatrix2SIMD(src);
fMatrixC2W = *hsMatrix2SIMD(inv);
}

void plMetalDevice::SetLocalToWorldMatrix(const hsMatrix44& src)
{
hsMatrix44 inv;
src.GetInverse(&inv);

hsMatrix2SIMD(src, &fMatrixL2W);
hsMatrix2SIMD(inv, &fMatrixW2L);
fMatrixL2W = *hsMatrix2SIMD(src);
fMatrixW2L = *hsMatrix2SIMD(inv);
}

void plMetalDevice::CreateNewCommandBuffer(CA::MetalDrawable* drawable)
Expand Down
7 changes: 5 additions & 2 deletions Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalDevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,11 @@ class plCubicEnvironmap;
class plLayerInterface;
class plMetalPipelineState;

// NOTE: Results of this will be row major
matrix_float4x4* hsMatrix2SIMD(const hsMatrix44& src, matrix_float4x4* dst);
constexpr const matrix_float4x4* hsMatrix2SIMD(const hsMatrix44& src)
{
//reinterperate_cast not allowed in constexpr
return (simd_float4x4*)(src.fMap);
}
Comment on lines +68 to +72
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This probably shouldn't be constexpr because there's not really anything that can be done at compile time. Instead, it should be either static inline or just a macro and use reinterpret_cast.


class plMetalDevice
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,7 @@ void plMetalMaterialShaderRef::EncodeArguments(MTL::RenderCommandEncoder* encode

void plMetalMaterialShaderRef::EncodeTransform(const plLayerInterface* layer, UVOutDescriptor* transform)
{
matrix_float4x4 tXfm;
hsMatrix2SIMD(layer->GetTransform(), &tXfm);
transform->transform = tXfm;
transform->transform = *hsMatrix2SIMD(layer->GetTransform());
transform->UVWSrc = layer->GetUVWSrc();
}

Expand Down
34 changes: 11 additions & 23 deletions Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1147,9 +1147,8 @@ void plMetalPipeline::ISetupTransforms(plDrawableSpans* drawable, const plSpan&
}

if (span.fNumMatrices == 2) {
matrix_float4x4 mat;
hsMatrix2SIMD(drawable->GetPaletteMatrix(span.fBaseMatrix + 1), &mat);
fDevice.CurrentRenderCommandEncoder()->setVertexBytes(&mat, sizeof(matrix_float4x4), VertexShaderArgumentBlendMatrix1);
const matrix_float4x4 *mat = hsMatrix2SIMD(drawable->GetPaletteMatrix(span.fBaseMatrix + 1));
fDevice.CurrentRenderCommandEncoder()->setVertexBytes(mat, sizeof(matrix_float4x4), VertexShaderArgumentBlendMatrix1);
}

fCurrentRenderPassUniforms->projectionMatrix = fDevice.fMatrixProj;
Expand Down Expand Up @@ -1315,9 +1314,8 @@ void plMetalPipeline::IRenderProjection(const plRenderPrimFunc& render, plLightI
fCurrentRenderPassUniforms->fogColor = {0.f, 0.f, 0.f};
fCurrentRenderPassUniforms->diffuseCol = {1.f, 1.f, 1.f, 1.f};

matrix_float4x4 tXfm;
hsMatrix2SIMD(proj->GetTransform(), &tXfm);
fCurrentRenderPassUniforms->uvTransforms[0].transform = tXfm;
const matrix_float4x4 *tXfm = hsMatrix2SIMD(proj->GetTransform());
fCurrentRenderPassUniforms->uvTransforms[0].transform = *tXfm;
fCurrentRenderPassUniforms->uvTransforms[0].UVWSrc = proj->GetUVWSrc();

fCurrNumLayers = 1;
Expand Down Expand Up @@ -3235,10 +3233,7 @@ bool plMetalPipeline::IPushShadowCastState(plShadowSlave* slave)
castLUT = castLUT * c2w;
}

simd_float4x4 tXfm;
hsMatrix2SIMD(castLUT, &tXfm);

fCurrentRenderPassUniforms->uvTransforms[0].transform = tXfm;
fCurrentRenderPassUniforms->uvTransforms[0].transform = *hsMatrix2SIMD(castLUT);
fCurrentRenderPassUniforms->uvTransforms[0].UVWSrc = plLayerInterface::kUVWPosition;

/*DWORD clearColor = 0xff000000L;
Expand Down Expand Up @@ -3877,9 +3872,7 @@ void plMetalPipeline::ISetupShadowRcvTextureStages(hsGMaterial* mat)
// Normal UVW source.
fCurrentRenderPassUniforms->uvTransforms[2].UVWSrc = uvwSrc;
// MiscFlags to layer's misc flags
matrix_float4x4 tXfm;
hsMatrix2SIMD(layer->GetTransform(), &tXfm);
fCurrentRenderPassUniforms->uvTransforms[2].transform = tXfm;
fCurrentRenderPassUniforms->uvTransforms[2].transform = *hsMatrix2SIMD(layer->GetTransform());
}

fDevice.CurrentRenderCommandEncoder()->setFragmentBytes(&layerIndex, sizeof(int), FragmentShaderArgumentShadowCastAlphaSrc);
Expand Down Expand Up @@ -3947,19 +3940,16 @@ void plMetalPipeline::ISetupShadowSlaveTextures(plShadowSlave* slave)
fDevice.CurrentRenderCommandEncoder()->setFragmentBytes(&uniforms, sizeof(plMetalShadowCastFragmentShaderArgumentBuffer), FragmentShaderArgumentShadowCastUniforms);

hsMatrix44 cameraToTexture = slave->fWorldToTexture * c2w;
simd_float4x4 tXfm;
hsMatrix2SIMD(cameraToTexture, &tXfm);

fCurrentRenderPassUniforms->uvTransforms[0].UVWSrc = plLayerInterface::kUVWPosition;
fCurrentRenderPassUniforms->uvTransforms[0].transform = tXfm;
fCurrentRenderPassUniforms->uvTransforms[0].transform = *hsMatrix2SIMD(cameraToTexture);

// Stage 1: the lut
// Set the texture transform to slave's fRcvLUT
hsMatrix44 cameraToLut = slave->fRcvLUT * c2w;
hsMatrix2SIMD(cameraToLut, &tXfm);

fCurrentRenderPassUniforms->uvTransforms[1].UVWSrc = plLayerInterface::kUVWPosition;
fCurrentRenderPassUniforms->uvTransforms[1].transform = tXfm;
fCurrentRenderPassUniforms->uvTransforms[1].transform = *hsMatrix2SIMD(cameraToLut);
}

///////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -4198,15 +4188,13 @@ void plMetalPipeline::IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette,
simd_float4 destNorm_buf = (simd_float4){0.f, 0.f, 0.f, 0.f};
simd_float4 destPt_buf = (simd_float4){0.f, 0.f, 0.f, 1.f};

simd_float4x4 simdMatrix;

// Blend
for (uint32_t j = 0; j < numWeights + 1; ++j) {
hsMatrix2SIMD(matrixPalette[indices & 0xFF], &simdMatrix);
const simd_float4x4 *simdMatrix = hsMatrix2SIMD(matrixPalette[indices & 0xFF]);
if (weights[j]) {
// Note: This bit is different than GL/DirectX. It's using acclerate so this is also accelerated on ARM through NEON or maybe even the Neural Engine.
destPt_buf += simd_mul(*(simd_float4*)pt_buf, simdMatrix) * weights[j];
destNorm_buf += simd_mul(*(simd_float4*)vec_buf, simdMatrix) * weights[j];
destPt_buf += simd_mul(*(simd_float4*)pt_buf, *simdMatrix) * weights[j];
destNorm_buf += simd_mul(*(simd_float4*)vec_buf, *simdMatrix) * weights[j];
}
// ISkinVertexSSE41(matrixPalette[indices & 0xFF], weights[j], pt_buf, destPt_buf, vec_buf, destNorm_buf);
indices >>= 8;
Expand Down
Loading