Devsh-Graphics-Programming · devshgraphicsprogramming · Sep 18, 2025 · Oct 4, 2025 · Oct 4, 2025 · Oct 4, 2025
diff --git a/include/nbl/asset/utils/CPolygonGeometryManipulator.h b/include/nbl/asset/utils/CPolygonGeometryManipulator.h
@@ -9,6 +9,7 @@
 
 #include "nbl/asset/ICPUPolygonGeometry.h"
 #include "nbl/asset/utils/CGeometryManipulator.h"
+#include "nbl/asset/utils/CSmoothNormalGenerator.h"
 #include "nbl/asset/utils/CVertexHashGrid.h"
 
 namespace nbl::asset
@@ -19,33 +20,6 @@ class NBL_API2 CPolygonGeometryManipulator
 {
 	public:
 
-    struct SSNGVertexData
-    {
-      uint64_t index;									     //offset of the vertex into index buffer
-			uint32_t hash;
-      hlsl::float32_t3 weightedNormal;
-			// TODO(kevinyu): Should we separate this from SSNGVertexData, and store it in its own vector in VertexHashGrid? Similar like how hashmap work. Or keep it intrusive?
-      hlsl::float32_t3 position;							   //position of the vertex in 3D space
-
-			hlsl::float32_t3 getPosition() const
-			{
-				return position;
-			}
-
-			void setHash(uint32_t hash)
-			{
-				this->hash = hash;
-			}
-
-			uint32_t getHash() const
-			{
-				return hash;
-			};
-
-    };
-
-		using VxCmpFunction = std::function<bool(const SSNGVertexData&, const SSNGVertexData&, const ICPUPolygonGeometry*)>;
-
 		static inline void recomputeContentHashes(ICPUPolygonGeometry* geo)
 		{
 			if (!geo)
@@ -260,8 +234,9 @@ class NBL_API2 CPolygonGeometryManipulator
 
 		static core::smart_refctd_ptr<ICPUPolygonGeometry> createUnweldedList(const ICPUPolygonGeometry* inGeo);
 
+		using SSNGVxCmpFunction = CSmoothNormalGenerator::VxCmpFunction;
 		static core::smart_refctd_ptr<ICPUPolygonGeometry> createSmoothVertexNormal(const ICPUPolygonGeometry* inbuffer, bool enableWelding = false, float epsilon = 1.525e-5f,
-				VxCmpFunction vxcmp = [](const CPolygonGeometryManipulator::SSNGVertexData& v0, const CPolygonGeometryManipulator::SSNGVertexData& v1, const ICPUPolygonGeometry* buffer) 
+				SSNGVxCmpFunction vxcmp = [](const CSmoothNormalGenerator::SSNGVertexData& v0, const CSmoothNormalGenerator::SSNGVertexData& v1, const ICPUPolygonGeometry* buffer) 
 				{ 
 					static constexpr float cosOf45Deg = 0.70710678118f;
 					return dot(normalize(v0.weightedNormal),normalize(v1.weightedNormal)) > cosOf45Deg;

diff --git a/include/nbl/asset/utils/CVertexHashGrid.h b/include/nbl/asset/utils/CVertexHashGrid.h
@@ -13,6 +13,13 @@ concept HashGridVertexData = requires(T obj, T const cobj, uint32_t hash) {
 		{ cobj.getPosition() } -> std::same_as<hlsl::float32_t3>;
 };
 
+template <typename Fn, typename T>
+concept HashGridIteratorFn = HashGridVertexData<T> && requires(Fn && fn, T const cobj)
+{
+	// return whether hash grid should continue the iteration
+	{ std::invoke(std::forward<Fn>(fn), cobj) } -> std::same_as<bool>;
+};
+
 template <HashGridVertexData VertexData>
 class CVertexHashGrid
 {
@@ -26,43 +33,40 @@ class CVertexHashGrid
 		collection_t::const_iterator end;
 	};
 
-	CVertexHashGrid(size_t _vertexCount, uint32_t _hashTableMaxSize, float _cellSize) :
-		m_sorter(createSorter(_vertexCount)),
-		m_hashTableMaxSize(_hashTableMaxSize),
-		m_cellSize(_cellSize)
+	CVertexHashGrid(size_t cellSize, uint32_t hashTableMaxSizeLog2, float vertexCountReserve) :
+		m_cellSize(cellSize),
+		m_hashTableMaxSize(1llu << hashTableMaxSizeLog2),
+		m_sorter(createSorter(vertexCountReserve))
 	{
-		assert((core::isPoT(m_hashTableMaxSize)));
-
-		m_vertices.reserve(_vertexCount);
+		m_vertices.reserve(vertexCountReserve);
 	}
 
 	//inserts vertex into hash table
 	void add(VertexData&& vertex)
 	{
 		vertex.setHash(hash(vertex));
-		m_vertices.push_back(vertex);
+		m_vertices.push_back(std::move(vertex));
 	}
 
-	void validate()
+	void bake()
 	{
-		const auto oldSize = m_vertices.size();
-		m_vertices.resize(oldSize*2u);
-		auto finalSortedOutput = std::visit( [&](auto& sorter) { return sorter(m_vertices.data(), m_vertices.data() + oldSize, oldSize, KeyAccessor()); },m_sorter );
+		auto scratchBuffer = collection_t(m_vertices.size());
+
+		auto finalSortedOutput = std::visit( [&](auto& sorter)
+		{
+		  return sorter(m_vertices.data(), scratchBuffer.data(), m_vertices.size(), KeyAccessor());
+		}, m_sorter );
 
 		if (finalSortedOutput != m_vertices.data())
-			m_vertices.erase(m_vertices.begin(), m_vertices.begin() + oldSize);
-		else
-			m_vertices.resize(oldSize);
+			m_vertices = std::move(scratchBuffer);
 	}
 
 	const collection_t& vertices() const { return m_vertices; }
 
-	collection_t& vertices(){ return m_vertices; }
-
 	inline uint32_t getVertexCount() const { return m_vertices.size(); }
 
-	template <typename Fn>
-	void iterateBroadphaseCandidates(const VertexData& vertex, Fn fn) const
+	template <HashGridIteratorFn<VertexData> Fn>
+	void forEachBroadphaseNeighborCandidates(const VertexData& vertex, Fn&& fn) const
 	{
 		std::array<uint32_t, 8> neighboringCells;
 		const auto cellCount = getNeighboringCellHashes(neighboringCells.data(), vertex);
@@ -76,7 +80,7 @@ class CVertexHashGrid
 			{
 				const vertex_data_t& neighborVertex = *bounds.begin;
 				if (&vertex != &neighborVertex)
-					if (!fn(neighborVertex)) break;
+					if (!std::invoke(std::forward<Fn>(fn), neighborVertex)) break;
 			}
 		}
 
@@ -85,7 +89,7 @@ class CVertexHashGrid
 private:
 	struct KeyAccessor
 	{
-		_NBL_STATIC_INLINE_CONSTEXPR size_t key_bit_count = 32ull;
+		constexpr static size_t key_bit_count = 32ull;
 
 		template<auto bit_offset, auto radix_mask>
 		inline decltype(radix_mask) operator()(const VertexData& item) const
@@ -98,21 +102,19 @@ class CVertexHashGrid
 	static constexpr uint32_t primeNumber2 = 19349663;
 	static constexpr uint32_t primeNumber3 = 83492791;
 
-	static constexpr uint32_t invalidHash = 0xFFFFFFFF;
-
 	using sorter_t = std::variant<
-		core::LSBSorter<KeyAccessor::key_bit_count, uint16_t>,
-		core::LSBSorter<KeyAccessor::key_bit_count, uint32_t>,
-		core::LSBSorter<KeyAccessor::key_bit_count, size_t>>;
+		core::RadixLsbSorter<KeyAccessor::key_bit_count, uint16_t>,
+		core::RadixLsbSorter<KeyAccessor::key_bit_count, uint32_t>,
+		core::RadixLsbSorter<KeyAccessor::key_bit_count, size_t>>;
 	sorter_t m_sorter;
 
 	static sorter_t createSorter(size_t vertexCount)
 	{
 		if (vertexCount < (0x1ull << 16ull))
-			return core::LSBSorter<KeyAccessor::key_bit_count, uint16_t>();
+			return core::RadixLsbSorter<KeyAccessor::key_bit_count, uint16_t>();
 		if (vertexCount < (0x1ull << 32ull))
-			return core::LSBSorter<KeyAccessor::key_bit_count, uint32_t>();
-		return core::LSBSorter<KeyAccessor::key_bit_count, size_t>();
+			return core::RadixLsbSorter<KeyAccessor::key_bit_count, uint32_t>();
+		return core::RadixLsbSorter<KeyAccessor::key_bit_count, size_t>();
 	}
 
 	collection_t m_vertices;
@@ -122,10 +124,8 @@ class CVertexHashGrid
 	uint32_t hash(const VertexData& vertex) const
 	{
 		const hlsl::float32_t3 position = floor(vertex.getPosition() / m_cellSize);
-
-		return	((static_cast<uint32_t>(position.x) * primeNumber1) ^
-			(static_cast<uint32_t>(position.y) * primeNumber2) ^
-			(static_cast<uint32_t>(position.z) * primeNumber3))& (m_hashTableMaxSize - 1);
+		const auto position_uint32 = hlsl::uint32_t3(position.x, position.y, position.z);
+		return hash(position_uint32);
 	}
 
 	uint32_t hash(const hlsl::uint32_t3& position) const
@@ -137,6 +137,7 @@ class CVertexHashGrid
 
 	uint8_t getNeighboringCellHashes(uint32_t* outNeighbors, const VertexData& vertex) const
 	{
+		// both 0.x and -0.x would be converted to 0 if we directly casting the position to unsigned integer. Causing the 0 to be crowded then the rest of the cells. So we use floor here to spread the vertex more uniformly.
 		hlsl::float32_t3 cellfloatcoord = floor(vertex.getPosition() / m_cellSize - hlsl::float32_t3(0.5));
 		hlsl::uint32_t3 baseCoord = hlsl::uint32_t3(static_cast<uint32_t>(cellfloatcoord.x), static_cast<uint32_t>(cellfloatcoord.y), static_cast<uint32_t>(cellfloatcoord.z));
 
@@ -167,12 +168,9 @@ class CVertexHashGrid
 
 	BucketBounds getBucketBoundsByHash(uint32_t hash) const
 	{
-		if (hash == invalidHash)
-			return { m_vertices.end(), m_vertices.end() };
-
 		const auto skipListBound = std::visit([&](auto& sorter)
 		{
-			auto hashBound = sorter.getHashBound(hash);
+			auto hashBound = sorter.getMostSignificantRadixBound(hash);
 			return std::pair<collection_t::const_iterator, collection_t::const_iterator>(m_vertices.begin() + hashBound.first, m_vertices.begin() + hashBound.second);
 		}, m_sorter);
 
@@ -182,7 +180,7 @@ class CVertexHashGrid
 			hash,
 			[](const VertexData& vertex, uint32_t hash)
 			{
-				return vertex.hash < hash;
+				return vertex.getHash() < hash;
 			});
 
 		auto end = std::upper_bound(
@@ -191,7 +189,7 @@ class CVertexHashGrid
 			hash, 
 			[](uint32_t hash, const VertexData& vertex)
 			{
-				return hash < vertex.hash;
+				return hash < vertex.getHash();
 			});
 
 		const auto beginIx = begin - m_vertices.begin();

diff --git a/include/nbl/asset/utils/CVertexWelder.h b/include/nbl/asset/utils/CVertexWelder.h
@@ -11,33 +11,137 @@ namespace nbl::asset {
 class CVertexWelder {
 
 public:
-  using WeldPredicateFn = std::function<bool(const ICPUPolygonGeometry* geom, uint64_t idx1, uint64_t idx2)>;
+  using WeldPredicateFn = std::function<bool(const ICPUPolygonGeometry* geom, uint32_t idx1, uint32_t idx2)>;
+
+  class DefaultWeldPredicate
+  {
+    private:
+      static bool isAttributeValEqual(const ICPUPolygonGeometry::SDataView& view, uint32_t index1, uint32_t index2, float epsilon)
+      {
+        if (!view) return true;
+        const auto channelCount = getFormatChannelCount(view.composed.format);
+        switch (view.composed.rangeFormat)
+        {
+          case IGeometryBase::EAABBFormat::U64:
+          case IGeometryBase::EAABBFormat::U32:
+          {
+            hlsl::uint64_t4 val1, val2;
+            view.decodeElement<hlsl::uint64_t4>(index1, val1);
+            view.decodeElement<hlsl::uint64_t4>(index2, val2);
+            for (auto channel_i = 0u; channel_i < channelCount; channel_i++)
+              if (val1[channel_i] != val2[channel_i]) return false;
+            break;
+          }
+          case IGeometryBase::EAABBFormat::S64:
+          case IGeometryBase::EAABBFormat::S32:
+          {
+            hlsl::int64_t4 val1, val2;
+            view.decodeElement<hlsl::int64_t4>(index1, val1);
+            view.decodeElement<hlsl::int64_t4>(index2, val2);
+            for (auto channel_i = 0u; channel_i < channelCount; channel_i++)
+              if (val1[channel_i] != val2[channel_i]) return false;
+            break;
+          }
+          default:
+          {
+            // TODO: Handle 16,32,64 bit float vectors once the pixel encode/decode functions get reimplemented in HLSL and decodeElement can actually benefit from that.
+            hlsl::float64_t4 val1, val2;
+            view.decodeElement<hlsl::float64_t4>(index1, val1);
+            view.decodeElement<hlsl::float64_t4>(index2, val2);
+            for (auto channel_i = 0u; channel_i < channelCount; channel_i++)
+            {
+              const auto diff = abs(val1[channel_i] - val2[channel_i]);
+              if (diff > epsilon) return false;
+            }
+            break;
+          }
+        }
+        return true;
+      }
+
+      static bool isAttributeDirEqual(const ICPUPolygonGeometry::SDataView& view, uint32_t index1, uint32_t index2, float epsilon)
+      {
+        if (!view) return true;
+        const auto channelCount = getFormatChannelCount(view.composed.format);
+        switch (view.composed.rangeFormat)
+        {
+          case IGeometryBase::EAABBFormat::U64:
+          case IGeometryBase::EAABBFormat::U32:
+          {
+            hlsl::uint64_t4 val1, val2;
+            view.decodeElement<hlsl::uint64_t4>(index1, val1);
+            view.decodeElement<hlsl::uint64_t4>(index2, val2);
+            return (1.0 - hlsl::dot(val1, val2)) < epsilon;
+          }
+          case IGeometryBase::EAABBFormat::S64:
+          case IGeometryBase::EAABBFormat::S32:
+          {
+            hlsl::int64_t4 val1, val2;
+            view.decodeElement<hlsl::int64_t4>(index1, val1);
+            view.decodeElement<hlsl::int64_t4>(index2, val2);
+            return (1.0 - hlsl::dot(val1, val2)) < epsilon;
+          }
+          default:
+          {
+            hlsl::float64_t4 val1, val2;
+            view.decodeElement<hlsl::float64_t4>(index1, val1);
+            view.decodeElement<hlsl::float64_t4>(index2, val2);
+            return (1.0 - hlsl::dot(val1, val2)) < epsilon;
+          }
+        }
+        return true;
+      }
+
+      float m_epsilon;
+
+    public:
+
+      DefaultWeldPredicate(float epsilon) : m_epsilon(epsilon) {}
+
+      bool operator()(const ICPUPolygonGeometry* polygon, uint32_t index1, uint32_t index2)
+      {
+        if (!isAttributeValEqual(polygon->getPositionView(), index1, index2, m_epsilon))
+          return false;
+        if (!isAttributeDirEqual(polygon->getNormalView(), index1, index2, m_epsilon))
+          return false;
+        for (const auto& jointWeightView : polygon->getJointWeightViews())
+        {
+          if (!isAttributeValEqual(jointWeightView.indices, index1, index2, m_epsilon)) return false;
+          if (!isAttributeValEqual(jointWeightView.weights, index1, index2, m_epsilon)) return false;
+        }
+        for (const auto& auxAttributeView : polygon->getAuxAttributeViews())
+          if (!isAttributeValEqual(auxAttributeView, index1, index2, m_epsilon)) return false;
+
+        return true;
+      }
+
+  };
 
   template <typename AccelStructureT>
   static core::smart_refctd_ptr<ICPUPolygonGeometry> weldVertices(const ICPUPolygonGeometry* polygon, const AccelStructureT& as, WeldPredicateFn shouldWeldFn) {
     auto outPolygon = core::move_and_static_cast<ICPUPolygonGeometry>(polygon->clone(0u));
     outPolygon->setIndexing(IPolygonGeometryBase::TriangleList());
 
-    core::vector<uint64_t> vertexIndexToAsIndex(as.getVertexCount());
+    core::vector<uint32_t> vertexIndexToAsIndex(as.getVertexCount());
 
-    for (uint64_t vertexData_i = 0u; vertexData_i < as.getVertexCount(); vertexData_i++)
+    for (uint32_t vertexData_i = 0u; vertexData_i < as.getVertexCount(); vertexData_i++)
     {
       const auto& vertexData = as.vertices()[vertexData_i];
       vertexIndexToAsIndex[vertexData.index] = vertexData.index;
     }
 
-    static constexpr auto INVALID_INDEX = std::numeric_limits<uint64_t>::max();
-    core::vector<uint64_t> remappedVertexIndexes(as.getVertexCount());
+    static constexpr auto INVALID_INDEX = std::numeric_limits<uint32_t>::max();
+    core::vector<uint32_t> remappedVertexIndexes(as.getVertexCount());
     std::fill(remappedVertexIndexes.begin(), remappedVertexIndexes.end(), INVALID_INDEX);
 
-    uint64_t maxRemappedIndex = 0;
+    uint32_t maxRemappedIndex = 0;
     // iterate by index, so that we always use the smallest index when multiple vertexes can be welded together
-    for (uint64_t index = 0; index < as.getVertexCount(); index++)
+    for (uint32_t index = 0; index < as.getVertexCount(); index++)
     {
       const auto asIndex = vertexIndexToAsIndex[index];
       const auto& vertexData = as.vertices()[asIndex];
       auto& remappedVertexIndex = remappedVertexIndexes[index];
-      as.iterateBroadphaseCandidates(vertexData, [&, polygon, index](const typename AccelStructureT::vertex_data_t& neighbor) {
+      as.forEachBroadphaseNeighborCandidates(vertexData, [&, polygon, index](const typename AccelStructureT::vertex_data_t& neighbor) {
         const auto neighborRemappedIndex = remappedVertexIndexes[neighbor.index];
         if (shouldWeldFn(polygon, index, neighbor.index) && neighborRemappedIndex != INVALID_INDEX) {
           remappedVertexIndex = neighborRemappedIndex;