GPU buffer suballocator for meshses to reduce index buffer switching (#1094)

2025-04-28 09:10:53 +02:00
parent 5ba77946d1
commit 30917c9e1f
20 changed files with 993 additions and 56 deletions
@@ -1036,6 +1036,17 @@ void MeshWindow::SetEntity(Entity entity, int subset)
 		if (mesh->so_nor.IsValid()) ss += "\tstreamout_normals;\n";
 		if (mesh->so_tan.IsValid()) ss += "\tstreamout_tangents;\n";
 		if (mesh->so_pre.IsValid()) ss += "\tprevious_position;\n";
+
+		ss += "\nSuballocation offset: ";
+		if (mesh->generalBufferOffsetAllocation.IsValid())
+		{
+			ss += wi::helper::GetMemorySizeText(mesh->generalBufferOffsetAllocation.byte_offset);
+		}
+		else
+		{
+			ss += "suballocation is not used for this mesh";
+		}
+
 		meshInfoLabel.SetText(ss);

 		subsetComboBox.ClearItems();
@@ -16,6 +16,12 @@

 // Simple common math helpers:

+template<typename T>
+constexpr T align(T value, T alignment)
+{
+	return ((value + alignment - T(1)) / alignment) * alignment;
+}
+
 template <typename T>
 constexpr T sqr(T x) { return x * x; }

@@ -0,0 +1,475 @@
+// (C) Sebastian Aaltonen 2023
+// MIT License (see file: LICENSE)
+
+#include "offsetAllocator.hpp"
+
+#ifdef DEBUG
+#include <assert.h>
+#define ASSERT(x) assert(x)
+//#define DEBUG_VERBOSE
+#else
+#define ASSERT(x)
+#endif
+
+#ifdef DEBUG_VERBOSE
+#include <stdio.h>
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <cstring>
+
+namespace OffsetAllocator
+{
+    inline uint32 lzcnt_nonzero(uint32 v)
+    {
+#ifdef _MSC_VER
+        unsigned long retVal;
+        _BitScanReverse(&retVal, v);
+        return 31 - retVal;
+#else
+        return __builtin_clz(v);
+#endif
+    }
+
+    inline uint32 tzcnt_nonzero(uint32 v)
+    {
+#ifdef _MSC_VER
+        unsigned long retVal;
+        _BitScanForward(&retVal, v);
+        return retVal;
+#else
+        return __builtin_ctz(v);
+#endif
+    }
+
+    namespace SmallFloat
+    {
+        static constexpr uint32 MANTISSA_BITS = 3;
+        static constexpr uint32 MANTISSA_VALUE = 1 << MANTISSA_BITS;
+        static constexpr uint32 MANTISSA_MASK = MANTISSA_VALUE - 1;
+    
+        // Bin sizes follow floating point (exponent + mantissa) distribution (piecewise linear log approx)
+        // This ensures that for each size class, the average overhead percentage stays the same
+        uint32 uintToFloatRoundUp(uint32 size)
+        {
+            uint32 exp = 0;
+            uint32 mantissa = 0;
+            
+            if (size < MANTISSA_VALUE)
+            {
+                // Denorm: 0..(MANTISSA_VALUE-1)
+                mantissa = size;
+            }
+            else
+            {
+                // Normalized: Hidden high bit always 1. Not stored. Just like float.
+                uint32 leadingZeros = lzcnt_nonzero(size);
+                uint32 highestSetBit = 31 - leadingZeros;
+                
+                uint32 mantissaStartBit = highestSetBit - MANTISSA_BITS;
+                exp = mantissaStartBit + 1;
+                mantissa = (size >> mantissaStartBit) & MANTISSA_MASK;
+                
+                uint32 lowBitsMask = (1 << mantissaStartBit) - 1;
+                
+                // Round up!
+                if ((size & lowBitsMask) != 0)
+                    mantissa++;
+            }
+            
+            return (exp << MANTISSA_BITS) + mantissa; // + allows mantissa->exp overflow for round up
+        }
+
+        uint32 uintToFloatRoundDown(uint32 size)
+        {
+            uint32 exp = 0;
+            uint32 mantissa = 0;
+            
+            if (size < MANTISSA_VALUE)
+            {
+                // Denorm: 0..(MANTISSA_VALUE-1)
+                mantissa = size;
+            }
+            else
+            {
+                // Normalized: Hidden high bit always 1. Not stored. Just like float.
+                uint32 leadingZeros = lzcnt_nonzero(size);
+                uint32 highestSetBit = 31 - leadingZeros;
+                
+                uint32 mantissaStartBit = highestSetBit - MANTISSA_BITS;
+                exp = mantissaStartBit + 1;
+                mantissa = (size >> mantissaStartBit) & MANTISSA_MASK;
+            }
+            
+            return (exp << MANTISSA_BITS) | mantissa;
+        }
+    
+        uint32 floatToUint(uint32 floatValue)
+        {
+            uint32 exponent = floatValue >> MANTISSA_BITS;
+            uint32 mantissa = floatValue & MANTISSA_MASK;
+            if (exponent == 0)
+            {
+                // Denorms
+                return mantissa;
+            }
+            else
+            {
+                return (mantissa | MANTISSA_VALUE) << (exponent - 1);
+            }
+        }
+    }
+
+    // Utility functions
+    uint32 findLowestSetBitAfter(uint32 bitMask, uint32 startBitIndex)
+    {
+        uint32 maskBeforeStartIndex = (1 << startBitIndex) - 1;
+        uint32 maskAfterStartIndex = ~maskBeforeStartIndex;
+        uint32 bitsAfter = bitMask & maskAfterStartIndex;
+        if (bitsAfter == 0) return Allocation::NO_SPACE;
+        return tzcnt_nonzero(bitsAfter);
+    }
+
+    // Allocator...
+    void Allocator::init(uint32 size, uint32 maxAllocs)
+    {
+		m_size = size;
+		m_maxAllocs = maxAllocs;
+		m_nodes.reserve(maxAllocs);
+		m_freeNodes.reserve(maxAllocs);
+        if (sizeof(NodeIndex) == 2)
+        {
+            ASSERT(maxAllocs <= 65536);
+        }
+        reset();
+    }
+
+    void Allocator::reset()
+    {
+        m_freeStorage = 0;
+        m_usedBinsTop = 0;
+        m_freeOffset = m_maxAllocs - 1;
+
+        for (uint32 i = 0 ; i < NUM_TOP_BINS; i++)
+            m_usedBins[i] = 0;
+        
+        for (uint32 i = 0 ; i < NUM_LEAF_BINS; i++)
+            m_binIndices[i] = Node::unused;
+
+		m_nodes.clear();
+		m_freeNodes.clear();
+
+        m_nodes.resize(m_maxAllocs);
+        m_freeNodes.resize(m_maxAllocs);
+        
+        // Freelist is a stack. Nodes in inverse order so that [0] pops first.
+        for (uint32 i = 0; i < m_maxAllocs; i++)
+        {
+            m_freeNodes[i] = m_maxAllocs - i - 1;
+        }
+        
+        // Start state: Whole storage as one big node
+        // Algorithm will split remainders and push them back as smaller nodes
+        insertNodeIntoBin(m_size, 0);
+    }
+    
+    Allocation Allocator::allocate(uint32 size)
+    {
+        // Out of allocations?
+        if (m_freeOffset == 0)
+        {
+			Allocation ret;
+			ret.offset = Allocation::NO_SPACE;
+			ret.metadata = Allocation::NO_SPACE;
+            return ret;
+        }
+        
+        // Round up to bin index to ensure that alloc >= bin
+        // Gives us min bin index that fits the size
+        uint32 minBinIndex = SmallFloat::uintToFloatRoundUp(size);
+        
+        uint32 minTopBinIndex = minBinIndex >> TOP_BINS_INDEX_SHIFT;
+        uint32 minLeafBinIndex = minBinIndex & LEAF_BINS_INDEX_MASK;
+        
+        uint32 topBinIndex = minTopBinIndex;
+        uint32 leafBinIndex = Allocation::NO_SPACE;
+
+        // If top bin exists, scan its leaf bin. This can fail (NO_SPACE).
+        if (m_usedBinsTop & (1 << topBinIndex))
+        {
+            leafBinIndex = findLowestSetBitAfter(m_usedBins[topBinIndex], minLeafBinIndex);
+        }
+    
+        // If we didn't find space in top bin, we search top bin from +1
+        if (leafBinIndex == Allocation::NO_SPACE)
+        {
+            topBinIndex = findLowestSetBitAfter(m_usedBinsTop, minTopBinIndex + 1);
+            
+            // Out of space?
+            if (topBinIndex == Allocation::NO_SPACE)
+            {
+				Allocation ret;
+				ret.offset = Allocation::NO_SPACE;
+				ret.metadata = Allocation::NO_SPACE;
+				return ret;
+            }
+
+            // All leaf bins here fit the alloc, since the top bin was rounded up. Start leaf search from bit 0.
+            // NOTE: This search can't fail since at least one leaf bit was set because the top bit was set.
+            leafBinIndex = tzcnt_nonzero(m_usedBins[topBinIndex]);
+        }
+                
+        uint32 binIndex = (topBinIndex << TOP_BINS_INDEX_SHIFT) | leafBinIndex;
+        
+        // Pop the top node of the bin. Bin top = node.next.
+        uint32 nodeIndex = m_binIndices[binIndex];
+        Node& node = m_nodes[nodeIndex];
+        uint32 nodeTotalSize = node.dataSize;
+        node.dataSize = size;
+        node.used = true;
+        m_binIndices[binIndex] = node.binListNext;
+        if (node.binListNext != Node::unused) m_nodes[node.binListNext].binListPrev = Node::unused;
+        m_freeStorage -= nodeTotalSize;
+#ifdef DEBUG_VERBOSE
+        printf("Free storage: %u (-%u) (allocate)\n", m_freeStorage, nodeTotalSize);
+#endif
+
+        // Bin empty?
+        if (m_binIndices[binIndex] == Node::unused)
+        {
+            // Remove a leaf bin mask bit
+            m_usedBins[topBinIndex] &= ~(1 << leafBinIndex);
+            
+            // All leaf bins empty?
+            if (m_usedBins[topBinIndex] == 0)
+            {
+                // Remove a top bin mask bit
+                m_usedBinsTop &= ~(1 << topBinIndex);
+            }
+        }
+        
+        // Push back reminder N elements to a lower bin
+        uint32 reminderSize = nodeTotalSize - size;
+        if (reminderSize > 0)
+        {
+            uint32 newNodeIndex = insertNodeIntoBin(reminderSize, node.dataOffset + size);
+            
+            // Link nodes next to each other so that we can merge them later if both are free
+            // And update the old next neighbor to point to the new node (in middle)
+            if (node.neighborNext != Node::unused) m_nodes[node.neighborNext].neighborPrev = newNodeIndex;
+            m_nodes[newNodeIndex].neighborPrev = nodeIndex;
+            m_nodes[newNodeIndex].neighborNext = node.neighborNext;
+            node.neighborNext = newNodeIndex;
+        }
+
+		Allocation ret;
+		ret.offset = node.dataOffset;
+		ret.metadata = nodeIndex;
+		return ret;
+    }
+    
+    void Allocator::free(Allocation allocation)
+    {
+        ASSERT(allocation.metadata != Allocation::NO_SPACE);
+        if (m_nodes.empty()) return;
+        
+        uint32 nodeIndex = allocation.metadata;
+        Node& node = m_nodes[nodeIndex];
+        
+        // Double delete check
+        ASSERT(node.used == true);
+        
+        // Merge with neighbors...
+        uint32 offset = node.dataOffset;
+        uint32 size = node.dataSize;
+        
+        if ((node.neighborPrev != Node::unused) && (m_nodes[node.neighborPrev].used == false))
+        {
+            // Previous (contiguous) free node: Change offset to previous node offset. Sum sizes
+            Node& prevNode = m_nodes[node.neighborPrev];
+            offset = prevNode.dataOffset;
+            size += prevNode.dataSize;
+            
+            // Remove node from the bin linked list and put it in the freelist
+            removeNodeFromBin(node.neighborPrev);
+            
+            ASSERT(prevNode.neighborNext == nodeIndex);
+            node.neighborPrev = prevNode.neighborPrev;
+        }
+        
+        if ((node.neighborNext != Node::unused) && (m_nodes[node.neighborNext].used == false))
+        {
+            // Next (contiguous) free node: Offset remains the same. Sum sizes.
+            Node& nextNode = m_nodes[node.neighborNext];
+            size += nextNode.dataSize;
+            
+            // Remove node from the bin linked list and put it in the freelist
+            removeNodeFromBin(node.neighborNext);
+            
+            ASSERT(nextNode.neighborPrev == nodeIndex);
+            node.neighborNext = nextNode.neighborNext;
+        }
+
+        uint32 neighborNext = node.neighborNext;
+        uint32 neighborPrev = node.neighborPrev;
+        
+        // Insert the removed node to freelist
+#ifdef DEBUG_VERBOSE
+        printf("Putting node %u into freelist[%u] (free)\n", nodeIndex, m_freeOffset + 1);
+#endif
+        m_freeNodes[++m_freeOffset] = nodeIndex;
+
+        // Insert the (combined) free node to bin
+        uint32 combinedNodeIndex = insertNodeIntoBin(size, offset);
+
+        // Connect neighbors with the new combined node
+        if (neighborNext != Node::unused)
+        {
+            m_nodes[combinedNodeIndex].neighborNext = neighborNext;
+            m_nodes[neighborNext].neighborPrev = combinedNodeIndex;
+        }
+        if (neighborPrev != Node::unused)
+        {
+            m_nodes[combinedNodeIndex].neighborPrev = neighborPrev;
+            m_nodes[neighborPrev].neighborNext = combinedNodeIndex;
+        }
+    }
+
+	uint32 Allocator::insertNodeIntoBin(uint32 size, uint32 dataOffset)
+	{
+		// Round down to bin index to ensure that bin >= alloc
+		uint32 binIndex = SmallFloat::uintToFloatRoundDown(size);
+
+		uint32 topBinIndex = binIndex >> TOP_BINS_INDEX_SHIFT;
+		uint32 leafBinIndex = binIndex & LEAF_BINS_INDEX_MASK;
+
+		// Bin was empty before?
+		if (m_binIndices[binIndex] == Node::unused)
+		{
+			// Set bin mask bits
+			m_usedBins[topBinIndex] |= 1 << leafBinIndex;
+			m_usedBinsTop |= 1 << topBinIndex;
+		}
+
+		// Take a freelist node and insert on top of the bin linked list (next = old top)
+		uint32 topNodeIndex = m_binIndices[binIndex];
+		uint32 nodeIndex = m_freeNodes[m_freeOffset--];
+#ifdef DEBUG_VERBOSE
+		printf("Getting node %u from freelist[%u]\n", nodeIndex, m_freeOffset + 1);
+#endif
+		m_nodes[nodeIndex].dataOffset = dataOffset;
+		m_nodes[nodeIndex].dataSize = size;
+		m_nodes[nodeIndex].binListNext = topNodeIndex;
+        if (topNodeIndex != Node::unused) m_nodes[topNodeIndex].binListPrev = nodeIndex;
+        m_binIndices[binIndex] = nodeIndex;
+        
+        m_freeStorage += size;
+#ifdef DEBUG_VERBOSE
+        printf("Free storage: %u (+%u) (insertNodeIntoBin)\n", m_freeStorage, size);
+#endif
+
+        return nodeIndex;
+    }
+    
+    void Allocator::removeNodeFromBin(uint32 nodeIndex)
+    {
+        Node &node = m_nodes[nodeIndex];
+        
+        if (node.binListPrev != Node::unused)
+        {
+            // Easy case: We have previous node. Just remove this node from the middle of the list.
+            m_nodes[node.binListPrev].binListNext = node.binListNext;
+            if (node.binListNext != Node::unused) m_nodes[node.binListNext].binListPrev = node.binListPrev;
+        }
+        else
+        {
+            // Hard case: We are the first node in a bin. Find the bin.
+            
+            // Round down to bin index to ensure that bin >= alloc
+            uint32 binIndex = SmallFloat::uintToFloatRoundDown(node.dataSize);
+            
+            uint32 topBinIndex = binIndex >> TOP_BINS_INDEX_SHIFT;
+            uint32 leafBinIndex = binIndex & LEAF_BINS_INDEX_MASK;
+            
+            m_binIndices[binIndex] = node.binListNext;
+            if (node.binListNext != Node::unused) m_nodes[node.binListNext].binListPrev = Node::unused;
+
+            // Bin empty?
+            if (m_binIndices[binIndex] == Node::unused)
+            {
+                // Remove a leaf bin mask bit
+                m_usedBins[topBinIndex] &= ~(1 << leafBinIndex);
+                
+                // All leaf bins empty?
+                if (m_usedBins[topBinIndex] == 0)
+                {
+                    // Remove a top bin mask bit
+                    m_usedBinsTop &= ~(1 << topBinIndex);
+                }
+            }
+        }
+        
+        // Insert the node to freelist
+#ifdef DEBUG_VERBOSE
+        printf("Putting node %u into freelist[%u] (removeNodeFromBin)\n", nodeIndex, m_freeOffset + 1);
+#endif
+        m_freeNodes[++m_freeOffset] = nodeIndex;
+
+        m_freeStorage -= node.dataSize;
+#ifdef DEBUG_VERBOSE
+        printf("Free storage: %u (-%u) (removeNodeFromBin)\n", m_freeStorage, node.dataSize);
+#endif
+    }
+
+    uint32 Allocator::allocationSize(Allocation allocation) const
+    {
+        if (allocation.metadata == Allocation::NO_SPACE) return 0;
+        if (m_nodes.empty()) return 0;
+        
+        return m_nodes[allocation.metadata].dataSize;
+    }
+
+    StorageReport Allocator::storageReport() const
+    {
+        uint32 largestFreeRegion = 0;
+        uint32 freeStorage = 0;
+        
+        // Out of allocations? -> Zero free space
+        if (m_freeOffset > 0)
+        {
+            freeStorage = m_freeStorage;
+            if (m_usedBinsTop)
+            {
+                uint32 topBinIndex = 31 - lzcnt_nonzero(m_usedBinsTop);
+                uint32 leafBinIndex = 31 - lzcnt_nonzero(m_usedBins[topBinIndex]);
+                largestFreeRegion = SmallFloat::floatToUint((topBinIndex << TOP_BINS_INDEX_SHIFT) | leafBinIndex);
+                ASSERT(freeStorage >= largestFreeRegion);
+            }
+        }
+
+		StorageReport ret;
+		ret.totalFreeSpace = freeStorage;
+		ret.largestFreeRegion = largestFreeRegion;
+		return ret;
+    }
+
+    StorageReportFull Allocator::storageReportFull() const
+    {
+        StorageReportFull report;
+		for (uint32 i = 0; i < NUM_LEAF_BINS; i++)
+		{
+			uint32 count = 0;
+			uint32 nodeIndex = m_binIndices[i];
+			while (nodeIndex != Node::unused)
+			{
+				nodeIndex = m_nodes[nodeIndex].binListNext;
+				count++;
+			}
+			report.freeRegions[i].size = SmallFloat::floatToUint(i); report.freeRegions[i].count = count;
+        }
+        return report;
+    }
+}
@@ -0,0 +1,115 @@
+#pragma once
+// (C) Sebastian Aaltonen 2023
+// MIT License (see file: LICENSE)
+
+// Modified for Wicked Engine
+//  - removed cpp20 features
+//	- removed constructors
+//	- changed node storage to std::vector
+//	- reduced size of Node structure
+
+//#define USE_16_BIT_OFFSETS
+
+#include <vector>
+
+namespace OffsetAllocator
+{
+    typedef unsigned char uint8;
+    typedef unsigned short uint16;
+    typedef unsigned int uint32;
+
+    // 16 bit offsets mode will halve the metadata storage cost
+    // But it only supports up to 65536 maximum allocation count
+#ifdef USE_16_BIT_NODE_INDICES
+    typedef uint16 NodeIndex;
+	static constexpr uint32 default_maxallocations = 64 * 1024;
+#else
+    typedef uint32 NodeIndex;
+	static constexpr uint32 default_maxallocations = 128 * 1024;
+#endif
+
+    static constexpr uint32 NUM_TOP_BINS = 32;
+    static constexpr uint32 BINS_PER_LEAF = 8;
+    static constexpr uint32 TOP_BINS_INDEX_SHIFT = 3;
+    static constexpr uint32 LEAF_BINS_INDEX_MASK = 0x7;
+    static constexpr uint32 NUM_LEAF_BINS = NUM_TOP_BINS * BINS_PER_LEAF;
+
+    struct Allocation
+    {
+        static constexpr uint32 NO_SPACE = 0xffffffff;
+        
+        uint32 offset = NO_SPACE;
+        NodeIndex metadata = NO_SPACE; // internal: node index
+    };
+
+    struct StorageReport
+    {
+		uint32 totalFreeSpace = 0;
+		uint32 largestFreeRegion = 0;
+    };
+
+    struct StorageReportFull
+    {
+        struct Region
+        {
+			uint32 size = 0;
+			uint32 count = 0;
+        };
+        
+        Region freeRegions[NUM_LEAF_BINS];
+    };
+
+    class Allocator
+    {
+    public:
+		void init(uint32 size, uint32 maxAllocs = default_maxallocations);
+		void reset();
+        
+        Allocation allocate(uint32 size);
+        void free(Allocation allocation);
+
+        uint32 allocationSize(Allocation allocation) const;
+        StorageReport storageReport() const;
+        StorageReportFull storageReportFull() const;
+        
+    private:
+        uint32 insertNodeIntoBin(uint32 size, uint32 dataOffset);
+        void removeNodeFromBin(uint32 nodeIndex);
+
+        struct Node
+        {
+            static constexpr NodeIndex unused = 0xffffffff;
+            
+			uint32 dataOffset : 32;
+			uint32 dataSize : 31;
+			uint32 used : 1;
+            NodeIndex binListPrev : 32;
+            NodeIndex binListNext : 32;
+            NodeIndex neighborPrev : 32;
+            NodeIndex neighborNext : 32;
+
+			Node()
+			{
+				dataOffset = 0;
+				dataSize = 0;
+				binListPrev = unused;
+				binListNext = unused;
+				neighborPrev = unused;
+				neighborNext = unused;
+				used = 0;
+			}
+        };
+    
+        uint32 m_size = 0;
+        uint32 m_maxAllocs = 0;
+        uint32 m_freeStorage = 0;
+
+        uint32 m_usedBinsTop = 0;
+		uint8 m_usedBins[NUM_TOP_BINS] = {};
+		NodeIndex m_binIndices[NUM_LEAF_BINS] = {};
+                
+		std::vector<Node> m_nodes;
+		std::vector<NodeIndex> m_freeNodes;
+		uint32 m_freeOffset = 0;
+    };
+}
@@ -314,6 +314,7 @@
    <ClInclude Include="$(MSBuildThisFileDirectory)Utility\lodepng.h" />
    <ClInclude Include="$(MSBuildThisFileDirectory)Utility\meshoptimizer\meshoptimizer.h" />
    <ClInclude Include="$(MSBuildThisFileDirectory)Utility\minimp4.h" />
+    <ClInclude Include="$(MSBuildThisFileDirectory)Utility\offsetAllocator.hpp" />
    <ClInclude Include="$(MSBuildThisFileDirectory)Utility\pugiconfig.hpp" />
    <ClInclude Include="$(MSBuildThisFileDirectory)Utility\pugixml.hpp" />
    <ClInclude Include="$(MSBuildThisFileDirectory)Utility\WinAdapter.h" />
@@ -610,6 +611,7 @@
    <ClCompile Include="$(MSBuildThisFileDirectory)Utility\meshoptimizer\vertexfilter.cpp" />
    <ClCompile Include="$(MSBuildThisFileDirectory)Utility\meshoptimizer\vfetchanalyzer.cpp" />
    <ClCompile Include="$(MSBuildThisFileDirectory)Utility\meshoptimizer\vfetchoptimizer.cpp" />
+    <ClCompile Include="$(MSBuildThisFileDirectory)Utility\offsetAllocator.cpp" />
    <ClCompile Include="$(MSBuildThisFileDirectory)Utility\pugixml.cpp" />
    <ClCompile Include="$(MSBuildThisFileDirectory)wiAsync_BindLua.cpp" />
    <ClCompile Include="$(MSBuildThisFileDirectory)wiConfig.cpp" />
@@ -1389,6 +1389,9 @@
    <ClInclude Include="$(MSBuildThisFileDirectory)Jolt\RegisterTypes.h">
      <Filter>JOLT</Filter>
    </ClInclude>
+    <ClInclude Include="$(MSBuildThisFileDirectory)Utility\offsetAllocator.hpp">
+      <Filter>UTILITY</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="$(MSBuildThisFileDirectory)LUA\lapi.c">
@@ -2189,6 +2192,9 @@
    <ClCompile Include="$(MSBuildThisFileDirectory)Jolt\RegisterTypes.cpp">
      <Filter>JOLT</Filter>
    </ClCompile>
+    <ClCompile Include="$(MSBuildThisFileDirectory)Utility\offsetAllocator.cpp">
+      <Filter>UTILITY</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <None Include="$(MSBuildThisFileDirectory)Utility\DirectXCollision.inl">
@@ -2,11 +2,18 @@
 #include "CommonInclude.h"
 #include "wiVector.h"

+#include "Utility/offsetAllocator.hpp"
+
+#include <mutex>
+#include <atomic>
+#include <memory>
 #include <cassert>
 #include <algorithm>
+#include <deque>

 namespace wi::allocator
 {
+	// Allocation of consecutive bytes, but no freeing, instead the whole allocator can be reset
 	struct LinearAllocator
 	{
 		uint8_t* data = nullptr;
@@ -38,6 +45,7 @@ namespace wi::allocator
 		}
 	};

+	// Allocation and freeing of single elements of the same size
 	template<typename T, size_t block_size = 256>
 	struct BlockAllocator
 	{
@@ -71,5 +79,174 @@ namespace wi::allocator
 			ptr->~T();
 			free_list.push_back(ptr);
 		}
+
+		inline bool is_empty() const
+		{
+			return (blocks.size() * block_size) == free_list.size();
+		}
+	};
+
+	// Allocation and freeing of an arbitrary number of bytes, managed in pages of the same size
+	//	- this is a wrapper around OffsetAllocator that adds thread safety and refcounting
+	//	- also supports deferred release for suballocated GPU resources
+	struct PageAllocator
+	{
+		uint32_t page_count = 0;
+		uint32_t page_size = 0;
+		struct AllocationInternal
+		{
+			std::atomic<int> refcount{ 0 };
+			OffsetAllocator::Allocation allocation;
+		};
+		struct AllocatorInternal
+		{
+			std::mutex locker;
+			OffsetAllocator::Allocator allocator;
+			BlockAllocator<AllocationInternal> internal_blocks;
+			bool deferred_release_enabled = false;
+			uint64_t deferred_release_frame = 0;
+			std::deque<std::pair<OffsetAllocator::Allocation, uint64_t>> deferred_release_queue;
+		};
+		std::shared_ptr<AllocatorInternal> allocator; // shared ptr is used to let any allocations extend the lifeftime of the allocator
+
+		// Returns the total size that the allocator manages:
+		constexpr uint64_t total_size_in_bytes() const { return uint64_t(page_count) * uint64_t(page_size); }
+
+		// Calculates the page count that will accomodate an allocation size request
+		constexpr uint32_t page_count_from_bytes(uint64_t sizeInBytes) const { return uint32_t(align((uint64_t)sizeInBytes, (uint64_t)page_size) / (uint64_t)page_size); }
+
+		// Initializes the allocator, only after which it can be used
+		//	total_size_in_bytes	:	the allocator will manage this number of bytes
+		//	page_size			:	the allocation granularity in bytes, each allocation will be aligned to this
+		//	deferred_release	:	if false, allocations are freed immediately (suitable for CPU only allocations), otherwise they are freed after a number of frames passed (which should be used for GPU allocations)
+		void init(uint64_t total_size_in_bytes, uint32_t page_size = 64u * 1024u, bool deferred_release = false)
+		{
+			this->page_size = page_size;
+			this->page_count = page_count_from_bytes(total_size_in_bytes);
+			allocator = std::make_shared<AllocatorInternal>();
+			allocator->allocator.init(page_count, std::min(page_count, OffsetAllocator::default_maxallocations));
+			allocator->deferred_release_enabled = deferred_release;
+			allocator->deferred_release_frame = 0;
+			allocator->deferred_release_queue.clear();
+		}
+		// This needs to be called every frame if deferred release is enabled:
+		void update_deferred_release(uint64_t framecount, uint32_t buffercount)
+		{
+			if (allocator == nullptr)
+				return;
+			std::scoped_lock lck(allocator->locker);
+			allocator->deferred_release_frame = framecount;
+			while (!allocator->deferred_release_queue.empty() && allocator->deferred_release_queue.front().second + buffercount < framecount)
+			{
+				allocator->allocator.free(allocator->deferred_release_queue.front().first);
+				allocator->deferred_release_queue.pop_front();
+			}
+		}
+
+		struct Allocation
+		{
+			std::shared_ptr<AllocatorInternal> allocator; // the allocator is retained so that allocation can deallocate itself
+			AllocationInternal* internal_state = nullptr; // this is pointing within the allocator which is retained by shared_ptr
+			uint64_t byte_offset = ~0ull;
+
+			Allocation()
+			{
+				Reset();
+			}
+			Allocation(const Allocation& other)
+			{
+				Reset();
+				allocator = other.allocator;
+				internal_state = other.internal_state;
+				byte_offset = other.byte_offset;
+				if (internal_state != nullptr)
+				{
+					internal_state->refcount.fetch_add(1);
+				}
+			}
+			Allocation(Allocation&& other) noexcept
+			{
+				Reset();
+				allocator = std::move(other.allocator);
+				internal_state = other.internal_state;
+				byte_offset = other.byte_offset;
+				other.allocator = nullptr;
+				other.internal_state = nullptr;
+				other.byte_offset = ~0ull;
+			}
+			~Allocation()
+			{
+				Reset();
+			}
+			void operator=(const Allocation& other)
+			{
+				Reset();
+				allocator = other.allocator;
+				internal_state = other.internal_state;
+				byte_offset = other.byte_offset;
+				if (internal_state != nullptr)
+				{
+					internal_state->refcount.fetch_add(1);
+				}
+			}
+			void operator=(Allocation&& other) noexcept
+			{
+				Reset();
+				allocator = std::move(other.allocator);
+				internal_state = other.internal_state;
+				byte_offset = other.byte_offset;
+				other.allocator = nullptr;
+				other.internal_state = nullptr;
+				other.byte_offset = ~0ull;
+			}
+			void Reset()
+			{
+				if (IsValid() && (internal_state->refcount.fetch_sub(1) <= 1))
+				{
+					std::scoped_lock lck(allocator->locker);
+					if (allocator->deferred_release_enabled)
+					{
+						// can only be reclaimed after buffering amount of frames passed, this is usually used for GPU resources:
+						allocator->deferred_release_queue.push_back(std::make_pair(internal_state->allocation, allocator->deferred_release_frame));
+					}
+					else
+					{
+						// reclaimed immediately:
+						allocator->allocator.free(internal_state->allocation);
+					}
+					allocator->internal_blocks.free(internal_state);
+				}
+				allocator = {};
+				internal_state = nullptr;
+				byte_offset = ~0ull;
+			}
+
+			constexpr bool IsValid() const { return internal_state != nullptr; }
+		};
+
+		// Allocates a reference counted allocation, viewing at least the requested amount of bytes
+		//	To check if the allocation succeeded, call IsValid() on the returned object
+		inline Allocation allocate(size_t sizeInBytes)
+		{
+			const uint32_t pages = page_count_from_bytes(sizeInBytes);
+			std::scoped_lock lck(allocator->locker);
+			OffsetAllocator::Allocation offsetallocation = allocator->allocator.allocate(pages);
+			Allocation alloc;
+			if (offsetallocation.offset != OffsetAllocator::Allocation::NO_SPACE)
+			{
+				alloc.allocator = allocator;
+				alloc.internal_state = allocator->internal_blocks.allocate();
+				alloc.internal_state->refcount.store(1);
+				alloc.internal_state->allocation = offsetallocation;
+				alloc.byte_offset = offsetallocation.offset * page_size;
+			}
+			return alloc;
+		}
+
+		// returns true if no pages are allocated
+		inline bool is_empty()
+		{
+			return allocator->allocator.storageReport().totalFreeSpace == page_count;
+		}
 	};
 }
@@ -318,6 +318,7 @@ namespace wi
 		wi::input::ClearForNextFrame();
 		wi::profiler::EndFrame(cmd);
 		graphicsDevice->SubmitCommandLists();
+		wi::renderer::UpdateGPUSuballocator();
 	}

 	void Application::Update(float dt)
@@ -250,14 +250,14 @@ namespace wi::graphics
 			return CreateBuffer2(desc, [&](void* dest) { std::memcpy(dest, initial_data, desc->size); }, buffer, alias, alias_offset);
 		}

-		bool CreateBufferCleared(const GPUBufferDesc* desc, uint8_t value, GPUBuffer* buffer) const
+		bool CreateBufferCleared(const GPUBufferDesc* desc, uint8_t value, GPUBuffer* buffer, const GPUResource* alias = nullptr, uint64_t alias_offset = 0ull) const
 		{
-			return CreateBuffer2(desc, [&](void* dest) { std::memset(dest, value, desc->size); }, buffer);
+			return CreateBuffer2(desc, [&](void* dest) { std::memset(dest, value, desc->size); }, buffer, alias, alias_offset);
 		}

-		bool CreateBufferZeroed(const GPUBufferDesc* desc, GPUBuffer* buffer) const
+		bool CreateBufferZeroed(const GPUBufferDesc* desc, GPUBuffer* buffer, const GPUResource* alias = nullptr, uint64_t alias_offset = 0ull) const
 		{
-			return CreateBufferCleared(desc, 0, buffer);
+			return CreateBufferCleared(desc, 0, buffer, alias, alias_offset);
 		}

 		void Barrier(const GPUBarrier& barrier, CommandList cmd)
@@ -2422,6 +2422,7 @@ std::mutex queue_locker;

 				disabledMessages.push_back(D3D12_MESSAGE_ID_DRAW_EMPTY_SCISSOR_RECTANGLE);
 				disabledMessages.push_back(D3D12_MESSAGE_ID_SETPRIVATEDATA_CHANGINGPARAMS);
+				disabledMessages.push_back(D3D12_MESSAGE_ID_HEAP_ADDRESS_RANGE_INTERSECTS_MULTIPLE_BUFFERS);

 				D3D12_INFO_QUEUE_FILTER filter = {};
 				filter.AllowList.NumSeverities = static_cast<UINT>(enabledSeverities.size());
@@ -349,6 +349,18 @@ namespace wi::graphics
 			{
 				alignment = std::max(alignment, 16ull);
 			}
+			if (has_flag(desc->misc_flags, ResourceMiscFlag::ALIASING_BUFFER))
+			{
+				alignment = std::max(alignment, (uint64_t)D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT);
+			}
+			if (has_flag(desc->misc_flags, ResourceMiscFlag::ALIASING_TEXTURE_NON_RT_DS))
+			{
+				alignment = std::max(alignment, (uint64_t)D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT);
+			}
+			if (has_flag(desc->misc_flags, ResourceMiscFlag::ALIASING_TEXTURE_RT_DS))
+			{
+				alignment = std::max(alignment, (uint64_t)D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT);
+			}
 			return alignment;
 		}

@@ -3750,12 +3750,6 @@ using namespace vulkan_internal;
 	}
 	bool GraphicsDevice_Vulkan::CreateBuffer2(const GPUBufferDesc* desc, const std::function<void(void*)>& init_callback, GPUBuffer* buffer, const GPUResource* alias, uint64_t alias_offset) const
 	{
-#ifdef PLATFORM_LINUX
-		// Resource aliasing on Linux sometimes fails with VK_ERROR_UNKOWN so I disable it:
-		alias = nullptr;
-		alias_offset = 0;
-#endif // PLATFORM_LINUX
-
 		auto internal_state = std::make_shared<Buffer_Vulkan>();
 		internal_state->allocationhandler = allocationhandler;
 		buffer->internal_state = internal_state;
@@ -3854,6 +3848,10 @@ using namespace vulkan_internal;
 		{
 			VkMemoryRequirements memory_requirements = {};
 			memory_requirements.alignment = desc->alignment;
+			if (memory_requirements.alignment == 0)
+			{
+				memory_requirements.alignment = GetMinOffsetAlignment(desc);
+			}
 			memory_requirements.size = AlignTo(desc->size, memory_requirements.alignment);
 			memory_requirements.memoryTypeBits = ~0u;

@@ -473,6 +473,10 @@ namespace wi::graphics
 			{
 				alignment = std::max(alignment, properties2.properties.limits.minTexelBufferOffsetAlignment);
 			}
+			if (has_flag(desc->misc_flags, ResourceMiscFlag::ALIASING_BUFFER) || has_flag(desc->misc_flags, ResourceMiscFlag::ALIASING_TEXTURE_NON_RT_DS) || has_flag(desc->misc_flags, ResourceMiscFlag::ALIASING_TEXTURE_RT_DS))
+			{
+				alignment = std::max(alignment, uint64_t(64 * 1024)); // 64KB safety to match DX12, because cannot use vkGetBufferMemoryRequirements here
+			}
 			return alignment;
 		}

@@ -920,12 +920,6 @@ namespace wi
 				wi::renderer::UpdateRaytracingAccelerationStructures(*scene, cmd);
 			}

-			if (scene->weather.IsRealisticSky())
-			{
-				wi::renderer::ComputeSkyAtmosphereTextures(cmd);
-				wi::renderer::ComputeSkyAtmosphereSkyViewLut(cmd);
-			}
-
 			if (wi::renderer::GetSurfelGIEnabled())
 			{
 				wi::renderer::SurfelGI(
@@ -1164,16 +1158,6 @@ namespace wi
 				);
 			}

-			if (scene->weather.IsRealisticSky())
-			{
-				wi::renderer::ComputeSkyAtmosphereSkyViewLut(cmd);
-
-				if (scene->weather.IsRealisticSkyAerialPerspective())
-				{
-					wi::renderer::ComputeSkyAtmosphereCameraVolumeLut(cmd);
-				}
-			}
-
 			if (scene->weather.IsVolumetricClouds() && !scene->weather.IsVolumetricCloudsReceiveShadow())
 			{
 				// When volumetric cloud DOESN'T receive shadow it can be done async to shadow maps!
@@ -1305,17 +1289,6 @@ namespace wi
 					cmd
 				);

-				// Render SkyAtmosphere assets from planar reflections point of view
-				if (scene->weather.IsRealisticSky())
-				{
-					wi::renderer::ComputeSkyAtmosphereSkyViewLut(cmd);
-
-					if (scene->weather.IsRealisticSkyAerialPerspective())
-					{
-						wi::renderer::ComputeSkyAtmosphereCameraVolumeLut(cmd);
-					}
-				}
-
 				device->EventBegin("Planar reflections Z-Prepass", cmd);
 				auto range = wi::profiler::BeginRangeGPU("Planar Reflections Z-Prepass", cmd);

@@ -2623,6 +2623,73 @@ const GPUBuffer& GetIndexBufferForQuads(uint32_t max_quad_count)
 	return indexBufferForQuads32;
 }

+// This is responsible to manage big chunks of GPUBuffer, each of which will be used for suballocations:
+struct GPUSubAllocator
+{
+	static constexpr uint64_t blocksize = 256ull * 1024ull * 1024ull; // 256 MB
+	struct Block
+	{
+		wi::allocator::PageAllocator allocator;
+		GPUBuffer buffer;
+	};
+	wi::vector<Block> blocks;
+	std::mutex locker;
+} static suballocator;
+BufferSuballocation SuballocateGPUBuffer(uint64_t size)
+{
+	if (size > GPUSubAllocator::blocksize / 2)
+		return {}; // invalid, larger allocations than half block size will not be suballocated
+
+	// scoped for locker
+	{
+		std::scoped_lock lock(suballocator.locker);
+
+		// See if any of the large blocks can fulfill the allocation request:
+		BufferSuballocation allocation;
+		for (auto& block : suballocator.blocks)
+		{
+			allocation.allocation = block.allocator.allocate(size);
+			if (allocation.allocation.IsValid())
+			{
+				allocation.alias = block.buffer;
+				//wilog("SuballocateGPUBuffer allocated size: %s, pages: %d, free space remaining: %s", wi::helper::GetMemorySizeText(size).c_str(), block.allocator.page_count_from_bytes(size), wi::helper::GetMemorySizeText(allocation.allocation.allocator->allocator.storageReport().totalFreeSpace * block.allocator.page_size).c_str());
+				return allocation;
+			}
+		}
+
+		// Allocation couldn't be fulfilled, create new block:
+		GPUBufferDesc desc;
+		desc.size = GPUSubAllocator::blocksize;
+		desc.usage = Usage::DEFAULT;
+		desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::VERTEX_BUFFER | BindFlag::INDEX_BUFFER;
+		desc.misc_flags = ResourceMiscFlag::ALIASING_BUFFER | ResourceMiscFlag::NO_DEFAULT_DESCRIPTORS;
+		desc.alignment = device->GetMinOffsetAlignment(&desc);
+		auto& block = suballocator.blocks.emplace_back();
+		bool success = device->CreateBuffer(&desc, nullptr, &block.buffer);
+		assert(success);
+		device->SetName(&block.buffer, "GPUSubAllocator");
+		block.allocator.init(desc.size, (uint32_t)desc.alignment, true);
+		wilog("SuballocateGPUBuffer created buffer block with size: %s, with page size: %s, page count: %d", wi::helper::GetMemorySizeText(block.allocator.total_size_in_bytes()).c_str(), wi::helper::GetMemorySizeText(block.allocator.page_size).c_str(), (int)block.allocator.page_count);
+	}
+	return SuballocateGPUBuffer(size); // retry
+}
+void UpdateGPUSuballocator()
+{
+	std::scoped_lock lock(suballocator.locker);
+	for (auto& block : suballocator.blocks)
+	{
+		block.allocator.update_deferred_release(device->GetFrameCount(), device->GetBufferCount());
+	}
+	for (size_t i = 0; i < suballocator.blocks.size(); ++i)
+	{
+		if (suballocator.blocks[i].allocator.is_empty())
+		{
+			suballocator.blocks.erase(suballocator.blocks.begin() + i);
+			break;
+		}
+	}
+}
+
 void ModifyObjectSampler(const SamplerDesc& desc)
 {
 	if (initialized.load())
@@ -2961,7 +3028,8 @@ void RenderMeshes(
 	uint32_t prev_stencilref = STENCILREF_DEFAULT;
 	device->BindStencilRef(prev_stencilref, cmd);

-	const GPUBuffer* prev_ib = nullptr;
+	IndexBufferFormat prev_ibformat = IndexBufferFormat::UINT16;
+	const void* prev_ib_internal = nullptr;

 	// This will be called every time we start a new draw call:
 	auto batch_flush = [&]()
@@ -3092,10 +3160,16 @@ void RenderMeshes(
 				device->BindStencilRef(stencilRef, cmd);
 			}

-			if (!meshShaderPSO && prev_ib != &mesh.generalBuffer)
+			// Note: the mesh.generalBuffer can be either a standalone allocated buffer, or a suballocated one (to reduce index buffer switching)
+			const GPUBuffer* ib = mesh.generalBufferOffsetAllocation.IsValid() ? &mesh.generalBufferOffsetAllocationAlias : &mesh.generalBuffer;
+			const IndexBufferFormat ibformat = mesh.GetIndexFormat();
+			const void* ibinternal = ib->internal_state.get();
+
+			if (!meshShaderPSO && (prev_ib_internal != ibinternal || prev_ibformat != ibformat))
 			{
-				device->BindIndexBuffer(&mesh.generalBuffer, mesh.GetIndexFormat(), mesh.ib.offset, cmd);
-				prev_ib = &mesh.generalBuffer;
+				prev_ib_internal = ibinternal;
+				prev_ibformat = ibformat;
+				device->BindIndexBuffer(ib, ibformat, 0, cmd);
 			}

 			if (
@@ -3114,6 +3188,18 @@ void RenderMeshes(
 			push.instances = instanceBufferDescriptorIndex;
 			push.instance_offset = (uint)instancedBatch.dataOffset;

+			uint32_t indexOffset = 0;
+			if (mesh.generalBufferOffsetAllocation.IsValid())
+			{
+				// In case the mesh general buffer is suballocated, the indexOffset is calculated relative to the beginning of the aliased buffer block:
+				indexOffset = uint32_t(((uint64_t)mesh.generalBufferOffsetAllocation.byte_offset + mesh.ib.offset) / mesh.GetIndexStride()) + subset.indexOffset;
+			}
+			else
+			{
+				// In case the mesh general buffer is not suballocated, it is a standalone buffer and index offset is relative to itself
+				indexOffset = uint32_t(mesh.ib.offset / mesh.GetIndexStride()) + subset.indexOffset;
+			}
+
 			if (pso_backside != nullptr)
 			{
 				device->BindPipelineState(pso_backside, cmd);
@@ -3124,7 +3210,7 @@ void RenderMeshes(
 				}
 				else
 				{
-					device->DrawIndexedInstanced(subset.indexCount, instancedBatch.instanceCount, subset.indexOffset, 0, 0, cmd);
+					device->DrawIndexedInstanced(subset.indexCount, instancedBatch.instanceCount, indexOffset, 0, 0, cmd);
 				}
 			}

@@ -3136,7 +3222,7 @@ void RenderMeshes(
 			}
 			else
 			{
-				device->DrawIndexedInstanced(subset.indexCount, instancedBatch.instanceCount, subset.indexOffset, 0, 0, cmd);
+				device->DrawIndexedInstanced(subset.indexCount, instancedBatch.instanceCount, indexOffset, 0, 0, cmd);
 			}

 		}
@@ -5196,6 +5282,16 @@ void UpdateRenderDataAsync(
 		ComputeVolumetricCloudShadows(cmd, weatherMapFirst, weatherMapSecond);
 	}

+	if (vis.scene->weather.IsRealisticSky())
+	{
+		wi::renderer::ComputeSkyAtmosphereTextures(cmd);
+		wi::renderer::ComputeSkyAtmosphereSkyViewLut(cmd);
+		if (vis.scene->weather.IsRealisticSkyAerialPerspective())
+		{
+			wi::renderer::ComputeSkyAtmosphereCameraVolumeLut(cmd);
+		}
+	}
+
 	// GPU Particle systems simulation/sorting/culling:
 	if (!vis.visibleEmitters.empty() || vis.scene->weather.rain_amount > 0)
 	{
@@ -12,6 +12,7 @@
 #include "shaders/ShaderInterop_SurfelGI.h"
 #include "wiVector.h"
 #include "wiSpinLock.h"
+#include "wiAllocator.h"

 #include <memory>
 #include <limits>
@@ -66,8 +67,21 @@ namespace wi::renderer

 	// Returns a buffer preinitialized for quad index buffer laid out as:
 	//	vertexID * 4 + [0, 1, 2, 2, 1, 3]
+	//	Note: it will return 16-bit or 32-bit index buffer depending on max_quad_count
 	const wi::graphics::GPUBuffer& GetIndexBufferForQuads(uint32_t max_quad_count);

+	struct BufferSuballocation
+	{
+		wi::graphics::GPUBuffer alias;
+		wi::allocator::PageAllocator::Allocation allocation;
+	};
+	// Sub-allocate (thread-safe) from a global GPU buffer for memory aliasing purpose:
+	//	The buffer will be DEFAULT usage, useable as vertex buffer, index buffer and shader resource
+	//	The purpose is to suballocate smaller GPUBuffers inside a larger GPUBuffer and bind the large GPUBuffer once as index buffer,
+	//	while the small buffers can be allocated/deallocated from it with memory aliasing and also used regularly by themselves
+	BufferSuballocation SuballocateGPUBuffer(uint64_t size);
+	void UpdateGPUSuballocator(); // called every frame for deferred release of GPU suballocations
+
 	void ModifyObjectSampler(const wi::graphics::SamplerDesc& desc);

 	// Initializes the renderer
@@ -586,6 +586,7 @@ namespace wi::scene

 	void MeshComponent::DeleteRenderData()
 	{
+		generalBufferOffsetAllocation = {};
 		generalBuffer = {};
 		streamoutBuffer = {};
 		ib = {};
@@ -1291,9 +1292,25 @@ namespace wi::scene
 			}
 		};

-		bool success = device->CreateBuffer2(&bd, init_callback, &generalBuffer);
-		assert(success);
-		device->SetName(&generalBuffer, "MeshComponent::generalBuffer");
+		// The suballocation strategy is used to have all mesh buffers reside in a global buffer
+		//	With this we can avoid rebinding the index buffer for every mesh and can work with purely offsets
+		//	Though the index buffer will still need to be rebound if the index format changes, but that happens less frequently
+		wi::renderer::BufferSuballocation suballoc = wi::renderer::SuballocateGPUBuffer(bd.size);
+		if (suballoc.allocation.IsValid())
+		{
+			bool success = device->CreateBuffer2(&bd, init_callback, &generalBuffer, &suballoc.alias, suballoc.allocation.byte_offset);
+			assert(success);
+			device->SetName(&generalBuffer, "MeshComponent::generalBuffer (suballocated)");
+			generalBufferOffsetAllocation = std::move(suballoc.allocation);
+			generalBufferOffsetAllocationAlias = std::move(suballoc.alias);
+		}
+		else
+		{
+			// If suballocation was not successful, a standalone buffer can be created instead:
+			bool success = device->CreateBuffer2(&bd, init_callback, &generalBuffer);
+			assert(success);
+			device->SetName(&generalBuffer, "MeshComponent::generalBuffer");
+		}

 		assert(ib.IsValid());
 		const Format ib_format = GetIndexFormat() == IndexBufferFormat::UINT32 ? Format::R32_UINT : Format::R16_UINT;
@@ -15,6 +15,7 @@
 #include "wiUnorderedSet.h"
 #include "wiBVH.h"
 #include "wiPathQuery.h"
+#include "wiAllocator.h"

 namespace wi::scene
 {
@@ -182,7 +183,7 @@ namespace wi::scene
 		XMFLOAT4 emissiveColor = XMFLOAT4(1, 1, 1, 0);
 		XMFLOAT4 subsurfaceScattering = XMFLOAT4(1, 1, 1, 0);
 		XMFLOAT4 extinctionColor = XMFLOAT4(0, 0.9f, 1, 1);
-		XMFLOAT4 texMulAdd = XMFLOAT4(1, 1, 0, 0);
+		XMFLOAT4 texMulAdd = XMFLOAT4(1, 1, 0, 0); // dynamic multiplier (.xy) and addition (.zw) for UV coordinates
 		float roughness = 0.2f;
 		float reflectance = 0.02f;
 		float metalness = 0.0f;
@@ -655,7 +656,7 @@ namespace wi::scene
 			BVH_ENABLED = 1 << 8,
 			QUANTIZED_POSITIONS_DISABLED = 1 << 9,
 		};
-		uint32_t _flags = RENDERABLE;
+		// *uint32_t _flags is moved down for better struct padding...

 		wi::vector<XMFLOAT3> vertex_positions;
 		wi::vector<XMFLOAT3> vertex_normals;
@@ -714,6 +715,8 @@ namespace wi::scene
 		wi::primitive::AABB aabb;
 		wi::graphics::GPUBuffer generalBuffer; // index buffer + all static vertex buffers
 		wi::graphics::GPUBuffer streamoutBuffer; // all dynamic vertex buffers
+		wi::allocator::PageAllocator::Allocation generalBufferOffsetAllocation;
+		wi::graphics::GPUBuffer generalBufferOffsetAllocationAlias;
 		struct BufferView
 		{
 			uint64_t offset = ~0ull;
@@ -751,13 +754,6 @@ namespace wi::scene
 		XMFLOAT2 uv_range_max = XMFLOAT2(1, 1);

 		wi::vector<wi::graphics::RaytracingAccelerationStructure> BLASes; // one BLAS per LOD
-		enum BLAS_STATE
-		{
-			BLAS_STATE_NEEDS_REBUILD,
-			BLAS_STATE_NEEDS_REFIT,
-			BLAS_STATE_COMPLETE,
-		};
-		mutable BLAS_STATE BLAS_state = BLAS_STATE_NEEDS_REBUILD;

 		wi::vector<wi::primitive::AABB> bvh_leaf_aabbs;
 		wi::BVH bvh;
@@ -771,6 +767,16 @@ namespace wi::scene

 		RigidBodyPhysicsComponent precomputed_rigidbody_physics_shape; // you can precompute a physics shape here if you need without using a real rigid body component yet

+		uint32_t _flags = RENDERABLE; // *this is serialized but put here for better struct padding
+
+		enum BLAS_STATE
+		{
+			BLAS_STATE_NEEDS_REBUILD,
+			BLAS_STATE_NEEDS_REFIT,
+			BLAS_STATE_COMPLETE,
+		};
+		mutable BLAS_STATE BLAS_state = BLAS_STATE_NEEDS_REBUILD;
+
 		constexpr void SetRenderable(bool value) { if (value) { _flags |= RENDERABLE; } else { _flags &= ~RENDERABLE; } }
 		constexpr void SetDoubleSided(bool value) { if (value) { _flags |= DOUBLE_SIDED; } else { _flags &= ~DOUBLE_SIDED; } }
 		constexpr void SetDoubleSidedShadow(bool value) { if (value) { _flags |= DOUBLE_SIDED_SHADOW; } else { _flags &= ~DOUBLE_SIDED_SHADOW; } }
@@ -9,7 +9,7 @@ namespace wi::version
 	// minor features, major updates, breaking compatibility changes
 	const int minor = 71;
 	// minor bug fixes, alterations, refactors, updates
-	const int revision = 750;
+	const int revision = 751;

 	const std::string version_string = std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(revision);

@@ -934,5 +934,28 @@ SOFTWARE.

 ###############################################################################################################################

+OffsetAllocator: https://github.com/sebbbi/OffsetAllocator

+MIT License

+Copyright (c) 2023 Sebastian Aaltonen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+###############################################################################################################################