GPU buffer suballocator for meshses to reduce index buffer switching (#1094)
This commit is contained in:
@@ -1036,6 +1036,17 @@ void MeshWindow::SetEntity(Entity entity, int subset)
|
||||
if (mesh->so_nor.IsValid()) ss += "\tstreamout_normals;\n";
|
||||
if (mesh->so_tan.IsValid()) ss += "\tstreamout_tangents;\n";
|
||||
if (mesh->so_pre.IsValid()) ss += "\tprevious_position;\n";
|
||||
|
||||
ss += "\nSuballocation offset: ";
|
||||
if (mesh->generalBufferOffsetAllocation.IsValid())
|
||||
{
|
||||
ss += wi::helper::GetMemorySizeText(mesh->generalBufferOffsetAllocation.byte_offset);
|
||||
}
|
||||
else
|
||||
{
|
||||
ss += "suballocation is not used for this mesh";
|
||||
}
|
||||
|
||||
meshInfoLabel.SetText(ss);
|
||||
|
||||
subsetComboBox.ClearItems();
|
||||
|
||||
@@ -16,6 +16,12 @@
|
||||
|
||||
// Simple common math helpers:
|
||||
|
||||
template<typename T>
|
||||
constexpr T align(T value, T alignment)
|
||||
{
|
||||
return ((value + alignment - T(1)) / alignment) * alignment;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T sqr(T x) { return x * x; }
|
||||
|
||||
|
||||
@@ -0,0 +1,475 @@
|
||||
// (C) Sebastian Aaltonen 2023
|
||||
// MIT License (see file: LICENSE)
|
||||
|
||||
#include "offsetAllocator.hpp"
|
||||
|
||||
#ifdef DEBUG
|
||||
#include <assert.h>
|
||||
#define ASSERT(x) assert(x)
|
||||
//#define DEBUG_VERBOSE
|
||||
#else
|
||||
#define ASSERT(x)
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG_VERBOSE
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include <cstring>
|
||||
|
||||
namespace OffsetAllocator
|
||||
{
|
||||
inline uint32 lzcnt_nonzero(uint32 v)
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
unsigned long retVal;
|
||||
_BitScanReverse(&retVal, v);
|
||||
return 31 - retVal;
|
||||
#else
|
||||
return __builtin_clz(v);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline uint32 tzcnt_nonzero(uint32 v)
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
unsigned long retVal;
|
||||
_BitScanForward(&retVal, v);
|
||||
return retVal;
|
||||
#else
|
||||
return __builtin_ctz(v);
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace SmallFloat
|
||||
{
|
||||
static constexpr uint32 MANTISSA_BITS = 3;
|
||||
static constexpr uint32 MANTISSA_VALUE = 1 << MANTISSA_BITS;
|
||||
static constexpr uint32 MANTISSA_MASK = MANTISSA_VALUE - 1;
|
||||
|
||||
// Bin sizes follow floating point (exponent + mantissa) distribution (piecewise linear log approx)
|
||||
// This ensures that for each size class, the average overhead percentage stays the same
|
||||
uint32 uintToFloatRoundUp(uint32 size)
|
||||
{
|
||||
uint32 exp = 0;
|
||||
uint32 mantissa = 0;
|
||||
|
||||
if (size < MANTISSA_VALUE)
|
||||
{
|
||||
// Denorm: 0..(MANTISSA_VALUE-1)
|
||||
mantissa = size;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Normalized: Hidden high bit always 1. Not stored. Just like float.
|
||||
uint32 leadingZeros = lzcnt_nonzero(size);
|
||||
uint32 highestSetBit = 31 - leadingZeros;
|
||||
|
||||
uint32 mantissaStartBit = highestSetBit - MANTISSA_BITS;
|
||||
exp = mantissaStartBit + 1;
|
||||
mantissa = (size >> mantissaStartBit) & MANTISSA_MASK;
|
||||
|
||||
uint32 lowBitsMask = (1 << mantissaStartBit) - 1;
|
||||
|
||||
// Round up!
|
||||
if ((size & lowBitsMask) != 0)
|
||||
mantissa++;
|
||||
}
|
||||
|
||||
return (exp << MANTISSA_BITS) + mantissa; // + allows mantissa->exp overflow for round up
|
||||
}
|
||||
|
||||
uint32 uintToFloatRoundDown(uint32 size)
|
||||
{
|
||||
uint32 exp = 0;
|
||||
uint32 mantissa = 0;
|
||||
|
||||
if (size < MANTISSA_VALUE)
|
||||
{
|
||||
// Denorm: 0..(MANTISSA_VALUE-1)
|
||||
mantissa = size;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Normalized: Hidden high bit always 1. Not stored. Just like float.
|
||||
uint32 leadingZeros = lzcnt_nonzero(size);
|
||||
uint32 highestSetBit = 31 - leadingZeros;
|
||||
|
||||
uint32 mantissaStartBit = highestSetBit - MANTISSA_BITS;
|
||||
exp = mantissaStartBit + 1;
|
||||
mantissa = (size >> mantissaStartBit) & MANTISSA_MASK;
|
||||
}
|
||||
|
||||
return (exp << MANTISSA_BITS) | mantissa;
|
||||
}
|
||||
|
||||
uint32 floatToUint(uint32 floatValue)
|
||||
{
|
||||
uint32 exponent = floatValue >> MANTISSA_BITS;
|
||||
uint32 mantissa = floatValue & MANTISSA_MASK;
|
||||
if (exponent == 0)
|
||||
{
|
||||
// Denorms
|
||||
return mantissa;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (mantissa | MANTISSA_VALUE) << (exponent - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Utility functions
|
||||
uint32 findLowestSetBitAfter(uint32 bitMask, uint32 startBitIndex)
|
||||
{
|
||||
uint32 maskBeforeStartIndex = (1 << startBitIndex) - 1;
|
||||
uint32 maskAfterStartIndex = ~maskBeforeStartIndex;
|
||||
uint32 bitsAfter = bitMask & maskAfterStartIndex;
|
||||
if (bitsAfter == 0) return Allocation::NO_SPACE;
|
||||
return tzcnt_nonzero(bitsAfter);
|
||||
}
|
||||
|
||||
// Allocator...
|
||||
void Allocator::init(uint32 size, uint32 maxAllocs)
|
||||
{
|
||||
m_size = size;
|
||||
m_maxAllocs = maxAllocs;
|
||||
m_nodes.reserve(maxAllocs);
|
||||
m_freeNodes.reserve(maxAllocs);
|
||||
if (sizeof(NodeIndex) == 2)
|
||||
{
|
||||
ASSERT(maxAllocs <= 65536);
|
||||
}
|
||||
reset();
|
||||
}
|
||||
|
||||
void Allocator::reset()
|
||||
{
|
||||
m_freeStorage = 0;
|
||||
m_usedBinsTop = 0;
|
||||
m_freeOffset = m_maxAllocs - 1;
|
||||
|
||||
for (uint32 i = 0 ; i < NUM_TOP_BINS; i++)
|
||||
m_usedBins[i] = 0;
|
||||
|
||||
for (uint32 i = 0 ; i < NUM_LEAF_BINS; i++)
|
||||
m_binIndices[i] = Node::unused;
|
||||
|
||||
m_nodes.clear();
|
||||
m_freeNodes.clear();
|
||||
|
||||
m_nodes.resize(m_maxAllocs);
|
||||
m_freeNodes.resize(m_maxAllocs);
|
||||
|
||||
// Freelist is a stack. Nodes in inverse order so that [0] pops first.
|
||||
for (uint32 i = 0; i < m_maxAllocs; i++)
|
||||
{
|
||||
m_freeNodes[i] = m_maxAllocs - i - 1;
|
||||
}
|
||||
|
||||
// Start state: Whole storage as one big node
|
||||
// Algorithm will split remainders and push them back as smaller nodes
|
||||
insertNodeIntoBin(m_size, 0);
|
||||
}
|
||||
|
||||
Allocation Allocator::allocate(uint32 size)
|
||||
{
|
||||
// Out of allocations?
|
||||
if (m_freeOffset == 0)
|
||||
{
|
||||
Allocation ret;
|
||||
ret.offset = Allocation::NO_SPACE;
|
||||
ret.metadata = Allocation::NO_SPACE;
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Round up to bin index to ensure that alloc >= bin
|
||||
// Gives us min bin index that fits the size
|
||||
uint32 minBinIndex = SmallFloat::uintToFloatRoundUp(size);
|
||||
|
||||
uint32 minTopBinIndex = minBinIndex >> TOP_BINS_INDEX_SHIFT;
|
||||
uint32 minLeafBinIndex = minBinIndex & LEAF_BINS_INDEX_MASK;
|
||||
|
||||
uint32 topBinIndex = minTopBinIndex;
|
||||
uint32 leafBinIndex = Allocation::NO_SPACE;
|
||||
|
||||
// If top bin exists, scan its leaf bin. This can fail (NO_SPACE).
|
||||
if (m_usedBinsTop & (1 << topBinIndex))
|
||||
{
|
||||
leafBinIndex = findLowestSetBitAfter(m_usedBins[topBinIndex], minLeafBinIndex);
|
||||
}
|
||||
|
||||
// If we didn't find space in top bin, we search top bin from +1
|
||||
if (leafBinIndex == Allocation::NO_SPACE)
|
||||
{
|
||||
topBinIndex = findLowestSetBitAfter(m_usedBinsTop, minTopBinIndex + 1);
|
||||
|
||||
// Out of space?
|
||||
if (topBinIndex == Allocation::NO_SPACE)
|
||||
{
|
||||
Allocation ret;
|
||||
ret.offset = Allocation::NO_SPACE;
|
||||
ret.metadata = Allocation::NO_SPACE;
|
||||
return ret;
|
||||
}
|
||||
|
||||
// All leaf bins here fit the alloc, since the top bin was rounded up. Start leaf search from bit 0.
|
||||
// NOTE: This search can't fail since at least one leaf bit was set because the top bit was set.
|
||||
leafBinIndex = tzcnt_nonzero(m_usedBins[topBinIndex]);
|
||||
}
|
||||
|
||||
uint32 binIndex = (topBinIndex << TOP_BINS_INDEX_SHIFT) | leafBinIndex;
|
||||
|
||||
// Pop the top node of the bin. Bin top = node.next.
|
||||
uint32 nodeIndex = m_binIndices[binIndex];
|
||||
Node& node = m_nodes[nodeIndex];
|
||||
uint32 nodeTotalSize = node.dataSize;
|
||||
node.dataSize = size;
|
||||
node.used = true;
|
||||
m_binIndices[binIndex] = node.binListNext;
|
||||
if (node.binListNext != Node::unused) m_nodes[node.binListNext].binListPrev = Node::unused;
|
||||
m_freeStorage -= nodeTotalSize;
|
||||
#ifdef DEBUG_VERBOSE
|
||||
printf("Free storage: %u (-%u) (allocate)\n", m_freeStorage, nodeTotalSize);
|
||||
#endif
|
||||
|
||||
// Bin empty?
|
||||
if (m_binIndices[binIndex] == Node::unused)
|
||||
{
|
||||
// Remove a leaf bin mask bit
|
||||
m_usedBins[topBinIndex] &= ~(1 << leafBinIndex);
|
||||
|
||||
// All leaf bins empty?
|
||||
if (m_usedBins[topBinIndex] == 0)
|
||||
{
|
||||
// Remove a top bin mask bit
|
||||
m_usedBinsTop &= ~(1 << topBinIndex);
|
||||
}
|
||||
}
|
||||
|
||||
// Push back reminder N elements to a lower bin
|
||||
uint32 reminderSize = nodeTotalSize - size;
|
||||
if (reminderSize > 0)
|
||||
{
|
||||
uint32 newNodeIndex = insertNodeIntoBin(reminderSize, node.dataOffset + size);
|
||||
|
||||
// Link nodes next to each other so that we can merge them later if both are free
|
||||
// And update the old next neighbor to point to the new node (in middle)
|
||||
if (node.neighborNext != Node::unused) m_nodes[node.neighborNext].neighborPrev = newNodeIndex;
|
||||
m_nodes[newNodeIndex].neighborPrev = nodeIndex;
|
||||
m_nodes[newNodeIndex].neighborNext = node.neighborNext;
|
||||
node.neighborNext = newNodeIndex;
|
||||
}
|
||||
|
||||
Allocation ret;
|
||||
ret.offset = node.dataOffset;
|
||||
ret.metadata = nodeIndex;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void Allocator::free(Allocation allocation)
|
||||
{
|
||||
ASSERT(allocation.metadata != Allocation::NO_SPACE);
|
||||
if (m_nodes.empty()) return;
|
||||
|
||||
uint32 nodeIndex = allocation.metadata;
|
||||
Node& node = m_nodes[nodeIndex];
|
||||
|
||||
// Double delete check
|
||||
ASSERT(node.used == true);
|
||||
|
||||
// Merge with neighbors...
|
||||
uint32 offset = node.dataOffset;
|
||||
uint32 size = node.dataSize;
|
||||
|
||||
if ((node.neighborPrev != Node::unused) && (m_nodes[node.neighborPrev].used == false))
|
||||
{
|
||||
// Previous (contiguous) free node: Change offset to previous node offset. Sum sizes
|
||||
Node& prevNode = m_nodes[node.neighborPrev];
|
||||
offset = prevNode.dataOffset;
|
||||
size += prevNode.dataSize;
|
||||
|
||||
// Remove node from the bin linked list and put it in the freelist
|
||||
removeNodeFromBin(node.neighborPrev);
|
||||
|
||||
ASSERT(prevNode.neighborNext == nodeIndex);
|
||||
node.neighborPrev = prevNode.neighborPrev;
|
||||
}
|
||||
|
||||
if ((node.neighborNext != Node::unused) && (m_nodes[node.neighborNext].used == false))
|
||||
{
|
||||
// Next (contiguous) free node: Offset remains the same. Sum sizes.
|
||||
Node& nextNode = m_nodes[node.neighborNext];
|
||||
size += nextNode.dataSize;
|
||||
|
||||
// Remove node from the bin linked list and put it in the freelist
|
||||
removeNodeFromBin(node.neighborNext);
|
||||
|
||||
ASSERT(nextNode.neighborPrev == nodeIndex);
|
||||
node.neighborNext = nextNode.neighborNext;
|
||||
}
|
||||
|
||||
uint32 neighborNext = node.neighborNext;
|
||||
uint32 neighborPrev = node.neighborPrev;
|
||||
|
||||
// Insert the removed node to freelist
|
||||
#ifdef DEBUG_VERBOSE
|
||||
printf("Putting node %u into freelist[%u] (free)\n", nodeIndex, m_freeOffset + 1);
|
||||
#endif
|
||||
m_freeNodes[++m_freeOffset] = nodeIndex;
|
||||
|
||||
// Insert the (combined) free node to bin
|
||||
uint32 combinedNodeIndex = insertNodeIntoBin(size, offset);
|
||||
|
||||
// Connect neighbors with the new combined node
|
||||
if (neighborNext != Node::unused)
|
||||
{
|
||||
m_nodes[combinedNodeIndex].neighborNext = neighborNext;
|
||||
m_nodes[neighborNext].neighborPrev = combinedNodeIndex;
|
||||
}
|
||||
if (neighborPrev != Node::unused)
|
||||
{
|
||||
m_nodes[combinedNodeIndex].neighborPrev = neighborPrev;
|
||||
m_nodes[neighborPrev].neighborNext = combinedNodeIndex;
|
||||
}
|
||||
}
|
||||
|
||||
uint32 Allocator::insertNodeIntoBin(uint32 size, uint32 dataOffset)
|
||||
{
|
||||
// Round down to bin index to ensure that bin >= alloc
|
||||
uint32 binIndex = SmallFloat::uintToFloatRoundDown(size);
|
||||
|
||||
uint32 topBinIndex = binIndex >> TOP_BINS_INDEX_SHIFT;
|
||||
uint32 leafBinIndex = binIndex & LEAF_BINS_INDEX_MASK;
|
||||
|
||||
// Bin was empty before?
|
||||
if (m_binIndices[binIndex] == Node::unused)
|
||||
{
|
||||
// Set bin mask bits
|
||||
m_usedBins[topBinIndex] |= 1 << leafBinIndex;
|
||||
m_usedBinsTop |= 1 << topBinIndex;
|
||||
}
|
||||
|
||||
// Take a freelist node and insert on top of the bin linked list (next = old top)
|
||||
uint32 topNodeIndex = m_binIndices[binIndex];
|
||||
uint32 nodeIndex = m_freeNodes[m_freeOffset--];
|
||||
#ifdef DEBUG_VERBOSE
|
||||
printf("Getting node %u from freelist[%u]\n", nodeIndex, m_freeOffset + 1);
|
||||
#endif
|
||||
m_nodes[nodeIndex].dataOffset = dataOffset;
|
||||
m_nodes[nodeIndex].dataSize = size;
|
||||
m_nodes[nodeIndex].binListNext = topNodeIndex;
|
||||
if (topNodeIndex != Node::unused) m_nodes[topNodeIndex].binListPrev = nodeIndex;
|
||||
m_binIndices[binIndex] = nodeIndex;
|
||||
|
||||
m_freeStorage += size;
|
||||
#ifdef DEBUG_VERBOSE
|
||||
printf("Free storage: %u (+%u) (insertNodeIntoBin)\n", m_freeStorage, size);
|
||||
#endif
|
||||
|
||||
return nodeIndex;
|
||||
}
|
||||
|
||||
void Allocator::removeNodeFromBin(uint32 nodeIndex)
|
||||
{
|
||||
Node &node = m_nodes[nodeIndex];
|
||||
|
||||
if (node.binListPrev != Node::unused)
|
||||
{
|
||||
// Easy case: We have previous node. Just remove this node from the middle of the list.
|
||||
m_nodes[node.binListPrev].binListNext = node.binListNext;
|
||||
if (node.binListNext != Node::unused) m_nodes[node.binListNext].binListPrev = node.binListPrev;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Hard case: We are the first node in a bin. Find the bin.
|
||||
|
||||
// Round down to bin index to ensure that bin >= alloc
|
||||
uint32 binIndex = SmallFloat::uintToFloatRoundDown(node.dataSize);
|
||||
|
||||
uint32 topBinIndex = binIndex >> TOP_BINS_INDEX_SHIFT;
|
||||
uint32 leafBinIndex = binIndex & LEAF_BINS_INDEX_MASK;
|
||||
|
||||
m_binIndices[binIndex] = node.binListNext;
|
||||
if (node.binListNext != Node::unused) m_nodes[node.binListNext].binListPrev = Node::unused;
|
||||
|
||||
// Bin empty?
|
||||
if (m_binIndices[binIndex] == Node::unused)
|
||||
{
|
||||
// Remove a leaf bin mask bit
|
||||
m_usedBins[topBinIndex] &= ~(1 << leafBinIndex);
|
||||
|
||||
// All leaf bins empty?
|
||||
if (m_usedBins[topBinIndex] == 0)
|
||||
{
|
||||
// Remove a top bin mask bit
|
||||
m_usedBinsTop &= ~(1 << topBinIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Insert the node to freelist
|
||||
#ifdef DEBUG_VERBOSE
|
||||
printf("Putting node %u into freelist[%u] (removeNodeFromBin)\n", nodeIndex, m_freeOffset + 1);
|
||||
#endif
|
||||
m_freeNodes[++m_freeOffset] = nodeIndex;
|
||||
|
||||
m_freeStorage -= node.dataSize;
|
||||
#ifdef DEBUG_VERBOSE
|
||||
printf("Free storage: %u (-%u) (removeNodeFromBin)\n", m_freeStorage, node.dataSize);
|
||||
#endif
|
||||
}
|
||||
|
||||
uint32 Allocator::allocationSize(Allocation allocation) const
|
||||
{
|
||||
if (allocation.metadata == Allocation::NO_SPACE) return 0;
|
||||
if (m_nodes.empty()) return 0;
|
||||
|
||||
return m_nodes[allocation.metadata].dataSize;
|
||||
}
|
||||
|
||||
StorageReport Allocator::storageReport() const
|
||||
{
|
||||
uint32 largestFreeRegion = 0;
|
||||
uint32 freeStorage = 0;
|
||||
|
||||
// Out of allocations? -> Zero free space
|
||||
if (m_freeOffset > 0)
|
||||
{
|
||||
freeStorage = m_freeStorage;
|
||||
if (m_usedBinsTop)
|
||||
{
|
||||
uint32 topBinIndex = 31 - lzcnt_nonzero(m_usedBinsTop);
|
||||
uint32 leafBinIndex = 31 - lzcnt_nonzero(m_usedBins[topBinIndex]);
|
||||
largestFreeRegion = SmallFloat::floatToUint((topBinIndex << TOP_BINS_INDEX_SHIFT) | leafBinIndex);
|
||||
ASSERT(freeStorage >= largestFreeRegion);
|
||||
}
|
||||
}
|
||||
|
||||
StorageReport ret;
|
||||
ret.totalFreeSpace = freeStorage;
|
||||
ret.largestFreeRegion = largestFreeRegion;
|
||||
return ret;
|
||||
}
|
||||
|
||||
StorageReportFull Allocator::storageReportFull() const
|
||||
{
|
||||
StorageReportFull report;
|
||||
for (uint32 i = 0; i < NUM_LEAF_BINS; i++)
|
||||
{
|
||||
uint32 count = 0;
|
||||
uint32 nodeIndex = m_binIndices[i];
|
||||
while (nodeIndex != Node::unused)
|
||||
{
|
||||
nodeIndex = m_nodes[nodeIndex].binListNext;
|
||||
count++;
|
||||
}
|
||||
report.freeRegions[i].size = SmallFloat::floatToUint(i); report.freeRegions[i].count = count;
|
||||
}
|
||||
return report;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
#pragma once
|
||||
// (C) Sebastian Aaltonen 2023
|
||||
// MIT License (see file: LICENSE)
|
||||
|
||||
// Modified for Wicked Engine
|
||||
// - removed cpp20 features
|
||||
// - removed constructors
|
||||
// - changed node storage to std::vector
|
||||
// - reduced size of Node structure
|
||||
|
||||
//#define USE_16_BIT_OFFSETS
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace OffsetAllocator
|
||||
{
|
||||
typedef unsigned char uint8;
|
||||
typedef unsigned short uint16;
|
||||
typedef unsigned int uint32;
|
||||
|
||||
// 16 bit offsets mode will halve the metadata storage cost
|
||||
// But it only supports up to 65536 maximum allocation count
|
||||
#ifdef USE_16_BIT_NODE_INDICES
|
||||
typedef uint16 NodeIndex;
|
||||
static constexpr uint32 default_maxallocations = 64 * 1024;
|
||||
#else
|
||||
typedef uint32 NodeIndex;
|
||||
static constexpr uint32 default_maxallocations = 128 * 1024;
|
||||
#endif
|
||||
|
||||
static constexpr uint32 NUM_TOP_BINS = 32;
|
||||
static constexpr uint32 BINS_PER_LEAF = 8;
|
||||
static constexpr uint32 TOP_BINS_INDEX_SHIFT = 3;
|
||||
static constexpr uint32 LEAF_BINS_INDEX_MASK = 0x7;
|
||||
static constexpr uint32 NUM_LEAF_BINS = NUM_TOP_BINS * BINS_PER_LEAF;
|
||||
|
||||
struct Allocation
|
||||
{
|
||||
static constexpr uint32 NO_SPACE = 0xffffffff;
|
||||
|
||||
uint32 offset = NO_SPACE;
|
||||
NodeIndex metadata = NO_SPACE; // internal: node index
|
||||
};
|
||||
|
||||
struct StorageReport
|
||||
{
|
||||
uint32 totalFreeSpace = 0;
|
||||
uint32 largestFreeRegion = 0;
|
||||
};
|
||||
|
||||
struct StorageReportFull
|
||||
{
|
||||
struct Region
|
||||
{
|
||||
uint32 size = 0;
|
||||
uint32 count = 0;
|
||||
};
|
||||
|
||||
Region freeRegions[NUM_LEAF_BINS];
|
||||
};
|
||||
|
||||
class Allocator
|
||||
{
|
||||
public:
|
||||
void init(uint32 size, uint32 maxAllocs = default_maxallocations);
|
||||
void reset();
|
||||
|
||||
Allocation allocate(uint32 size);
|
||||
void free(Allocation allocation);
|
||||
|
||||
uint32 allocationSize(Allocation allocation) const;
|
||||
StorageReport storageReport() const;
|
||||
StorageReportFull storageReportFull() const;
|
||||
|
||||
private:
|
||||
uint32 insertNodeIntoBin(uint32 size, uint32 dataOffset);
|
||||
void removeNodeFromBin(uint32 nodeIndex);
|
||||
|
||||
struct Node
|
||||
{
|
||||
static constexpr NodeIndex unused = 0xffffffff;
|
||||
|
||||
uint32 dataOffset : 32;
|
||||
uint32 dataSize : 31;
|
||||
uint32 used : 1;
|
||||
NodeIndex binListPrev : 32;
|
||||
NodeIndex binListNext : 32;
|
||||
NodeIndex neighborPrev : 32;
|
||||
NodeIndex neighborNext : 32;
|
||||
|
||||
Node()
|
||||
{
|
||||
dataOffset = 0;
|
||||
dataSize = 0;
|
||||
binListPrev = unused;
|
||||
binListNext = unused;
|
||||
neighborPrev = unused;
|
||||
neighborNext = unused;
|
||||
used = 0;
|
||||
}
|
||||
};
|
||||
|
||||
uint32 m_size = 0;
|
||||
uint32 m_maxAllocs = 0;
|
||||
uint32 m_freeStorage = 0;
|
||||
|
||||
uint32 m_usedBinsTop = 0;
|
||||
uint8 m_usedBins[NUM_TOP_BINS] = {};
|
||||
NodeIndex m_binIndices[NUM_LEAF_BINS] = {};
|
||||
|
||||
std::vector<Node> m_nodes;
|
||||
std::vector<NodeIndex> m_freeNodes;
|
||||
uint32 m_freeOffset = 0;
|
||||
};
|
||||
}
|
||||
@@ -314,6 +314,7 @@
|
||||
<ClInclude Include="$(MSBuildThisFileDirectory)Utility\lodepng.h" />
|
||||
<ClInclude Include="$(MSBuildThisFileDirectory)Utility\meshoptimizer\meshoptimizer.h" />
|
||||
<ClInclude Include="$(MSBuildThisFileDirectory)Utility\minimp4.h" />
|
||||
<ClInclude Include="$(MSBuildThisFileDirectory)Utility\offsetAllocator.hpp" />
|
||||
<ClInclude Include="$(MSBuildThisFileDirectory)Utility\pugiconfig.hpp" />
|
||||
<ClInclude Include="$(MSBuildThisFileDirectory)Utility\pugixml.hpp" />
|
||||
<ClInclude Include="$(MSBuildThisFileDirectory)Utility\WinAdapter.h" />
|
||||
@@ -610,6 +611,7 @@
|
||||
<ClCompile Include="$(MSBuildThisFileDirectory)Utility\meshoptimizer\vertexfilter.cpp" />
|
||||
<ClCompile Include="$(MSBuildThisFileDirectory)Utility\meshoptimizer\vfetchanalyzer.cpp" />
|
||||
<ClCompile Include="$(MSBuildThisFileDirectory)Utility\meshoptimizer\vfetchoptimizer.cpp" />
|
||||
<ClCompile Include="$(MSBuildThisFileDirectory)Utility\offsetAllocator.cpp" />
|
||||
<ClCompile Include="$(MSBuildThisFileDirectory)Utility\pugixml.cpp" />
|
||||
<ClCompile Include="$(MSBuildThisFileDirectory)wiAsync_BindLua.cpp" />
|
||||
<ClCompile Include="$(MSBuildThisFileDirectory)wiConfig.cpp" />
|
||||
|
||||
@@ -1389,6 +1389,9 @@
|
||||
<ClInclude Include="$(MSBuildThisFileDirectory)Jolt\RegisterTypes.h">
|
||||
<Filter>JOLT</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="$(MSBuildThisFileDirectory)Utility\offsetAllocator.hpp">
|
||||
<Filter>UTILITY</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="$(MSBuildThisFileDirectory)LUA\lapi.c">
|
||||
@@ -2189,6 +2192,9 @@
|
||||
<ClCompile Include="$(MSBuildThisFileDirectory)Jolt\RegisterTypes.cpp">
|
||||
<Filter>JOLT</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="$(MSBuildThisFileDirectory)Utility\offsetAllocator.cpp">
|
||||
<Filter>UTILITY</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="$(MSBuildThisFileDirectory)Utility\DirectXCollision.inl">
|
||||
|
||||
@@ -2,11 +2,18 @@
|
||||
#include "CommonInclude.h"
|
||||
#include "wiVector.h"
|
||||
|
||||
#include "Utility/offsetAllocator.hpp"
|
||||
|
||||
#include <mutex>
|
||||
#include <atomic>
|
||||
#include <memory>
|
||||
#include <cassert>
|
||||
#include <algorithm>
|
||||
#include <deque>
|
||||
|
||||
namespace wi::allocator
|
||||
{
|
||||
// Allocation of consecutive bytes, but no freeing, instead the whole allocator can be reset
|
||||
struct LinearAllocator
|
||||
{
|
||||
uint8_t* data = nullptr;
|
||||
@@ -38,6 +45,7 @@ namespace wi::allocator
|
||||
}
|
||||
};
|
||||
|
||||
// Allocation and freeing of single elements of the same size
|
||||
template<typename T, size_t block_size = 256>
|
||||
struct BlockAllocator
|
||||
{
|
||||
@@ -71,5 +79,174 @@ namespace wi::allocator
|
||||
ptr->~T();
|
||||
free_list.push_back(ptr);
|
||||
}
|
||||
|
||||
inline bool is_empty() const
|
||||
{
|
||||
return (blocks.size() * block_size) == free_list.size();
|
||||
}
|
||||
};
|
||||
|
||||
// Allocation and freeing of an arbitrary number of bytes, managed in pages of the same size
|
||||
// - this is a wrapper around OffsetAllocator that adds thread safety and refcounting
|
||||
// - also supports deferred release for suballocated GPU resources
|
||||
struct PageAllocator
|
||||
{
|
||||
uint32_t page_count = 0;
|
||||
uint32_t page_size = 0;
|
||||
struct AllocationInternal
|
||||
{
|
||||
std::atomic<int> refcount{ 0 };
|
||||
OffsetAllocator::Allocation allocation;
|
||||
};
|
||||
struct AllocatorInternal
|
||||
{
|
||||
std::mutex locker;
|
||||
OffsetAllocator::Allocator allocator;
|
||||
BlockAllocator<AllocationInternal> internal_blocks;
|
||||
bool deferred_release_enabled = false;
|
||||
uint64_t deferred_release_frame = 0;
|
||||
std::deque<std::pair<OffsetAllocator::Allocation, uint64_t>> deferred_release_queue;
|
||||
};
|
||||
std::shared_ptr<AllocatorInternal> allocator; // shared ptr is used to let any allocations extend the lifeftime of the allocator
|
||||
|
||||
// Returns the total size that the allocator manages:
|
||||
constexpr uint64_t total_size_in_bytes() const { return uint64_t(page_count) * uint64_t(page_size); }
|
||||
|
||||
// Calculates the page count that will accomodate an allocation size request
|
||||
constexpr uint32_t page_count_from_bytes(uint64_t sizeInBytes) const { return uint32_t(align((uint64_t)sizeInBytes, (uint64_t)page_size) / (uint64_t)page_size); }
|
||||
|
||||
// Initializes the allocator, only after which it can be used
|
||||
// total_size_in_bytes : the allocator will manage this number of bytes
|
||||
// page_size : the allocation granularity in bytes, each allocation will be aligned to this
|
||||
// deferred_release : if false, allocations are freed immediately (suitable for CPU only allocations), otherwise they are freed after a number of frames passed (which should be used for GPU allocations)
|
||||
void init(uint64_t total_size_in_bytes, uint32_t page_size = 64u * 1024u, bool deferred_release = false)
|
||||
{
|
||||
this->page_size = page_size;
|
||||
this->page_count = page_count_from_bytes(total_size_in_bytes);
|
||||
allocator = std::make_shared<AllocatorInternal>();
|
||||
allocator->allocator.init(page_count, std::min(page_count, OffsetAllocator::default_maxallocations));
|
||||
allocator->deferred_release_enabled = deferred_release;
|
||||
allocator->deferred_release_frame = 0;
|
||||
allocator->deferred_release_queue.clear();
|
||||
}
|
||||
// This needs to be called every frame if deferred release is enabled:
|
||||
void update_deferred_release(uint64_t framecount, uint32_t buffercount)
|
||||
{
|
||||
if (allocator == nullptr)
|
||||
return;
|
||||
std::scoped_lock lck(allocator->locker);
|
||||
allocator->deferred_release_frame = framecount;
|
||||
while (!allocator->deferred_release_queue.empty() && allocator->deferred_release_queue.front().second + buffercount < framecount)
|
||||
{
|
||||
allocator->allocator.free(allocator->deferred_release_queue.front().first);
|
||||
allocator->deferred_release_queue.pop_front();
|
||||
}
|
||||
}
|
||||
|
||||
struct Allocation
|
||||
{
|
||||
std::shared_ptr<AllocatorInternal> allocator; // the allocator is retained so that allocation can deallocate itself
|
||||
AllocationInternal* internal_state = nullptr; // this is pointing within the allocator which is retained by shared_ptr
|
||||
uint64_t byte_offset = ~0ull;
|
||||
|
||||
Allocation()
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
Allocation(const Allocation& other)
|
||||
{
|
||||
Reset();
|
||||
allocator = other.allocator;
|
||||
internal_state = other.internal_state;
|
||||
byte_offset = other.byte_offset;
|
||||
if (internal_state != nullptr)
|
||||
{
|
||||
internal_state->refcount.fetch_add(1);
|
||||
}
|
||||
}
|
||||
Allocation(Allocation&& other) noexcept
|
||||
{
|
||||
Reset();
|
||||
allocator = std::move(other.allocator);
|
||||
internal_state = other.internal_state;
|
||||
byte_offset = other.byte_offset;
|
||||
other.allocator = nullptr;
|
||||
other.internal_state = nullptr;
|
||||
other.byte_offset = ~0ull;
|
||||
}
|
||||
~Allocation()
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
void operator=(const Allocation& other)
|
||||
{
|
||||
Reset();
|
||||
allocator = other.allocator;
|
||||
internal_state = other.internal_state;
|
||||
byte_offset = other.byte_offset;
|
||||
if (internal_state != nullptr)
|
||||
{
|
||||
internal_state->refcount.fetch_add(1);
|
||||
}
|
||||
}
|
||||
void operator=(Allocation&& other) noexcept
|
||||
{
|
||||
Reset();
|
||||
allocator = std::move(other.allocator);
|
||||
internal_state = other.internal_state;
|
||||
byte_offset = other.byte_offset;
|
||||
other.allocator = nullptr;
|
||||
other.internal_state = nullptr;
|
||||
other.byte_offset = ~0ull;
|
||||
}
|
||||
void Reset()
|
||||
{
|
||||
if (IsValid() && (internal_state->refcount.fetch_sub(1) <= 1))
|
||||
{
|
||||
std::scoped_lock lck(allocator->locker);
|
||||
if (allocator->deferred_release_enabled)
|
||||
{
|
||||
// can only be reclaimed after buffering amount of frames passed, this is usually used for GPU resources:
|
||||
allocator->deferred_release_queue.push_back(std::make_pair(internal_state->allocation, allocator->deferred_release_frame));
|
||||
}
|
||||
else
|
||||
{
|
||||
// reclaimed immediately:
|
||||
allocator->allocator.free(internal_state->allocation);
|
||||
}
|
||||
allocator->internal_blocks.free(internal_state);
|
||||
}
|
||||
allocator = {};
|
||||
internal_state = nullptr;
|
||||
byte_offset = ~0ull;
|
||||
}
|
||||
|
||||
constexpr bool IsValid() const { return internal_state != nullptr; }
|
||||
};
|
||||
|
||||
// Allocates a reference counted allocation, viewing at least the requested amount of bytes
|
||||
// To check if the allocation succeeded, call IsValid() on the returned object
|
||||
inline Allocation allocate(size_t sizeInBytes)
|
||||
{
|
||||
const uint32_t pages = page_count_from_bytes(sizeInBytes);
|
||||
std::scoped_lock lck(allocator->locker);
|
||||
OffsetAllocator::Allocation offsetallocation = allocator->allocator.allocate(pages);
|
||||
Allocation alloc;
|
||||
if (offsetallocation.offset != OffsetAllocator::Allocation::NO_SPACE)
|
||||
{
|
||||
alloc.allocator = allocator;
|
||||
alloc.internal_state = allocator->internal_blocks.allocate();
|
||||
alloc.internal_state->refcount.store(1);
|
||||
alloc.internal_state->allocation = offsetallocation;
|
||||
alloc.byte_offset = offsetallocation.offset * page_size;
|
||||
}
|
||||
return alloc;
|
||||
}
|
||||
|
||||
// returns true if no pages are allocated
|
||||
inline bool is_empty()
|
||||
{
|
||||
return allocator->allocator.storageReport().totalFreeSpace == page_count;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@@ -318,6 +318,7 @@ namespace wi
|
||||
wi::input::ClearForNextFrame();
|
||||
wi::profiler::EndFrame(cmd);
|
||||
graphicsDevice->SubmitCommandLists();
|
||||
wi::renderer::UpdateGPUSuballocator();
|
||||
}
|
||||
|
||||
void Application::Update(float dt)
|
||||
|
||||
@@ -250,14 +250,14 @@ namespace wi::graphics
|
||||
return CreateBuffer2(desc, [&](void* dest) { std::memcpy(dest, initial_data, desc->size); }, buffer, alias, alias_offset);
|
||||
}
|
||||
|
||||
bool CreateBufferCleared(const GPUBufferDesc* desc, uint8_t value, GPUBuffer* buffer) const
|
||||
bool CreateBufferCleared(const GPUBufferDesc* desc, uint8_t value, GPUBuffer* buffer, const GPUResource* alias = nullptr, uint64_t alias_offset = 0ull) const
|
||||
{
|
||||
return CreateBuffer2(desc, [&](void* dest) { std::memset(dest, value, desc->size); }, buffer);
|
||||
return CreateBuffer2(desc, [&](void* dest) { std::memset(dest, value, desc->size); }, buffer, alias, alias_offset);
|
||||
}
|
||||
|
||||
bool CreateBufferZeroed(const GPUBufferDesc* desc, GPUBuffer* buffer) const
|
||||
bool CreateBufferZeroed(const GPUBufferDesc* desc, GPUBuffer* buffer, const GPUResource* alias = nullptr, uint64_t alias_offset = 0ull) const
|
||||
{
|
||||
return CreateBufferCleared(desc, 0, buffer);
|
||||
return CreateBufferCleared(desc, 0, buffer, alias, alias_offset);
|
||||
}
|
||||
|
||||
void Barrier(const GPUBarrier& barrier, CommandList cmd)
|
||||
|
||||
@@ -2422,6 +2422,7 @@ std::mutex queue_locker;
|
||||
|
||||
disabledMessages.push_back(D3D12_MESSAGE_ID_DRAW_EMPTY_SCISSOR_RECTANGLE);
|
||||
disabledMessages.push_back(D3D12_MESSAGE_ID_SETPRIVATEDATA_CHANGINGPARAMS);
|
||||
disabledMessages.push_back(D3D12_MESSAGE_ID_HEAP_ADDRESS_RANGE_INTERSECTS_MULTIPLE_BUFFERS);
|
||||
|
||||
D3D12_INFO_QUEUE_FILTER filter = {};
|
||||
filter.AllowList.NumSeverities = static_cast<UINT>(enabledSeverities.size());
|
||||
|
||||
@@ -349,6 +349,18 @@ namespace wi::graphics
|
||||
{
|
||||
alignment = std::max(alignment, 16ull);
|
||||
}
|
||||
if (has_flag(desc->misc_flags, ResourceMiscFlag::ALIASING_BUFFER))
|
||||
{
|
||||
alignment = std::max(alignment, (uint64_t)D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT);
|
||||
}
|
||||
if (has_flag(desc->misc_flags, ResourceMiscFlag::ALIASING_TEXTURE_NON_RT_DS))
|
||||
{
|
||||
alignment = std::max(alignment, (uint64_t)D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT);
|
||||
}
|
||||
if (has_flag(desc->misc_flags, ResourceMiscFlag::ALIASING_TEXTURE_RT_DS))
|
||||
{
|
||||
alignment = std::max(alignment, (uint64_t)D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT);
|
||||
}
|
||||
return alignment;
|
||||
}
|
||||
|
||||
|
||||
@@ -3750,12 +3750,6 @@ using namespace vulkan_internal;
|
||||
}
|
||||
bool GraphicsDevice_Vulkan::CreateBuffer2(const GPUBufferDesc* desc, const std::function<void(void*)>& init_callback, GPUBuffer* buffer, const GPUResource* alias, uint64_t alias_offset) const
|
||||
{
|
||||
#ifdef PLATFORM_LINUX
|
||||
// Resource aliasing on Linux sometimes fails with VK_ERROR_UNKOWN so I disable it:
|
||||
alias = nullptr;
|
||||
alias_offset = 0;
|
||||
#endif // PLATFORM_LINUX
|
||||
|
||||
auto internal_state = std::make_shared<Buffer_Vulkan>();
|
||||
internal_state->allocationhandler = allocationhandler;
|
||||
buffer->internal_state = internal_state;
|
||||
@@ -3854,6 +3848,10 @@ using namespace vulkan_internal;
|
||||
{
|
||||
VkMemoryRequirements memory_requirements = {};
|
||||
memory_requirements.alignment = desc->alignment;
|
||||
if (memory_requirements.alignment == 0)
|
||||
{
|
||||
memory_requirements.alignment = GetMinOffsetAlignment(desc);
|
||||
}
|
||||
memory_requirements.size = AlignTo(desc->size, memory_requirements.alignment);
|
||||
memory_requirements.memoryTypeBits = ~0u;
|
||||
|
||||
|
||||
@@ -473,6 +473,10 @@ namespace wi::graphics
|
||||
{
|
||||
alignment = std::max(alignment, properties2.properties.limits.minTexelBufferOffsetAlignment);
|
||||
}
|
||||
if (has_flag(desc->misc_flags, ResourceMiscFlag::ALIASING_BUFFER) || has_flag(desc->misc_flags, ResourceMiscFlag::ALIASING_TEXTURE_NON_RT_DS) || has_flag(desc->misc_flags, ResourceMiscFlag::ALIASING_TEXTURE_RT_DS))
|
||||
{
|
||||
alignment = std::max(alignment, uint64_t(64 * 1024)); // 64KB safety to match DX12, because cannot use vkGetBufferMemoryRequirements here
|
||||
}
|
||||
return alignment;
|
||||
}
|
||||
|
||||
|
||||
@@ -920,12 +920,6 @@ namespace wi
|
||||
wi::renderer::UpdateRaytracingAccelerationStructures(*scene, cmd);
|
||||
}
|
||||
|
||||
if (scene->weather.IsRealisticSky())
|
||||
{
|
||||
wi::renderer::ComputeSkyAtmosphereTextures(cmd);
|
||||
wi::renderer::ComputeSkyAtmosphereSkyViewLut(cmd);
|
||||
}
|
||||
|
||||
if (wi::renderer::GetSurfelGIEnabled())
|
||||
{
|
||||
wi::renderer::SurfelGI(
|
||||
@@ -1164,16 +1158,6 @@ namespace wi
|
||||
);
|
||||
}
|
||||
|
||||
if (scene->weather.IsRealisticSky())
|
||||
{
|
||||
wi::renderer::ComputeSkyAtmosphereSkyViewLut(cmd);
|
||||
|
||||
if (scene->weather.IsRealisticSkyAerialPerspective())
|
||||
{
|
||||
wi::renderer::ComputeSkyAtmosphereCameraVolumeLut(cmd);
|
||||
}
|
||||
}
|
||||
|
||||
if (scene->weather.IsVolumetricClouds() && !scene->weather.IsVolumetricCloudsReceiveShadow())
|
||||
{
|
||||
// When volumetric cloud DOESN'T receive shadow it can be done async to shadow maps!
|
||||
@@ -1305,17 +1289,6 @@ namespace wi
|
||||
cmd
|
||||
);
|
||||
|
||||
// Render SkyAtmosphere assets from planar reflections point of view
|
||||
if (scene->weather.IsRealisticSky())
|
||||
{
|
||||
wi::renderer::ComputeSkyAtmosphereSkyViewLut(cmd);
|
||||
|
||||
if (scene->weather.IsRealisticSkyAerialPerspective())
|
||||
{
|
||||
wi::renderer::ComputeSkyAtmosphereCameraVolumeLut(cmd);
|
||||
}
|
||||
}
|
||||
|
||||
device->EventBegin("Planar reflections Z-Prepass", cmd);
|
||||
auto range = wi::profiler::BeginRangeGPU("Planar Reflections Z-Prepass", cmd);
|
||||
|
||||
|
||||
+102
-6
@@ -2623,6 +2623,73 @@ const GPUBuffer& GetIndexBufferForQuads(uint32_t max_quad_count)
|
||||
return indexBufferForQuads32;
|
||||
}
|
||||
|
||||
// This is responsible to manage big chunks of GPUBuffer, each of which will be used for suballocations:
|
||||
struct GPUSubAllocator
|
||||
{
|
||||
static constexpr uint64_t blocksize = 256ull * 1024ull * 1024ull; // 256 MB
|
||||
struct Block
|
||||
{
|
||||
wi::allocator::PageAllocator allocator;
|
||||
GPUBuffer buffer;
|
||||
};
|
||||
wi::vector<Block> blocks;
|
||||
std::mutex locker;
|
||||
} static suballocator;
|
||||
BufferSuballocation SuballocateGPUBuffer(uint64_t size)
|
||||
{
|
||||
if (size > GPUSubAllocator::blocksize / 2)
|
||||
return {}; // invalid, larger allocations than half block size will not be suballocated
|
||||
|
||||
// scoped for locker
|
||||
{
|
||||
std::scoped_lock lock(suballocator.locker);
|
||||
|
||||
// See if any of the large blocks can fulfill the allocation request:
|
||||
BufferSuballocation allocation;
|
||||
for (auto& block : suballocator.blocks)
|
||||
{
|
||||
allocation.allocation = block.allocator.allocate(size);
|
||||
if (allocation.allocation.IsValid())
|
||||
{
|
||||
allocation.alias = block.buffer;
|
||||
//wilog("SuballocateGPUBuffer allocated size: %s, pages: %d, free space remaining: %s", wi::helper::GetMemorySizeText(size).c_str(), block.allocator.page_count_from_bytes(size), wi::helper::GetMemorySizeText(allocation.allocation.allocator->allocator.storageReport().totalFreeSpace * block.allocator.page_size).c_str());
|
||||
return allocation;
|
||||
}
|
||||
}
|
||||
|
||||
// Allocation couldn't be fulfilled, create new block:
|
||||
GPUBufferDesc desc;
|
||||
desc.size = GPUSubAllocator::blocksize;
|
||||
desc.usage = Usage::DEFAULT;
|
||||
desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::VERTEX_BUFFER | BindFlag::INDEX_BUFFER;
|
||||
desc.misc_flags = ResourceMiscFlag::ALIASING_BUFFER | ResourceMiscFlag::NO_DEFAULT_DESCRIPTORS;
|
||||
desc.alignment = device->GetMinOffsetAlignment(&desc);
|
||||
auto& block = suballocator.blocks.emplace_back();
|
||||
bool success = device->CreateBuffer(&desc, nullptr, &block.buffer);
|
||||
assert(success);
|
||||
device->SetName(&block.buffer, "GPUSubAllocator");
|
||||
block.allocator.init(desc.size, (uint32_t)desc.alignment, true);
|
||||
wilog("SuballocateGPUBuffer created buffer block with size: %s, with page size: %s, page count: %d", wi::helper::GetMemorySizeText(block.allocator.total_size_in_bytes()).c_str(), wi::helper::GetMemorySizeText(block.allocator.page_size).c_str(), (int)block.allocator.page_count);
|
||||
}
|
||||
return SuballocateGPUBuffer(size); // retry
|
||||
}
|
||||
void UpdateGPUSuballocator()
|
||||
{
|
||||
std::scoped_lock lock(suballocator.locker);
|
||||
for (auto& block : suballocator.blocks)
|
||||
{
|
||||
block.allocator.update_deferred_release(device->GetFrameCount(), device->GetBufferCount());
|
||||
}
|
||||
for (size_t i = 0; i < suballocator.blocks.size(); ++i)
|
||||
{
|
||||
if (suballocator.blocks[i].allocator.is_empty())
|
||||
{
|
||||
suballocator.blocks.erase(suballocator.blocks.begin() + i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ModifyObjectSampler(const SamplerDesc& desc)
|
||||
{
|
||||
if (initialized.load())
|
||||
@@ -2961,7 +3028,8 @@ void RenderMeshes(
|
||||
uint32_t prev_stencilref = STENCILREF_DEFAULT;
|
||||
device->BindStencilRef(prev_stencilref, cmd);
|
||||
|
||||
const GPUBuffer* prev_ib = nullptr;
|
||||
IndexBufferFormat prev_ibformat = IndexBufferFormat::UINT16;
|
||||
const void* prev_ib_internal = nullptr;
|
||||
|
||||
// This will be called every time we start a new draw call:
|
||||
auto batch_flush = [&]()
|
||||
@@ -3092,10 +3160,16 @@ void RenderMeshes(
|
||||
device->BindStencilRef(stencilRef, cmd);
|
||||
}
|
||||
|
||||
if (!meshShaderPSO && prev_ib != &mesh.generalBuffer)
|
||||
// Note: the mesh.generalBuffer can be either a standalone allocated buffer, or a suballocated one (to reduce index buffer switching)
|
||||
const GPUBuffer* ib = mesh.generalBufferOffsetAllocation.IsValid() ? &mesh.generalBufferOffsetAllocationAlias : &mesh.generalBuffer;
|
||||
const IndexBufferFormat ibformat = mesh.GetIndexFormat();
|
||||
const void* ibinternal = ib->internal_state.get();
|
||||
|
||||
if (!meshShaderPSO && (prev_ib_internal != ibinternal || prev_ibformat != ibformat))
|
||||
{
|
||||
device->BindIndexBuffer(&mesh.generalBuffer, mesh.GetIndexFormat(), mesh.ib.offset, cmd);
|
||||
prev_ib = &mesh.generalBuffer;
|
||||
prev_ib_internal = ibinternal;
|
||||
prev_ibformat = ibformat;
|
||||
device->BindIndexBuffer(ib, ibformat, 0, cmd);
|
||||
}
|
||||
|
||||
if (
|
||||
@@ -3114,6 +3188,18 @@ void RenderMeshes(
|
||||
push.instances = instanceBufferDescriptorIndex;
|
||||
push.instance_offset = (uint)instancedBatch.dataOffset;
|
||||
|
||||
uint32_t indexOffset = 0;
|
||||
if (mesh.generalBufferOffsetAllocation.IsValid())
|
||||
{
|
||||
// In case the mesh general buffer is suballocated, the indexOffset is calculated relative to the beginning of the aliased buffer block:
|
||||
indexOffset = uint32_t(((uint64_t)mesh.generalBufferOffsetAllocation.byte_offset + mesh.ib.offset) / mesh.GetIndexStride()) + subset.indexOffset;
|
||||
}
|
||||
else
|
||||
{
|
||||
// In case the mesh general buffer is not suballocated, it is a standalone buffer and index offset is relative to itself
|
||||
indexOffset = uint32_t(mesh.ib.offset / mesh.GetIndexStride()) + subset.indexOffset;
|
||||
}
|
||||
|
||||
if (pso_backside != nullptr)
|
||||
{
|
||||
device->BindPipelineState(pso_backside, cmd);
|
||||
@@ -3124,7 +3210,7 @@ void RenderMeshes(
|
||||
}
|
||||
else
|
||||
{
|
||||
device->DrawIndexedInstanced(subset.indexCount, instancedBatch.instanceCount, subset.indexOffset, 0, 0, cmd);
|
||||
device->DrawIndexedInstanced(subset.indexCount, instancedBatch.instanceCount, indexOffset, 0, 0, cmd);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3136,7 +3222,7 @@ void RenderMeshes(
|
||||
}
|
||||
else
|
||||
{
|
||||
device->DrawIndexedInstanced(subset.indexCount, instancedBatch.instanceCount, subset.indexOffset, 0, 0, cmd);
|
||||
device->DrawIndexedInstanced(subset.indexCount, instancedBatch.instanceCount, indexOffset, 0, 0, cmd);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -5196,6 +5282,16 @@ void UpdateRenderDataAsync(
|
||||
ComputeVolumetricCloudShadows(cmd, weatherMapFirst, weatherMapSecond);
|
||||
}
|
||||
|
||||
if (vis.scene->weather.IsRealisticSky())
|
||||
{
|
||||
wi::renderer::ComputeSkyAtmosphereTextures(cmd);
|
||||
wi::renderer::ComputeSkyAtmosphereSkyViewLut(cmd);
|
||||
if (vis.scene->weather.IsRealisticSkyAerialPerspective())
|
||||
{
|
||||
wi::renderer::ComputeSkyAtmosphereCameraVolumeLut(cmd);
|
||||
}
|
||||
}
|
||||
|
||||
// GPU Particle systems simulation/sorting/culling:
|
||||
if (!vis.visibleEmitters.empty() || vis.scene->weather.rain_amount > 0)
|
||||
{
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "shaders/ShaderInterop_SurfelGI.h"
|
||||
#include "wiVector.h"
|
||||
#include "wiSpinLock.h"
|
||||
#include "wiAllocator.h"
|
||||
|
||||
#include <memory>
|
||||
#include <limits>
|
||||
@@ -66,8 +67,21 @@ namespace wi::renderer
|
||||
|
||||
// Returns a buffer preinitialized for quad index buffer laid out as:
|
||||
// vertexID * 4 + [0, 1, 2, 2, 1, 3]
|
||||
// Note: it will return 16-bit or 32-bit index buffer depending on max_quad_count
|
||||
const wi::graphics::GPUBuffer& GetIndexBufferForQuads(uint32_t max_quad_count);
|
||||
|
||||
struct BufferSuballocation
|
||||
{
|
||||
wi::graphics::GPUBuffer alias;
|
||||
wi::allocator::PageAllocator::Allocation allocation;
|
||||
};
|
||||
// Sub-allocate (thread-safe) from a global GPU buffer for memory aliasing purpose:
|
||||
// The buffer will be DEFAULT usage, useable as vertex buffer, index buffer and shader resource
|
||||
// The purpose is to suballocate smaller GPUBuffers inside a larger GPUBuffer and bind the large GPUBuffer once as index buffer,
|
||||
// while the small buffers can be allocated/deallocated from it with memory aliasing and also used regularly by themselves
|
||||
BufferSuballocation SuballocateGPUBuffer(uint64_t size);
|
||||
void UpdateGPUSuballocator(); // called every frame for deferred release of GPU suballocations
|
||||
|
||||
void ModifyObjectSampler(const wi::graphics::SamplerDesc& desc);
|
||||
|
||||
// Initializes the renderer
|
||||
|
||||
@@ -586,6 +586,7 @@ namespace wi::scene
|
||||
|
||||
void MeshComponent::DeleteRenderData()
|
||||
{
|
||||
generalBufferOffsetAllocation = {};
|
||||
generalBuffer = {};
|
||||
streamoutBuffer = {};
|
||||
ib = {};
|
||||
@@ -1291,9 +1292,25 @@ namespace wi::scene
|
||||
}
|
||||
};
|
||||
|
||||
bool success = device->CreateBuffer2(&bd, init_callback, &generalBuffer);
|
||||
assert(success);
|
||||
device->SetName(&generalBuffer, "MeshComponent::generalBuffer");
|
||||
// The suballocation strategy is used to have all mesh buffers reside in a global buffer
|
||||
// With this we can avoid rebinding the index buffer for every mesh and can work with purely offsets
|
||||
// Though the index buffer will still need to be rebound if the index format changes, but that happens less frequently
|
||||
wi::renderer::BufferSuballocation suballoc = wi::renderer::SuballocateGPUBuffer(bd.size);
|
||||
if (suballoc.allocation.IsValid())
|
||||
{
|
||||
bool success = device->CreateBuffer2(&bd, init_callback, &generalBuffer, &suballoc.alias, suballoc.allocation.byte_offset);
|
||||
assert(success);
|
||||
device->SetName(&generalBuffer, "MeshComponent::generalBuffer (suballocated)");
|
||||
generalBufferOffsetAllocation = std::move(suballoc.allocation);
|
||||
generalBufferOffsetAllocationAlias = std::move(suballoc.alias);
|
||||
}
|
||||
else
|
||||
{
|
||||
// If suballocation was not successful, a standalone buffer can be created instead:
|
||||
bool success = device->CreateBuffer2(&bd, init_callback, &generalBuffer);
|
||||
assert(success);
|
||||
device->SetName(&generalBuffer, "MeshComponent::generalBuffer");
|
||||
}
|
||||
|
||||
assert(ib.IsValid());
|
||||
const Format ib_format = GetIndexFormat() == IndexBufferFormat::UINT32 ? Format::R32_UINT : Format::R16_UINT;
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "wiUnorderedSet.h"
|
||||
#include "wiBVH.h"
|
||||
#include "wiPathQuery.h"
|
||||
#include "wiAllocator.h"
|
||||
|
||||
namespace wi::scene
|
||||
{
|
||||
@@ -182,7 +183,7 @@ namespace wi::scene
|
||||
XMFLOAT4 emissiveColor = XMFLOAT4(1, 1, 1, 0);
|
||||
XMFLOAT4 subsurfaceScattering = XMFLOAT4(1, 1, 1, 0);
|
||||
XMFLOAT4 extinctionColor = XMFLOAT4(0, 0.9f, 1, 1);
|
||||
XMFLOAT4 texMulAdd = XMFLOAT4(1, 1, 0, 0);
|
||||
XMFLOAT4 texMulAdd = XMFLOAT4(1, 1, 0, 0); // dynamic multiplier (.xy) and addition (.zw) for UV coordinates
|
||||
float roughness = 0.2f;
|
||||
float reflectance = 0.02f;
|
||||
float metalness = 0.0f;
|
||||
@@ -655,7 +656,7 @@ namespace wi::scene
|
||||
BVH_ENABLED = 1 << 8,
|
||||
QUANTIZED_POSITIONS_DISABLED = 1 << 9,
|
||||
};
|
||||
uint32_t _flags = RENDERABLE;
|
||||
// *uint32_t _flags is moved down for better struct padding...
|
||||
|
||||
wi::vector<XMFLOAT3> vertex_positions;
|
||||
wi::vector<XMFLOAT3> vertex_normals;
|
||||
@@ -714,6 +715,8 @@ namespace wi::scene
|
||||
wi::primitive::AABB aabb;
|
||||
wi::graphics::GPUBuffer generalBuffer; // index buffer + all static vertex buffers
|
||||
wi::graphics::GPUBuffer streamoutBuffer; // all dynamic vertex buffers
|
||||
wi::allocator::PageAllocator::Allocation generalBufferOffsetAllocation;
|
||||
wi::graphics::GPUBuffer generalBufferOffsetAllocationAlias;
|
||||
struct BufferView
|
||||
{
|
||||
uint64_t offset = ~0ull;
|
||||
@@ -751,13 +754,6 @@ namespace wi::scene
|
||||
XMFLOAT2 uv_range_max = XMFLOAT2(1, 1);
|
||||
|
||||
wi::vector<wi::graphics::RaytracingAccelerationStructure> BLASes; // one BLAS per LOD
|
||||
enum BLAS_STATE
|
||||
{
|
||||
BLAS_STATE_NEEDS_REBUILD,
|
||||
BLAS_STATE_NEEDS_REFIT,
|
||||
BLAS_STATE_COMPLETE,
|
||||
};
|
||||
mutable BLAS_STATE BLAS_state = BLAS_STATE_NEEDS_REBUILD;
|
||||
|
||||
wi::vector<wi::primitive::AABB> bvh_leaf_aabbs;
|
||||
wi::BVH bvh;
|
||||
@@ -771,6 +767,16 @@ namespace wi::scene
|
||||
|
||||
RigidBodyPhysicsComponent precomputed_rigidbody_physics_shape; // you can precompute a physics shape here if you need without using a real rigid body component yet
|
||||
|
||||
uint32_t _flags = RENDERABLE; // *this is serialized but put here for better struct padding
|
||||
|
||||
enum BLAS_STATE
|
||||
{
|
||||
BLAS_STATE_NEEDS_REBUILD,
|
||||
BLAS_STATE_NEEDS_REFIT,
|
||||
BLAS_STATE_COMPLETE,
|
||||
};
|
||||
mutable BLAS_STATE BLAS_state = BLAS_STATE_NEEDS_REBUILD;
|
||||
|
||||
constexpr void SetRenderable(bool value) { if (value) { _flags |= RENDERABLE; } else { _flags &= ~RENDERABLE; } }
|
||||
constexpr void SetDoubleSided(bool value) { if (value) { _flags |= DOUBLE_SIDED; } else { _flags &= ~DOUBLE_SIDED; } }
|
||||
constexpr void SetDoubleSidedShadow(bool value) { if (value) { _flags |= DOUBLE_SIDED_SHADOW; } else { _flags &= ~DOUBLE_SIDED_SHADOW; } }
|
||||
|
||||
@@ -9,7 +9,7 @@ namespace wi::version
|
||||
// minor features, major updates, breaking compatibility changes
|
||||
const int minor = 71;
|
||||
// minor bug fixes, alterations, refactors, updates
|
||||
const int revision = 750;
|
||||
const int revision = 751;
|
||||
|
||||
const std::string version_string = std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(revision);
|
||||
|
||||
|
||||
@@ -934,5 +934,28 @@ SOFTWARE.
|
||||
|
||||
###############################################################################################################################
|
||||
|
||||
OffsetAllocator: https://github.com/sebbbi/OffsetAllocator
|
||||
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023 Sebastian Aaltonen
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
###############################################################################################################################
|
||||
|
||||
Reference in New Issue
Block a user