Files
WickedEngine/WickedEngine/wiGPUSortLib.cpp
T
2025-01-02 17:27:36 +01:00

203 lines
5.0 KiB
C++

#include "wiGPUSortLib.h"
#include "wiRenderer.h"
#include "wiResourceManager.h"
#include "shaders/ShaderInterop_GPUSortLib.h"
#include "wiEventHandler.h"
#include "wiTimer.h"
#include "wiBacklog.h"
using namespace wi::graphics;
namespace wi::gpusortlib
{
static GPUBuffer indirectBuffer;
static Shader kickoffSortCS;
static Shader sortCS;
static Shader sortInnerCS;
static Shader sortStepCS;
void LoadShaders()
{
wi::renderer::LoadShader(ShaderStage::CS, kickoffSortCS, "gpusortlib_kickoffSortCS.cso");
wi::renderer::LoadShader(ShaderStage::CS, sortCS, "gpusortlib_sortCS.cso");
wi::renderer::LoadShader(ShaderStage::CS, sortInnerCS, "gpusortlib_sortInnerCS.cso");
wi::renderer::LoadShader(ShaderStage::CS, sortStepCS, "gpusortlib_sortStepCS.cso");
}
void Initialize()
{
wi::Timer timer;
GPUBufferDesc bd;
bd.usage = Usage::DEFAULT;
bd.bind_flags = BindFlag::UNORDERED_ACCESS;
bd.misc_flags = ResourceMiscFlag::INDIRECT_ARGS | ResourceMiscFlag::BUFFER_RAW;
bd.size = sizeof(IndirectDispatchArgs);
wi::graphics::GetDevice()->CreateBuffer(&bd, nullptr, &indirectBuffer);
static wi::eventhandler::Handle handle = wi::eventhandler::Subscribe(wi::eventhandler::EVENT_RELOAD_SHADERS, [](uint64_t userdata) { LoadShaders(); });
LoadShaders();
wilog("wi::gpusortlib Initialized (%d ms)", (int)std::round(timer.elapsed()));
}
void Sort(
uint32_t maxCount,
const GPUBuffer& comparisonBuffer_read,
const GPUBuffer& counterBuffer_read,
uint32_t counterReadOffset,
const GPUBuffer& indexBuffer_write,
CommandList cmd)
{
GraphicsDevice* device = wi::graphics::GetDevice();
device->EventBegin("GPUSortLib", cmd);
SortConstants sort;
sort.counterReadOffset = counterReadOffset;
// initialize sorting arguments:
{
device->BindComputeShader(&kickoffSortCS, cmd);
const GPUResource* res[] = {
&counterBuffer_read,
};
device->BindResources(res, 0, arraysize(res), cmd);
const GPUResource* uavs[] = {
&indirectBuffer,
};
device->BindUAVs(uavs, 0, arraysize(uavs), cmd);
{
GPUBarrier barriers[] = {
GPUBarrier::Buffer(&indirectBuffer, ResourceState::INDIRECT_ARGUMENT, ResourceState::UNORDERED_ACCESS)
};
device->Barrier(barriers, arraysize(barriers), cmd);
}
device->PushConstants(&sort, sizeof(sort), cmd);
device->Dispatch(1, 1, 1, cmd);
{
GPUBarrier barriers[] = {
GPUBarrier::Memory(),
GPUBarrier::Buffer(&indirectBuffer, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT)
};
device->Barrier(barriers, arraysize(barriers), cmd);
}
}
const GPUResource* uavs[] = {
&indexBuffer_write,
};
device->BindUAVs(uavs, 0, arraysize(uavs), cmd);
const GPUResource* resources[] = {
&counterBuffer_read,
&comparisonBuffer_read,
};
device->BindResources(resources, 0, arraysize(resources), cmd);
// initial sorting:
bool bDone = true;
{
// calculate how many threads we'll require:
// we'll sort 512 elements per CU (threadgroupsize 256)
// maybe need to optimize this or make it changeable during init
// TGS=256 is a good intermediate value
unsigned int numThreadGroups = ((maxCount - 1) >> 9) + 1;
//assert(numThreadGroups <= 1024);
if (numThreadGroups > 1)
{
bDone = false;
}
// sort all buffers of size 512 (and presort bigger ones)
device->BindComputeShader(&sortCS, cmd);
device->PushConstants(&sort, sizeof(sort), cmd);
device->DispatchIndirect(&indirectBuffer, 0, cmd);
GPUBarrier barriers[] = {
GPUBarrier::Memory(),
};
device->Barrier(barriers, arraysize(barriers), cmd);
}
int presorted = 512;
while (!bDone)
{
// Incremental sorting:
bDone = true;
device->BindComputeShader(&sortStepCS, cmd);
// prepare thread group description data
uint32_t numThreadGroups = 0;
if (maxCount > (uint32_t)presorted)
{
if (maxCount > (uint32_t)presorted * 2)
bDone = false;
uint32_t pow2 = presorted;
while (pow2 < maxCount)
pow2 *= 2;
numThreadGroups = pow2 >> 9;
}
uint32_t nMergeSize = presorted * 2;
for (uint32_t nMergeSubSize = nMergeSize >> 1; nMergeSubSize > 256; nMergeSubSize = nMergeSubSize >> 1)
{
SortConstants sort;
sort.job_params.x = nMergeSubSize;
if (nMergeSubSize == nMergeSize >> 1)
{
sort.job_params.y = (2 * nMergeSubSize - 1);
sort.job_params.z = -1;
}
else
{
sort.job_params.y = nMergeSubSize;
sort.job_params.z = 1;
}
sort.counterReadOffset = counterReadOffset;
device->PushConstants(&sort, sizeof(sort), cmd);
device->Dispatch(numThreadGroups, 1, 1, cmd);
GPUBarrier barriers[] = {
GPUBarrier::Memory(),
};
device->Barrier(barriers, arraysize(barriers), cmd);
}
device->BindComputeShader(&sortInnerCS, cmd);
device->PushConstants(&sort, sizeof(sort), cmd);
device->Dispatch(numThreadGroups, 1, 1, cmd);
GPUBarrier barriers[] = {
GPUBarrier::Memory(),
};
device->Barrier(barriers, arraysize(barriers), cmd);
presorted *= 2;
}
device->EventEnd(cmd);
}
}