diff --git a/WickedEngine/ShaderInterop_TracedRendering.h b/WickedEngine/ShaderInterop_TracedRendering.h index 674861aab..c91514641 100644 --- a/WickedEngine/ShaderInterop_TracedRendering.h +++ b/WickedEngine/ShaderInterop_TracedRendering.h @@ -3,6 +3,7 @@ #include "ShaderInterop.h" #define TRACEDRENDERING_BVH_CLASSIFICATION_GROUPSIZE 64 +#define TRACEDRENDERING_BVH_SORTEDMORTON_GROUPSIZE 64 #define TRACEDRENDERING_BVH_HIERARCHY_GROUPSIZE 64 #define TRACEDRENDERING_CLEAR_BLOCKSIZE 8 @@ -41,11 +42,9 @@ struct TracedRenderingClusterAABB struct BVHNode { - uint parent; - uint childA; - uint childB; + uint ParentIndex; + uint LeftChildIndex; + uint RightChildIndex; }; -inline uint BVH_MakeLeafNode(uint nodeID) { return nodeID | (1 << 31); } -inline bool BVH_IsLeafNode(uint nodeID) { return nodeID & (1 << 31); } #endif // _SHADERINTEROP_TRACEDRENDERING_H_ diff --git a/WickedEngine/WickedEngine_SHADERS.vcxproj b/WickedEngine/WickedEngine_SHADERS.vcxproj index e1218f2e1..352f267f0 100644 --- a/WickedEngine/WickedEngine_SHADERS.vcxproj +++ b/WickedEngine/WickedEngine_SHADERS.vcxproj @@ -543,6 +543,10 @@ Compute 5.0 + + Compute + 5.0 + Compute 5.0 diff --git a/WickedEngine/WickedEngine_SHADERS.vcxproj.filters b/WickedEngine/WickedEngine_SHADERS.vcxproj.filters index b9efe121c..104a006d6 100644 --- a/WickedEngine/WickedEngine_SHADERS.vcxproj.filters +++ b/WickedEngine/WickedEngine_SHADERS.vcxproj.filters @@ -747,6 +747,9 @@ CS + + CS + diff --git a/WickedEngine/raytrace_bvh_hierarchyCS.hlsl b/WickedEngine/raytrace_bvh_hierarchyCS.hlsl index 4931a34cd..ecefca48d 100644 --- a/WickedEngine/raytrace_bvh_hierarchyCS.hlsl +++ b/WickedEngine/raytrace_bvh_hierarchyCS.hlsl @@ -2,53 +2,102 @@ #include "ShaderInterop_TracedRendering.h" #include "tracedRenderingHF.hlsli" +// This shader will construct the BVH from sorted cluster morton codes. +// Output is a list of continuous BVH tree nodes in memory: [parentIndex, leftChildNodeIndex, rightChildNodeIndex] +// The output node is a leaf node if: leftChildNodeIndex == rightChildNodeIndex == 0 +// Else the output node is an intermediate node +// Also, we know that intermediate nodes start at arrayIndex == 0 (starting with root node) +// Also, we know that leaf nodes will start at arrayIndex == clusterCount -1 (and they will correspond to a single cluster, which is indexable by clusterIndexBuffer later) + +// Using the Karras's 2012 parallel BVH construction algorithm outlined +// in "Maximizing Parallelism in the Construction of BVHs, Octrees, +// and k-d Trees" + RAWBUFFER(clusterCounterBuffer, TEXSLOT_ONDEMAND0); -STRUCTUREDBUFFER(clusterIndexBuffer, uint, TEXSLOT_ONDEMAND1); -STRUCTUREDBUFFER(clusterMortonBuffer, uint, TEXSLOT_ONDEMAND2); +STRUCTUREDBUFFER(clusterMortonBuffer, uint, TEXSLOT_ONDEMAND1); RWSTRUCTUREDBUFFER(bvhNodeBuffer, BVHNode, 0); -#define __clz firstbithigh - -inline int2 determineRange(uint count, uint idx) +int CountLeadingZeroes(uint num) { - //todo - return int2(count, idx); + return 31 - firstbithigh(num); } -inline int findSplit(int first, int last) +void WriteChild(uint childIndex, uint parentIndex) { - // Identical Morton codes => split the range in the middle. + bvhNodeBuffer[childIndex].ParentIndex = parentIndex; +} - uint firstCode = clusterMortonBuffer[first]; - uint lastCode = clusterMortonBuffer[last]; +void WriteParent(uint parentIndex, int leftBoxIndex, int rightBoxIndex) +{ + bvhNodeBuffer[parentIndex].LeftChildIndex = leftBoxIndex; + bvhNodeBuffer[parentIndex].RightChildIndex = rightBoxIndex; +} - if (firstCode == lastCode) - return (first + last) >> 1; +int GetLongestCommonPrefix(uint indexA, uint indexB, uint elementCount) +{ + if (indexA >= elementCount || indexB >= elementCount) + { + return -1; + } + else + { + uint mortonCodeA = clusterMortonBuffer[indexA]; + uint mortonCodeB = clusterMortonBuffer[indexB]; + if (mortonCodeA != mortonCodeB) + { + return CountLeadingZeroes(clusterMortonBuffer[indexA] ^ clusterMortonBuffer[indexB]); + } + else + { + // TODO: Technically this should be primitive ID + return CountLeadingZeroes(indexA ^ indexB) + 31; + } + } +} - // Calculate the number of highest bits that are the same - // for all objects, using the count-leading-zeros intrinsic. +uint2 DetermineRange(uint idx, uint elementCount) +{ + int d = GetLongestCommonPrefix(idx, idx + 1, elementCount) - GetLongestCommonPrefix(idx, idx - 1, elementCount); + d = clamp(d, -1, 1); + int minPrefix = GetLongestCommonPrefix(idx, idx - d, elementCount); - int commonPrefix = __clz(firstCode ^ lastCode); + // TODO: Consider starting this at a higher number + int maxLength = 2; + while (GetLongestCommonPrefix(idx, idx + maxLength * d, elementCount) > minPrefix) + { + maxLength *= 4; + } - // Use binary search to find where the next bit differs. - // Specifically, we are looking for the highest object that - // shares more than commonPrefix bits with the first one. + int length = 0; + for (int t = maxLength / 2; t > 0; t /= 2) + { + if (GetLongestCommonPrefix(idx, idx + (length + t) * d, elementCount) > minPrefix) + { + length = length + t; + } + } - int split = first; // initial guess + int j = idx + length * d; + return uint2(min(idx, j), max(idx, j)); +} + +int FindSplit(int first, uint last, uint elementCount) +{ + int commonPrefix = GetLongestCommonPrefix(first, last, elementCount); + int split = first; int step = last - first; do { - step = (step + 1) >> 1; // exponential decrease - int newSplit = split + step; // proposed new position + step = (step + 1) >> 1; + int newSplit = split + step; if (newSplit < last) { - uint splitCode = clusterMortonBuffer[newSplit]; - int splitPrefix = __clz(firstCode ^ splitCode); + int splitPrefix = GetLongestCommonPrefix(first, newSplit, elementCount); if (splitPrefix > commonPrefix) - split = newSplit; // accept proposal + split = newSplit; } } while (step > 1); @@ -65,56 +114,28 @@ void main( uint3 DTid : SV_DispatchThreadID ) if (idx < clusterCount - 1) { - // Find out which range of objects the node corresponds to. - // (This is where the magic happens!) + uint2 range = DetermineRange(idx, clusterCount); + uint first = range.x; + uint last = range.y; - int2 range = determineRange(clusterCount, idx); - int first = range.x; - int last = range.y; + uint split = FindSplit(first, last, clusterCount); - // Determine where to split the range. - - int split = findSplit(first, last); - - // Select childA. - - uint childA = split; + uint internalNodeOffset = 0; + uint leafNodeOffset = clusterCount - 1; + uint childAIndex; if (split == first) - { - //childA = &leafNodes[split]; - childA = BVH_MakeLeafNode(childA); - } + childAIndex = leafNodeOffset + split; else - { - //childA = &internalNodes[split]; - } + childAIndex = internalNodeOffset + split; - // Select childB. - - uint childB = split + 1; + uint childBIndex; if (split + 1 == last) - { - //childB = &leafNodes[split + 1]; - childB = BVH_MakeLeafNode(childB); - } + childBIndex = leafNodeOffset + split + 1; else - { - //childB = &internalNodes[split + 1]; - } + childBIndex = internalNodeOffset + split + 1; - // Record parent-child relationships. - - bvhNodeBuffer[idx].childA = childA; - bvhNodeBuffer[idx].childB = childB; - //childA->parent = &internalNodes[idx]; - //childB->parent = &internalNodes[idx]; - if (!BVH_IsLeafNode(childA)) - { - bvhNodeBuffer[childA].parent = idx; - } - if (!BVH_IsLeafNode(childB)) - { - bvhNodeBuffer[childB].parent = idx; - } + WriteParent(idx, childAIndex, childBIndex); + WriteChild(childAIndex, idx); + WriteChild(childBIndex, idx); } } diff --git a/WickedEngine/raytrace_bvh_sortedmortonCS.hlsl b/WickedEngine/raytrace_bvh_sortedmortonCS.hlsl new file mode 100644 index 000000000..3b0bdfe61 --- /dev/null +++ b/WickedEngine/raytrace_bvh_sortedmortonCS.hlsl @@ -0,0 +1,21 @@ +#include "globals.hlsli" +#include "ShaderInterop_TracedRendering.h" +#include "tracedRenderingHF.hlsli" + +// This shader reads the cluster index buffer (sorted by morton) +// and outputs the direct sorted morton codes + +RAWBUFFER(clusterCounterBuffer, TEXSLOT_ONDEMAND0); +STRUCTUREDBUFFER(clusterIndexBuffer, uint, TEXSLOT_ONDEMAND1); +STRUCTUREDBUFFER(clusterMortonBuffer, uint, TEXSLOT_ONDEMAND2); + +RWSTRUCTUREDBUFFER(clusterSortedMortonBuffer, uint, 0); + +[numthreads(TRACEDRENDERING_BVH_SORTEDMORTON_GROUPSIZE, 1, 1)] +void main( uint3 DTid : SV_DispatchThreadID ) +{ + if (DTid.x < clusterCounterBuffer.Load(0)) + { + clusterSortedMortonBuffer[DTid.x] = clusterMortonBuffer[clusterIndexBuffer[DTid.x]]; + } +} diff --git a/WickedEngine/wiEnums.h b/WickedEngine/wiEnums.h index e97f378b0..ec7ffba2e 100644 --- a/WickedEngine/wiEnums.h +++ b/WickedEngine/wiEnums.h @@ -275,6 +275,7 @@ enum CSTYPES CSTYPE_RAYTRACE_BVH_RESET, CSTYPE_RAYTRACE_BVH_CLASSIFICATION, CSTYPE_RAYTRACE_BVH_KICKHIERARCHY, + CSTYPE_RAYTRACE_BVH_SORTEDMORTON, CSTYPE_RAYTRACE_BVH_HIERARCHY, CSTYPE_RAYTRACE_CLEAR, CSTYPE_RAYTRACE_LAUNCH, diff --git a/WickedEngine/wiRenderer.cpp b/WickedEngine/wiRenderer.cpp index 7cfc2ef47..a39a4c07e 100644 --- a/WickedEngine/wiRenderer.cpp +++ b/WickedEngine/wiRenderer.cpp @@ -1477,6 +1477,7 @@ void wiRenderer::LoadShaders() computeShaders[CSTYPE_RAYTRACE_BVH_RESET] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_bvh_resetCS.cso", wiResourceManager::COMPUTESHADER)); computeShaders[CSTYPE_RAYTRACE_BVH_CLASSIFICATION] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_bvh_classificationCS.cso", wiResourceManager::COMPUTESHADER)); computeShaders[CSTYPE_RAYTRACE_BVH_KICKHIERARCHY] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_bvh_kickhierarchyCS.cso", wiResourceManager::COMPUTESHADER)); + computeShaders[CSTYPE_RAYTRACE_BVH_SORTEDMORTON] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_bvh_sortedmortonCS.cso", wiResourceManager::COMPUTESHADER)); computeShaders[CSTYPE_RAYTRACE_BVH_HIERARCHY] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_bvh_hierarchyCS.cso", wiResourceManager::COMPUTESHADER)); computeShaders[CSTYPE_RAYTRACE_CLEAR] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_clearCS.cso", wiResourceManager::COMPUTESHADER)); computeShaders[CSTYPE_RAYTRACE_LAUNCH] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_launchCS.cso", wiResourceManager::COMPUTESHADER)); @@ -6416,6 +6417,7 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res static GPUBuffer* clusterCounterBuffer = nullptr; static GPUBuffer* clusterIndexBuffer = nullptr; static GPUBuffer* clusterMortonBuffer = nullptr; + static GPUBuffer* clusterSortedMortonBuffer = nullptr; static GPUBuffer* clusterOffsetBuffer = nullptr; static GPUBuffer* clusterAABBBuffer = nullptr; const uint maxClusterCount = 1000; @@ -6432,6 +6434,7 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res SAFE_DELETE(clusterCounterBuffer); SAFE_DELETE(clusterIndexBuffer); SAFE_DELETE(clusterMortonBuffer); + SAFE_DELETE(clusterSortedMortonBuffer); SAFE_DELETE(clusterOffsetBuffer); SAFE_DELETE(clusterAABBBuffer); bvhNodeBuffer = new GPUBuffer; @@ -6439,6 +6442,7 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res clusterCounterBuffer = new GPUBuffer; clusterIndexBuffer = new GPUBuffer; clusterMortonBuffer = new GPUBuffer; + clusterSortedMortonBuffer = new GPUBuffer; clusterOffsetBuffer = new GPUBuffer; clusterAABBBuffer = new GPUBuffer; @@ -6490,6 +6494,7 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res desc.MiscFlags = RESOURCE_MISC_BUFFER_STRUCTURED; desc.Usage = USAGE_DEFAULT; hr = device->CreateBuffer(&desc, nullptr, clusterMortonBuffer); + hr = device->CreateBuffer(&desc, nullptr, clusterSortedMortonBuffer); assert(SUCCEEDED(hr)); desc.BindFlags = BIND_SHADER_RESOURCE | BIND_UNORDERED_ACCESS; @@ -6607,7 +6612,7 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res device->EventEnd(threadID); - device->EventBegin("BVH - Sort Clusters", threadID); + device->EventBegin("BVH - Sort Cluster Mortons", threadID); wiGPUSortLib::Sort(maxClusterCount, clusterMortonBuffer, clusterCounterBuffer, 0, clusterIndexBuffer, threadID); device->EventEnd(threadID); @@ -6631,6 +6636,29 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res } device->EventEnd(threadID); + device->EventBegin("BVH - Assemble Sorted Mortons", threadID); + { + device->BindComputePSO(CPSO[CSTYPE_RAYTRACE_BVH_SORTEDMORTON], threadID); + GPUResource* uavs[] = { + clusterSortedMortonBuffer, + }; + device->BindUnorderedAccessResourcesCS(uavs, 0, ARRAYSIZE(uavs), threadID); + + GPUResource* res[] = { + clusterCounterBuffer, + clusterIndexBuffer, + clusterMortonBuffer, + }; + device->BindResources(CS, res, TEXSLOT_ONDEMAND0, ARRAYSIZE(res), threadID); + + device->DispatchIndirect(indirectBuffer, 0, threadID); + + + device->UAVBarrier(uavs, ARRAYSIZE(uavs), threadID); + device->UnBindUnorderedAccessResources(0, ARRAYSIZE(uavs), threadID); + } + device->EventEnd(threadID); + device->EventBegin("BVH - Build Hierarchy", threadID); { device->BindComputePSO(CPSO[CSTYPE_RAYTRACE_BVH_HIERARCHY], threadID); @@ -6641,8 +6669,7 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res GPUResource* res[] = { clusterCounterBuffer, - clusterIndexBuffer, - clusterMortonBuffer, + clusterSortedMortonBuffer, }; device->BindResources(CS, res, TEXSLOT_ONDEMAND0, ARRAYSIZE(res), threadID);