diff --git a/WickedEngine/ShaderInterop_TracedRendering.h b/WickedEngine/ShaderInterop_TracedRendering.h
index 674861aab..c91514641 100644
--- a/WickedEngine/ShaderInterop_TracedRendering.h
+++ b/WickedEngine/ShaderInterop_TracedRendering.h
@@ -3,6 +3,7 @@
#include "ShaderInterop.h"
#define TRACEDRENDERING_BVH_CLASSIFICATION_GROUPSIZE 64
+#define TRACEDRENDERING_BVH_SORTEDMORTON_GROUPSIZE 64
#define TRACEDRENDERING_BVH_HIERARCHY_GROUPSIZE 64
#define TRACEDRENDERING_CLEAR_BLOCKSIZE 8
@@ -41,11 +42,9 @@ struct TracedRenderingClusterAABB
struct BVHNode
{
- uint parent;
- uint childA;
- uint childB;
+ uint ParentIndex;
+ uint LeftChildIndex;
+ uint RightChildIndex;
};
-inline uint BVH_MakeLeafNode(uint nodeID) { return nodeID | (1 << 31); }
-inline bool BVH_IsLeafNode(uint nodeID) { return nodeID & (1 << 31); }
#endif // _SHADERINTEROP_TRACEDRENDERING_H_
diff --git a/WickedEngine/WickedEngine_SHADERS.vcxproj b/WickedEngine/WickedEngine_SHADERS.vcxproj
index e1218f2e1..352f267f0 100644
--- a/WickedEngine/WickedEngine_SHADERS.vcxproj
+++ b/WickedEngine/WickedEngine_SHADERS.vcxproj
@@ -543,6 +543,10 @@
Compute
5.0
+
+ Compute
+ 5.0
+
Compute
5.0
diff --git a/WickedEngine/WickedEngine_SHADERS.vcxproj.filters b/WickedEngine/WickedEngine_SHADERS.vcxproj.filters
index b9efe121c..104a006d6 100644
--- a/WickedEngine/WickedEngine_SHADERS.vcxproj.filters
+++ b/WickedEngine/WickedEngine_SHADERS.vcxproj.filters
@@ -747,6 +747,9 @@
CS
+
+ CS
+
diff --git a/WickedEngine/raytrace_bvh_hierarchyCS.hlsl b/WickedEngine/raytrace_bvh_hierarchyCS.hlsl
index 4931a34cd..ecefca48d 100644
--- a/WickedEngine/raytrace_bvh_hierarchyCS.hlsl
+++ b/WickedEngine/raytrace_bvh_hierarchyCS.hlsl
@@ -2,53 +2,102 @@
#include "ShaderInterop_TracedRendering.h"
#include "tracedRenderingHF.hlsli"
+// This shader will construct the BVH from sorted cluster morton codes.
+// Output is a list of continuous BVH tree nodes in memory: [parentIndex, leftChildNodeIndex, rightChildNodeIndex]
+// The output node is a leaf node if: leftChildNodeIndex == rightChildNodeIndex == 0
+// Else the output node is an intermediate node
+// Also, we know that intermediate nodes start at arrayIndex == 0 (starting with root node)
+// Also, we know that leaf nodes will start at arrayIndex == clusterCount -1 (and they will correspond to a single cluster, which is indexable by clusterIndexBuffer later)
+
+// Using the Karras's 2012 parallel BVH construction algorithm outlined
+// in "Maximizing Parallelism in the Construction of BVHs, Octrees,
+// and k-d Trees"
+
RAWBUFFER(clusterCounterBuffer, TEXSLOT_ONDEMAND0);
-STRUCTUREDBUFFER(clusterIndexBuffer, uint, TEXSLOT_ONDEMAND1);
-STRUCTUREDBUFFER(clusterMortonBuffer, uint, TEXSLOT_ONDEMAND2);
+STRUCTUREDBUFFER(clusterMortonBuffer, uint, TEXSLOT_ONDEMAND1);
RWSTRUCTUREDBUFFER(bvhNodeBuffer, BVHNode, 0);
-#define __clz firstbithigh
-
-inline int2 determineRange(uint count, uint idx)
+int CountLeadingZeroes(uint num)
{
- //todo
- return int2(count, idx);
+ return 31 - firstbithigh(num);
}
-inline int findSplit(int first, int last)
+void WriteChild(uint childIndex, uint parentIndex)
{
- // Identical Morton codes => split the range in the middle.
+ bvhNodeBuffer[childIndex].ParentIndex = parentIndex;
+}
- uint firstCode = clusterMortonBuffer[first];
- uint lastCode = clusterMortonBuffer[last];
+void WriteParent(uint parentIndex, int leftBoxIndex, int rightBoxIndex)
+{
+ bvhNodeBuffer[parentIndex].LeftChildIndex = leftBoxIndex;
+ bvhNodeBuffer[parentIndex].RightChildIndex = rightBoxIndex;
+}
- if (firstCode == lastCode)
- return (first + last) >> 1;
+int GetLongestCommonPrefix(uint indexA, uint indexB, uint elementCount)
+{
+ if (indexA >= elementCount || indexB >= elementCount)
+ {
+ return -1;
+ }
+ else
+ {
+ uint mortonCodeA = clusterMortonBuffer[indexA];
+ uint mortonCodeB = clusterMortonBuffer[indexB];
+ if (mortonCodeA != mortonCodeB)
+ {
+ return CountLeadingZeroes(clusterMortonBuffer[indexA] ^ clusterMortonBuffer[indexB]);
+ }
+ else
+ {
+ // TODO: Technically this should be primitive ID
+ return CountLeadingZeroes(indexA ^ indexB) + 31;
+ }
+ }
+}
- // Calculate the number of highest bits that are the same
- // for all objects, using the count-leading-zeros intrinsic.
+uint2 DetermineRange(uint idx, uint elementCount)
+{
+ int d = GetLongestCommonPrefix(idx, idx + 1, elementCount) - GetLongestCommonPrefix(idx, idx - 1, elementCount);
+ d = clamp(d, -1, 1);
+ int minPrefix = GetLongestCommonPrefix(idx, idx - d, elementCount);
- int commonPrefix = __clz(firstCode ^ lastCode);
+ // TODO: Consider starting this at a higher number
+ int maxLength = 2;
+ while (GetLongestCommonPrefix(idx, idx + maxLength * d, elementCount) > minPrefix)
+ {
+ maxLength *= 4;
+ }
- // Use binary search to find where the next bit differs.
- // Specifically, we are looking for the highest object that
- // shares more than commonPrefix bits with the first one.
+ int length = 0;
+ for (int t = maxLength / 2; t > 0; t /= 2)
+ {
+ if (GetLongestCommonPrefix(idx, idx + (length + t) * d, elementCount) > minPrefix)
+ {
+ length = length + t;
+ }
+ }
- int split = first; // initial guess
+ int j = idx + length * d;
+ return uint2(min(idx, j), max(idx, j));
+}
+
+int FindSplit(int first, uint last, uint elementCount)
+{
+ int commonPrefix = GetLongestCommonPrefix(first, last, elementCount);
+ int split = first;
int step = last - first;
do
{
- step = (step + 1) >> 1; // exponential decrease
- int newSplit = split + step; // proposed new position
+ step = (step + 1) >> 1;
+ int newSplit = split + step;
if (newSplit < last)
{
- uint splitCode = clusterMortonBuffer[newSplit];
- int splitPrefix = __clz(firstCode ^ splitCode);
+ int splitPrefix = GetLongestCommonPrefix(first, newSplit, elementCount);
if (splitPrefix > commonPrefix)
- split = newSplit; // accept proposal
+ split = newSplit;
}
} while (step > 1);
@@ -65,56 +114,28 @@ void main( uint3 DTid : SV_DispatchThreadID )
if (idx < clusterCount - 1)
{
- // Find out which range of objects the node corresponds to.
- // (This is where the magic happens!)
+ uint2 range = DetermineRange(idx, clusterCount);
+ uint first = range.x;
+ uint last = range.y;
- int2 range = determineRange(clusterCount, idx);
- int first = range.x;
- int last = range.y;
+ uint split = FindSplit(first, last, clusterCount);
- // Determine where to split the range.
-
- int split = findSplit(first, last);
-
- // Select childA.
-
- uint childA = split;
+ uint internalNodeOffset = 0;
+ uint leafNodeOffset = clusterCount - 1;
+ uint childAIndex;
if (split == first)
- {
- //childA = &leafNodes[split];
- childA = BVH_MakeLeafNode(childA);
- }
+ childAIndex = leafNodeOffset + split;
else
- {
- //childA = &internalNodes[split];
- }
+ childAIndex = internalNodeOffset + split;
- // Select childB.
-
- uint childB = split + 1;
+ uint childBIndex;
if (split + 1 == last)
- {
- //childB = &leafNodes[split + 1];
- childB = BVH_MakeLeafNode(childB);
- }
+ childBIndex = leafNodeOffset + split + 1;
else
- {
- //childB = &internalNodes[split + 1];
- }
+ childBIndex = internalNodeOffset + split + 1;
- // Record parent-child relationships.
-
- bvhNodeBuffer[idx].childA = childA;
- bvhNodeBuffer[idx].childB = childB;
- //childA->parent = &internalNodes[idx];
- //childB->parent = &internalNodes[idx];
- if (!BVH_IsLeafNode(childA))
- {
- bvhNodeBuffer[childA].parent = idx;
- }
- if (!BVH_IsLeafNode(childB))
- {
- bvhNodeBuffer[childB].parent = idx;
- }
+ WriteParent(idx, childAIndex, childBIndex);
+ WriteChild(childAIndex, idx);
+ WriteChild(childBIndex, idx);
}
}
diff --git a/WickedEngine/raytrace_bvh_sortedmortonCS.hlsl b/WickedEngine/raytrace_bvh_sortedmortonCS.hlsl
new file mode 100644
index 000000000..3b0bdfe61
--- /dev/null
+++ b/WickedEngine/raytrace_bvh_sortedmortonCS.hlsl
@@ -0,0 +1,21 @@
+#include "globals.hlsli"
+#include "ShaderInterop_TracedRendering.h"
+#include "tracedRenderingHF.hlsli"
+
+// This shader reads the cluster index buffer (sorted by morton)
+// and outputs the direct sorted morton codes
+
+RAWBUFFER(clusterCounterBuffer, TEXSLOT_ONDEMAND0);
+STRUCTUREDBUFFER(clusterIndexBuffer, uint, TEXSLOT_ONDEMAND1);
+STRUCTUREDBUFFER(clusterMortonBuffer, uint, TEXSLOT_ONDEMAND2);
+
+RWSTRUCTUREDBUFFER(clusterSortedMortonBuffer, uint, 0);
+
+[numthreads(TRACEDRENDERING_BVH_SORTEDMORTON_GROUPSIZE, 1, 1)]
+void main( uint3 DTid : SV_DispatchThreadID )
+{
+ if (DTid.x < clusterCounterBuffer.Load(0))
+ {
+ clusterSortedMortonBuffer[DTid.x] = clusterMortonBuffer[clusterIndexBuffer[DTid.x]];
+ }
+}
diff --git a/WickedEngine/wiEnums.h b/WickedEngine/wiEnums.h
index e97f378b0..ec7ffba2e 100644
--- a/WickedEngine/wiEnums.h
+++ b/WickedEngine/wiEnums.h
@@ -275,6 +275,7 @@ enum CSTYPES
CSTYPE_RAYTRACE_BVH_RESET,
CSTYPE_RAYTRACE_BVH_CLASSIFICATION,
CSTYPE_RAYTRACE_BVH_KICKHIERARCHY,
+ CSTYPE_RAYTRACE_BVH_SORTEDMORTON,
CSTYPE_RAYTRACE_BVH_HIERARCHY,
CSTYPE_RAYTRACE_CLEAR,
CSTYPE_RAYTRACE_LAUNCH,
diff --git a/WickedEngine/wiRenderer.cpp b/WickedEngine/wiRenderer.cpp
index 7cfc2ef47..a39a4c07e 100644
--- a/WickedEngine/wiRenderer.cpp
+++ b/WickedEngine/wiRenderer.cpp
@@ -1477,6 +1477,7 @@ void wiRenderer::LoadShaders()
computeShaders[CSTYPE_RAYTRACE_BVH_RESET] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_bvh_resetCS.cso", wiResourceManager::COMPUTESHADER));
computeShaders[CSTYPE_RAYTRACE_BVH_CLASSIFICATION] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_bvh_classificationCS.cso", wiResourceManager::COMPUTESHADER));
computeShaders[CSTYPE_RAYTRACE_BVH_KICKHIERARCHY] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_bvh_kickhierarchyCS.cso", wiResourceManager::COMPUTESHADER));
+ computeShaders[CSTYPE_RAYTRACE_BVH_SORTEDMORTON] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_bvh_sortedmortonCS.cso", wiResourceManager::COMPUTESHADER));
computeShaders[CSTYPE_RAYTRACE_BVH_HIERARCHY] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_bvh_hierarchyCS.cso", wiResourceManager::COMPUTESHADER));
computeShaders[CSTYPE_RAYTRACE_CLEAR] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_clearCS.cso", wiResourceManager::COMPUTESHADER));
computeShaders[CSTYPE_RAYTRACE_LAUNCH] = static_cast(wiResourceManager::GetShaderManager()->add(SHADERPATH + "raytrace_launchCS.cso", wiResourceManager::COMPUTESHADER));
@@ -6416,6 +6417,7 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res
static GPUBuffer* clusterCounterBuffer = nullptr;
static GPUBuffer* clusterIndexBuffer = nullptr;
static GPUBuffer* clusterMortonBuffer = nullptr;
+ static GPUBuffer* clusterSortedMortonBuffer = nullptr;
static GPUBuffer* clusterOffsetBuffer = nullptr;
static GPUBuffer* clusterAABBBuffer = nullptr;
const uint maxClusterCount = 1000;
@@ -6432,6 +6434,7 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res
SAFE_DELETE(clusterCounterBuffer);
SAFE_DELETE(clusterIndexBuffer);
SAFE_DELETE(clusterMortonBuffer);
+ SAFE_DELETE(clusterSortedMortonBuffer);
SAFE_DELETE(clusterOffsetBuffer);
SAFE_DELETE(clusterAABBBuffer);
bvhNodeBuffer = new GPUBuffer;
@@ -6439,6 +6442,7 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res
clusterCounterBuffer = new GPUBuffer;
clusterIndexBuffer = new GPUBuffer;
clusterMortonBuffer = new GPUBuffer;
+ clusterSortedMortonBuffer = new GPUBuffer;
clusterOffsetBuffer = new GPUBuffer;
clusterAABBBuffer = new GPUBuffer;
@@ -6490,6 +6494,7 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res
desc.MiscFlags = RESOURCE_MISC_BUFFER_STRUCTURED;
desc.Usage = USAGE_DEFAULT;
hr = device->CreateBuffer(&desc, nullptr, clusterMortonBuffer);
+ hr = device->CreateBuffer(&desc, nullptr, clusterSortedMortonBuffer);
assert(SUCCEEDED(hr));
desc.BindFlags = BIND_SHADER_RESOURCE | BIND_UNORDERED_ACCESS;
@@ -6607,7 +6612,7 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res
device->EventEnd(threadID);
- device->EventBegin("BVH - Sort Clusters", threadID);
+ device->EventBegin("BVH - Sort Cluster Mortons", threadID);
wiGPUSortLib::Sort(maxClusterCount, clusterMortonBuffer, clusterCounterBuffer, 0, clusterIndexBuffer, threadID);
device->EventEnd(threadID);
@@ -6631,6 +6636,29 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res
}
device->EventEnd(threadID);
+ device->EventBegin("BVH - Assemble Sorted Mortons", threadID);
+ {
+ device->BindComputePSO(CPSO[CSTYPE_RAYTRACE_BVH_SORTEDMORTON], threadID);
+ GPUResource* uavs[] = {
+ clusterSortedMortonBuffer,
+ };
+ device->BindUnorderedAccessResourcesCS(uavs, 0, ARRAYSIZE(uavs), threadID);
+
+ GPUResource* res[] = {
+ clusterCounterBuffer,
+ clusterIndexBuffer,
+ clusterMortonBuffer,
+ };
+ device->BindResources(CS, res, TEXSLOT_ONDEMAND0, ARRAYSIZE(res), threadID);
+
+ device->DispatchIndirect(indirectBuffer, 0, threadID);
+
+
+ device->UAVBarrier(uavs, ARRAYSIZE(uavs), threadID);
+ device->UnBindUnorderedAccessResources(0, ARRAYSIZE(uavs), threadID);
+ }
+ device->EventEnd(threadID);
+
device->EventBegin("BVH - Build Hierarchy", threadID);
{
device->BindComputePSO(CPSO[CSTYPE_RAYTRACE_BVH_HIERARCHY], threadID);
@@ -6641,8 +6669,7 @@ void wiRenderer::DrawTracedScene(Camera* camera, wiGraphicsTypes::Texture2D* res
GPUResource* res[] = {
clusterCounterBuffer,
- clusterIndexBuffer,
- clusterMortonBuffer,
+ clusterSortedMortonBuffer,
};
device->BindResources(CS, res, TEXSLOT_ONDEMAND0, ARRAYSIZE(res), threadID);