light culling optimizations
This commit is contained in:
@@ -53,7 +53,6 @@ wi::vector<ShaderEntry> shaders = {
|
||||
{"upsample_bilateral_unorm1CS", wi::graphics::ShaderStage::CS},
|
||||
{"upsample_bilateral_unorm4CS", wi::graphics::ShaderStage::CS},
|
||||
{"temporalaaCS", wi::graphics::ShaderStage::CS},
|
||||
{"tileFrustumsCS", wi::graphics::ShaderStage::CS},
|
||||
{"tonemapCS", wi::graphics::ShaderStage::CS},
|
||||
{"underwaterCS", wi::graphics::ShaderStage::CS},
|
||||
{"fsr_upscalingCS", wi::graphics::ShaderStage::CS},
|
||||
|
||||
@@ -2556,16 +2556,6 @@
|
||||
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">Compute</ShaderType>
|
||||
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">Compute</ShaderType>
|
||||
</FxCompile>
|
||||
<FxCompile Include="$(MSBuildThisFileDirectory)tileFrustumsCS.hlsl">
|
||||
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Compute</ShaderType>
|
||||
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Compute</ShaderType>
|
||||
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Compute</ShaderType>
|
||||
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">Compute</ShaderType>
|
||||
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">Compute</ShaderType>
|
||||
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Compute</ShaderType>
|
||||
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">Compute</ShaderType>
|
||||
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">Compute</ShaderType>
|
||||
</FxCompile>
|
||||
<FxCompile Include="$(MSBuildThisFileDirectory)tonemapCS.hlsl">
|
||||
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Compute</ShaderType>
|
||||
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Compute</ShaderType>
|
||||
|
||||
@@ -251,9 +251,6 @@
|
||||
<FxCompile Include="$(MSBuildThisFileDirectory)temporalaaCS.hlsl">
|
||||
<Filter>CS</Filter>
|
||||
</FxCompile>
|
||||
<FxCompile Include="$(MSBuildThisFileDirectory)tileFrustumsCS.hlsl">
|
||||
<Filter>CS</Filter>
|
||||
</FxCompile>
|
||||
<FxCompile Include="$(MSBuildThisFileDirectory)tonemapCS.hlsl">
|
||||
<Filter>CS</Filter>
|
||||
</FxCompile>
|
||||
|
||||
@@ -4,8 +4,6 @@
|
||||
|
||||
#define entityCount (GetFrame().entity_culling_count)
|
||||
|
||||
StructuredBuffer<Frustum> in_Frustums : register(t0);
|
||||
|
||||
RWStructuredBuffer<uint> entityTiles : register(u0);
|
||||
|
||||
// Group shared variables.
|
||||
@@ -79,14 +77,6 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
|
||||
texture_depth.GetDimensions(dim.x, dim.y);
|
||||
float2 dim_rcp = rcp(dim);
|
||||
|
||||
// This controls the unrolling granularity if the blocksize and threadsize are different:
|
||||
uint granularity = 0;
|
||||
|
||||
// Compute addresses and load frustum:
|
||||
const uint flatTileIndex = flatten2D(Gid.xy, GetCamera().entity_culling_tilecount.xy);
|
||||
const uint tileBucketsAddress = flatTileIndex * SHADER_ENTITY_TILE_BUCKET_COUNT;
|
||||
Frustum GroupFrustum = in_Frustums[flatTileIndex];
|
||||
|
||||
// Each thread will zero out one bucket in the LDS:
|
||||
for (uint i = groupIndex; i < SHADER_ENTITY_TILE_BUCKET_COUNT; i += TILED_CULLING_THREADSIZE * TILED_CULLING_THREADSIZE)
|
||||
{
|
||||
@@ -112,7 +102,7 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
|
||||
float depthMaxUnrolled = -10000000;
|
||||
|
||||
[unroll]
|
||||
for (granularity = 0; granularity < TILED_CULLING_GRANULARITY * TILED_CULLING_GRANULARITY; ++granularity)
|
||||
for (uint granularity = 0; granularity < TILED_CULLING_GRANULARITY * TILED_CULLING_GRANULARITY; ++granularity)
|
||||
{
|
||||
uint2 pixel = DTid.xy * uint2(TILED_CULLING_GRANULARITY, TILED_CULLING_GRANULARITY) + unflatten2D(granularity, TILED_CULLING_GRANULARITY);
|
||||
pixel = min(pixel, dim - 1); // avoid loading from outside the texture, it messes up the min-max depth!
|
||||
@@ -137,16 +127,15 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
|
||||
float fMinDepth = asfloat(uMaxDepth);
|
||||
float fMaxDepth = asfloat(uMinDepth);
|
||||
|
||||
// Note: the following will be SGPR
|
||||
Frustum GroupFrustum;
|
||||
AABB GroupAABB; // frustum AABB around min-max depth in View Space
|
||||
AABB GroupAABB_WS; // frustum AABB in world space
|
||||
if(WaveIsFirstLane())
|
||||
{
|
||||
// I construct an AABB around the minmax depth bounds to perform tighter culling:
|
||||
// The frustum is asymmetric so we must consider all corners!
|
||||
|
||||
// View space eye position is always at the origin.
|
||||
const float3 eyePos = float3(0, 0, 0);
|
||||
|
||||
// View space frustum corners:
|
||||
float3 viewSpace[8];
|
||||
|
||||
// Top left point, near
|
||||
viewSpace[0] = ScreenToView(float4(Gid.xy * TILED_CULLING_BLOCKSIZE, fMinDepth, 1.0f), dim_rcp).xyz;
|
||||
// Top right point, near
|
||||
@@ -155,7 +144,6 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
|
||||
viewSpace[2] = ScreenToView(float4(float2(Gid.x, Gid.y + 1) * TILED_CULLING_BLOCKSIZE, fMinDepth, 1.0f), dim_rcp).xyz;
|
||||
// Bottom right point, near
|
||||
viewSpace[3] = ScreenToView(float4(float2(Gid.x + 1, Gid.y + 1) * TILED_CULLING_BLOCKSIZE, fMinDepth, 1.0f), dim_rcp).xyz;
|
||||
|
||||
// Top left point, far
|
||||
viewSpace[4] = ScreenToView(float4(Gid.xy * TILED_CULLING_BLOCKSIZE, fMaxDepth, 1.0f), dim_rcp).xyz;
|
||||
// Top right point, far
|
||||
@@ -164,7 +152,18 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
|
||||
viewSpace[6] = ScreenToView(float4(float2(Gid.x, Gid.y + 1) * TILED_CULLING_BLOCKSIZE, fMaxDepth, 1.0f), dim_rcp).xyz;
|
||||
// Bottom right point, far
|
||||
viewSpace[7] = ScreenToView(float4(float2(Gid.x + 1, Gid.y + 1) * TILED_CULLING_BLOCKSIZE, fMaxDepth, 1.0f), dim_rcp).xyz;
|
||||
|
||||
|
||||
// Left plane
|
||||
GroupFrustum.planes[0] = ComputePlane(viewSpace[2], eyePos, viewSpace[0]);
|
||||
// Right plane
|
||||
GroupFrustum.planes[1] = ComputePlane(viewSpace[1], eyePos, viewSpace[3]);
|
||||
// Top plane
|
||||
GroupFrustum.planes[2] = ComputePlane(viewSpace[0], eyePos, viewSpace[1]);
|
||||
// Bottom plane
|
||||
GroupFrustum.planes[3] = ComputePlane(viewSpace[3], eyePos, viewSpace[2]);
|
||||
|
||||
// I construct an AABB around the minmax depth bounds to perform tighter culling:
|
||||
// The frustum is asymmetric so we must consider all corners!
|
||||
float3 minAABB = 10000000;
|
||||
float3 maxAABB = -10000000;
|
||||
[unroll]
|
||||
@@ -180,10 +179,6 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
|
||||
GroupAABB_WS = GroupAABB;
|
||||
AABBtransform(GroupAABB_WS, GetCamera().inverse_view);
|
||||
}
|
||||
GroupAABB.c = WaveReadLaneFirst(GroupAABB.c);
|
||||
GroupAABB.e = WaveReadLaneFirst(GroupAABB.e);
|
||||
GroupAABB_WS.c = WaveReadLaneFirst(GroupAABB_WS.c);
|
||||
GroupAABB_WS.e = WaveReadLaneFirst(GroupAABB_WS.e);
|
||||
|
||||
// Convert depth values to view space.
|
||||
float minDepthVS = ScreenToView(float4(0, 0, fMinDepth, 1), dim_rcp).z;
|
||||
@@ -198,7 +193,7 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
|
||||
uint __depthmaskUnrolled = 0;
|
||||
|
||||
[unroll]
|
||||
for (granularity = 0; granularity < TILED_CULLING_GRANULARITY * TILED_CULLING_GRANULARITY; ++granularity)
|
||||
for (uint granularity = 0; granularity < TILED_CULLING_GRANULARITY * TILED_CULLING_GRANULARITY; ++granularity)
|
||||
{
|
||||
float realDepthVS = ScreenToView(float4(0, 0, depth[granularity], 1), dim_rcp).z;
|
||||
const uint __depthmaskcellindex = max(0, min(31, floor((realDepthVS - minDepthVS) * __depthRangeRecip)));
|
||||
@@ -452,6 +447,9 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
|
||||
#endif
|
||||
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
|
||||
const uint flatTileIndex = flatten2D(Gid.xy, GetCamera().entity_culling_tilecount.xy);
|
||||
const uint tileBucketsAddress = flatTileIndex * SHADER_ENTITY_TILE_BUCKET_COUNT;
|
||||
|
||||
// Each thread will export one bucket from LDS to global memory:
|
||||
for (uint i = groupIndex; i < SHADER_ENTITY_TILE_BUCKET_COUNT; i += TILED_CULLING_THREADSIZE * TILED_CULLING_THREADSIZE)
|
||||
@@ -461,7 +459,7 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
|
||||
}
|
||||
|
||||
#ifdef DEBUG_TILEDLIGHTCULLING
|
||||
for (granularity = 0; granularity < TILED_CULLING_GRANULARITY * TILED_CULLING_GRANULARITY; ++granularity)
|
||||
for (uint granularity = 0; granularity < TILED_CULLING_GRANULARITY * TILED_CULLING_GRANULARITY; ++granularity)
|
||||
{
|
||||
uint2 pixel = DTid.xy * uint2(TILED_CULLING_GRANULARITY, TILED_CULLING_GRANULARITY) + unflatten2D(granularity, TILED_CULLING_GRANULARITY);
|
||||
|
||||
|
||||
@@ -69,20 +69,18 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint3 GTid :
|
||||
#endif // RTSHADOW
|
||||
|
||||
[branch]
|
||||
if (lights().item_count() > 0)
|
||||
if (!lights().empty())
|
||||
{
|
||||
// Loop through light buckets in the tile:
|
||||
const uint first_item = lights().first_item();
|
||||
const uint last_item = lights().last_item();
|
||||
const uint first_bucket = first_item / 32;
|
||||
const uint last_bucket = min(last_item / 32, max(0, SHADER_ENTITY_TILE_BUCKET_COUNT - 1));
|
||||
[loop]
|
||||
for (uint bucket = first_bucket; bucket <= last_bucket && shadow_index < MAX_RTSHADOWS; ++bucket)
|
||||
ShaderEntityIterator iterator = lights();
|
||||
for (uint bucket = iterator.first_bucket(); bucket <= iterator.last_bucket(); ++bucket)
|
||||
{
|
||||
uint bucket_bits = load_entitytile(flatTileIndex + bucket);
|
||||
|
||||
// Bucket scalarizer - Siggraph 2017 - Improved Culling [Michal Drobot]:
|
||||
bucket_bits = WaveReadLaneFirst(WaveActiveBitOr(bucket_bits));
|
||||
|
||||
bucket_bits = iterator.mask_entity(bucket, bucket_bits);
|
||||
|
||||
[loop]
|
||||
while (bucket_bits != 0 && shadow_index < MAX_RTSHADOWS)
|
||||
@@ -91,208 +89,196 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint3 GTid :
|
||||
const uint bucket_bit_index = firstbitlow(bucket_bits);
|
||||
const uint entity_index = bucket * 32 + bucket_bit_index;
|
||||
bucket_bits ^= 1u << bucket_bit_index;
|
||||
|
||||
shadow_index = entity_index - lights().first_item();
|
||||
if (shadow_index >= MAX_RTSHADOWS)
|
||||
break;
|
||||
|
||||
// Check if it is a light and process:
|
||||
[branch]
|
||||
if (entity_index >= first_item && entity_index <= last_item)
|
||||
ShaderEntity light = load_entity(entity_index);
|
||||
|
||||
if (!light.IsCastingShadow())
|
||||
{
|
||||
shadow_index = entity_index - lights().first_item();
|
||||
if (shadow_index >= MAX_RTSHADOWS)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
ShaderEntity light = load_entity(entity_index);
|
||||
if (light.GetFlags() & ENTITY_FLAG_LIGHT_STATIC)
|
||||
{
|
||||
continue; // static lights will be skipped (they are used in lightmap baking)
|
||||
}
|
||||
|
||||
if (!light.IsCastingShadow())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
float3 L;
|
||||
ray.TMax = 0;
|
||||
|
||||
if (light.GetFlags() & ENTITY_FLAG_LIGHT_STATIC)
|
||||
{
|
||||
continue; // static lights will be skipped (they are used in lightmap baking)
|
||||
}
|
||||
|
||||
float3 L;
|
||||
ray.TMax = 0;
|
||||
|
||||
switch (light.GetType())
|
||||
{
|
||||
default:
|
||||
case ENTITY_TYPE_DIRECTIONALLIGHT:
|
||||
{
|
||||
L = normalize(light.GetDirection());
|
||||
switch (light.GetType())
|
||||
{
|
||||
default:
|
||||
case ENTITY_TYPE_DIRECTIONALLIGHT:
|
||||
{
|
||||
L = normalize(light.GetDirection());
|
||||
|
||||
#ifdef RTSHADOW
|
||||
L += mul(hemispherepoint_cos(bluenoise.x, bluenoise.y), get_tangentspace(L)) * light.GetRadius();
|
||||
L += mul(hemispherepoint_cos(bluenoise.x, bluenoise.y), get_tangentspace(L)) * light.GetRadius();
|
||||
#endif // RTSHADOW
|
||||
|
||||
SurfaceToLight surfaceToLight;
|
||||
surfaceToLight.create(surface, L);
|
||||
|
||||
[branch]
|
||||
if (any(surfaceToLight.NdotL))
|
||||
{
|
||||
[branch]
|
||||
if (light.IsCastingShadow())
|
||||
{
|
||||
ray.TMax = FLT_MAX;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case ENTITY_TYPE_POINTLIGHT:
|
||||
{
|
||||
#ifdef RTSHADOW
|
||||
light.position += light.GetDirection() * (bluenoise.z - 0.5) * light.GetLength();
|
||||
light.position += mul(hemispherepoint_cos(bluenoise.x, bluenoise.y), get_tangentspace(normalize(light.position - surface.P))) * light.GetRadius();
|
||||
#endif // RTSHADOW
|
||||
L = light.position - surface.P;
|
||||
const float dist2 = dot(L, L);
|
||||
const float range = light.GetRange();
|
||||
const float range2 = range * range;
|
||||
|
||||
[branch]
|
||||
if (dist2 < range2)
|
||||
{
|
||||
const float3 Lunnormalized = L;
|
||||
const float dist = sqrt(dist2);
|
||||
L /= dist;
|
||||
|
||||
SurfaceToLight surfaceToLight;
|
||||
surfaceToLight.create(surface, L);
|
||||
|
||||
[branch]
|
||||
if (any(surfaceToLight.NdotL))
|
||||
{
|
||||
[branch]
|
||||
if (light.IsCastingShadow())
|
||||
{
|
||||
ray.TMax = FLT_MAX;
|
||||
}
|
||||
ray.TMax = dist;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case ENTITY_TYPE_POINTLIGHT:
|
||||
{
|
||||
}
|
||||
break;
|
||||
case ENTITY_TYPE_SPOTLIGHT:
|
||||
{
|
||||
float3 Loriginal = normalize(light.position - surface.P);
|
||||
#ifdef RTSHADOW
|
||||
light.position += light.GetDirection() * (bluenoise.z - 0.5) * light.GetLength();
|
||||
light.position += mul(hemispherepoint_cos(bluenoise.x, bluenoise.y), get_tangentspace(normalize(light.position - surface.P))) * light.GetRadius();
|
||||
light.position += mul(hemispherepoint_cos(bluenoise.x, bluenoise.y), get_tangentspace(normalize(light.position - surface.P))) * light.GetRadius();
|
||||
#endif // RTSHADOW
|
||||
L = light.position - surface.P;
|
||||
const float dist2 = dot(L, L);
|
||||
const float range = light.GetRange();
|
||||
const float range2 = range * range;
|
||||
|
||||
[branch]
|
||||
if (dist2 < range2)
|
||||
{
|
||||
const float3 Lunnormalized = L;
|
||||
const float dist = sqrt(dist2);
|
||||
L /= dist;
|
||||
|
||||
SurfaceToLight surfaceToLight;
|
||||
surfaceToLight.create(surface, L);
|
||||
|
||||
[branch]
|
||||
if (any(surfaceToLight.NdotL))
|
||||
{
|
||||
ray.TMax = dist;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case ENTITY_TYPE_SPOTLIGHT:
|
||||
{
|
||||
float3 Loriginal = normalize(light.position - surface.P);
|
||||
#ifdef RTSHADOW
|
||||
light.position += mul(hemispherepoint_cos(bluenoise.x, bluenoise.y), get_tangentspace(normalize(light.position - surface.P))) * light.GetRadius();
|
||||
#endif // RTSHADOW
|
||||
L = light.position - surface.P;
|
||||
const float dist2 = dot(L, L);
|
||||
const float range2 = light.GetRange() * light.GetRange();
|
||||
|
||||
[branch]
|
||||
if (dist2 < range2)
|
||||
{
|
||||
const float dist = sqrt(dist2);
|
||||
L /= dist;
|
||||
|
||||
SurfaceToLight surfaceToLight;
|
||||
surfaceToLight.create(surface, L);
|
||||
|
||||
[branch]
|
||||
if (any(surfaceToLight.NdotL_sss) && (dot(Loriginal, light.GetDirection()) > light.GetConeAngleCos()))
|
||||
{
|
||||
ray.TMax = dist;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
L = light.position - surface.P;
|
||||
const float dist2 = dot(L, L);
|
||||
const float range2 = light.GetRange() * light.GetRange();
|
||||
|
||||
[branch]
|
||||
if (ray.TMax > 0)
|
||||
if (dist2 < range2)
|
||||
{
|
||||
#ifdef RTSHADOW
|
||||
// true ray traced shadow:
|
||||
uint seed = 0;
|
||||
float shadow = 0;
|
||||
const float dist = sqrt(dist2);
|
||||
L /= dist;
|
||||
|
||||
ray.Direction = L + max3(surface.sss);
|
||||
SurfaceToLight surfaceToLight;
|
||||
surfaceToLight.create(surface, L);
|
||||
|
||||
[branch]
|
||||
if (any(surfaceToLight.NdotL_sss) && (dot(Loriginal, light.GetDirection()) > light.GetConeAngleCos()))
|
||||
{
|
||||
ray.TMax = dist;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
[branch]
|
||||
if (ray.TMax > 0)
|
||||
{
|
||||
#ifdef RTSHADOW
|
||||
// true ray traced shadow:
|
||||
uint seed = 0;
|
||||
float shadow = 0;
|
||||
|
||||
ray.Direction = L + max3(surface.sss);
|
||||
|
||||
#ifdef RTAPI
|
||||
wiRayQuery q;
|
||||
q.TraceRayInline(
|
||||
scene_acceleration_structure, // RaytracingAccelerationStructure AccelerationStructure
|
||||
RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES |
|
||||
RAY_FLAG_CULL_FRONT_FACING_TRIANGLES |
|
||||
RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH, // uint RayFlags
|
||||
asuint(postprocess.params1.x), // uint InstanceInclusionMask
|
||||
ray // RayDesc Ray
|
||||
);
|
||||
while (q.Proceed())
|
||||
wiRayQuery q;
|
||||
q.TraceRayInline(
|
||||
scene_acceleration_structure, // RaytracingAccelerationStructure AccelerationStructure
|
||||
RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES |
|
||||
RAY_FLAG_CULL_FRONT_FACING_TRIANGLES |
|
||||
RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH, // uint RayFlags
|
||||
asuint(postprocess.params1.x), // uint InstanceInclusionMask
|
||||
ray // RayDesc Ray
|
||||
);
|
||||
while (q.Proceed())
|
||||
{
|
||||
if(q.CandidateType() != CANDIDATE_NON_OPAQUE_TRIANGLE) // see xbox coherent ray traversal documentation
|
||||
continue;
|
||||
PrimitiveID prim;
|
||||
prim.primitiveIndex = q.CandidatePrimitiveIndex();
|
||||
prim.instanceIndex = q.CandidateInstanceID();
|
||||
prim.subsetIndex = q.CandidateGeometryIndex();
|
||||
|
||||
Surface surface;
|
||||
surface.init();
|
||||
if (!surface.load(prim, q.CandidateTriangleBarycentrics()))
|
||||
break;
|
||||
|
||||
float alphatest = clamp(blue_noise(DTid.xy, q.CandidateTriangleRayT()).r, 0, 0.99);
|
||||
|
||||
[branch]
|
||||
if (surface.opacity - alphatest >= 0)
|
||||
{
|
||||
if(q.CandidateType() != CANDIDATE_NON_OPAQUE_TRIANGLE) // see xbox coherent ray traversal documentation
|
||||
continue;
|
||||
PrimitiveID prim;
|
||||
prim.primitiveIndex = q.CandidatePrimitiveIndex();
|
||||
prim.instanceIndex = q.CandidateInstanceID();
|
||||
prim.subsetIndex = q.CandidateGeometryIndex();
|
||||
|
||||
Surface surface;
|
||||
surface.init();
|
||||
if (!surface.load(prim, q.CandidateTriangleBarycentrics()))
|
||||
break;
|
||||
|
||||
float alphatest = clamp(blue_noise(DTid.xy, q.CandidateTriangleRayT()).r, 0, 0.99);
|
||||
|
||||
[branch]
|
||||
if (surface.opacity - alphatest >= 0)
|
||||
{
|
||||
q.CommitNonOpaqueTriangleHit();
|
||||
}
|
||||
q.CommitNonOpaqueTriangleHit();
|
||||
}
|
||||
shadow = q.CommittedStatus() == COMMITTED_TRIANGLE_HIT ? 0 : 1;
|
||||
}
|
||||
shadow = q.CommittedStatus() == COMMITTED_TRIANGLE_HIT ? 0 : 1;
|
||||
#else
|
||||
shadow = TraceRay_Any(newRay, asuint(postprocess.params1.x), groupIndex) ? 0 : 1;
|
||||
shadow = TraceRay_Any(newRay, asuint(postprocess.params1.x), groupIndex) ? 0 : 1;
|
||||
#endif // RTAPI
|
||||
|
||||
#else
|
||||
// screen space raymarch shadow:
|
||||
ray.Direction = normalize(mul((float3x3)GetCamera().view, L));
|
||||
float3 rayPos = ray.Origin + ray.Direction * stepsize * offset;
|
||||
// screen space raymarch shadow:
|
||||
ray.Direction = normalize(mul((float3x3)GetCamera().view, L));
|
||||
float3 rayPos = ray.Origin + ray.Direction * stepsize * offset;
|
||||
|
||||
float occlusion = 0;
|
||||
[loop]
|
||||
for (uint i = 0; i < samplecount; ++i)
|
||||
float occlusion = 0;
|
||||
[loop]
|
||||
for (uint i = 0; i < samplecount; ++i)
|
||||
{
|
||||
float4 proj = mul(GetCamera().projection, float4(rayPos, 1));
|
||||
proj.xyz /= proj.w;
|
||||
proj.xy = proj.xy * float2(0.5f, -0.5f) + float2(0.5f, 0.5f);
|
||||
|
||||
[branch]
|
||||
if (is_saturated(proj.xy))
|
||||
{
|
||||
float4 proj = mul(GetCamera().projection, float4(rayPos, 1));
|
||||
proj.xyz /= proj.w;
|
||||
proj.xy = proj.xy * float2(0.5f, -0.5f) + float2(0.5f, 0.5f);
|
||||
|
||||
[branch]
|
||||
if (is_saturated(proj.xy))
|
||||
const float ray_depth_real = proj.w;
|
||||
const float ray_depth_sample = texture_lineardepth.SampleLevel(sampler_point_clamp, proj.xy, 1) * GetCamera().z_far;
|
||||
const float ray_depth_delta = ray_depth_real - ray_depth_sample;
|
||||
if (ray_depth_delta > 0.02 && ray_depth_delta < thickness)
|
||||
{
|
||||
const float ray_depth_real = proj.w;
|
||||
const float ray_depth_sample = texture_lineardepth.SampleLevel(sampler_point_clamp, proj.xy, 1) * GetCamera().z_far;
|
||||
const float ray_depth_delta = ray_depth_real - ray_depth_sample;
|
||||
if (ray_depth_delta > 0.02 && ray_depth_delta < thickness)
|
||||
{
|
||||
occlusion = 1 - pow(float(i) / float(samplecount), 8);
|
||||
occlusion = 1 - pow(float(i) / float(samplecount), 8);
|
||||
|
||||
// screen edge fade:
|
||||
float2 fade = max(12 * abs(proj.xy - 0.5) - 5, 0);
|
||||
occlusion *= saturate(1 - dot(fade, fade));
|
||||
// screen edge fade:
|
||||
float2 fade = max(12 * abs(proj.xy - 0.5) - 5, 0);
|
||||
occlusion *= saturate(1 - dot(fade, fade));
|
||||
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
rayPos += ray.Direction * stepsize;
|
||||
}
|
||||
float shadow = 1 - occlusion;
|
||||
|
||||
rayPos += ray.Direction * stepsize;
|
||||
}
|
||||
float shadow = 1 - occlusion;
|
||||
#endif // RTSHADOW
|
||||
|
||||
uint mask = uint(saturate(shadow) * 255); // 8 bits
|
||||
uint mask_shift = (shadow_index % 4) * 8;
|
||||
uint mask_bucket = shadow_index / 4;
|
||||
shadow_mask[mask_bucket] |= mask << mask_shift;
|
||||
}
|
||||
|
||||
}
|
||||
else if (entity_index > last_item)
|
||||
{
|
||||
// force exit:
|
||||
bucket = SHADER_ENTITY_TILE_BUCKET_COUNT;
|
||||
break;
|
||||
uint mask = uint(saturate(shadow) * 255); // 8 bits
|
||||
uint mask_shift = (shadow_index % 4) * 8;
|
||||
uint mask_bucket = shadow_index / 4;
|
||||
shadow_mask[mask_bucket] |= mask << mask_shift;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -374,75 +374,7 @@ inline void TiledLighting(inout Surface surface, inout Lighting lighting, uint f
|
||||
lighting.indirect.diffuse = ddgi_sample_irradiance(surface.P, surface.N);
|
||||
surface.SetGIApplied(true);
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Combined light loops:
|
||||
|
||||
[branch]
|
||||
if (!lights().empty())
|
||||
{
|
||||
// Loop through light buckets in the tile:
|
||||
ShaderEntityIterator iterator = lights();
|
||||
for(uint bucket = iterator.first_bucket(); bucket <= iterator.last_bucket(); ++bucket)
|
||||
{
|
||||
uint bucket_bits = load_entitytile(flatTileIndex + bucket);
|
||||
|
||||
#ifndef ENTITY_TILE_UNIFORM
|
||||
// Bucket scalarizer - Siggraph 2017 - Improved Culling [Michal Drobot]:
|
||||
bucket_bits = WaveReadLaneFirst(WaveActiveBitOr(bucket_bits));
|
||||
#endif // ENTITY_TILE_UNIFORM
|
||||
|
||||
bucket_bits = iterator.mask_entity(bucket, bucket_bits);
|
||||
|
||||
[loop]
|
||||
while (bucket_bits != 0)
|
||||
{
|
||||
// Retrieve global entity index from local bucket, then remove bit from local bucket:
|
||||
const uint bucket_bit_index = firstbitlow(bucket_bits);
|
||||
const uint entity_index = bucket * 32 + bucket_bit_index;
|
||||
bucket_bits ^= 1u << bucket_bit_index;
|
||||
|
||||
ShaderEntity light = load_entity(entity_index);
|
||||
|
||||
half shadow_mask = 1;
|
||||
#if defined(SHADOW_MASK_ENABLED) && !defined(TRANSPARENT)
|
||||
[branch]
|
||||
if (light.IsCastingShadow() && (GetFrame().options & OPTION_BIT_SHADOW_MASK) && (GetCamera().options & SHADERCAMERA_OPTION_USE_SHADOW_MASK) && GetCamera().texture_rtshadow_index >= 0)
|
||||
{
|
||||
uint shadow_index = entity_index - lights().first_item();
|
||||
if (shadow_index < 16)
|
||||
{
|
||||
shadow_mask = (half)bindless_textures2DArray[GetCamera().texture_rtshadow_index][uint3(surface.pixel, shadow_index)].r;
|
||||
}
|
||||
}
|
||||
#endif // SHADOW_MASK_ENABLED && !TRANSPARENT
|
||||
|
||||
switch (light.GetType())
|
||||
{
|
||||
case ENTITY_TYPE_DIRECTIONALLIGHT:
|
||||
{
|
||||
light_directional(light, surface, lighting);
|
||||
}
|
||||
break;
|
||||
case ENTITY_TYPE_POINTLIGHT:
|
||||
{
|
||||
light_point(light, surface, lighting);
|
||||
}
|
||||
break;
|
||||
case ENTITY_TYPE_SPOTLIGHT:
|
||||
{
|
||||
light_spot(light, surface, lighting);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
// Separated light loops by type:
|
||||
|
||||
|
||||
[branch]
|
||||
if (!directional_lights().empty())
|
||||
{
|
||||
@@ -558,7 +490,6 @@ inline void TiledLighting(inout Surface surface, inout Lighting lighting, uint f
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
#include "globals.hlsli"
|
||||
#include "cullingShaderHF.hlsli"
|
||||
|
||||
// View space frustums for the grid cells.
|
||||
RWStructuredBuffer<Frustum> out_Frustums : register(u0);
|
||||
|
||||
[numthreads(TILED_CULLING_BLOCKSIZE, TILED_CULLING_BLOCKSIZE, 1)]
|
||||
void main(uint3 DTid : SV_DispatchThreadID)
|
||||
{
|
||||
uint2 dim;
|
||||
texture_depth.GetDimensions(dim.x, dim.y);
|
||||
float2 dim_rcp = rcp(dim);
|
||||
|
||||
// View space eye position is always at the origin.
|
||||
const float3 eyePos = float3(0, 0, 0);
|
||||
|
||||
// Compute 4 points on the far clipping plane to use as the
|
||||
// frustum vertices.
|
||||
float4 screenSpace[4];
|
||||
// Top left point
|
||||
screenSpace[0] = float4(DTid.xy * TILED_CULLING_BLOCKSIZE, 1.0f, 1.0f);
|
||||
// Top right point
|
||||
screenSpace[1] = float4(float2(DTid.x + 1, DTid.y) * TILED_CULLING_BLOCKSIZE, 1.0f, 1.0f);
|
||||
// Bottom left point
|
||||
screenSpace[2] = float4(float2(DTid.x, DTid.y + 1) * TILED_CULLING_BLOCKSIZE, 1.0f, 1.0f);
|
||||
// Bottom right point
|
||||
screenSpace[3] = float4(float2(DTid.x + 1, DTid.y + 1) * TILED_CULLING_BLOCKSIZE, 1.0f, 1.0f);
|
||||
|
||||
float3 viewSpace[4];
|
||||
// Now convert the screen space points to view space
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
viewSpace[i] = ScreenToView(screenSpace[i], dim_rcp).xyz;
|
||||
}
|
||||
|
||||
// Now build the frustum planes from the view space points
|
||||
Frustum frustum;
|
||||
|
||||
// Left plane
|
||||
frustum.planes[0] = ComputePlane(viewSpace[2], eyePos, viewSpace[0]);
|
||||
// Right plane
|
||||
frustum.planes[1] = ComputePlane(viewSpace[1], eyePos, viewSpace[3]);
|
||||
// Top plane
|
||||
frustum.planes[2] = ComputePlane(viewSpace[0], eyePos, viewSpace[1]);
|
||||
// Bottom plane
|
||||
frustum.planes[3] = ComputePlane(viewSpace[3], eyePos, viewSpace[2]);
|
||||
|
||||
// Store the computed frustum in global memory (if our thread ID is in bounds of the grid).
|
||||
if (DTid.x < GetCamera().entity_culling_tilecount.x && DTid.y < GetCamera().entity_culling_tilecount.y)
|
||||
{
|
||||
out_Frustums[flatten2D(DTid.xy, GetCamera().entity_culling_tilecount.xy)] = frustum;
|
||||
}
|
||||
}
|
||||
@@ -228,7 +228,6 @@ namespace wi::enums
|
||||
CSTYPE_LUMINANCE_PASS2,
|
||||
CSTYPE_SHADINGRATECLASSIFICATION,
|
||||
CSTYPE_SHADINGRATECLASSIFICATION_DEBUG,
|
||||
CSTYPE_TILEFRUSTUMS,
|
||||
CSTYPE_LIGHTCULLING,
|
||||
CSTYPE_LIGHTCULLING_DEBUG,
|
||||
CSTYPE_LIGHTCULLING_ADVANCED,
|
||||
|
||||
+10
-66
@@ -992,7 +992,6 @@ void LoadShaders()
|
||||
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_LUMINANCE_PASS2], "luminancePass2CS.cso"); });
|
||||
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_SHADINGRATECLASSIFICATION], "shadingRateClassificationCS.cso"); });
|
||||
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_SHADINGRATECLASSIFICATION_DEBUG], "shadingRateClassificationCS_DEBUG.cso"); });
|
||||
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_TILEFRUSTUMS], "tileFrustumsCS.cso"); });
|
||||
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_LIGHTCULLING], "lightCullingCS.cso"); });
|
||||
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_LIGHTCULLING_DEBUG], "lightCullingCS_DEBUG.cso"); });
|
||||
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_LIGHTCULLING_ADVANCED], "lightCullingCS_ADVANCED.cso"); });
|
||||
@@ -9385,26 +9384,14 @@ void CreateTiledLightResources(TiledLightResources& res, XMUINT2 resolution)
|
||||
{
|
||||
res.tileCount = GetEntityCullingTileCount(resolution);
|
||||
|
||||
{
|
||||
GPUBufferDesc bd;
|
||||
bd.stride = sizeof(XMFLOAT4) * 4; // storing 4 planes for every tile
|
||||
bd.size = bd.stride * res.tileCount.x * res.tileCount.y;
|
||||
bd.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS;
|
||||
bd.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
|
||||
bd.usage = Usage::DEFAULT;
|
||||
device->CreateBuffer(&bd, nullptr, &res.tileFrustums);
|
||||
device->SetName(&res.tileFrustums, "tileFrustums");
|
||||
}
|
||||
{
|
||||
GPUBufferDesc bd;
|
||||
bd.stride = sizeof(uint);
|
||||
bd.size = res.tileCount.x * res.tileCount.y * bd.stride * SHADER_ENTITY_TILE_BUCKET_COUNT * 2; // *2: opaque and transparent arrays
|
||||
bd.usage = Usage::DEFAULT;
|
||||
bd.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE;
|
||||
bd.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
|
||||
device->CreateBuffer(&bd, nullptr, &res.entityTiles);
|
||||
device->SetName(&res.entityTiles, "entityTiles");
|
||||
}
|
||||
GPUBufferDesc bd;
|
||||
bd.stride = sizeof(uint);
|
||||
bd.size = res.tileCount.x * res.tileCount.y * bd.stride * SHADER_ENTITY_TILE_BUCKET_COUNT * 2; // *2: opaque and transparent arrays
|
||||
bd.usage = Usage::DEFAULT;
|
||||
bd.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE;
|
||||
bd.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
|
||||
device->CreateBuffer(&bd, nullptr, &res.entityTiles);
|
||||
device->SetName(&res.entityTiles, "entityTiles");
|
||||
}
|
||||
void ComputeTiledLightCulling(
|
||||
const TiledLightResources& res,
|
||||
@@ -9415,14 +9402,7 @@ void ComputeTiledLightCulling(
|
||||
{
|
||||
auto range = wi::profiler::BeginRangeGPU("Entity Culling", cmd);
|
||||
|
||||
// Initial barriers to put all resources into UAV:
|
||||
{
|
||||
GPUBarrier barriers[] = {
|
||||
GPUBarrier::Buffer(&res.tileFrustums, ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS),
|
||||
GPUBarrier::Buffer(&res.entityTiles, ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS),
|
||||
};
|
||||
device->Barrier(barriers, arraysize(barriers), cmd);
|
||||
}
|
||||
device->Barrier(GPUBarrier::Buffer(&res.entityTiles, ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS), cmd);
|
||||
|
||||
if (
|
||||
vis.visibleLights.empty() &&
|
||||
@@ -9431,15 +9411,8 @@ void ComputeTiledLightCulling(
|
||||
)
|
||||
{
|
||||
device->EventBegin("Tiled Entity Clear Only", cmd);
|
||||
device->ClearUAV(&res.tileFrustums, 0, cmd);
|
||||
device->ClearUAV(&res.entityTiles, 0, cmd);
|
||||
{
|
||||
GPUBarrier barriers[] = {
|
||||
GPUBarrier::Buffer(&res.tileFrustums, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE),
|
||||
GPUBarrier::Buffer(&res.entityTiles, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE),
|
||||
};
|
||||
device->Barrier(barriers, arraysize(barriers), cmd);
|
||||
}
|
||||
device->Barrier(GPUBarrier::Buffer(&res.entityTiles, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), cmd);
|
||||
device->EventEnd(cmd);
|
||||
wi::profiler::EndRange(range);
|
||||
return;
|
||||
@@ -9447,39 +9420,10 @@ void ComputeTiledLightCulling(
|
||||
|
||||
BindCommonResources(cmd);
|
||||
|
||||
// Frustum computation
|
||||
{
|
||||
device->EventBegin("Tile Frustums", cmd);
|
||||
device->BindComputeShader(&shaders[CSTYPE_TILEFRUSTUMS], cmd);
|
||||
|
||||
const GPUResource* uavs[] = {
|
||||
&res.tileFrustums
|
||||
};
|
||||
device->BindUAVs(uavs, 0, arraysize(uavs), cmd);
|
||||
|
||||
device->Dispatch(
|
||||
(res.tileCount.x + TILED_CULLING_BLOCKSIZE - 1) / TILED_CULLING_BLOCKSIZE,
|
||||
(res.tileCount.y + TILED_CULLING_BLOCKSIZE - 1) / TILED_CULLING_BLOCKSIZE,
|
||||
1,
|
||||
cmd
|
||||
);
|
||||
|
||||
{
|
||||
GPUBarrier barriers[] = {
|
||||
GPUBarrier::Buffer(&res.tileFrustums, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE),
|
||||
};
|
||||
device->Barrier(barriers, arraysize(barriers), cmd);
|
||||
}
|
||||
|
||||
device->EventEnd(cmd);
|
||||
}
|
||||
|
||||
// Perform the culling
|
||||
{
|
||||
device->EventBegin("Entity Culling", cmd);
|
||||
|
||||
device->BindResource(&res.tileFrustums, 0, cmd);
|
||||
|
||||
if (GetDebugLightCulling() && debugUAV.IsValid())
|
||||
{
|
||||
device->BindComputeShader(&shaders[GetAdvancedLightCulling() ? CSTYPE_LIGHTCULLING_ADVANCED_DEBUG : CSTYPE_LIGHTCULLING_DEBUG], cmd);
|
||||
|
||||
@@ -327,7 +327,6 @@ namespace wi::renderer
|
||||
struct TiledLightResources
|
||||
{
|
||||
XMUINT2 tileCount = {};
|
||||
wi::graphics::GPUBuffer tileFrustums; // entity culling frustums
|
||||
wi::graphics::GPUBuffer entityTiles; // culled entity indices
|
||||
};
|
||||
void CreateTiledLightResources(TiledLightResources& res, XMUINT2 resolution);
|
||||
|
||||
@@ -9,7 +9,7 @@ namespace wi::version
|
||||
// minor features, major updates, breaking compatibility changes
|
||||
const int minor = 71;
|
||||
// minor bug fixes, alterations, refactors, updates
|
||||
const int revision = 556;
|
||||
const int revision = 557;
|
||||
|
||||
const std::string version_string = std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(revision);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user