light culling optimizations

This commit is contained in:
Turánszki János
2024-08-27 07:57:43 +02:00
parent c51af2aa7c
commit 8a3b4fab13
11 changed files with 190 additions and 400 deletions
-1
View File
@@ -53,7 +53,6 @@ wi::vector<ShaderEntry> shaders = {
{"upsample_bilateral_unorm1CS", wi::graphics::ShaderStage::CS},
{"upsample_bilateral_unorm4CS", wi::graphics::ShaderStage::CS},
{"temporalaaCS", wi::graphics::ShaderStage::CS},
{"tileFrustumsCS", wi::graphics::ShaderStage::CS},
{"tonemapCS", wi::graphics::ShaderStage::CS},
{"underwaterCS", wi::graphics::ShaderStage::CS},
{"fsr_upscalingCS", wi::graphics::ShaderStage::CS},
@@ -2556,16 +2556,6 @@
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">Compute</ShaderType>
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">Compute</ShaderType>
</FxCompile>
<FxCompile Include="$(MSBuildThisFileDirectory)tileFrustumsCS.hlsl">
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Compute</ShaderType>
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Compute</ShaderType>
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Compute</ShaderType>
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">Compute</ShaderType>
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">Compute</ShaderType>
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Compute</ShaderType>
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">Compute</ShaderType>
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">Compute</ShaderType>
</FxCompile>
<FxCompile Include="$(MSBuildThisFileDirectory)tonemapCS.hlsl">
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Compute</ShaderType>
<ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Compute</ShaderType>
@@ -251,9 +251,6 @@
<FxCompile Include="$(MSBuildThisFileDirectory)temporalaaCS.hlsl">
<Filter>CS</Filter>
</FxCompile>
<FxCompile Include="$(MSBuildThisFileDirectory)tileFrustumsCS.hlsl">
<Filter>CS</Filter>
</FxCompile>
<FxCompile Include="$(MSBuildThisFileDirectory)tonemapCS.hlsl">
<Filter>CS</Filter>
</FxCompile>
+23 -25
View File
@@ -4,8 +4,6 @@
#define entityCount (GetFrame().entity_culling_count)
StructuredBuffer<Frustum> in_Frustums : register(t0);
RWStructuredBuffer<uint> entityTiles : register(u0);
// Group shared variables.
@@ -79,14 +77,6 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
texture_depth.GetDimensions(dim.x, dim.y);
float2 dim_rcp = rcp(dim);
// This controls the unrolling granularity if the blocksize and threadsize are different:
uint granularity = 0;
// Compute addresses and load frustum:
const uint flatTileIndex = flatten2D(Gid.xy, GetCamera().entity_culling_tilecount.xy);
const uint tileBucketsAddress = flatTileIndex * SHADER_ENTITY_TILE_BUCKET_COUNT;
Frustum GroupFrustum = in_Frustums[flatTileIndex];
// Each thread will zero out one bucket in the LDS:
for (uint i = groupIndex; i < SHADER_ENTITY_TILE_BUCKET_COUNT; i += TILED_CULLING_THREADSIZE * TILED_CULLING_THREADSIZE)
{
@@ -112,7 +102,7 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
float depthMaxUnrolled = -10000000;
[unroll]
for (granularity = 0; granularity < TILED_CULLING_GRANULARITY * TILED_CULLING_GRANULARITY; ++granularity)
for (uint granularity = 0; granularity < TILED_CULLING_GRANULARITY * TILED_CULLING_GRANULARITY; ++granularity)
{
uint2 pixel = DTid.xy * uint2(TILED_CULLING_GRANULARITY, TILED_CULLING_GRANULARITY) + unflatten2D(granularity, TILED_CULLING_GRANULARITY);
pixel = min(pixel, dim - 1); // avoid loading from outside the texture, it messes up the min-max depth!
@@ -137,16 +127,15 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
float fMinDepth = asfloat(uMaxDepth);
float fMaxDepth = asfloat(uMinDepth);
// Note: the following will be SGPR
Frustum GroupFrustum;
AABB GroupAABB; // frustum AABB around min-max depth in View Space
AABB GroupAABB_WS; // frustum AABB in world space
if(WaveIsFirstLane())
{
// I construct an AABB around the minmax depth bounds to perform tighter culling:
// The frustum is asymmetric so we must consider all corners!
// View space eye position is always at the origin.
const float3 eyePos = float3(0, 0, 0);
// View space frustum corners:
float3 viewSpace[8];
// Top left point, near
viewSpace[0] = ScreenToView(float4(Gid.xy * TILED_CULLING_BLOCKSIZE, fMinDepth, 1.0f), dim_rcp).xyz;
// Top right point, near
@@ -155,7 +144,6 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
viewSpace[2] = ScreenToView(float4(float2(Gid.x, Gid.y + 1) * TILED_CULLING_BLOCKSIZE, fMinDepth, 1.0f), dim_rcp).xyz;
// Bottom right point, near
viewSpace[3] = ScreenToView(float4(float2(Gid.x + 1, Gid.y + 1) * TILED_CULLING_BLOCKSIZE, fMinDepth, 1.0f), dim_rcp).xyz;
// Top left point, far
viewSpace[4] = ScreenToView(float4(Gid.xy * TILED_CULLING_BLOCKSIZE, fMaxDepth, 1.0f), dim_rcp).xyz;
// Top right point, far
@@ -164,7 +152,18 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
viewSpace[6] = ScreenToView(float4(float2(Gid.x, Gid.y + 1) * TILED_CULLING_BLOCKSIZE, fMaxDepth, 1.0f), dim_rcp).xyz;
// Bottom right point, far
viewSpace[7] = ScreenToView(float4(float2(Gid.x + 1, Gid.y + 1) * TILED_CULLING_BLOCKSIZE, fMaxDepth, 1.0f), dim_rcp).xyz;
// Left plane
GroupFrustum.planes[0] = ComputePlane(viewSpace[2], eyePos, viewSpace[0]);
// Right plane
GroupFrustum.planes[1] = ComputePlane(viewSpace[1], eyePos, viewSpace[3]);
// Top plane
GroupFrustum.planes[2] = ComputePlane(viewSpace[0], eyePos, viewSpace[1]);
// Bottom plane
GroupFrustum.planes[3] = ComputePlane(viewSpace[3], eyePos, viewSpace[2]);
// I construct an AABB around the minmax depth bounds to perform tighter culling:
// The frustum is asymmetric so we must consider all corners!
float3 minAABB = 10000000;
float3 maxAABB = -10000000;
[unroll]
@@ -180,10 +179,6 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
GroupAABB_WS = GroupAABB;
AABBtransform(GroupAABB_WS, GetCamera().inverse_view);
}
GroupAABB.c = WaveReadLaneFirst(GroupAABB.c);
GroupAABB.e = WaveReadLaneFirst(GroupAABB.e);
GroupAABB_WS.c = WaveReadLaneFirst(GroupAABB_WS.c);
GroupAABB_WS.e = WaveReadLaneFirst(GroupAABB_WS.e);
// Convert depth values to view space.
float minDepthVS = ScreenToView(float4(0, 0, fMinDepth, 1), dim_rcp).z;
@@ -198,7 +193,7 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
uint __depthmaskUnrolled = 0;
[unroll]
for (granularity = 0; granularity < TILED_CULLING_GRANULARITY * TILED_CULLING_GRANULARITY; ++granularity)
for (uint granularity = 0; granularity < TILED_CULLING_GRANULARITY * TILED_CULLING_GRANULARITY; ++granularity)
{
float realDepthVS = ScreenToView(float4(0, 0, depth[granularity], 1), dim_rcp).z;
const uint __depthmaskcellindex = max(0, min(31, floor((realDepthVS - minDepthVS) * __depthRangeRecip)));
@@ -452,6 +447,9 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
#endif
GroupMemoryBarrierWithGroupSync();
const uint flatTileIndex = flatten2D(Gid.xy, GetCamera().entity_culling_tilecount.xy);
const uint tileBucketsAddress = flatTileIndex * SHADER_ENTITY_TILE_BUCKET_COUNT;
// Each thread will export one bucket from LDS to global memory:
for (uint i = groupIndex; i < SHADER_ENTITY_TILE_BUCKET_COUNT; i += TILED_CULLING_THREADSIZE * TILED_CULLING_THREADSIZE)
@@ -461,7 +459,7 @@ void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid :
}
#ifdef DEBUG_TILEDLIGHTCULLING
for (granularity = 0; granularity < TILED_CULLING_GRANULARITY * TILED_CULLING_GRANULARITY; ++granularity)
for (uint granularity = 0; granularity < TILED_CULLING_GRANULARITY * TILED_CULLING_GRANULARITY; ++granularity)
{
uint2 pixel = DTid.xy * uint2(TILED_CULLING_GRANULARITY, TILED_CULLING_GRANULARITY) + unflatten2D(granularity, TILED_CULLING_GRANULARITY);
+155 -169
View File
@@ -69,20 +69,18 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint3 GTid :
#endif // RTSHADOW
[branch]
if (lights().item_count() > 0)
if (!lights().empty())
{
// Loop through light buckets in the tile:
const uint first_item = lights().first_item();
const uint last_item = lights().last_item();
const uint first_bucket = first_item / 32;
const uint last_bucket = min(last_item / 32, max(0, SHADER_ENTITY_TILE_BUCKET_COUNT - 1));
[loop]
for (uint bucket = first_bucket; bucket <= last_bucket && shadow_index < MAX_RTSHADOWS; ++bucket)
ShaderEntityIterator iterator = lights();
for (uint bucket = iterator.first_bucket(); bucket <= iterator.last_bucket(); ++bucket)
{
uint bucket_bits = load_entitytile(flatTileIndex + bucket);
// Bucket scalarizer - Siggraph 2017 - Improved Culling [Michal Drobot]:
bucket_bits = WaveReadLaneFirst(WaveActiveBitOr(bucket_bits));
bucket_bits = iterator.mask_entity(bucket, bucket_bits);
[loop]
while (bucket_bits != 0 && shadow_index < MAX_RTSHADOWS)
@@ -91,208 +89,196 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint3 GTid :
const uint bucket_bit_index = firstbitlow(bucket_bits);
const uint entity_index = bucket * 32 + bucket_bit_index;
bucket_bits ^= 1u << bucket_bit_index;
shadow_index = entity_index - lights().first_item();
if (shadow_index >= MAX_RTSHADOWS)
break;
// Check if it is a light and process:
[branch]
if (entity_index >= first_item && entity_index <= last_item)
ShaderEntity light = load_entity(entity_index);
if (!light.IsCastingShadow())
{
shadow_index = entity_index - lights().first_item();
if (shadow_index >= MAX_RTSHADOWS)
break;
continue;
}
ShaderEntity light = load_entity(entity_index);
if (light.GetFlags() & ENTITY_FLAG_LIGHT_STATIC)
{
continue; // static lights will be skipped (they are used in lightmap baking)
}
if (!light.IsCastingShadow())
{
continue;
}
float3 L;
ray.TMax = 0;
if (light.GetFlags() & ENTITY_FLAG_LIGHT_STATIC)
{
continue; // static lights will be skipped (they are used in lightmap baking)
}
float3 L;
ray.TMax = 0;
switch (light.GetType())
{
default:
case ENTITY_TYPE_DIRECTIONALLIGHT:
{
L = normalize(light.GetDirection());
switch (light.GetType())
{
default:
case ENTITY_TYPE_DIRECTIONALLIGHT:
{
L = normalize(light.GetDirection());
#ifdef RTSHADOW
L += mul(hemispherepoint_cos(bluenoise.x, bluenoise.y), get_tangentspace(L)) * light.GetRadius();
L += mul(hemispherepoint_cos(bluenoise.x, bluenoise.y), get_tangentspace(L)) * light.GetRadius();
#endif // RTSHADOW
SurfaceToLight surfaceToLight;
surfaceToLight.create(surface, L);
[branch]
if (any(surfaceToLight.NdotL))
{
[branch]
if (light.IsCastingShadow())
{
ray.TMax = FLT_MAX;
}
}
}
break;
case ENTITY_TYPE_POINTLIGHT:
{
#ifdef RTSHADOW
light.position += light.GetDirection() * (bluenoise.z - 0.5) * light.GetLength();
light.position += mul(hemispherepoint_cos(bluenoise.x, bluenoise.y), get_tangentspace(normalize(light.position - surface.P))) * light.GetRadius();
#endif // RTSHADOW
L = light.position - surface.P;
const float dist2 = dot(L, L);
const float range = light.GetRange();
const float range2 = range * range;
[branch]
if (dist2 < range2)
{
const float3 Lunnormalized = L;
const float dist = sqrt(dist2);
L /= dist;
SurfaceToLight surfaceToLight;
surfaceToLight.create(surface, L);
[branch]
if (any(surfaceToLight.NdotL))
{
[branch]
if (light.IsCastingShadow())
{
ray.TMax = FLT_MAX;
}
ray.TMax = dist;
}
}
break;
case ENTITY_TYPE_POINTLIGHT:
{
}
break;
case ENTITY_TYPE_SPOTLIGHT:
{
float3 Loriginal = normalize(light.position - surface.P);
#ifdef RTSHADOW
light.position += light.GetDirection() * (bluenoise.z - 0.5) * light.GetLength();
light.position += mul(hemispherepoint_cos(bluenoise.x, bluenoise.y), get_tangentspace(normalize(light.position - surface.P))) * light.GetRadius();
light.position += mul(hemispherepoint_cos(bluenoise.x, bluenoise.y), get_tangentspace(normalize(light.position - surface.P))) * light.GetRadius();
#endif // RTSHADOW
L = light.position - surface.P;
const float dist2 = dot(L, L);
const float range = light.GetRange();
const float range2 = range * range;
[branch]
if (dist2 < range2)
{
const float3 Lunnormalized = L;
const float dist = sqrt(dist2);
L /= dist;
SurfaceToLight surfaceToLight;
surfaceToLight.create(surface, L);
[branch]
if (any(surfaceToLight.NdotL))
{
ray.TMax = dist;
}
}
}
break;
case ENTITY_TYPE_SPOTLIGHT:
{
float3 Loriginal = normalize(light.position - surface.P);
#ifdef RTSHADOW
light.position += mul(hemispherepoint_cos(bluenoise.x, bluenoise.y), get_tangentspace(normalize(light.position - surface.P))) * light.GetRadius();
#endif // RTSHADOW
L = light.position - surface.P;
const float dist2 = dot(L, L);
const float range2 = light.GetRange() * light.GetRange();
[branch]
if (dist2 < range2)
{
const float dist = sqrt(dist2);
L /= dist;
SurfaceToLight surfaceToLight;
surfaceToLight.create(surface, L);
[branch]
if (any(surfaceToLight.NdotL_sss) && (dot(Loriginal, light.GetDirection()) > light.GetConeAngleCos()))
{
ray.TMax = dist;
}
}
}
break;
}
L = light.position - surface.P;
const float dist2 = dot(L, L);
const float range2 = light.GetRange() * light.GetRange();
[branch]
if (ray.TMax > 0)
if (dist2 < range2)
{
#ifdef RTSHADOW
// true ray traced shadow:
uint seed = 0;
float shadow = 0;
const float dist = sqrt(dist2);
L /= dist;
ray.Direction = L + max3(surface.sss);
SurfaceToLight surfaceToLight;
surfaceToLight.create(surface, L);
[branch]
if (any(surfaceToLight.NdotL_sss) && (dot(Loriginal, light.GetDirection()) > light.GetConeAngleCos()))
{
ray.TMax = dist;
}
}
}
break;
}
[branch]
if (ray.TMax > 0)
{
#ifdef RTSHADOW
// true ray traced shadow:
uint seed = 0;
float shadow = 0;
ray.Direction = L + max3(surface.sss);
#ifdef RTAPI
wiRayQuery q;
q.TraceRayInline(
scene_acceleration_structure, // RaytracingAccelerationStructure AccelerationStructure
RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES |
RAY_FLAG_CULL_FRONT_FACING_TRIANGLES |
RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH, // uint RayFlags
asuint(postprocess.params1.x), // uint InstanceInclusionMask
ray // RayDesc Ray
);
while (q.Proceed())
wiRayQuery q;
q.TraceRayInline(
scene_acceleration_structure, // RaytracingAccelerationStructure AccelerationStructure
RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES |
RAY_FLAG_CULL_FRONT_FACING_TRIANGLES |
RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH, // uint RayFlags
asuint(postprocess.params1.x), // uint InstanceInclusionMask
ray // RayDesc Ray
);
while (q.Proceed())
{
if(q.CandidateType() != CANDIDATE_NON_OPAQUE_TRIANGLE) // see xbox coherent ray traversal documentation
continue;
PrimitiveID prim;
prim.primitiveIndex = q.CandidatePrimitiveIndex();
prim.instanceIndex = q.CandidateInstanceID();
prim.subsetIndex = q.CandidateGeometryIndex();
Surface surface;
surface.init();
if (!surface.load(prim, q.CandidateTriangleBarycentrics()))
break;
float alphatest = clamp(blue_noise(DTid.xy, q.CandidateTriangleRayT()).r, 0, 0.99);
[branch]
if (surface.opacity - alphatest >= 0)
{
if(q.CandidateType() != CANDIDATE_NON_OPAQUE_TRIANGLE) // see xbox coherent ray traversal documentation
continue;
PrimitiveID prim;
prim.primitiveIndex = q.CandidatePrimitiveIndex();
prim.instanceIndex = q.CandidateInstanceID();
prim.subsetIndex = q.CandidateGeometryIndex();
Surface surface;
surface.init();
if (!surface.load(prim, q.CandidateTriangleBarycentrics()))
break;
float alphatest = clamp(blue_noise(DTid.xy, q.CandidateTriangleRayT()).r, 0, 0.99);
[branch]
if (surface.opacity - alphatest >= 0)
{
q.CommitNonOpaqueTriangleHit();
}
q.CommitNonOpaqueTriangleHit();
}
shadow = q.CommittedStatus() == COMMITTED_TRIANGLE_HIT ? 0 : 1;
}
shadow = q.CommittedStatus() == COMMITTED_TRIANGLE_HIT ? 0 : 1;
#else
shadow = TraceRay_Any(newRay, asuint(postprocess.params1.x), groupIndex) ? 0 : 1;
shadow = TraceRay_Any(newRay, asuint(postprocess.params1.x), groupIndex) ? 0 : 1;
#endif // RTAPI
#else
// screen space raymarch shadow:
ray.Direction = normalize(mul((float3x3)GetCamera().view, L));
float3 rayPos = ray.Origin + ray.Direction * stepsize * offset;
// screen space raymarch shadow:
ray.Direction = normalize(mul((float3x3)GetCamera().view, L));
float3 rayPos = ray.Origin + ray.Direction * stepsize * offset;
float occlusion = 0;
[loop]
for (uint i = 0; i < samplecount; ++i)
float occlusion = 0;
[loop]
for (uint i = 0; i < samplecount; ++i)
{
float4 proj = mul(GetCamera().projection, float4(rayPos, 1));
proj.xyz /= proj.w;
proj.xy = proj.xy * float2(0.5f, -0.5f) + float2(0.5f, 0.5f);
[branch]
if (is_saturated(proj.xy))
{
float4 proj = mul(GetCamera().projection, float4(rayPos, 1));
proj.xyz /= proj.w;
proj.xy = proj.xy * float2(0.5f, -0.5f) + float2(0.5f, 0.5f);
[branch]
if (is_saturated(proj.xy))
const float ray_depth_real = proj.w;
const float ray_depth_sample = texture_lineardepth.SampleLevel(sampler_point_clamp, proj.xy, 1) * GetCamera().z_far;
const float ray_depth_delta = ray_depth_real - ray_depth_sample;
if (ray_depth_delta > 0.02 && ray_depth_delta < thickness)
{
const float ray_depth_real = proj.w;
const float ray_depth_sample = texture_lineardepth.SampleLevel(sampler_point_clamp, proj.xy, 1) * GetCamera().z_far;
const float ray_depth_delta = ray_depth_real - ray_depth_sample;
if (ray_depth_delta > 0.02 && ray_depth_delta < thickness)
{
occlusion = 1 - pow(float(i) / float(samplecount), 8);
occlusion = 1 - pow(float(i) / float(samplecount), 8);
// screen edge fade:
float2 fade = max(12 * abs(proj.xy - 0.5) - 5, 0);
occlusion *= saturate(1 - dot(fade, fade));
// screen edge fade:
float2 fade = max(12 * abs(proj.xy - 0.5) - 5, 0);
occlusion *= saturate(1 - dot(fade, fade));
break;
}
break;
}
rayPos += ray.Direction * stepsize;
}
float shadow = 1 - occlusion;
rayPos += ray.Direction * stepsize;
}
float shadow = 1 - occlusion;
#endif // RTSHADOW
uint mask = uint(saturate(shadow) * 255); // 8 bits
uint mask_shift = (shadow_index % 4) * 8;
uint mask_bucket = shadow_index / 4;
shadow_mask[mask_bucket] |= mask << mask_shift;
}
}
else if (entity_index > last_item)
{
// force exit:
bucket = SHADER_ENTITY_TILE_BUCKET_COUNT;
break;
uint mask = uint(saturate(shadow) * 255); // 8 bits
uint mask_shift = (shadow_index % 4) * 8;
uint mask_bucket = shadow_index / 4;
shadow_mask[mask_bucket] |= mask << mask_shift;
}
}
+1 -70
View File
@@ -374,75 +374,7 @@ inline void TiledLighting(inout Surface surface, inout Lighting lighting, uint f
lighting.indirect.diffuse = ddgi_sample_irradiance(surface.P, surface.N);
surface.SetGIApplied(true);
}
#if 0
// Combined light loops:
[branch]
if (!lights().empty())
{
// Loop through light buckets in the tile:
ShaderEntityIterator iterator = lights();
for(uint bucket = iterator.first_bucket(); bucket <= iterator.last_bucket(); ++bucket)
{
uint bucket_bits = load_entitytile(flatTileIndex + bucket);
#ifndef ENTITY_TILE_UNIFORM
// Bucket scalarizer - Siggraph 2017 - Improved Culling [Michal Drobot]:
bucket_bits = WaveReadLaneFirst(WaveActiveBitOr(bucket_bits));
#endif // ENTITY_TILE_UNIFORM
bucket_bits = iterator.mask_entity(bucket, bucket_bits);
[loop]
while (bucket_bits != 0)
{
// Retrieve global entity index from local bucket, then remove bit from local bucket:
const uint bucket_bit_index = firstbitlow(bucket_bits);
const uint entity_index = bucket * 32 + bucket_bit_index;
bucket_bits ^= 1u << bucket_bit_index;
ShaderEntity light = load_entity(entity_index);
half shadow_mask = 1;
#if defined(SHADOW_MASK_ENABLED) && !defined(TRANSPARENT)
[branch]
if (light.IsCastingShadow() && (GetFrame().options & OPTION_BIT_SHADOW_MASK) && (GetCamera().options & SHADERCAMERA_OPTION_USE_SHADOW_MASK) && GetCamera().texture_rtshadow_index >= 0)
{
uint shadow_index = entity_index - lights().first_item();
if (shadow_index < 16)
{
shadow_mask = (half)bindless_textures2DArray[GetCamera().texture_rtshadow_index][uint3(surface.pixel, shadow_index)].r;
}
}
#endif // SHADOW_MASK_ENABLED && !TRANSPARENT
switch (light.GetType())
{
case ENTITY_TYPE_DIRECTIONALLIGHT:
{
light_directional(light, surface, lighting);
}
break;
case ENTITY_TYPE_POINTLIGHT:
{
light_point(light, surface, lighting);
}
break;
case ENTITY_TYPE_SPOTLIGHT:
{
light_spot(light, surface, lighting);
}
break;
}
}
}
}
#else
// Separated light loops by type:
[branch]
if (!directional_lights().empty())
{
@@ -558,7 +490,6 @@ inline void TiledLighting(inout Surface surface, inout Lighting lighting, uint f
}
}
}
#endif
}
-53
View File
@@ -1,53 +0,0 @@
#include "globals.hlsli"
#include "cullingShaderHF.hlsli"
// View space frustums for the grid cells.
RWStructuredBuffer<Frustum> out_Frustums : register(u0);
[numthreads(TILED_CULLING_BLOCKSIZE, TILED_CULLING_BLOCKSIZE, 1)]
void main(uint3 DTid : SV_DispatchThreadID)
{
uint2 dim;
texture_depth.GetDimensions(dim.x, dim.y);
float2 dim_rcp = rcp(dim);
// View space eye position is always at the origin.
const float3 eyePos = float3(0, 0, 0);
// Compute 4 points on the far clipping plane to use as the
// frustum vertices.
float4 screenSpace[4];
// Top left point
screenSpace[0] = float4(DTid.xy * TILED_CULLING_BLOCKSIZE, 1.0f, 1.0f);
// Top right point
screenSpace[1] = float4(float2(DTid.x + 1, DTid.y) * TILED_CULLING_BLOCKSIZE, 1.0f, 1.0f);
// Bottom left point
screenSpace[2] = float4(float2(DTid.x, DTid.y + 1) * TILED_CULLING_BLOCKSIZE, 1.0f, 1.0f);
// Bottom right point
screenSpace[3] = float4(float2(DTid.x + 1, DTid.y + 1) * TILED_CULLING_BLOCKSIZE, 1.0f, 1.0f);
float3 viewSpace[4];
// Now convert the screen space points to view space
for (int i = 0; i < 4; i++)
{
viewSpace[i] = ScreenToView(screenSpace[i], dim_rcp).xyz;
}
// Now build the frustum planes from the view space points
Frustum frustum;
// Left plane
frustum.planes[0] = ComputePlane(viewSpace[2], eyePos, viewSpace[0]);
// Right plane
frustum.planes[1] = ComputePlane(viewSpace[1], eyePos, viewSpace[3]);
// Top plane
frustum.planes[2] = ComputePlane(viewSpace[0], eyePos, viewSpace[1]);
// Bottom plane
frustum.planes[3] = ComputePlane(viewSpace[3], eyePos, viewSpace[2]);
// Store the computed frustum in global memory (if our thread ID is in bounds of the grid).
if (DTid.x < GetCamera().entity_culling_tilecount.x && DTid.y < GetCamera().entity_culling_tilecount.y)
{
out_Frustums[flatten2D(DTid.xy, GetCamera().entity_culling_tilecount.xy)] = frustum;
}
}
-1
View File
@@ -228,7 +228,6 @@ namespace wi::enums
CSTYPE_LUMINANCE_PASS2,
CSTYPE_SHADINGRATECLASSIFICATION,
CSTYPE_SHADINGRATECLASSIFICATION_DEBUG,
CSTYPE_TILEFRUSTUMS,
CSTYPE_LIGHTCULLING,
CSTYPE_LIGHTCULLING_DEBUG,
CSTYPE_LIGHTCULLING_ADVANCED,
+10 -66
View File
@@ -992,7 +992,6 @@ void LoadShaders()
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_LUMINANCE_PASS2], "luminancePass2CS.cso"); });
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_SHADINGRATECLASSIFICATION], "shadingRateClassificationCS.cso"); });
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_SHADINGRATECLASSIFICATION_DEBUG], "shadingRateClassificationCS_DEBUG.cso"); });
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_TILEFRUSTUMS], "tileFrustumsCS.cso"); });
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_LIGHTCULLING], "lightCullingCS.cso"); });
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_LIGHTCULLING_DEBUG], "lightCullingCS_DEBUG.cso"); });
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_LIGHTCULLING_ADVANCED], "lightCullingCS_ADVANCED.cso"); });
@@ -9385,26 +9384,14 @@ void CreateTiledLightResources(TiledLightResources& res, XMUINT2 resolution)
{
res.tileCount = GetEntityCullingTileCount(resolution);
{
GPUBufferDesc bd;
bd.stride = sizeof(XMFLOAT4) * 4; // storing 4 planes for every tile
bd.size = bd.stride * res.tileCount.x * res.tileCount.y;
bd.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS;
bd.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
bd.usage = Usage::DEFAULT;
device->CreateBuffer(&bd, nullptr, &res.tileFrustums);
device->SetName(&res.tileFrustums, "tileFrustums");
}
{
GPUBufferDesc bd;
bd.stride = sizeof(uint);
bd.size = res.tileCount.x * res.tileCount.y * bd.stride * SHADER_ENTITY_TILE_BUCKET_COUNT * 2; // *2: opaque and transparent arrays
bd.usage = Usage::DEFAULT;
bd.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE;
bd.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
device->CreateBuffer(&bd, nullptr, &res.entityTiles);
device->SetName(&res.entityTiles, "entityTiles");
}
GPUBufferDesc bd;
bd.stride = sizeof(uint);
bd.size = res.tileCount.x * res.tileCount.y * bd.stride * SHADER_ENTITY_TILE_BUCKET_COUNT * 2; // *2: opaque and transparent arrays
bd.usage = Usage::DEFAULT;
bd.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE;
bd.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
device->CreateBuffer(&bd, nullptr, &res.entityTiles);
device->SetName(&res.entityTiles, "entityTiles");
}
void ComputeTiledLightCulling(
const TiledLightResources& res,
@@ -9415,14 +9402,7 @@ void ComputeTiledLightCulling(
{
auto range = wi::profiler::BeginRangeGPU("Entity Culling", cmd);
// Initial barriers to put all resources into UAV:
{
GPUBarrier barriers[] = {
GPUBarrier::Buffer(&res.tileFrustums, ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Buffer(&res.entityTiles, ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS),
};
device->Barrier(barriers, arraysize(barriers), cmd);
}
device->Barrier(GPUBarrier::Buffer(&res.entityTiles, ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS), cmd);
if (
vis.visibleLights.empty() &&
@@ -9431,15 +9411,8 @@ void ComputeTiledLightCulling(
)
{
device->EventBegin("Tiled Entity Clear Only", cmd);
device->ClearUAV(&res.tileFrustums, 0, cmd);
device->ClearUAV(&res.entityTiles, 0, cmd);
{
GPUBarrier barriers[] = {
GPUBarrier::Buffer(&res.tileFrustums, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE),
GPUBarrier::Buffer(&res.entityTiles, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE),
};
device->Barrier(barriers, arraysize(barriers), cmd);
}
device->Barrier(GPUBarrier::Buffer(&res.entityTiles, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), cmd);
device->EventEnd(cmd);
wi::profiler::EndRange(range);
return;
@@ -9447,39 +9420,10 @@ void ComputeTiledLightCulling(
BindCommonResources(cmd);
// Frustum computation
{
device->EventBegin("Tile Frustums", cmd);
device->BindComputeShader(&shaders[CSTYPE_TILEFRUSTUMS], cmd);
const GPUResource* uavs[] = {
&res.tileFrustums
};
device->BindUAVs(uavs, 0, arraysize(uavs), cmd);
device->Dispatch(
(res.tileCount.x + TILED_CULLING_BLOCKSIZE - 1) / TILED_CULLING_BLOCKSIZE,
(res.tileCount.y + TILED_CULLING_BLOCKSIZE - 1) / TILED_CULLING_BLOCKSIZE,
1,
cmd
);
{
GPUBarrier barriers[] = {
GPUBarrier::Buffer(&res.tileFrustums, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE),
};
device->Barrier(barriers, arraysize(barriers), cmd);
}
device->EventEnd(cmd);
}
// Perform the culling
{
device->EventBegin("Entity Culling", cmd);
device->BindResource(&res.tileFrustums, 0, cmd);
if (GetDebugLightCulling() && debugUAV.IsValid())
{
device->BindComputeShader(&shaders[GetAdvancedLightCulling() ? CSTYPE_LIGHTCULLING_ADVANCED_DEBUG : CSTYPE_LIGHTCULLING_DEBUG], cmd);
-1
View File
@@ -327,7 +327,6 @@ namespace wi::renderer
struct TiledLightResources
{
XMUINT2 tileCount = {};
wi::graphics::GPUBuffer tileFrustums; // entity culling frustums
wi::graphics::GPUBuffer entityTiles; // culled entity indices
};
void CreateTiledLightResources(TiledLightResources& res, XMUINT2 resolution);
+1 -1
View File
@@ -9,7 +9,7 @@ namespace wi::version
// minor features, major updates, breaking compatibility changes
const int minor = 71;
// minor bug fixes, alterations, refactors, updates
const int revision = 556;
const int revision = 557;
const std::string version_string = std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(revision);