ssgi updates

This commit is contained in:
Turánszki János
2024-04-02 17:23:13 +02:00
parent 4cfc23eb28
commit af983faadc
8 changed files with 180 additions and 495 deletions
+16 -12
View File
@@ -12,7 +12,7 @@ RWTexture2D<float4> output_diffuse : register(u0);
#ifdef WIDE
static const uint THREADCOUNT = 16;
static const int TILE_BORDER = 18;
static const int TILE_BORDER = 16;
#else
static const uint THREADCOUNT = 8;
static const int TILE_BORDER = 4;
@@ -25,7 +25,7 @@ groupshared uint group_valid;
inline uint coord_to_cache(int2 coord)
{
return flatten2D(clamp(TILE_BORDER + coord, 0, TILE_SIZE - 1), TILE_SIZE);
return flatten2D(clamp(coord, 0, TILE_SIZE - 1), TILE_SIZE);
}
static const float depthRejection = 8;
@@ -34,11 +34,10 @@ static const float depthRejection_rcp = rcp(depthRejection);
float3 compute_diffuse(
float3 origin_position,
float3 origin_normal,
int2 GTid,
int2 offset
int2 originLoc, // coord in cache
int2 sampleLoc // coord in cache
)
{
const int2 sampleLoc = GTid + offset;
const uint t = coord_to_cache(sampleLoc);
uint c = cache_rgb[t];
if(c == 0)
@@ -56,7 +55,7 @@ float3 compute_diffuse(
const float sample_z = sample_position.z;
// DDA occlusion:
const int2 start = GTid;
const int2 start = originLoc;
const int2 goal = sampleLoc;
const int dx = int(goal.x) - int(start.x);
@@ -86,7 +85,8 @@ float3 compute_diffuse(
const float sz = cache_z[tt];
if(sz < z - 0.1)
{
return occlusion * Unpack_R11G11B10_FLOAT(cache_rgb[tt]);
c = cache_rgb[tt];
break;
}
}
}
@@ -127,7 +127,8 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint2 GTid :
if (group_valid == 0)
return; // if no valid color was cached, whole group can exit early
const uint t = coord_to_cache(GTid.xy);
const int2 originLoc = GTid.xy + TILE_BORDER;
const uint t = coord_to_cache(originLoc);
float3 P;
P.z = cache_z[t];
@@ -137,23 +138,26 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint2 GTid :
P.xy = unpack_half2(cache_xy[t]);
const uint2 pixel = DTid.xy;
const float3 N = mul((float3x3)GetCamera().view, decode_oct(input_normal[interleaved_pixel].rg));
float3 diffuse = 0;
float sum = 0;
const int range = int(postprocess.params0.x);
const float spread = postprocess.params0.y + dither(pixel);
const float spread = postprocess.params0.y /*+ dither(DTid.xy)*/;
const float rangespread_rcp2 = postprocess.params0.z;
const int2 pixel_base = Gid.xy * THREADCOUNT + GTid;
for(int x = -range; x <= range; ++x)
{
for(int y = -range; y <= range; ++y)
{
const int2 pixel = pixel_base + int2(x, y);
if(any(pixel < 0) || any(pixel >= postprocess.resolution))
continue; // to not lose energy when sampling outside of textures, we skip those offsets
const float2 foffset = float2(x, y) * spread;
const int2 offset = round(foffset);
const float weight = saturate(1 - abs(foffset.x) * abs(foffset.y) * rangespread_rcp2);
diffuse += compute_diffuse(P, N, GTid, offset) * weight;
diffuse += compute_diffuse(P, N, originLoc, originLoc + offset) * weight;
sum += weight;
}
}
+18 -18
View File
@@ -12,17 +12,17 @@ RWTexture2DArray<float3> atlas4x_color : register(u5);
RWTexture2DArray<float3> atlas8x_color : register(u6);
RWTexture2DArray<float3> atlas16x_color : register(u7);
RWTexture2D<float> regular2x_depth : register(u8);
RWTexture2D<float2> regular2x_normal : register(u9);
RWTexture2D<float> regular4x_depth : register(u10);
RWTexture2D<float2> regular4x_normal : register(u11);
RWTexture2D<float> regular8x_depth : register(u12);
RWTexture2D<float2> regular8x_normal : register(u13);
RWTexture2D<float> regular16x_depth : register(u14);
RWTexture2D<float> regular4x_depth : register(u9);
RWTexture2D<float> regular8x_depth : register(u10);
RWTexture2D<float> regular16x_depth : register(u11);
RWTexture2D<float2> regular2x_normal : register(u12);
RWTexture2D<float2> regular4x_normal : register(u13);
RWTexture2D<float2> regular8x_normal : register(u14);
RWTexture2D<float2> regular16x_normal : register(u15);
groupshared float shared_depths[256];
groupshared float2 shared_normals[256];
groupshared float3 shared_colors[256];
groupshared uint shared_normals[256];
groupshared uint shared_colors[256];
[numthreads(8, 8, 1)]
void main(uint3 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex, uint3 GTid : SV_GroupThreadID, uint3 DTid : SV_DispatchThreadID)
@@ -38,10 +38,10 @@ void main(uint3 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex, uint3 GTid :
shared_depths[destIdx + 128] = texture_depth[min(startST | uint2(0, 8), dim - 1)];
shared_depths[destIdx + 136] = texture_depth[min(startST | uint2(8, 8), dim - 1)];
shared_normals[destIdx + 0] = texture_normal[min(startST | uint2(0, 0), dim - 1)];
shared_normals[destIdx + 8] = texture_normal[min(startST | uint2(8, 0), dim - 1)];
shared_normals[destIdx + 128] = texture_normal[min(startST | uint2(0, 8), dim - 1)];
shared_normals[destIdx + 136] = texture_normal[min(startST | uint2(8, 8), dim - 1)];
shared_normals[destIdx + 0] = pack_half2(texture_normal[min(startST | uint2(0, 0), dim - 1)]);
shared_normals[destIdx + 8] = pack_half2(texture_normal[min(startST | uint2(8, 0), dim - 1)]);
shared_normals[destIdx + 128] = pack_half2(texture_normal[min(startST | uint2(0, 8), dim - 1)]);
shared_normals[destIdx + 136] = pack_half2(texture_normal[min(startST | uint2(8, 8), dim - 1)]);
const float2 uv0 = float2(startST | uint2(0, 0)) * dim_rcp;
const float2 uv1 = float2(startST | uint2(8, 0)) * dim_rcp;
@@ -55,18 +55,18 @@ void main(uint3 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex, uint3 GTid :
const float2 prevUV1 = uv1 + velocity1;
const float2 prevUV2 = uv2 + velocity2;
const float2 prevUV3 = uv3 + velocity3;
shared_colors[destIdx + 0] = texture_input.SampleLevel(sampler_linear_clamp, prevUV0, 0);
shared_colors[destIdx + 8] = texture_input.SampleLevel(sampler_linear_clamp, prevUV1, 0);
shared_colors[destIdx + 128] = texture_input.SampleLevel(sampler_linear_clamp, prevUV2, 0);
shared_colors[destIdx + 136] = texture_input.SampleLevel(sampler_linear_clamp, prevUV3, 0);
shared_colors[destIdx + 0] = Pack_R11G11B10_FLOAT(texture_input.SampleLevel(sampler_linear_clamp, prevUV0, 0));
shared_colors[destIdx + 8] = Pack_R11G11B10_FLOAT(texture_input.SampleLevel(sampler_linear_clamp, prevUV1, 0));
shared_colors[destIdx + 128] = Pack_R11G11B10_FLOAT(texture_input.SampleLevel(sampler_linear_clamp, prevUV2, 0));
shared_colors[destIdx + 136] = Pack_R11G11B10_FLOAT(texture_input.SampleLevel(sampler_linear_clamp, prevUV3, 0));
GroupMemoryBarrierWithGroupSync();
uint ldsIndex = (GTid.x << 1) | (GTid.y << 5);
float depth = shared_depths[ldsIndex];
float2 normal = shared_normals[ldsIndex];
float3 color = shared_colors[ldsIndex];
float2 normal = unpack_half2(shared_normals[ldsIndex]);
float3 color = Unpack_R11G11B10_FLOAT(shared_colors[ldsIndex]);
color = color - 0.2; // cut out pixels that shouldn't act as lights
color *= 0.9; // accumulation energy loss
+2 -2
View File
@@ -42,8 +42,8 @@ void main(uint2 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex)
const int range = int(postprocess.params0.x);
const float spread = postprocess.params0.y;
#else
const int range = 1;
const float spread = 8;
const int range = 2;
const float spread = 6;
#endif
for(int x = -range; x <= range; ++x)
{
+140 -188
View File
@@ -12444,6 +12444,8 @@ void Postprocess_RTDiffuse(
}
void CreateSSGIResources(SSGIResources& res, XMUINT2 resolution)
{
res.cleared = false;
TextureDesc desc;
desc.type = TextureDesc::Type::TEXTURE_2D;
desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS;
@@ -12456,34 +12458,11 @@ void CreateSSGIResources(SSGIResources& res, XMUINT2 resolution)
desc.width = (resolution.x + 7) / 8;
desc.height = (resolution.y + 7) / 8;
desc.array_size = 16;
desc.mip_levels = 4;
desc.format = Format::R32_FLOAT;
device->CreateTexture(&desc, nullptr, &res.texture_atlas2x_depth);
device->CreateTexture(&desc, nullptr, &res.texture_atlas_depth);
desc.format = Format::R11G11B10_FLOAT;
device->CreateTexture(&desc, nullptr, &res.texture_atlas2x_color);
desc.width = (resolution.x + 15) / 16;
desc.height = (resolution.y + 15) / 16;
desc.array_size = 16;
desc.format = Format::R32_FLOAT;
device->CreateTexture(&desc, nullptr, &res.texture_atlas4x_depth);
desc.format = Format::R11G11B10_FLOAT;
device->CreateTexture(&desc, nullptr, &res.texture_atlas4x_color);
desc.width = (resolution.x + 31) / 32;
desc.height = (resolution.y + 31) / 32;
desc.array_size = 16;
desc.format = Format::R32_FLOAT;
device->CreateTexture(&desc, nullptr, &res.texture_atlas8x_depth);
desc.format = Format::R11G11B10_FLOAT;
device->CreateTexture(&desc, nullptr, &res.texture_atlas8x_color);
desc.width = (resolution.x + 63) / 64;
desc.height = (resolution.y + 63) / 64;
desc.array_size = 16;
desc.format = Format::R32_FLOAT;
device->CreateTexture(&desc, nullptr, &res.texture_atlas16x_depth);
desc.format = Format::R11G11B10_FLOAT;
device->CreateTexture(&desc, nullptr, &res.texture_atlas16x_color);
device->CreateTexture(&desc, nullptr, &res.texture_atlas_color);
desc.array_size = 1;
desc.mip_levels = 4;
@@ -12496,9 +12475,17 @@ void CreateSSGIResources(SSGIResources& res, XMUINT2 resolution)
desc.format = Format::R11G11B10_FLOAT;
device->CreateTexture(&desc, nullptr, &res.texture_diffuse_mips);
for (uint32_t i = 0; i < desc.mip_levels; ++i)
for (uint32_t i = 0; i < 4u; ++i)
{
int subresource_index;
subresource_index = device->CreateSubresource(&res.texture_atlas_depth, SubresourceType::SRV, 0, 16, i, 1);
assert(subresource_index == i);
subresource_index = device->CreateSubresource(&res.texture_atlas_depth, SubresourceType::UAV, 0, 16, i, 1);
assert(subresource_index == i);
subresource_index = device->CreateSubresource(&res.texture_atlas_color, SubresourceType::SRV, 0, 16, i, 1);
assert(subresource_index == i);
subresource_index = device->CreateSubresource(&res.texture_atlas_color, SubresourceType::UAV, 0, 16, i, 1);
assert(subresource_index == i);
subresource_index = device->CreateSubresource(&res.texture_depth_mips, SubresourceType::SRV, 0, 1, i, 1);
assert(subresource_index == i);
subresource_index = device->CreateSubresource(&res.texture_depth_mips, SubresourceType::UAV, 0, 1, i, 1);
@@ -12527,14 +12514,8 @@ void Postprocess_SSGI(
{
GPUBarrier barriers[] = {
GPUBarrier::Image(&res.texture_atlas2x_depth, res.texture_atlas2x_depth.desc.layout, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Image(&res.texture_atlas4x_depth, res.texture_atlas4x_depth.desc.layout, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Image(&res.texture_atlas8x_depth, res.texture_atlas8x_depth.desc.layout, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Image(&res.texture_atlas16x_depth, res.texture_atlas16x_depth.desc.layout, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Image(&res.texture_atlas2x_color, res.texture_atlas2x_color.desc.layout, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Image(&res.texture_atlas4x_color, res.texture_atlas4x_color.desc.layout, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Image(&res.texture_atlas8x_color, res.texture_atlas8x_color.desc.layout, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Image(&res.texture_atlas16x_color, res.texture_atlas16x_color.desc.layout, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Image(&res.texture_atlas_depth, res.texture_atlas_depth.desc.layout, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Image(&res.texture_atlas_color, res.texture_atlas_color.desc.layout, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Image(&res.texture_depth_mips, res.texture_depth_mips.desc.layout, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Image(&res.texture_normal_mips, res.texture_normal_mips.desc.layout, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Image(&res.texture_diffuse_mips, res.texture_diffuse_mips.desc.layout, ResourceState::UNORDERED_ACCESS),
@@ -12543,31 +12524,20 @@ void Postprocess_SSGI(
device->Barrier(barriers, arraysize(barriers), cmd);
}
device->ClearUAV(&res.texture_atlas2x_depth, 0, cmd);
device->ClearUAV(&res.texture_atlas4x_depth, 0, cmd);
device->ClearUAV(&res.texture_atlas8x_depth, 0, cmd);
device->ClearUAV(&res.texture_atlas16x_depth, 0, cmd);
device->ClearUAV(&res.texture_atlas2x_color, 0, cmd);
device->ClearUAV(&res.texture_atlas4x_color, 0, cmd);
device->ClearUAV(&res.texture_atlas8x_color, 0, cmd);
device->ClearUAV(&res.texture_atlas16x_color, 0, cmd);
device->ClearUAV(&res.texture_depth_mips, 0, cmd);
device->ClearUAV(&res.texture_normal_mips, 0, cmd);
if (!res.cleared)
{
res.cleared = true;
device->ClearUAV(&res.texture_atlas_depth, 0, cmd);
device->ClearUAV(&res.texture_atlas_color, 0, cmd);
device->ClearUAV(&res.texture_depth_mips, 0, cmd);
device->ClearUAV(&res.texture_normal_mips, 0, cmd);
}
device->ClearUAV(&res.texture_diffuse_mips, 0, cmd);
device->ClearUAV(&output, 0, cmd);
{
GPUBarrier barriers[] = {
GPUBarrier::Memory(&res.texture_atlas2x_depth),
GPUBarrier::Memory(&res.texture_atlas4x_depth),
GPUBarrier::Memory(&res.texture_atlas8x_depth),
GPUBarrier::Memory(&res.texture_atlas16x_depth),
GPUBarrier::Memory(&res.texture_atlas2x_color),
GPUBarrier::Memory(&res.texture_atlas4x_color),
GPUBarrier::Memory(&res.texture_atlas8x_color),
GPUBarrier::Memory(&res.texture_atlas16x_color),
GPUBarrier::Memory(&res.texture_depth_mips),
GPUBarrier::Memory(&res.texture_normal_mips),
GPUBarrier::Memory(),
};
device->Barrier(barriers, arraysize(barriers), cmd);
}
@@ -12582,31 +12552,30 @@ void Postprocess_SSGI(
device->BindResource(&input, 0, cmd);
const GPUResource* uavs[] = {
&res.texture_atlas2x_depth,
&res.texture_atlas4x_depth,
&res.texture_atlas8x_depth,
&res.texture_atlas16x_depth,
&res.texture_atlas2x_color,
&res.texture_atlas4x_color,
&res.texture_atlas8x_color,
&res.texture_atlas16x_color,
};
device->BindUAVs(uavs, 0, arraysize(uavs), cmd);
device->BindUAV(&res.texture_atlas_depth, 0, cmd, 0);
device->BindUAV(&res.texture_atlas_depth, 1, cmd, 1);
device->BindUAV(&res.texture_atlas_depth, 2, cmd, 2);
device->BindUAV(&res.texture_atlas_depth, 3, cmd, 3);
device->BindUAV(&res.texture_depth_mips, arraysize(uavs) + 0, cmd, 0);
device->BindUAV(&res.texture_normal_mips, arraysize(uavs) + 1, cmd, 0);
device->BindUAV(&res.texture_depth_mips, arraysize(uavs) + 2, cmd, 1);
device->BindUAV(&res.texture_normal_mips, arraysize(uavs) + 3, cmd, 1);
device->BindUAV(&res.texture_depth_mips, arraysize(uavs) + 4, cmd, 2);
device->BindUAV(&res.texture_normal_mips, arraysize(uavs) + 5, cmd, 2);
device->BindUAV(&res.texture_depth_mips, arraysize(uavs) + 6, cmd, 3);
device->BindUAV(&res.texture_normal_mips, arraysize(uavs) + 7, cmd, 3);
device->BindUAV(&res.texture_atlas_color, 4, cmd, 0);
device->BindUAV(&res.texture_atlas_color, 5, cmd, 1);
device->BindUAV(&res.texture_atlas_color, 6, cmd, 2);
device->BindUAV(&res.texture_atlas_color, 7, cmd, 3);
const TextureDesc& desc = res.texture_atlas4x_depth.GetDesc();
device->BindUAV(&res.texture_depth_mips, 8, cmd, 0);
device->BindUAV(&res.texture_depth_mips, 9, cmd, 1);
device->BindUAV(&res.texture_depth_mips, 10, cmd, 2);
device->BindUAV(&res.texture_depth_mips, 11, cmd, 3);
device->BindUAV(&res.texture_normal_mips, 12, cmd, 0);
device->BindUAV(&res.texture_normal_mips, 13, cmd, 1);
device->BindUAV(&res.texture_normal_mips, 14, cmd, 2);
device->BindUAV(&res.texture_normal_mips, 15, cmd, 3);
const TextureDesc& desc = res.texture_atlas_depth.GetDesc();
device->Dispatch(
desc.width,
desc.height,
desc.width >> 1,
desc.height >> 1,
1,
cmd
);
@@ -12616,15 +12585,8 @@ void Postprocess_SSGI(
{
GPUBarrier barriers[] = {
GPUBarrier::Memory(&res.texture_diffuse_mips),
GPUBarrier::Image(&res.texture_atlas2x_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas2x_depth.desc.layout),
GPUBarrier::Image(&res.texture_atlas4x_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas4x_depth.desc.layout),
GPUBarrier::Image(&res.texture_atlas8x_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas8x_depth.desc.layout),
GPUBarrier::Image(&res.texture_atlas16x_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas16x_depth.desc.layout),
GPUBarrier::Image(&res.texture_atlas2x_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas2x_color.desc.layout),
GPUBarrier::Image(&res.texture_atlas4x_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas4x_color.desc.layout),
GPUBarrier::Image(&res.texture_atlas8x_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas8x_color.desc.layout),
GPUBarrier::Image(&res.texture_atlas16x_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas16x_color.desc.layout),
GPUBarrier::Image(&res.texture_atlas_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas_depth.desc.layout),
GPUBarrier::Image(&res.texture_atlas_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas_color.desc.layout),
GPUBarrier::Image(&res.texture_depth_mips, ResourceState::UNORDERED_ACCESS, res.texture_depth_mips.desc.layout),
GPUBarrier::Image(&res.texture_normal_mips, ResourceState::UNORDERED_ACCESS, res.texture_normal_mips.desc.layout),
};
@@ -12634,110 +12596,20 @@ void Postprocess_SSGI(
{
device->EventBegin("SSGI - diffuse", cmd);
device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI], cmd);
// 2x:
{
const GPUResource* resarray[] = {
&res.texture_atlas2x_depth,
&res.texture_atlas2x_color,
};
device->BindResources(resarray, 0, arraysize(resarray), cmd);
device->BindResource(&res.texture_normal_mips, arraysize(resarray) + 0, cmd, 0);
device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 0);
const TextureDesc& desc = res.texture_atlas2x_depth.GetDesc();
postprocess.resolution.x = desc.width;
postprocess.resolution.y = desc.height;
postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x;
postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y;
postprocess.params0.x = 1; // range
postprocess.params0.y = 2; // spread
postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2
device->PushConstants(&postprocess, sizeof(postprocess), cmd);
device->Dispatch(
(desc.width + 7) / 8,
(desc.height + 7) / 8,
16,
cmd
);
}
// 4x:
{
const GPUResource* resarray[] = {
&res.texture_atlas4x_depth,
&res.texture_atlas4x_color,
};
device->BindResources(resarray, 0, arraysize(resarray), cmd);
device->BindResource(&res.texture_normal_mips, arraysize(resarray) + 0, cmd, 1);
device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 1);
const TextureDesc& desc = res.texture_atlas4x_depth.GetDesc();
postprocess.resolution.x = desc.width;
postprocess.resolution.y = desc.height;
postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x;
postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y;
postprocess.params0.x = 2; // range
postprocess.params0.y = 2; // spread
postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2
device->PushConstants(&postprocess, sizeof(postprocess), cmd);
device->Dispatch(
(desc.width + 7) / 8,
(desc.height + 7) / 8,
16,
cmd
);
}
// Switch to wide sampling shader:
// Wide sampling passes:
device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI_WIDE], cmd);
// 8x:
{
const GPUResource* resarray[] = {
&res.texture_atlas8x_depth,
&res.texture_atlas8x_color,
};
device->BindResources(resarray, 0, arraysize(resarray), cmd);
device->BindResource(&res.texture_normal_mips, arraysize(resarray) + 0, cmd, 2);
device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 2);
const TextureDesc& desc = res.texture_atlas8x_depth.GetDesc();
postprocess.resolution.x = desc.width;
postprocess.resolution.y = desc.height;
postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x;
postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y;
postprocess.params0.x = 4; // range
postprocess.params0.y = 4; // spread
postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2
device->PushConstants(&postprocess, sizeof(postprocess), cmd);
device->Dispatch(
(desc.width + 15) / 16,
(desc.height + 15) / 16,
16,
cmd
);
}
// 16x:
{
const GPUResource* resarray[] = {
&res.texture_atlas16x_depth,
&res.texture_atlas16x_color,
};
device->BindResources(resarray, 0, arraysize(resarray), cmd);
device->BindResource(&res.texture_normal_mips, arraysize(resarray) + 0, cmd, 3);
device->BindResource(&res.texture_atlas_depth, 0, cmd, 3);
device->BindResource(&res.texture_atlas_color, 1, cmd, 3);
device->BindResource(&res.texture_normal_mips, 2, cmd, 3);
device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 3);
const TextureDesc& desc = res.texture_atlas16x_depth.GetDesc();
const TextureDesc& desc = res.texture_atlas_depth.GetDesc();
postprocess.resolution.x = desc.width;
postprocess.resolution.y = desc.height;
postprocess.resolution.x = desc.width >> 3;
postprocess.resolution.y = desc.height >> 3;
postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x;
postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y;
postprocess.params0.x = 8; // range
@@ -12746,8 +12618,88 @@ void Postprocess_SSGI(
device->PushConstants(&postprocess, sizeof(postprocess), cmd);
device->Dispatch(
(desc.width + 15) / 16,
(desc.height + 15) / 16,
(postprocess.resolution.x + 15) / 16,
(postprocess.resolution.y + 15) / 16,
16,
cmd
);
}
// 8x:
{
device->BindResource(&res.texture_atlas_depth, 0, cmd, 2);
device->BindResource(&res.texture_atlas_color, 1, cmd, 2);
device->BindResource(&res.texture_normal_mips, 2, cmd, 2);
device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 2);
const TextureDesc& desc = res.texture_atlas_depth.GetDesc();
postprocess.resolution.x = desc.width >> 2;
postprocess.resolution.y = desc.height >> 2;
postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x;
postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y;
postprocess.params0.x = 4; // range
postprocess.params0.y = 4; // spread
postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2
device->PushConstants(&postprocess, sizeof(postprocess), cmd);
device->Dispatch(
(postprocess.resolution.x + 15) / 16,
(postprocess.resolution.y + 15) / 16,
16,
cmd
);
}
// Narrow sampling passes:
device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI], cmd);
// 4x:
{
device->BindResource(&res.texture_atlas_depth, 0, cmd, 1);
device->BindResource(&res.texture_atlas_color, 1, cmd, 1);
device->BindResource(&res.texture_normal_mips, 2, cmd, 1);
device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 1);
const TextureDesc& desc = res.texture_atlas_depth.GetDesc();
postprocess.resolution.x = desc.width >> 1u;
postprocess.resolution.y = desc.height >> 1u;
postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x;
postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y;
postprocess.params0.x = 2; // range
postprocess.params0.y = 2; // spread
postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2
device->PushConstants(&postprocess, sizeof(postprocess), cmd);
device->Dispatch(
(postprocess.resolution.x + 7) / 8,
(postprocess.resolution.y + 7) / 8,
16,
cmd
);
}
// 2x:
{
device->BindResource(&res.texture_atlas_depth, 0, cmd, 0);
device->BindResource(&res.texture_atlas_color, 1, cmd, 0);
device->BindResource(&res.texture_normal_mips, 2, cmd, 0);
device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 0);
const TextureDesc& desc = res.texture_atlas_depth.GetDesc();
postprocess.resolution.x = desc.width;
postprocess.resolution.y = desc.height;
postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x;
postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y;
postprocess.params0.x = 1; // range
postprocess.params0.y = 4; // spread
postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2
device->PushConstants(&postprocess, sizeof(postprocess), cmd);
device->Dispatch(
(postprocess.resolution.x + 7) / 8,
(postprocess.resolution.y + 7) / 8,
16,
cmd
);
@@ -12782,7 +12734,7 @@ void Postprocess_SSGI(
postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x;
postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y;
postprocess.params0.x = 2; // range
postprocess.params0.y = 8; // spread
postprocess.params0.y = 4; // spread
device->PushConstants(&postprocess, sizeof(postprocess), cmd);
device->Dispatch(
@@ -12885,8 +12837,8 @@ void Postprocess_SSGI(
device->PushConstants(&postprocess, sizeof(postprocess), cmd);
device->Dispatch(
(desc.width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE,
(desc.height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE,
(desc.width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, // dispatch is using desc size (unaligned!)
(desc.height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, // dispatch is using desc size (unaligned!)
1,
cmd
);
+3 -8
View File
@@ -554,14 +554,9 @@ namespace wi::renderer
);
struct SSGIResources
{
wi::graphics::Texture texture_atlas2x_depth;
wi::graphics::Texture texture_atlas4x_depth;
wi::graphics::Texture texture_atlas8x_depth;
wi::graphics::Texture texture_atlas16x_depth;
wi::graphics::Texture texture_atlas2x_color;
wi::graphics::Texture texture_atlas4x_color;
wi::graphics::Texture texture_atlas8x_color;
wi::graphics::Texture texture_atlas16x_color;
mutable bool cleared = false;
wi::graphics::Texture texture_atlas_depth;
wi::graphics::Texture texture_atlas_color;
wi::graphics::Texture texture_depth_mips;
wi::graphics::Texture texture_normal_mips;
wi::graphics::Texture texture_diffuse_mips;
+1 -1
View File
@@ -9,7 +9,7 @@ namespace wi::version
// minor features, major updates, breaking compatibility changes
const int minor = 71;
// minor bug fixes, alterations, refactors, updates
const int revision = 418;
const int revision = 419;
const std::string version_string = std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(revision);
@@ -1,74 +0,0 @@
#include "globals.hlsli"
#include "stochasticSSRHF.hlsli"
#include "ShaderInterop_Postprocess.h"
PUSHCONSTANT(postprocess, PostProcess);
Texture2D<float> input_depth_low : register(t0);
Texture2D<float2> input_normal_low : register(t1);
Texture2D<float4> input_diffuse_low : register(t2);
Texture2D<float> input_depth_high : register(t3);
Texture2D<float2> input_normal_high : register(t4);
RWTexture2D<float4> output : register(u0);
static const float depthThreshold = 1000.0;
static const float normalThreshold = 1.0;
[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)]
void main(uint2 DTid : SV_DispatchThreadID)
{
uint2 pixel = DTid.xy;
const float2 uv = (pixel + 0.5) * postprocess.resolution_rcp;
const float depth = input_depth_high[pixel];
const float linearDepth = compute_lineardepth(depth);
const float3 N = decode_oct(input_normal_high[pixel].rg);
const float3 P = reconstruct_position(uv, depth);
float3 result = 0;
float sum = 0;
#if 1
const int range = int(postprocess.params0.x);
const float spread = postprocess.params0.y;
#else
const int range = 1;
const float spread = 8;
#endif
for(int x = -range; x <= range; ++x)
{
for(int y = -range; y <= range; ++y)
{
const float2 offset = float2(x, y) * spread * postprocess.resolution_rcp;
const float2 sample_uv = uv + offset;
const float sampleDepth = input_depth_low.SampleLevel(sampler_linear_clamp, sample_uv, 0);
const float3 sampleN = decode_oct(input_normal_low.SampleLevel(sampler_linear_clamp, sample_uv, 0));
const float3 sampleDiffuse = input_diffuse_low.SampleLevel(sampler_linear_clamp, sample_uv, 0).rgb;
const float3 sampleP = reconstruct_position(sample_uv, sampleDepth);
float3 dq = P - sampleP;
float planeError = max(abs(dot(dq, sampleN)), abs(dot(dq, N)));
float relativeDepthDifference = planeError / linearDepth;
float bilateralDepthWeight = exp(-sqr(relativeDepthDifference) * depthThreshold);
float normalError = pow(saturate(dot(sampleN, N)), 4.0);
float bilateralNormalWeight = saturate(1.0 - (1.0 - normalError) * normalThreshold);
float weight = bilateralDepthWeight * bilateralNormalWeight;
//weight = 1;
result += sampleDiffuse * weight;
sum += weight;
}
}
if(sum > 0)
{
result /= sum;
}
result = max(0, result);
output[pixel] = (output[pixel] + float4(result, 1)) ;
}
@@ -1,192 +0,0 @@
#include "globals.hlsli"
#include "stochasticSSRHF.hlsli"
#include "ShaderInterop_Postprocess.h"
PUSHCONSTANT(postprocess, PostProcess);
Texture2D<float4> input : register(t0);
Texture2DArray<float> input_depth : register(t1);
Texture2D<float2> input_normal : register(t2);
RWTexture2D<float4> output_diffuse : register(u0);
#ifdef WIDE
static const uint THREADCOUNT = 16;
static const int TILE_BORDER = 18;
#else
static const uint THREADCOUNT = 8;
static const int TILE_BORDER = 4;
#endif // WIDE
static const int TILE_SIZE = TILE_BORDER + THREADCOUNT + TILE_BORDER;
groupshared uint cache_xy[TILE_SIZE * TILE_SIZE];
groupshared float cache_z[TILE_SIZE * TILE_SIZE];
groupshared uint cache_rgb[TILE_SIZE * TILE_SIZE];
groupshared uint group_valid;
inline uint coord_to_cache(int2 coord)
{
return flatten2D(clamp(TILE_BORDER + coord, 0, TILE_SIZE - 1), TILE_SIZE);
}
static const float radius = 14;
static const float radius2 = radius * radius;
static const float radius2_rcp_negative = -rcp(radius2);
#if 0
static const uint depth_test_count = 1;
static const float depth_tests[] = {0.33};
#else
static const uint depth_test_count = 3;
static const float depth_tests[] = {0.125, 0.25, 0.75};
#endif
float3 compute_diffuse(
float3 origin_position,
float3 origin_normal,
int2 GTid,
int2 offset
)
{
const int2 sampleLoc = GTid + offset;
const uint t = coord_to_cache(sampleLoc);
float3 sample_position;
sample_position.z = cache_z[t];
if(sample_position.z > GetCamera().z_far - 1)
return 0;
sample_position.xy = unpack_half2(cache_xy[t]);
const float3 origin_to_sample = sample_position - origin_position;
const float distance2 = dot(origin_to_sample, origin_to_sample);
float occlusion = saturate(dot(origin_normal, origin_to_sample));
occlusion *= saturate(distance2 * radius2_rcp_negative + 1.0f);
if(occlusion > 0)
{
const float origin_z = origin_position.z;
const float sample_z = sample_position.z;
#if 1
// DDA occlusion:
const int2 start = GTid;
const int2 goal = sampleLoc;
const int dx = int(goal.x) - int(start.x);
const int dy = int(goal.y) - int(start.y);
int step = max(abs(dx), abs(dy));
step = (step + 1) / 2; // reduce steps
const float step_rcp = rcp(step);
const float x_incr = float(dx) * step_rcp;
const float y_incr = float(dy) * step_rcp;
float x = float(start.x);
float y = float(start.y);
for (int i = 0; i < step - 1; i++)
{
x += x_incr;
y += y_incr;
const int2 loc = int2(round(x), round(y));
const uint tt = coord_to_cache(loc);
const float dt = float(i) / float(step);
const float z = lerp(origin_z, sample_z, dt);
const float sz = cache_z[tt];
if(sz < z - 0.1)
{
return occlusion * Unpack_R11G11B10_FLOAT(cache_rgb[tt]);
}
}
#else
// Simple occlusion:
for (uint i = 0; i < depth_test_count; ++i)
{
const float dt = depth_tests[i];
const float z = lerp(origin_z, sample_z, dt);
const int2 loc = round(lerp(float2(GTid), float2(sampleLoc), dt));
const uint tt = coord_to_cache(loc);
const float sz = cache_z[tt];
if (sz < z - 0.1)
{
return occlusion * Unpack_R11G11B10_FLOAT(cache_rgb[tt]);
}
}
#endif
}
return occlusion * Unpack_R11G11B10_FLOAT(cache_rgb[t]);
}
[numthreads(THREADCOUNT, THREADCOUNT, 1)]
void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint2 GTid : SV_GroupThreadID, uint groupIndex : SV_GroupIndex)
{
const uint layer = DTid.z;
const uint2 interleaved_pixel = DTid.xy << 2 | uint2(DTid.z & 3, DTid.z >> 2);
if(groupIndex == 0)
{
group_valid = 0;
}
GroupMemoryBarrierWithGroupSync();
const int2 tile_upperleft = Gid.xy * THREADCOUNT - TILE_BORDER;
for(uint t = groupIndex; t < TILE_SIZE * TILE_SIZE; t += THREADCOUNT * THREADCOUNT)
{
const int2 pixel = tile_upperleft + unflatten2D(t, TILE_SIZE);
const float depth = input_depth[uint3(pixel, layer)];
const float2 uv = (pixel + 0.5f) * postprocess.resolution_rcp;
const float3 P = reconstruct_position(uv, depth, GetCamera().inverse_projection);
const float3 color = input.SampleLevel(sampler_linear_clamp, uv, 0).rgb;
const uint pkcolor = Pack_R11G11B10_FLOAT(color.rgb);
cache_xy[t] = pack_half2(P.xy);
cache_z[t] = P.z;
cache_rgb[t] = pkcolor;
if(pkcolor)
InterlockedOr(group_valid, 1u);
}
GroupMemoryBarrierWithGroupSync();
[branch]
if (group_valid == 0)
return; // if no valid color was cached, whole group can exit early
const uint t = coord_to_cache(GTid.xy);
float3 P;
P.z = cache_z[t];
[branch]
if(P.z > GetCamera().z_far - 1)
return; // if pixel depth is not valid, it can exit early
P.xy = unpack_half2(cache_xy[t]);
const uint2 pixel = DTid.xy;
const float3 N = mul((float3x3)GetCamera().view, decode_oct(input_normal[interleaved_pixel].rg));
float3 diffuse = 0;
float sum = 0;
const int range = int(postprocess.params0.x);
const float spread = postprocess.params0.y /*+ dither(pixel)*/;
const float rangespread_rcp2 = postprocess.params0.z;
for(int x = -range; x <= range; ++x)
{
for(int y = -range; y <= range; ++y)
{
const float2 foffset = float2(x, y) * spread;
const int2 offset = round(foffset);
const float weight = saturate(1 - abs(foffset.x) * abs(foffset.y) * rangespread_rcp2);
diffuse += compute_diffuse(P, N, GTid, offset) * weight;
sum += weight;
}
}
if(sum > 0)
{
diffuse = diffuse / sum;
}
// interleave result:
output_diffuse[interleaved_pixel] = float4(diffuse, 1);
}