diff --git a/WickedEngine/shaders/ssgiCS.hlsl b/WickedEngine/shaders/ssgiCS.hlsl index c658f7fc7..fb73290d4 100644 --- a/WickedEngine/shaders/ssgiCS.hlsl +++ b/WickedEngine/shaders/ssgiCS.hlsl @@ -12,7 +12,7 @@ RWTexture2D output_diffuse : register(u0); #ifdef WIDE static const uint THREADCOUNT = 16; -static const int TILE_BORDER = 18; +static const int TILE_BORDER = 16; #else static const uint THREADCOUNT = 8; static const int TILE_BORDER = 4; @@ -25,7 +25,7 @@ groupshared uint group_valid; inline uint coord_to_cache(int2 coord) { - return flatten2D(clamp(TILE_BORDER + coord, 0, TILE_SIZE - 1), TILE_SIZE); + return flatten2D(clamp(coord, 0, TILE_SIZE - 1), TILE_SIZE); } static const float depthRejection = 8; @@ -34,11 +34,10 @@ static const float depthRejection_rcp = rcp(depthRejection); float3 compute_diffuse( float3 origin_position, float3 origin_normal, - int2 GTid, - int2 offset + int2 originLoc, // coord in cache + int2 sampleLoc // coord in cache ) { - const int2 sampleLoc = GTid + offset; const uint t = coord_to_cache(sampleLoc); uint c = cache_rgb[t]; if(c == 0) @@ -56,7 +55,7 @@ float3 compute_diffuse( const float sample_z = sample_position.z; // DDA occlusion: - const int2 start = GTid; + const int2 start = originLoc; const int2 goal = sampleLoc; const int dx = int(goal.x) - int(start.x); @@ -86,7 +85,8 @@ float3 compute_diffuse( const float sz = cache_z[tt]; if(sz < z - 0.1) { - return occlusion * Unpack_R11G11B10_FLOAT(cache_rgb[tt]); + c = cache_rgb[tt]; + break; } } } @@ -127,7 +127,8 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint2 GTid : if (group_valid == 0) return; // if no valid color was cached, whole group can exit early - const uint t = coord_to_cache(GTid.xy); + const int2 originLoc = GTid.xy + TILE_BORDER; + const uint t = coord_to_cache(originLoc); float3 P; P.z = cache_z[t]; @@ -137,23 +138,26 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint2 GTid : P.xy = unpack_half2(cache_xy[t]); - const uint2 pixel = DTid.xy; const float3 N = mul((float3x3)GetCamera().view, decode_oct(input_normal[interleaved_pixel].rg)); float3 diffuse = 0; float sum = 0; const int range = int(postprocess.params0.x); - const float spread = postprocess.params0.y + dither(pixel); + const float spread = postprocess.params0.y /*+ dither(DTid.xy)*/; const float rangespread_rcp2 = postprocess.params0.z; - + + const int2 pixel_base = Gid.xy * THREADCOUNT + GTid; for(int x = -range; x <= range; ++x) { for(int y = -range; y <= range; ++y) { + const int2 pixel = pixel_base + int2(x, y); + if(any(pixel < 0) || any(pixel >= postprocess.resolution)) + continue; // to not lose energy when sampling outside of textures, we skip those offsets const float2 foffset = float2(x, y) * spread; const int2 offset = round(foffset); const float weight = saturate(1 - abs(foffset.x) * abs(foffset.y) * rangespread_rcp2); - diffuse += compute_diffuse(P, N, GTid, offset) * weight; + diffuse += compute_diffuse(P, N, originLoc, originLoc + offset) * weight; sum += weight; } } diff --git a/WickedEngine/shaders/ssgi_deinterleaveCS.hlsl b/WickedEngine/shaders/ssgi_deinterleaveCS.hlsl index b392ce3d7..70339af04 100644 --- a/WickedEngine/shaders/ssgi_deinterleaveCS.hlsl +++ b/WickedEngine/shaders/ssgi_deinterleaveCS.hlsl @@ -12,17 +12,17 @@ RWTexture2DArray atlas4x_color : register(u5); RWTexture2DArray atlas8x_color : register(u6); RWTexture2DArray atlas16x_color : register(u7); RWTexture2D regular2x_depth : register(u8); -RWTexture2D regular2x_normal : register(u9); -RWTexture2D regular4x_depth : register(u10); -RWTexture2D regular4x_normal : register(u11); -RWTexture2D regular8x_depth : register(u12); -RWTexture2D regular8x_normal : register(u13); -RWTexture2D regular16x_depth : register(u14); +RWTexture2D regular4x_depth : register(u9); +RWTexture2D regular8x_depth : register(u10); +RWTexture2D regular16x_depth : register(u11); +RWTexture2D regular2x_normal : register(u12); +RWTexture2D regular4x_normal : register(u13); +RWTexture2D regular8x_normal : register(u14); RWTexture2D regular16x_normal : register(u15); groupshared float shared_depths[256]; -groupshared float2 shared_normals[256]; -groupshared float3 shared_colors[256]; +groupshared uint shared_normals[256]; +groupshared uint shared_colors[256]; [numthreads(8, 8, 1)] void main(uint3 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex, uint3 GTid : SV_GroupThreadID, uint3 DTid : SV_DispatchThreadID) @@ -38,10 +38,10 @@ void main(uint3 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex, uint3 GTid : shared_depths[destIdx + 128] = texture_depth[min(startST | uint2(0, 8), dim - 1)]; shared_depths[destIdx + 136] = texture_depth[min(startST | uint2(8, 8), dim - 1)]; - shared_normals[destIdx + 0] = texture_normal[min(startST | uint2(0, 0), dim - 1)]; - shared_normals[destIdx + 8] = texture_normal[min(startST | uint2(8, 0), dim - 1)]; - shared_normals[destIdx + 128] = texture_normal[min(startST | uint2(0, 8), dim - 1)]; - shared_normals[destIdx + 136] = texture_normal[min(startST | uint2(8, 8), dim - 1)]; + shared_normals[destIdx + 0] = pack_half2(texture_normal[min(startST | uint2(0, 0), dim - 1)]); + shared_normals[destIdx + 8] = pack_half2(texture_normal[min(startST | uint2(8, 0), dim - 1)]); + shared_normals[destIdx + 128] = pack_half2(texture_normal[min(startST | uint2(0, 8), dim - 1)]); + shared_normals[destIdx + 136] = pack_half2(texture_normal[min(startST | uint2(8, 8), dim - 1)]); const float2 uv0 = float2(startST | uint2(0, 0)) * dim_rcp; const float2 uv1 = float2(startST | uint2(8, 0)) * dim_rcp; @@ -55,18 +55,18 @@ void main(uint3 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex, uint3 GTid : const float2 prevUV1 = uv1 + velocity1; const float2 prevUV2 = uv2 + velocity2; const float2 prevUV3 = uv3 + velocity3; - shared_colors[destIdx + 0] = texture_input.SampleLevel(sampler_linear_clamp, prevUV0, 0); - shared_colors[destIdx + 8] = texture_input.SampleLevel(sampler_linear_clamp, prevUV1, 0); - shared_colors[destIdx + 128] = texture_input.SampleLevel(sampler_linear_clamp, prevUV2, 0); - shared_colors[destIdx + 136] = texture_input.SampleLevel(sampler_linear_clamp, prevUV3, 0); + shared_colors[destIdx + 0] = Pack_R11G11B10_FLOAT(texture_input.SampleLevel(sampler_linear_clamp, prevUV0, 0)); + shared_colors[destIdx + 8] = Pack_R11G11B10_FLOAT(texture_input.SampleLevel(sampler_linear_clamp, prevUV1, 0)); + shared_colors[destIdx + 128] = Pack_R11G11B10_FLOAT(texture_input.SampleLevel(sampler_linear_clamp, prevUV2, 0)); + shared_colors[destIdx + 136] = Pack_R11G11B10_FLOAT(texture_input.SampleLevel(sampler_linear_clamp, prevUV3, 0)); GroupMemoryBarrierWithGroupSync(); uint ldsIndex = (GTid.x << 1) | (GTid.y << 5); float depth = shared_depths[ldsIndex]; - float2 normal = shared_normals[ldsIndex]; - float3 color = shared_colors[ldsIndex]; + float2 normal = unpack_half2(shared_normals[ldsIndex]); + float3 color = Unpack_R11G11B10_FLOAT(shared_colors[ldsIndex]); color = color - 0.2; // cut out pixels that shouldn't act as lights color *= 0.9; // accumulation energy loss diff --git a/WickedEngine/shaders/ssgi_upsampleCS.hlsl b/WickedEngine/shaders/ssgi_upsampleCS.hlsl index 8d0c9b44d..81460dc77 100644 --- a/WickedEngine/shaders/ssgi_upsampleCS.hlsl +++ b/WickedEngine/shaders/ssgi_upsampleCS.hlsl @@ -42,8 +42,8 @@ void main(uint2 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex) const int range = int(postprocess.params0.x); const float spread = postprocess.params0.y; #else - const int range = 1; - const float spread = 8; + const int range = 2; + const float spread = 6; #endif for(int x = -range; x <= range; ++x) { diff --git a/WickedEngine/wiRenderer.cpp b/WickedEngine/wiRenderer.cpp index 140d44398..a9f6a7d7a 100644 --- a/WickedEngine/wiRenderer.cpp +++ b/WickedEngine/wiRenderer.cpp @@ -12444,6 +12444,8 @@ void Postprocess_RTDiffuse( } void CreateSSGIResources(SSGIResources& res, XMUINT2 resolution) { + res.cleared = false; + TextureDesc desc; desc.type = TextureDesc::Type::TEXTURE_2D; desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS; @@ -12456,34 +12458,11 @@ void CreateSSGIResources(SSGIResources& res, XMUINT2 resolution) desc.width = (resolution.x + 7) / 8; desc.height = (resolution.y + 7) / 8; desc.array_size = 16; + desc.mip_levels = 4; desc.format = Format::R32_FLOAT; - device->CreateTexture(&desc, nullptr, &res.texture_atlas2x_depth); + device->CreateTexture(&desc, nullptr, &res.texture_atlas_depth); desc.format = Format::R11G11B10_FLOAT; - device->CreateTexture(&desc, nullptr, &res.texture_atlas2x_color); - - desc.width = (resolution.x + 15) / 16; - desc.height = (resolution.y + 15) / 16; - desc.array_size = 16; - desc.format = Format::R32_FLOAT; - device->CreateTexture(&desc, nullptr, &res.texture_atlas4x_depth); - desc.format = Format::R11G11B10_FLOAT; - device->CreateTexture(&desc, nullptr, &res.texture_atlas4x_color); - - desc.width = (resolution.x + 31) / 32; - desc.height = (resolution.y + 31) / 32; - desc.array_size = 16; - desc.format = Format::R32_FLOAT; - device->CreateTexture(&desc, nullptr, &res.texture_atlas8x_depth); - desc.format = Format::R11G11B10_FLOAT; - device->CreateTexture(&desc, nullptr, &res.texture_atlas8x_color); - - desc.width = (resolution.x + 63) / 64; - desc.height = (resolution.y + 63) / 64; - desc.array_size = 16; - desc.format = Format::R32_FLOAT; - device->CreateTexture(&desc, nullptr, &res.texture_atlas16x_depth); - desc.format = Format::R11G11B10_FLOAT; - device->CreateTexture(&desc, nullptr, &res.texture_atlas16x_color); + device->CreateTexture(&desc, nullptr, &res.texture_atlas_color); desc.array_size = 1; desc.mip_levels = 4; @@ -12496,9 +12475,17 @@ void CreateSSGIResources(SSGIResources& res, XMUINT2 resolution) desc.format = Format::R11G11B10_FLOAT; device->CreateTexture(&desc, nullptr, &res.texture_diffuse_mips); - for (uint32_t i = 0; i < desc.mip_levels; ++i) + for (uint32_t i = 0; i < 4u; ++i) { int subresource_index; + subresource_index = device->CreateSubresource(&res.texture_atlas_depth, SubresourceType::SRV, 0, 16, i, 1); + assert(subresource_index == i); + subresource_index = device->CreateSubresource(&res.texture_atlas_depth, SubresourceType::UAV, 0, 16, i, 1); + assert(subresource_index == i); + subresource_index = device->CreateSubresource(&res.texture_atlas_color, SubresourceType::SRV, 0, 16, i, 1); + assert(subresource_index == i); + subresource_index = device->CreateSubresource(&res.texture_atlas_color, SubresourceType::UAV, 0, 16, i, 1); + assert(subresource_index == i); subresource_index = device->CreateSubresource(&res.texture_depth_mips, SubresourceType::SRV, 0, 1, i, 1); assert(subresource_index == i); subresource_index = device->CreateSubresource(&res.texture_depth_mips, SubresourceType::UAV, 0, 1, i, 1); @@ -12527,14 +12514,8 @@ void Postprocess_SSGI( { GPUBarrier barriers[] = { - GPUBarrier::Image(&res.texture_atlas2x_depth, res.texture_atlas2x_depth.desc.layout, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Image(&res.texture_atlas4x_depth, res.texture_atlas4x_depth.desc.layout, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Image(&res.texture_atlas8x_depth, res.texture_atlas8x_depth.desc.layout, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Image(&res.texture_atlas16x_depth, res.texture_atlas16x_depth.desc.layout, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Image(&res.texture_atlas2x_color, res.texture_atlas2x_color.desc.layout, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Image(&res.texture_atlas4x_color, res.texture_atlas4x_color.desc.layout, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Image(&res.texture_atlas8x_color, res.texture_atlas8x_color.desc.layout, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Image(&res.texture_atlas16x_color, res.texture_atlas16x_color.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_atlas_depth, res.texture_atlas_depth.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_atlas_color, res.texture_atlas_color.desc.layout, ResourceState::UNORDERED_ACCESS), GPUBarrier::Image(&res.texture_depth_mips, res.texture_depth_mips.desc.layout, ResourceState::UNORDERED_ACCESS), GPUBarrier::Image(&res.texture_normal_mips, res.texture_normal_mips.desc.layout, ResourceState::UNORDERED_ACCESS), GPUBarrier::Image(&res.texture_diffuse_mips, res.texture_diffuse_mips.desc.layout, ResourceState::UNORDERED_ACCESS), @@ -12543,31 +12524,20 @@ void Postprocess_SSGI( device->Barrier(barriers, arraysize(barriers), cmd); } - device->ClearUAV(&res.texture_atlas2x_depth, 0, cmd); - device->ClearUAV(&res.texture_atlas4x_depth, 0, cmd); - device->ClearUAV(&res.texture_atlas8x_depth, 0, cmd); - device->ClearUAV(&res.texture_atlas16x_depth, 0, cmd); - device->ClearUAV(&res.texture_atlas2x_color, 0, cmd); - device->ClearUAV(&res.texture_atlas4x_color, 0, cmd); - device->ClearUAV(&res.texture_atlas8x_color, 0, cmd); - device->ClearUAV(&res.texture_atlas16x_color, 0, cmd); - device->ClearUAV(&res.texture_depth_mips, 0, cmd); - device->ClearUAV(&res.texture_normal_mips, 0, cmd); + if (!res.cleared) + { + res.cleared = true; + device->ClearUAV(&res.texture_atlas_depth, 0, cmd); + device->ClearUAV(&res.texture_atlas_color, 0, cmd); + device->ClearUAV(&res.texture_depth_mips, 0, cmd); + device->ClearUAV(&res.texture_normal_mips, 0, cmd); + } device->ClearUAV(&res.texture_diffuse_mips, 0, cmd); device->ClearUAV(&output, 0, cmd); { GPUBarrier barriers[] = { - GPUBarrier::Memory(&res.texture_atlas2x_depth), - GPUBarrier::Memory(&res.texture_atlas4x_depth), - GPUBarrier::Memory(&res.texture_atlas8x_depth), - GPUBarrier::Memory(&res.texture_atlas16x_depth), - GPUBarrier::Memory(&res.texture_atlas2x_color), - GPUBarrier::Memory(&res.texture_atlas4x_color), - GPUBarrier::Memory(&res.texture_atlas8x_color), - GPUBarrier::Memory(&res.texture_atlas16x_color), - GPUBarrier::Memory(&res.texture_depth_mips), - GPUBarrier::Memory(&res.texture_normal_mips), + GPUBarrier::Memory(), }; device->Barrier(barriers, arraysize(barriers), cmd); } @@ -12582,31 +12552,30 @@ void Postprocess_SSGI( device->BindResource(&input, 0, cmd); - const GPUResource* uavs[] = { - &res.texture_atlas2x_depth, - &res.texture_atlas4x_depth, - &res.texture_atlas8x_depth, - &res.texture_atlas16x_depth, - &res.texture_atlas2x_color, - &res.texture_atlas4x_color, - &res.texture_atlas8x_color, - &res.texture_atlas16x_color, - }; - device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + device->BindUAV(&res.texture_atlas_depth, 0, cmd, 0); + device->BindUAV(&res.texture_atlas_depth, 1, cmd, 1); + device->BindUAV(&res.texture_atlas_depth, 2, cmd, 2); + device->BindUAV(&res.texture_atlas_depth, 3, cmd, 3); - device->BindUAV(&res.texture_depth_mips, arraysize(uavs) + 0, cmd, 0); - device->BindUAV(&res.texture_normal_mips, arraysize(uavs) + 1, cmd, 0); - device->BindUAV(&res.texture_depth_mips, arraysize(uavs) + 2, cmd, 1); - device->BindUAV(&res.texture_normal_mips, arraysize(uavs) + 3, cmd, 1); - device->BindUAV(&res.texture_depth_mips, arraysize(uavs) + 4, cmd, 2); - device->BindUAV(&res.texture_normal_mips, arraysize(uavs) + 5, cmd, 2); - device->BindUAV(&res.texture_depth_mips, arraysize(uavs) + 6, cmd, 3); - device->BindUAV(&res.texture_normal_mips, arraysize(uavs) + 7, cmd, 3); + device->BindUAV(&res.texture_atlas_color, 4, cmd, 0); + device->BindUAV(&res.texture_atlas_color, 5, cmd, 1); + device->BindUAV(&res.texture_atlas_color, 6, cmd, 2); + device->BindUAV(&res.texture_atlas_color, 7, cmd, 3); - const TextureDesc& desc = res.texture_atlas4x_depth.GetDesc(); + device->BindUAV(&res.texture_depth_mips, 8, cmd, 0); + device->BindUAV(&res.texture_depth_mips, 9, cmd, 1); + device->BindUAV(&res.texture_depth_mips, 10, cmd, 2); + device->BindUAV(&res.texture_depth_mips, 11, cmd, 3); + + device->BindUAV(&res.texture_normal_mips, 12, cmd, 0); + device->BindUAV(&res.texture_normal_mips, 13, cmd, 1); + device->BindUAV(&res.texture_normal_mips, 14, cmd, 2); + device->BindUAV(&res.texture_normal_mips, 15, cmd, 3); + + const TextureDesc& desc = res.texture_atlas_depth.GetDesc(); device->Dispatch( - desc.width, - desc.height, + desc.width >> 1, + desc.height >> 1, 1, cmd ); @@ -12616,15 +12585,8 @@ void Postprocess_SSGI( { GPUBarrier barriers[] = { - GPUBarrier::Memory(&res.texture_diffuse_mips), - GPUBarrier::Image(&res.texture_atlas2x_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas2x_depth.desc.layout), - GPUBarrier::Image(&res.texture_atlas4x_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas4x_depth.desc.layout), - GPUBarrier::Image(&res.texture_atlas8x_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas8x_depth.desc.layout), - GPUBarrier::Image(&res.texture_atlas16x_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas16x_depth.desc.layout), - GPUBarrier::Image(&res.texture_atlas2x_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas2x_color.desc.layout), - GPUBarrier::Image(&res.texture_atlas4x_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas4x_color.desc.layout), - GPUBarrier::Image(&res.texture_atlas8x_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas8x_color.desc.layout), - GPUBarrier::Image(&res.texture_atlas16x_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas16x_color.desc.layout), + GPUBarrier::Image(&res.texture_atlas_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas_depth.desc.layout), + GPUBarrier::Image(&res.texture_atlas_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas_color.desc.layout), GPUBarrier::Image(&res.texture_depth_mips, ResourceState::UNORDERED_ACCESS, res.texture_depth_mips.desc.layout), GPUBarrier::Image(&res.texture_normal_mips, ResourceState::UNORDERED_ACCESS, res.texture_normal_mips.desc.layout), }; @@ -12634,110 +12596,20 @@ void Postprocess_SSGI( { device->EventBegin("SSGI - diffuse", cmd); - device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI], cmd); - - // 2x: - { - const GPUResource* resarray[] = { - &res.texture_atlas2x_depth, - &res.texture_atlas2x_color, - }; - device->BindResources(resarray, 0, arraysize(resarray), cmd); - device->BindResource(&res.texture_normal_mips, arraysize(resarray) + 0, cmd, 0); - device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 0); - - const TextureDesc& desc = res.texture_atlas2x_depth.GetDesc(); - - postprocess.resolution.x = desc.width; - postprocess.resolution.y = desc.height; - postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; - postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; - postprocess.params0.x = 1; // range - postprocess.params0.y = 2; // spread - postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2 - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - - device->Dispatch( - (desc.width + 7) / 8, - (desc.height + 7) / 8, - 16, - cmd - ); - } - // 4x: - { - const GPUResource* resarray[] = { - &res.texture_atlas4x_depth, - &res.texture_atlas4x_color, - }; - device->BindResources(resarray, 0, arraysize(resarray), cmd); - device->BindResource(&res.texture_normal_mips, arraysize(resarray) + 0, cmd, 1); - device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 1); - - const TextureDesc& desc = res.texture_atlas4x_depth.GetDesc(); - - postprocess.resolution.x = desc.width; - postprocess.resolution.y = desc.height; - postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; - postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; - postprocess.params0.x = 2; // range - postprocess.params0.y = 2; // spread - postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2 - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - - device->Dispatch( - (desc.width + 7) / 8, - (desc.height + 7) / 8, - 16, - cmd - ); - } - - // Switch to wide sampling shader: + // Wide sampling passes: device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI_WIDE], cmd); - // 8x: - { - const GPUResource* resarray[] = { - &res.texture_atlas8x_depth, - &res.texture_atlas8x_color, - }; - device->BindResources(resarray, 0, arraysize(resarray), cmd); - device->BindResource(&res.texture_normal_mips, arraysize(resarray) + 0, cmd, 2); - device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 2); - - const TextureDesc& desc = res.texture_atlas8x_depth.GetDesc(); - - postprocess.resolution.x = desc.width; - postprocess.resolution.y = desc.height; - postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; - postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; - postprocess.params0.x = 4; // range - postprocess.params0.y = 4; // spread - postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2 - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - - device->Dispatch( - (desc.width + 15) / 16, - (desc.height + 15) / 16, - 16, - cmd - ); - } // 16x: { - const GPUResource* resarray[] = { - &res.texture_atlas16x_depth, - &res.texture_atlas16x_color, - }; - device->BindResources(resarray, 0, arraysize(resarray), cmd); - device->BindResource(&res.texture_normal_mips, arraysize(resarray) + 0, cmd, 3); + device->BindResource(&res.texture_atlas_depth, 0, cmd, 3); + device->BindResource(&res.texture_atlas_color, 1, cmd, 3); + device->BindResource(&res.texture_normal_mips, 2, cmd, 3); device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 3); - const TextureDesc& desc = res.texture_atlas16x_depth.GetDesc(); + const TextureDesc& desc = res.texture_atlas_depth.GetDesc(); - postprocess.resolution.x = desc.width; - postprocess.resolution.y = desc.height; + postprocess.resolution.x = desc.width >> 3; + postprocess.resolution.y = desc.height >> 3; postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; postprocess.params0.x = 8; // range @@ -12746,8 +12618,88 @@ void Postprocess_SSGI( device->PushConstants(&postprocess, sizeof(postprocess), cmd); device->Dispatch( - (desc.width + 15) / 16, - (desc.height + 15) / 16, + (postprocess.resolution.x + 15) / 16, + (postprocess.resolution.y + 15) / 16, + 16, + cmd + ); + } + // 8x: + { + device->BindResource(&res.texture_atlas_depth, 0, cmd, 2); + device->BindResource(&res.texture_atlas_color, 1, cmd, 2); + device->BindResource(&res.texture_normal_mips, 2, cmd, 2); + device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 2); + + const TextureDesc& desc = res.texture_atlas_depth.GetDesc(); + + postprocess.resolution.x = desc.width >> 2; + postprocess.resolution.y = desc.height >> 2; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + postprocess.params0.x = 4; // range + postprocess.params0.y = 4; // spread + postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2 + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + device->Dispatch( + (postprocess.resolution.x + 15) / 16, + (postprocess.resolution.y + 15) / 16, + 16, + cmd + ); + } + + // Narrow sampling passes: + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI], cmd); + + // 4x: + { + device->BindResource(&res.texture_atlas_depth, 0, cmd, 1); + device->BindResource(&res.texture_atlas_color, 1, cmd, 1); + device->BindResource(&res.texture_normal_mips, 2, cmd, 1); + device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 1); + + const TextureDesc& desc = res.texture_atlas_depth.GetDesc(); + + postprocess.resolution.x = desc.width >> 1u; + postprocess.resolution.y = desc.height >> 1u; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + postprocess.params0.x = 2; // range + postprocess.params0.y = 2; // spread + postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2 + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + device->Dispatch( + (postprocess.resolution.x + 7) / 8, + (postprocess.resolution.y + 7) / 8, + 16, + cmd + ); + } + + // 2x: + { + device->BindResource(&res.texture_atlas_depth, 0, cmd, 0); + device->BindResource(&res.texture_atlas_color, 1, cmd, 0); + device->BindResource(&res.texture_normal_mips, 2, cmd, 0); + device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 0); + + const TextureDesc& desc = res.texture_atlas_depth.GetDesc(); + + postprocess.resolution.x = desc.width; + postprocess.resolution.y = desc.height; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + postprocess.params0.x = 1; // range + postprocess.params0.y = 4; // spread + postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2 + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + device->Dispatch( + (postprocess.resolution.x + 7) / 8, + (postprocess.resolution.y + 7) / 8, 16, cmd ); @@ -12782,7 +12734,7 @@ void Postprocess_SSGI( postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; postprocess.params0.x = 2; // range - postprocess.params0.y = 8; // spread + postprocess.params0.y = 4; // spread device->PushConstants(&postprocess, sizeof(postprocess), cmd); device->Dispatch( @@ -12885,8 +12837,8 @@ void Postprocess_SSGI( device->PushConstants(&postprocess, sizeof(postprocess), cmd); device->Dispatch( - (desc.width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - (desc.height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (desc.width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, // dispatch is using desc size (unaligned!) + (desc.height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, // dispatch is using desc size (unaligned!) 1, cmd ); diff --git a/WickedEngine/wiRenderer.h b/WickedEngine/wiRenderer.h index e8920e5dd..90f9aa787 100644 --- a/WickedEngine/wiRenderer.h +++ b/WickedEngine/wiRenderer.h @@ -554,14 +554,9 @@ namespace wi::renderer ); struct SSGIResources { - wi::graphics::Texture texture_atlas2x_depth; - wi::graphics::Texture texture_atlas4x_depth; - wi::graphics::Texture texture_atlas8x_depth; - wi::graphics::Texture texture_atlas16x_depth; - wi::graphics::Texture texture_atlas2x_color; - wi::graphics::Texture texture_atlas4x_color; - wi::graphics::Texture texture_atlas8x_color; - wi::graphics::Texture texture_atlas16x_color; + mutable bool cleared = false; + wi::graphics::Texture texture_atlas_depth; + wi::graphics::Texture texture_atlas_color; wi::graphics::Texture texture_depth_mips; wi::graphics::Texture texture_normal_mips; wi::graphics::Texture texture_diffuse_mips; diff --git a/WickedEngine/wiVersion.cpp b/WickedEngine/wiVersion.cpp index b0d9562dd..1d1a26b80 100644 --- a/WickedEngine/wiVersion.cpp +++ b/WickedEngine/wiVersion.cpp @@ -9,7 +9,7 @@ namespace wi::version // minor features, major updates, breaking compatibility changes const int minor = 71; // minor bug fixes, alterations, refactors, updates - const int revision = 418; + const int revision = 419; const std::string version_string = std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(revision); diff --git a/enc_temp_folder/2f9a23bddcd8e6a426511f3643315cb7/ssgi_upsampleCS.hlsl b/enc_temp_folder/2f9a23bddcd8e6a426511f3643315cb7/ssgi_upsampleCS.hlsl deleted file mode 100644 index 5033b2433..000000000 --- a/enc_temp_folder/2f9a23bddcd8e6a426511f3643315cb7/ssgi_upsampleCS.hlsl +++ /dev/null @@ -1,74 +0,0 @@ -#include "globals.hlsli" -#include "stochasticSSRHF.hlsli" -#include "ShaderInterop_Postprocess.h" - -PUSHCONSTANT(postprocess, PostProcess); - -Texture2D input_depth_low : register(t0); -Texture2D input_normal_low : register(t1); -Texture2D input_diffuse_low : register(t2); -Texture2D input_depth_high : register(t3); -Texture2D input_normal_high : register(t4); - -RWTexture2D output : register(u0); - -static const float depthThreshold = 1000.0; -static const float normalThreshold = 1.0; - -[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] -void main(uint2 DTid : SV_DispatchThreadID) -{ - uint2 pixel = DTid.xy; - const float2 uv = (pixel + 0.5) * postprocess.resolution_rcp; - - const float depth = input_depth_high[pixel]; - const float linearDepth = compute_lineardepth(depth); - const float3 N = decode_oct(input_normal_high[pixel].rg); - const float3 P = reconstruct_position(uv, depth); - - float3 result = 0; - float sum = 0; -#if 1 - const int range = int(postprocess.params0.x); - const float spread = postprocess.params0.y; -#else - const int range = 1; - const float spread = 8; -#endif - for(int x = -range; x <= range; ++x) - { - for(int y = -range; y <= range; ++y) - { - const float2 offset = float2(x, y) * spread * postprocess.resolution_rcp; - const float2 sample_uv = uv + offset; - - const float sampleDepth = input_depth_low.SampleLevel(sampler_linear_clamp, sample_uv, 0); - const float3 sampleN = decode_oct(input_normal_low.SampleLevel(sampler_linear_clamp, sample_uv, 0)); - const float3 sampleDiffuse = input_diffuse_low.SampleLevel(sampler_linear_clamp, sample_uv, 0).rgb; - const float3 sampleP = reconstruct_position(sample_uv, sampleDepth); - - float3 dq = P - sampleP; - float planeError = max(abs(dot(dq, sampleN)), abs(dot(dq, N))); - float relativeDepthDifference = planeError / linearDepth; - float bilateralDepthWeight = exp(-sqr(relativeDepthDifference) * depthThreshold); - - float normalError = pow(saturate(dot(sampleN, N)), 4.0); - float bilateralNormalWeight = saturate(1.0 - (1.0 - normalError) * normalThreshold); - - float weight = bilateralDepthWeight * bilateralNormalWeight; - - //weight = 1; - result += sampleDiffuse * weight; - sum += weight; - } - } - - if(sum > 0) - { - result /= sum; - } - - result = max(0, result); - - output[pixel] = (output[pixel] + float4(result, 1)) ; -} diff --git a/enc_temp_folder/343d8179d01536ce28957367d6f61811/ssgiCS.hlsl b/enc_temp_folder/343d8179d01536ce28957367d6f61811/ssgiCS.hlsl deleted file mode 100644 index 8e972abf8..000000000 --- a/enc_temp_folder/343d8179d01536ce28957367d6f61811/ssgiCS.hlsl +++ /dev/null @@ -1,192 +0,0 @@ -#include "globals.hlsli" -#include "stochasticSSRHF.hlsli" -#include "ShaderInterop_Postprocess.h" - -PUSHCONSTANT(postprocess, PostProcess); - -Texture2D input : register(t0); -Texture2DArray input_depth : register(t1); -Texture2D input_normal : register(t2); - -RWTexture2D output_diffuse : register(u0); - -#ifdef WIDE -static const uint THREADCOUNT = 16; -static const int TILE_BORDER = 18; -#else -static const uint THREADCOUNT = 8; -static const int TILE_BORDER = 4; -#endif // WIDE -static const int TILE_SIZE = TILE_BORDER + THREADCOUNT + TILE_BORDER; -groupshared uint cache_xy[TILE_SIZE * TILE_SIZE]; -groupshared float cache_z[TILE_SIZE * TILE_SIZE]; -groupshared uint cache_rgb[TILE_SIZE * TILE_SIZE]; -groupshared uint group_valid; - -inline uint coord_to_cache(int2 coord) -{ - return flatten2D(clamp(TILE_BORDER + coord, 0, TILE_SIZE - 1), TILE_SIZE); -} - -static const float radius = 14; -static const float radius2 = radius * radius; -static const float radius2_rcp_negative = -rcp(radius2); - -#if 0 -static const uint depth_test_count = 1; -static const float depth_tests[] = {0.33}; -#else -static const uint depth_test_count = 3; -static const float depth_tests[] = {0.125, 0.25, 0.75}; -#endif - -float3 compute_diffuse( - float3 origin_position, - float3 origin_normal, - int2 GTid, - int2 offset -) -{ - const int2 sampleLoc = GTid + offset; - const uint t = coord_to_cache(sampleLoc); - float3 sample_position; - sample_position.z = cache_z[t]; - if(sample_position.z > GetCamera().z_far - 1) - return 0; - sample_position.xy = unpack_half2(cache_xy[t]); - const float3 origin_to_sample = sample_position - origin_position; - const float distance2 = dot(origin_to_sample, origin_to_sample); - float occlusion = saturate(dot(origin_normal, origin_to_sample)); - occlusion *= saturate(distance2 * radius2_rcp_negative + 1.0f); - - if(occlusion > 0) - { - const float origin_z = origin_position.z; - const float sample_z = sample_position.z; - -#if 1 - // DDA occlusion: - const int2 start = GTid; - const int2 goal = sampleLoc; - - const int dx = int(goal.x) - int(start.x); - const int dy = int(goal.y) - int(start.y); - - int step = max(abs(dx), abs(dy)); - step = (step + 1) / 2; // reduce steps - const float step_rcp = rcp(step); - - const float x_incr = float(dx) * step_rcp; - const float y_incr = float(dy) * step_rcp; - - float x = float(start.x); - float y = float(start.y); - - for (int i = 0; i < step - 1; i++) - { - x += x_incr; - y += y_incr; - - const int2 loc = int2(round(x), round(y)); - const uint tt = coord_to_cache(loc); - - const float dt = float(i) / float(step); - const float z = lerp(origin_z, sample_z, dt); - - const float sz = cache_z[tt]; - if(sz < z - 0.1) - { - return occlusion * Unpack_R11G11B10_FLOAT(cache_rgb[tt]); - } - } -#else - // Simple occlusion: - for (uint i = 0; i < depth_test_count; ++i) - { - const float dt = depth_tests[i]; - const float z = lerp(origin_z, sample_z, dt); - const int2 loc = round(lerp(float2(GTid), float2(sampleLoc), dt)); - const uint tt = coord_to_cache(loc); - const float sz = cache_z[tt]; - if (sz < z - 0.1) - { - return occlusion * Unpack_R11G11B10_FLOAT(cache_rgb[tt]); - } - } -#endif - } - - return occlusion * Unpack_R11G11B10_FLOAT(cache_rgb[t]); -} - -[numthreads(THREADCOUNT, THREADCOUNT, 1)] -void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint2 GTid : SV_GroupThreadID, uint groupIndex : SV_GroupIndex) -{ - const uint layer = DTid.z; - const uint2 interleaved_pixel = DTid.xy << 2 | uint2(DTid.z & 3, DTid.z >> 2); - - if(groupIndex == 0) - { - group_valid = 0; - } - GroupMemoryBarrierWithGroupSync(); - - const int2 tile_upperleft = Gid.xy * THREADCOUNT - TILE_BORDER; - for(uint t = groupIndex; t < TILE_SIZE * TILE_SIZE; t += THREADCOUNT * THREADCOUNT) - { - const int2 pixel = tile_upperleft + unflatten2D(t, TILE_SIZE); - const float depth = input_depth[uint3(pixel, layer)]; - const float2 uv = (pixel + 0.5f) * postprocess.resolution_rcp; - const float3 P = reconstruct_position(uv, depth, GetCamera().inverse_projection); - const float3 color = input.SampleLevel(sampler_linear_clamp, uv, 0).rgb; - const uint pkcolor = Pack_R11G11B10_FLOAT(color.rgb); - cache_xy[t] = pack_half2(P.xy); - cache_z[t] = P.z; - cache_rgb[t] = pkcolor; - if(pkcolor) - InterlockedOr(group_valid, 1u); - } - GroupMemoryBarrierWithGroupSync(); - - [branch] - if (group_valid == 0) - return; // if no valid color was cached, whole group can exit early - - const uint t = coord_to_cache(GTid.xy); - float3 P; - P.z = cache_z[t]; - - [branch] - if(P.z > GetCamera().z_far - 1) - return; // if pixel depth is not valid, it can exit early - - P.xy = unpack_half2(cache_xy[t]); - - const uint2 pixel = DTid.xy; - const float3 N = mul((float3x3)GetCamera().view, decode_oct(input_normal[interleaved_pixel].rg)); - - float3 diffuse = 0; - float sum = 0; - const int range = int(postprocess.params0.x); - const float spread = postprocess.params0.y /*+ dither(pixel)*/; - const float rangespread_rcp2 = postprocess.params0.z; - - for(int x = -range; x <= range; ++x) - { - for(int y = -range; y <= range; ++y) - { - const float2 foffset = float2(x, y) * spread; - const int2 offset = round(foffset); - const float weight = saturate(1 - abs(foffset.x) * abs(foffset.y) * rangespread_rcp2); - diffuse += compute_diffuse(P, N, GTid, offset) * weight; - sum += weight; - } - } - if(sum > 0) - { - diffuse = diffuse / sum; - } - - // interleave result: - output_diffuse[interleaved_pixel] = float4(diffuse, 1); -}