From 64888647b0a697ae79f51e2d1ef91a0626cfa07d Mon Sep 17 00:00:00 2001 From: Silas Oler Date: Sun, 27 Feb 2022 12:15:31 +0100 Subject: [PATCH] Traced Reflection Improvements (#395) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Traced Reflection Improvements * bilateral shader compiler fix * hierarchy shader compiler fix * Use texture_depth in hierarchy * Bilateral pass output fix & surface pass early exit * rtreflection fix: missing push constants after shader change * normals signedness fix * version bump Co-authored-by: Turánszki János --- WickedEngine/offlineshadercompiler.cpp | 11 +- .../shaders/ShaderInterop_Postprocess.h | 3 +- WickedEngine/shaders/Shaders_SOURCE.vcxitems | 42 +- .../shaders/Shaders_SOURCE.vcxitems.filters | 27 +- WickedEngine/shaders/rtreflectionCS.hlsl | 46 +- WickedEngine/shaders/rtreflectionLIB.hlsl | 48 +- WickedEngine/shaders/ssr_bilateralCS.hlsl | 106 ++ .../shaders/ssr_depthHierarchyCS.hlsl | 42 + WickedEngine/shaders/ssr_kickjobsCS.hlsl | 27 + WickedEngine/shaders/ssr_medianCS.hlsl | 66 -- WickedEngine/shaders/ssr_raytraceCS.hlsl | 612 +++++------ .../shaders/ssr_raytraceCS_cheap.hlsl | 2 + .../shaders/ssr_raytraceCS_earlyexit.hlsl | 2 + WickedEngine/shaders/ssr_resolveCS.hlsl | 299 +++--- WickedEngine/shaders/ssr_surfaceCS.hlsl | 51 + WickedEngine/shaders/ssr_temporalCS.hlsl | 323 +++--- .../ssr_tileMaxRoughness_horizontalCS.hlsl | 43 + .../ssr_tileMaxRoughness_verticalCS.hlsl | 58 ++ WickedEngine/shaders/stochasticSSRHF.hlsli | 172 +--- WickedEngine/wiEnums.h | 9 +- WickedEngine/wiRenderPath3D.cpp | 8 +- WickedEngine/wiRenderer.cpp | 948 ++++++++++++++---- WickedEngine/wiRenderer.h | 33 +- WickedEngine/wiVersion.cpp | 2 +- 24 files changed, 1938 insertions(+), 1042 deletions(-) create mode 100644 WickedEngine/shaders/ssr_bilateralCS.hlsl create mode 100644 WickedEngine/shaders/ssr_depthHierarchyCS.hlsl create mode 100644 WickedEngine/shaders/ssr_kickjobsCS.hlsl delete mode 100644 WickedEngine/shaders/ssr_medianCS.hlsl create mode 100644 WickedEngine/shaders/ssr_raytraceCS_cheap.hlsl create mode 100644 WickedEngine/shaders/ssr_raytraceCS_earlyexit.hlsl create mode 100644 WickedEngine/shaders/ssr_surfaceCS.hlsl create mode 100644 WickedEngine/shaders/ssr_tileMaxRoughness_horizontalCS.hlsl create mode 100644 WickedEngine/shaders/ssr_tileMaxRoughness_verticalCS.hlsl diff --git a/WickedEngine/offlineshadercompiler.cpp b/WickedEngine/offlineshadercompiler.cpp index af45ddcc2..87630abc6 100644 --- a/WickedEngine/offlineshadercompiler.cpp +++ b/WickedEngine/offlineshadercompiler.cpp @@ -110,10 +110,17 @@ int main(int argc, char* argv[]) "fsr_sharpenCS.hlsl" , "ssaoCS.hlsl" , "rtreflectionCS.hlsl" , - "ssr_raytraceCS.hlsl" , + "ssr_surfaceCS.hlsl" , + "ssr_tileMaxRoughness_horizontalCS.hlsl" , + "ssr_tileMaxRoughness_verticalCS.hlsl" , + "ssr_kickjobsCS.hlsl" , + "ssr_depthHierarchyCS.hlsl" , "ssr_resolveCS.hlsl" , "ssr_temporalCS.hlsl" , - "ssr_medianCS.hlsl" , + "ssr_bilateralCS.hlsl" , + "ssr_raytraceCS.hlsl" , + "ssr_raytraceCS_cheap.hlsl" , + "ssr_raytraceCS_earlyexit.hlsl" , "sharpenCS.hlsl" , "skinningCS.hlsl" , "resolveMSAADepthStencilCS.hlsl" , diff --git a/WickedEngine/shaders/ShaderInterop_Postprocess.h b/WickedEngine/shaders/ShaderInterop_Postprocess.h index 363686ddf..0ff67e899 100644 --- a/WickedEngine/shaders/ShaderInterop_Postprocess.h +++ b/WickedEngine/shaders/ShaderInterop_Postprocess.h @@ -40,8 +40,7 @@ struct Bloom #define lineardepth_inputresolution postprocess.params0.xy #define lineardepth_inputresolution_rcp postprocess.params0.zw -#define ssr_input_maxmip postprocess.params0.x -#define ssr_input_resolution_max postprocess.params0.y +static const uint SSR_TILESIZE = 32; #define ssr_frame postprocess.params0.w #define ssao_range postprocess.params0.x diff --git a/WickedEngine/shaders/Shaders_SOURCE.vcxitems b/WickedEngine/shaders/Shaders_SOURCE.vcxitems index 2cdbc092a..b19d32856 100644 --- a/WickedEngine/shaders/Shaders_SOURCE.vcxitems +++ b/WickedEngine/shaders/Shaders_SOURCE.vcxitems @@ -998,6 +998,38 @@ Compute Compute + + Compute + 4.0 + + + Compute + 4.0 + + + Compute + 4.0 + + + Compute + 4.0 + + + Compute + 4.0 + + + Compute + 4.0 + + + Compute + 4.0 + + + Compute + 4.0 + Compute 4.0 @@ -2561,16 +2593,6 @@ Compute Compute - - Compute - Compute - Compute - Compute - Compute - Compute - Compute - Compute - Compute Compute diff --git a/WickedEngine/shaders/Shaders_SOURCE.vcxitems.filters b/WickedEngine/shaders/Shaders_SOURCE.vcxitems.filters index c9b613559..30faa01cc 100644 --- a/WickedEngine/shaders/Shaders_SOURCE.vcxitems.filters +++ b/WickedEngine/shaders/Shaders_SOURCE.vcxitems.filters @@ -230,9 +230,6 @@ CS - - CS - CS @@ -1025,6 +1022,30 @@ CS + + CS + + + CS + + + CS + + + CS + + + CS + + + CS + + + CS + + + CS + diff --git a/WickedEngine/shaders/rtreflectionCS.hlsl b/WickedEngine/shaders/rtreflectionCS.hlsl index 4892a1e36..a6a6bd1f3 100644 --- a/WickedEngine/shaders/rtreflectionCS.hlsl +++ b/WickedEngine/shaders/rtreflectionCS.hlsl @@ -11,8 +11,13 @@ PUSHCONSTANT(postprocess, PostProcess); -RWTexture2D output : register(u0); -RWTexture2D output_rayLengths : register(u1); +Texture2D texture_surface_normal : register(t0); +Texture2D texture_surface_roughness : register(t1); +Texture2D texture_surface_environment : register(t2); + +RWTexture2D output_rayIndirectSpecular : register(u0); +RWTexture2D output_rayDirectionPDF : register(u1); +RWTexture2D output_rayLengths : register(u2); struct RayPayload { @@ -23,34 +28,30 @@ struct RayPayload void main(uint2 DTid : SV_DispatchThreadID) { const float2 uv = ((float2)DTid.xy + 0.5) * postprocess.resolution_rcp; - const float depth = texture_depth.SampleLevel(sampler_linear_clamp, uv, 0); - if (depth == 0) - return; - const float3 P = reconstruct_position(uv, depth); - const float3 V = normalize(GetCamera().position - P); + const uint downsampleFactor = 2; - PrimitiveID prim; - prim.unpack(texture_gbuffer0[DTid.xy * 2]); + // This is necessary for accurate upscaling. This is so we don't reuse the same half-res pixels + uint2 screenJitter = floor(blue_noise(uint2(0, 0)).xy * downsampleFactor); + uint2 jitterPixel = screenJitter + DTid.xy * downsampleFactor; + float2 jitterUV = (screenJitter + DTid.xy + 0.5f) * postprocess.resolution_rcp; - //output[DTid] = float4(saturate(P * 0.1), 1); - //return; + const float depth = texture_depth.SampleLevel(sampler_linear_clamp, jitterUV, 0); + const float roughness = texture_surface_roughness[jitterPixel]; - Surface surface; - surface.init(); - if (!surface.load(prim, P)) + if (!NeedReflection(roughness, depth)) { - return; - } - if (surface.roughness > 0.6) - { - output[DTid.xy] = float4(max(0, EnvironmentReflection_Global(surface)), 1); + float3 environmentReflection = texture_surface_environment[DTid.xy * downsampleFactor]; + + output_rayIndirectSpecular[DTid.xy] = float4(environmentReflection, 1); + output_rayDirectionPDF[DTid.xy] = 0.0; output_rayLengths[DTid.xy] = FLT_MAX; return; } - float3 N = surface.N; - float roughness = surface.roughness; + const float3 N = texture_surface_normal[jitterPixel]; + const float3 P = reconstruct_position(jitterUV, depth); + const float3 V = normalize(GetCamera().position - P); // The ray direction selection part is the same as in from ssr_raytraceCS.hlsl: float4 H; @@ -217,6 +218,7 @@ void main(uint2 DTid : SV_DispatchThreadID) payload.data.w = q.CommittedRayT(); } - output[DTid.xy] = float4(payload.data.xyz, 1); + output_rayIndirectSpecular[DTid.xy] = float4(payload.data.xyz, 1); + output_rayDirectionPDF[DTid.xy] = float4(L, H.w); output_rayLengths[DTid.xy] = payload.data.w; } diff --git a/WickedEngine/shaders/rtreflectionLIB.hlsl b/WickedEngine/shaders/rtreflectionLIB.hlsl index 3a7f72cb6..742982a2b 100644 --- a/WickedEngine/shaders/rtreflectionLIB.hlsl +++ b/WickedEngine/shaders/rtreflectionLIB.hlsl @@ -10,8 +10,13 @@ PUSHCONSTANT(postprocess, PostProcess); -RWTexture2D output : register(u0); -RWTexture2D output_rayLengths : register(u1); +Texture2D texture_surface_normal : register(t0); +Texture2D texture_surface_roughness : register(t1); +Texture2D texture_surface_environment : register(t2); + +RWTexture2D output_rayIndirectSpecular : register(u0); +RWTexture2D output_rayDirectionPDF : register(u1); +RWTexture2D output_rayLengths : register(u2); struct RayPayload { @@ -30,34 +35,30 @@ void RTReflection_Raygen() { uint2 DTid = DispatchRaysIndex().xy; const float2 uv = ((float2)DTid.xy + 0.5) / (float2)DispatchRaysDimensions(); - const float depth = texture_depth.SampleLevel(sampler_linear_clamp, uv, 0); - if (depth == 0) - return; - const float3 P = reconstruct_position(uv, depth); - const float3 V = normalize(GetCamera().position - P); + const uint downsampleFactor = 2; - PrimitiveID prim; - prim.unpack(texture_gbuffer0[DTid.xy * 2]); + // This is necessary for accurate upscaling. This is so we don't reuse the same half-res pixels + uint2 screenJitter = floor(blue_noise(uint2(0, 0)).xy * downsampleFactor); + uint2 jitterPixel = screenJitter + DTid.xy * downsampleFactor; + float2 jitterUV = (screenJitter + DTid.xy + 0.5f) / (float2)DispatchRaysDimensions(); - //output[DTid] = float4(saturate(P * 0.1), 1); - //return; + const float depth = texture_depth.SampleLevel(sampler_linear_clamp, jitterUV, 0); + const float roughness = texture_surface_roughness[jitterPixel]; - Surface surface; - surface.init(); - if (!surface.load(prim, P)) + if (!NeedReflection(roughness, depth)) { - return; - } - if (surface.roughness > 0.6) - { - output[DTid.xy] = float4(max(0, EnvironmentReflection_Global(surface)), 1); + float3 environmentReflection = texture_surface_environment[DTid.xy * downsampleFactor]; + + output_rayIndirectSpecular[DTid.xy] = float4(environmentReflection, 1); + output_rayDirectionPDF[DTid.xy] = 0.0; output_rayLengths[DTid.xy] = FLT_MAX; return; } - float3 N = surface.N; - float roughness = surface.roughness; + const float3 N = texture_surface_normal[jitterPixel]; + const float3 P = reconstruct_position(jitterUV, depth); + const float3 V = normalize(GetCamera().position - P); // The ray direction selection part is the same as in from ssr_raytraceCS.hlsl: float4 H; @@ -78,7 +79,6 @@ void RTReflection_Raygen() // Tangent to world H.xyz = mul(H.xyz, tangentBasis); - L = reflect(-V, H.xyz); } else @@ -87,7 +87,6 @@ void RTReflection_Raygen() L = reflect(-V, H.xyz); } - const float3 R = L; float seed = GetFrame().time; @@ -112,7 +111,8 @@ void RTReflection_Raygen() payload // Payload ); - output[DTid.xy] = float4(payload.data.xyz, 1); + output_rayIndirectSpecular[DTid.xy] = float4(L, 1); + output_rayDirectionPDF[DTid.xy] = float4(L, H.w); output_rayLengths[DTid.xy] = payload.data.w; } diff --git a/WickedEngine/shaders/ssr_bilateralCS.hlsl b/WickedEngine/shaders/ssr_bilateralCS.hlsl new file mode 100644 index 000000000..df0236666 --- /dev/null +++ b/WickedEngine/shaders/ssr_bilateralCS.hlsl @@ -0,0 +1,106 @@ +#include "globals.hlsli" +#include "stochasticSSRHF.hlsli" +#include "ShaderInterop_Postprocess.h" + +PUSHCONSTANT(postprocess, PostProcess); + +Texture2D texture_temporal : register(t0); +Texture2D texture_resolve_variance : register(t1); +Texture2D texture_surface_normal : register(t2); +Texture2D texture_surface_roughness : register(t3); + +RWTexture2D output : register(u0); + +static const float depthThreshold = 10000.0; +static const float normalThreshold = 1.0; +static const float varianceEstimateThreshold = 0.015; // Larger variance values use stronger blur +static const float varianceExitThreshold = 0.005; // Variance needs to be higher than this value to accept blur +static const uint2 bilateralMinMaxRadius = uint2(0, 2); // Chosen by variance + +#define BILATERAL_SIGMA 0.9 + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint3 DTid : SV_DispatchThreadID) +{ +#if 0 // Debug + output[DTid.xy] = float4((texture_resolve_variance[DTid.xy] > varianceEstimateThreshold).rrr, 1.0); + return; +#endif + + const float depth = texture_depth[DTid.xy]; + const float roughness = texture_surface_roughness[DTid.xy]; + + if (!NeedReflection(roughness, depth)) + { + output[DTid.xy] = texture_temporal[DTid.xy]; + return; + } + + float2 direction = postprocess.params0.xy; + + const float linearDepth = texture_lineardepth[DTid.xy]; + const float3 N = texture_surface_normal[DTid.xy]; + + float4 outputColor = texture_temporal[DTid.xy]; + + + float variance = texture_resolve_variance[DTid.xy]; + bool strongBlur = variance > varianceEstimateThreshold; + + float radius = strongBlur ? bilateralMinMaxRadius.y : bilateralMinMaxRadius.x; + radius = lerp(0.0, radius, saturate(roughness * 8.0)); // roughness 0.125 is destination + + float sigma = radius * BILATERAL_SIGMA; + int effectiveRadius = min(sigma * 2.0, radius); + + if (variance > varianceExitThreshold && effectiveRadius > 0) + { + float2 uv = (DTid.xy + 0.5f) * postprocess.resolution_rcp; + float3 P = reconstruct_position(uv, depth); + + float4 result = 0; + float weightSum = 0.0f; + + for (int r = -effectiveRadius; r <= effectiveRadius; r++) + { + const int2 sampleCoord = DTid.xy + (direction * r); // Left to right diameter directionally + + if (all(sampleCoord >= int2(0, 0) && sampleCoord < (int2) postprocess.resolution)) + { + const float sampleDepth = texture_depth[sampleCoord]; + const float4 sampleColor = texture_temporal[sampleCoord]; + + const float3 sampleN = texture_surface_normal[sampleCoord]; + const float sampleRoughness = texture_surface_roughness[sampleCoord]; + + float2 sampleUV = (sampleCoord + 0.5) * postprocess.resolution_rcp; + float3 sampleP = reconstruct_position(sampleUV, sampleDepth); + + // Don't let invalid roughness samples interfere + if (NeedReflection(sampleRoughness, sampleDepth)) + { + float3 dq = P - sampleP; + float planeError = max(abs(dot(dq, sampleN)), abs(dot(dq, N))); + float relativeDepthDifference = planeError / (linearDepth * GetCamera().z_far); + float bilateralDepthWeight = exp(-sqr(relativeDepthDifference) * depthThreshold); + + float normalError = pow(saturate(dot(sampleN, N)), 4.0); + float bilateralNormalWeight = saturate(1.0 - (1.0 - normalError) * normalThreshold); + + float bilateralWeight = bilateralDepthWeight * bilateralNormalWeight; + + float gaussian = exp(-sqr(r / sigma)); + float weight = (r == 0) ? 1.0 : gaussian * bilateralWeight; // Skip center gaussian peak + + result += sampleColor * weight; + weightSum += weight; + } + } + } + + result /= weightSum; + outputColor = result; + } + + output[DTid.xy] = outputColor; +} diff --git a/WickedEngine/shaders/ssr_depthHierarchyCS.hlsl b/WickedEngine/shaders/ssr_depthHierarchyCS.hlsl new file mode 100644 index 000000000..9111560a8 --- /dev/null +++ b/WickedEngine/shaders/ssr_depthHierarchyCS.hlsl @@ -0,0 +1,42 @@ +#include "globals.hlsli" +#include "ShaderInterop_Postprocess.h" + +PUSHCONSTANT(postprocess, PostProcess); + +Texture2D input : register(t0); + +RWTexture2D output : register(u0); + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint3 DTid : SV_DispatchThreadID) +{ + if (all(DTid.xy < postprocess.params0.xy)) + { + if (postprocess.params0.z == 1) + { + uint2 dim; + texture_depth.GetDimensions(dim.x, dim.y); + + float2 uv = (DTid.xy + 0.5) / dim * 2; // Account for half-res + + float4 depths = texture_depth.GatherRed(sampler_point_clamp, uv); + + float depthMax = max(max(depths.x, depths.y), max(depths.z, depths.w)); + float depthMin = min(min(depths.x, depths.y), min(depths.z, depths.w)); + + output[DTid.xy] = float2(depthMax, depthMin); + } + else + { + float2 uv = (DTid.xy + 0.5) / postprocess.params0.xy; + + float4 depthsRed = input.GatherRed(sampler_point_clamp, uv); + float4 depthsGreen = input.GatherGreen(sampler_point_clamp, uv); + + float depthMax = max(max(depthsRed.x, depthsRed.y), max(depthsRed.z, depthsRed.w)); + float depthMin = min(min(depthsGreen.x, depthsGreen.y), min(depthsGreen.z, depthsGreen.w)); + + output[DTid.xy] = float2(depthMax, depthMin); + } + } +} diff --git a/WickedEngine/shaders/ssr_kickjobsCS.hlsl b/WickedEngine/shaders/ssr_kickjobsCS.hlsl new file mode 100644 index 000000000..8888ac648 --- /dev/null +++ b/WickedEngine/shaders/ssr_kickjobsCS.hlsl @@ -0,0 +1,27 @@ +#include "globals.hlsli" +#include "ShaderInterop_Postprocess.h" + +RWByteAddressBuffer tile_tracing_statistics : register(u0); +RWStructuredBuffer tiles_tracing_earlyexit : register(u1); +RWStructuredBuffer tiles_tracing_cheap : register(u2); +RWStructuredBuffer tiles_tracing_expensive : register(u3); + +[numthreads(1, 1, 1)] +void main(uint3 DTid : SV_DispatchThreadID) +{ + // Load statistics: + const uint tracing_earlyexit_count = tile_tracing_statistics.Load(TILE_STATISTICS_OFFSET_EARLYEXIT); + const uint tracing_cheap_count = tile_tracing_statistics.Load(TILE_STATISTICS_OFFSET_CHEAP); + const uint tracing_expensive_count = tile_tracing_statistics.Load(TILE_STATISTICS_OFFSET_EXPENSIVE); + + // Reset counters: + tile_tracing_statistics.Store(TILE_STATISTICS_OFFSET_EARLYEXIT, 0); + tile_tracing_statistics.Store(TILE_STATISTICS_OFFSET_CHEAP, 0); + tile_tracing_statistics.Store(TILE_STATISTICS_OFFSET_EXPENSIVE, 0); + + // Create indirect dispatch arguments: + const uint tile_tracing_replicate = sqr(SSR_TILESIZE / 2 / POSTPROCESS_BLOCKSIZE); + tile_tracing_statistics.Store3(INDIRECT_OFFSET_EARLYEXIT, uint3(tracing_earlyexit_count * tile_tracing_replicate, 1, 1)); + tile_tracing_statistics.Store3(INDIRECT_OFFSET_CHEAP, uint3(tracing_cheap_count * tile_tracing_replicate, 1, 1)); + tile_tracing_statistics.Store3(INDIRECT_OFFSET_EXPENSIVE, uint3(tracing_expensive_count * tile_tracing_replicate, 1, 1)); +} diff --git a/WickedEngine/shaders/ssr_medianCS.hlsl b/WickedEngine/shaders/ssr_medianCS.hlsl deleted file mode 100644 index 94f0c9ffc..000000000 --- a/WickedEngine/shaders/ssr_medianCS.hlsl +++ /dev/null @@ -1,66 +0,0 @@ -#include "globals.hlsli" -#include "ShaderInterop_Postprocess.h" - -PUSHCONSTANT(postprocess, PostProcess); - -Texture2D texture_temporal : register(t0); - -RWTexture2D output : register(u0); - -// A Fast, Small-Radius GPU Median Filter by Morgan McGuire -// https://casual-effects.com/research/McGuire2008Median/index.html - -#define s2(a, b) temp = a; a = min(a, b); b = max(temp, b); -#define t2(a, b) s2(v[a], v[b]); -#define t24(a, b, c, d, e, f, g, h) t2(a, b); t2(c, d); t2(e, f); t2(g, h); -#define t25(a, b, c, d, e, f, g, h, i, j) t24(a, b, c, d, e, f, g, h); t2(i, j); - -[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] -void main(uint3 DTid : SV_DispatchThreadID) -{ - if (texture_depth.Load(uint3(DTid.xy, 1)) == 0) - return; - - const float2 uv = (DTid.xy + 0.5f) * postprocess.resolution_rcp; - - half4 v[25]; - - // Add the pixels which make up our window to the pixel array. - [unroll] - for (int dX = -2; dX <= 2; ++dX) - { - [unroll] - for (int dY = -2; dY <= 2; ++dY) - { - float2 offset = float2(float(dX), float(dY)); - - // If a pixel in the window is located at (x+dX, y+dY), put it at index (dX + R)(2R + 1) + (dY + R) of the - // pixel array. This will fill the pixel array, with the top left pixel of the window at pixel[0] and the - // bottom right pixel of the window at pixel[N-1]. - v[(dX + 2) * 5 + (dY + 2)] = texture_temporal.SampleLevel(sampler_linear_clamp, uv + offset * postprocess.resolution_rcp, 0); - } - } - - half4 temp; - t25(0, 1, 3, 4, 2, 4, 2, 3, 6, 7); - t25(5, 7, 5, 6, 9, 7, 1, 7, 1, 4); - t25(12, 13, 11, 13, 11, 12, 15, 16, 14, 16); - t25(14, 15, 18, 19, 17, 19, 17, 18, 21, 22); - t25(20, 22, 20, 21, 23, 24, 2, 5, 3, 6); - t25(0, 6, 0, 3, 4, 7, 1, 7, 1, 4); - t25(11, 14, 8, 14, 8, 11, 12, 15, 9, 15); - t25(9, 12, 13, 16, 10, 16, 10, 13, 20, 23); - t25(17, 23, 17, 20, 21, 24, 18, 24, 18, 21); - t25(19, 22, 8, 17, 9, 18, 0, 18, 0, 9); - t25(10, 19, 1, 19, 1, 10, 11, 20, 2, 20); - t25(2, 11, 12, 21, 3, 21, 3, 12, 13, 22); - t25(4, 22, 4, 13, 14, 23, 5, 23, 5, 14); - t25(15, 24, 6, 24, 6, 15, 7, 16, 7, 19); - t25(3, 11, 5, 17, 11, 17, 9, 17, 4, 10); - t25(6, 12, 7, 14, 4, 6, 4, 7, 12, 14); - t25(10, 14, 6, 7, 10, 12, 6, 10, 6, 17); - t25(12, 17, 7, 17, 7, 10, 12, 18, 7, 12); - t24(10, 18, 12, 20, 10, 20, 10, 12); - - output[DTid.xy] = v[12]; -} diff --git a/WickedEngine/shaders/ssr_raytraceCS.hlsl b/WickedEngine/shaders/ssr_raytraceCS.hlsl index 15db58d35..d72cf4abd 100644 --- a/WickedEngine/shaders/ssr_raytraceCS.hlsl +++ b/WickedEngine/shaders/ssr_raytraceCS.hlsl @@ -4,358 +4,390 @@ PUSHCONSTANT(postprocess, PostProcess); -Texture2D input : register(t0); +//#define DEBUG_TILING -RWTexture2D texture_raytrace : register(u0); -RWTexture2D texture_rayLengths : register(u1); +Texture2D texture_surface_normal : register(t0); +Texture2D texture_surface_roughness : register(t1); +Texture2D texture_depth_hierarchy : register(t2); +Texture2D input : register(t3); -static const float rayTraceStrideMin = 1.0f; // Step in horizontal or vertical pixels between samples. -static const float rayTraceStrideMax = 10.0f; // Define max stride between samples. Roughness will interpolate between it's min and max counterparts. -static const float rayTraceMaxStep = 512.0f; // Maximum number of iterations. Higher gives better images but may be slow. -static const float rayTraceThicknessOffset = 0.0f; // Increse or decrease thickness for each pixels in the depth buffer. [- / +] -static const float rayTraceThicknessBias = 1.0f; // Bias to control the growth of the thickness. -static const bool raytraceThicknessInfinite = false; // Use infinite thickness for maximum performance, but may not be suitable for most scenes. -static const float rayTraceMaxDistance = 1000.0f; // Maximum camera-space distance to trace before returning a miss. -static const float rayTraceStrideCutoff = 100.0f; // More distant pixels are smaller in screen space. This value tells at what point to - // start relaxing the stride to give higher quality reflections for objects far from the camera. -static const float raytraceHZBBias = 0.05f; // This value tells how fast the roughness increases the level. -static const float raytraceHZBStartLevel = 1.0f; -static const float raytraceHZBMinStep = 0.005f; // Minimum level increasement per iteration. - - -float DistanceSquared(float2 a, float2 b) -{ - a -= b; - return dot(a, a); -} - -bool IntersectsDepthBuffer(float sceneZMax, float rayZMin, float rayZMax) -{ - // Increase thickness along distance. - float thickness = max(sceneZMax * rayTraceThicknessBias + rayTraceThicknessOffset, 1.0); - -#if 0 // precision issues in DX12 - // Effectively remove line/tiny artifacts, mostly caused by Zbuffers precision. - float depthScale = min(1.0f, sceneZMax / rayTraceStrideCutoff); - sceneZMax += lerp(0.05f, 0.0f, depthScale); -#endif - - if (raytraceThicknessInfinite) - return (rayZMin >= sceneZMax); - else - return (rayZMin >= sceneZMax) && (rayZMax - thickness <= sceneZMax); -} - -// Heavily adapted from McGuire and Mara's original implementation -// http://casual-effects.blogspot.com/2014/08/screen-space-ray-tracing.html -bool ScreenSpaceRayTrace(float3 csOrig, float3 csDir, float jitter, float roughness, out float2 hitPixel, out float3 hitPoint, out float iterations) -{ - csOrig += csDir * 0.001; // precision issues in DX12 - float rayLength = ((csOrig.z + csDir.z * rayTraceMaxDistance) < GetCamera().z_near) ? - (GetCamera().z_near - csOrig.z) / csDir.z : rayTraceMaxDistance; - - float3 csRayEnd = csOrig + csDir * rayLength; - - // Project into homogeneous clip space - float4 clipRayOrigin = mul(GetCamera().projection, float4(csOrig, 1.0f)); - float4 clipRayEnd = mul(GetCamera().projection, float4(csRayEnd, 1.0f)); - - float k0 = 1.0f / clipRayOrigin.w; - float k1 = 1.0f / clipRayEnd.w; - - float3 Q0 = csOrig * k0; - float3 Q1 = csRayEnd * k1; - - // Screen-space endpoints - float2 P0 = clipRayOrigin.xy * k0; - float2 P1 = clipRayEnd.xy * k1; - - // Project to pixel - P0 = P0 * float2(0.5, -0.5) + float2(0.5, 0.5); - P1 = P1 * float2(0.5, -0.5) + float2(0.5, 0.5); - - P0.xy *= postprocess.resolution.xy; - P1.xy *= postprocess.resolution.xy; - -#if 0 - // Clip to the screen coordinates. Alternatively we could just modify rayTraceMaxStep instead - float2 yDelta = float2(postprocess.resolution.y + 2.0f, -2.0f); // - 0.5, 0.5 - float2 xDelta = float2(postprocess.resolution.x + 2.0f, -2.0f); // - 0.5, 0.5 - float alpha = 0.0; - - // P0 must be in bounds - if (P1.y > yDelta.x || P1.y < yDelta.y) - { - float yClip = (P1.y > yDelta.x) ? yDelta.x : yDelta.y; - float yAlpha = (P1.y - yClip) / (P1.y - P0.y); - alpha = yAlpha; - } - - // P1 must be in bounds - if (P1.x > xDelta.x || P1.x < xDelta.y) - { - float xClip = (P1.x > xDelta.x) ? xDelta.x : xDelta.y; - float xAlpha = (P1.x - xClip) / (P1.x - P0.x); - alpha = max(alpha, xAlpha); - } - - // These are all in homogeneous space, so they interpolate linearly - P1 = lerp(P1, P0, alpha); - k1 = lerp(k1, k0, alpha); - Q1 = lerp(Q1, Q0, alpha); -#endif - - // If the line is degenerate, make it cover at least one pixel to avoid handling zero-pixel extent as a special case later - P1 += (DistanceSquared(P0, P1) < 0.0001f) ? float2(0.01f, 0.01f) : 0.0f; - float2 screenOffset = P1 - P0; - - // Permute so that the primary iteration is in x to collapse all quadrant-specific DDA cases later - bool permute = false; - if (abs(screenOffset.x) < abs(screenOffset.y)) - { - permute = true; - screenOffset = screenOffset.yx; - P0 = P0.yx; - P1 = P1.yx; - } - - float stepDirection = sign(screenOffset.x); - float stepInterval = stepDirection / screenOffset.x; - - // Track the derivatives of Q and k - float3 dQ = (Q1 - Q0) * stepInterval; - float dk = (k1 - k0) * stepInterval; - - // Because we test 1/2 a texel forward along the ray, on the very last iteration - // the interpolation can go past the end of the ray. Use these bounds to clamp it. - float zMin = min(csRayEnd.z, csOrig.z); - float zMax = max(csRayEnd.z, csOrig.z); - - float2 dP = float2(stepDirection, screenOffset.y * stepInterval); - - // Scale derivatives by the desired pixel stride and then offset the starting values by the jitter fraction -#if 1 // Stride based on roughness. Matte materials will recieve higher stride - float alphaRoughness = roughness * roughness; - float alphaRoughnessSq = alphaRoughness * alphaRoughness; - - float strideScale = 1.0f - min(1.0f, csOrig.z / rayTraceStrideCutoff); - float strideRoughnessScale = lerp(rayTraceStrideMin, rayTraceStrideMax, min(alphaRoughnessSq, 1.0)); // Climb exponentially at extreme conditions - float stride = 1.0 + strideScale * strideRoughnessScale; +#if defined(SSR_EARLYEXIT) +StructuredBuffer tiles : register(t4); +#elif defined(SSR_CHEAP) +StructuredBuffer tiles : register(t5); #else - float strideScale = 1.0f - min(1.0f, csOrig.z / rayTraceStrideCutoff); - float stride = 1.0 + strideScale * rayTraceStrideMin; -#endif - - dP *= stride; - dQ *= stride; - dk *= stride; +StructuredBuffer tiles : register(t6); +#endif // SSR_EARLYEXIT - P0 += dP * jitter; - Q0 += dQ * jitter; - k0 += dk * jitter; +RWTexture2D output_rayIndirectSpecular : register(u0); +RWTexture2D output_rayDirectionPDF : register(u1); +RWTexture2D output_rayLengths : register(u2); - float4 PQk = float4(P0, Q0.z, k0); - float4 dPQk = float4(dP, dQ.z, dk); - float3 Q = Q0; +static const float traceThickness = 1.5; +static const float blendScreenEdgeFade = 5.0f; - // Adjust end condition for iteration direction - float end = P1.x * stepDirection; +static const float HiZTraceMostDetailedLevel = 0.0; +static const float HiZTraceIterationsMax = 64; - float stepCount = 0.0f; - float level = raytraceHZBStartLevel; +void InitialAdvanceRay(float3 origin, float3 direction, float2 currentMipResolution, float2 currentMipResolution_rcp, float2 floorOffset, float2 uvOffset, out float3 position, out float tCurrent) +{ + float2 currentMipPosition = currentMipResolution * origin.xy; - float prevZMaxEstimate = csOrig.z; - float rayZMin = prevZMaxEstimate; - float rayZMax = prevZMaxEstimate; - float sceneZMax = rayZMax + 100000.0f; - - [loop] - for (; ((PQk.x * stepDirection) <= end) && - (stepCount < rayTraceMaxStep) && - !IntersectsDepthBuffer(sceneZMax, rayZMin, rayZMax) && - (sceneZMax != 0.0f); - PQk += dPQk, stepCount++) - { - if (any(hitPixel < 0.0) || any(hitPixel > 1.0)) - { - return false; - } - - rayZMin = prevZMaxEstimate; - - // Compute the value at 1/2 step into the future - rayZMax = (dPQk.z * 0.5f + PQk.z) / (dPQk.w * 0.5f + PQk.w); - rayZMax = clamp(rayZMax, zMin, zMax); - - prevZMaxEstimate = rayZMax; - - if (rayZMin > rayZMax) - { - float t = rayZMin; - rayZMin = rayZMax; - rayZMax = t; - } + // Intersect ray with the half box that is pointing away from the ray origin. + float2 xyPlane = floor(currentMipPosition) + floorOffset; + xyPlane = xyPlane * currentMipResolution_rcp + uvOffset; - // A simple HZB approach based on roughness - level += max(raytraceHZBBias * roughness, raytraceHZBMinStep); - level = min(level, 6.0f); - - hitPixel = permute ? PQk.yx : PQk.xy; - hitPixel *= postprocess.resolution_rcp; - - sceneZMax = texture_lineardepth.SampleLevel(sampler_linear_clamp, hitPixel, level) * GetCamera().z_far; - } - - // Undo the last increment, which ran after the test variables were set up - //PQk -= dPQk; - //stepCount -= 1.0; - - // Advance Q based on the number of steps - Q.xy += dQ.xy * stepCount; - Q.z = PQk.z; - hitPoint = Q * (1.0f / PQk.w); - iterations = stepCount; - - return IntersectsDepthBuffer(sceneZMax, rayZMin, rayZMax); + // o + d * t = p' => t = (p' - o) / d + float2 t = (xyPlane - origin.xy) / direction.xy; + tCurrent = min(t.x, t.y); + position = origin + tCurrent * direction; } - -[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] -void main(uint3 DTid : SV_DispatchThreadID) +bool AdvanceRay(float3 origin, float3 direction, float2 currentMipPosition, float2 currentMipResolution_rcp, float2 floorOffset, float2 uvOffset, float surfaceZ, inout float3 position, inout float tCurrent) { - const float2 uv = (DTid.xy + 0.5f) * postprocess.resolution_rcp; - const float depth = texture_depth.SampleLevel(sampler_linear_clamp, uv, 1); - if (depth == 0) - return; + // Create boundary planes + float2 xyPlane = floor(currentMipPosition) + floorOffset; + xyPlane = xyPlane * currentMipResolution_rcp + uvOffset; + float3 boundaryPlanes = float3(xyPlane, surfaceZ); - PrimitiveID prim; - prim.unpack(texture_gbuffer0[DTid.xy * 2]); + // Intersect ray with the half box that is pointing away from the ray origin. + // o + d * t = p' => t = (p' - o) / d + float3 t = (boundaryPlanes - origin) / direction; - Surface surface; - surface.init(); - if (!surface.load(prim, reconstruct_position(uv, depth))) + // Prevent using z plane when shooting out of the depth buffer. + t.z = direction.z < 0 ? t.z : FLT_MAX; + + // Choose nearest intersection with a boundary. + float tMin = min(min(t.x, t.y), t.z); + + // Larger z means closer to the camera. + bool aboveSurface = surfaceZ < position.z; + + // Decide whether we are able to advance the ray until we hit the xy boundaries or if we had to clamp it at the surface. + // We use the asuint comparison to avoid NaN / Inf logic, also we actually care about bitwise equality here to see if t_min is the t.z we fed into the min3 above. + bool skippedTile = asuint(tMin) != asuint(t.z) && aboveSurface; + + // Make sure to only advance the ray if we're still above the surface. + tCurrent = aboveSurface ? tMin : tCurrent; + + // Advance ray + position = origin + tCurrent * direction; + + return skippedTile; +} + +float2 GetMipResolution(float2 screenDimensions, int mipLevel) +{ + return screenDimensions * pow(0.5, mipLevel); +} + +// Based on: https://github.com/GPUOpen-Effects/FidelityFX-SSSR/tree/master +// Requires origin and direction of the ray to be in screen space [0, 1] x [0, 1] +float3 HierarchicalRaymarch(float3 origin, float3 direction, float2 screenSize, out bool validHit) +{ + // Start on mip with highest detail. + int currentMip = HiZTraceMostDetailedLevel; + + // Could recompute these every iteration, but it's faster to hoist them out and update them. + float2 currentMipResolution = GetMipResolution(screenSize, currentMip); + float2 currentMipResolution_rcp = rcp(currentMipResolution); + + // Offset to the bounding boxes uv space to intersect the ray with the center of the next pixel. + // This means we ever so slightly over shoot into the next region. + float2 uvOffset = 0.005 * exp2(HiZTraceMostDetailedLevel) / screenSize; + uvOffset = direction.xy < 0 ? -uvOffset : uvOffset; + + // Offset applied depending on current mip resolution to move the boundary to the left/right upper/lower border depending on ray direction. + float2 floorOffset = direction.xy < 0 ? 0 : 1; + + // Initially advance ray to avoid immediate self intersections. + float tCurrent; + float3 position; + InitialAdvanceRay(origin, direction, currentMipResolution, currentMipResolution_rcp, floorOffset, uvOffset, position, tCurrent); + + int i = 0; + while (i < HiZTraceIterationsMax && currentMip >= HiZTraceMostDetailedLevel) { - return; + if (any(position.xy < 0.0) || any(position.xy > 1.0)) + { + validHit = false; + return position; + } + + float2 currentMipPosition = currentMipResolution * position.xy; + float surfaceZ = texture_depth_hierarchy.Load(int3(currentMipPosition, currentMip)).r; + + bool skippedTile = AdvanceRay(origin, direction, currentMipPosition, currentMipResolution_rcp, floorOffset, uvOffset, surfaceZ, position, tCurrent); + + currentMip += skippedTile ? 1 : -1; + currentMipResolution *= skippedTile ? 0.5 : 2; + currentMipResolution_rcp *= skippedTile ? 2 : 0.5; + + i++; } - if (surface.roughness > 0.6) + + validHit = (i <= HiZTraceIterationsMax); + + return position; +} + +static const uint rayMarchIterationsMax = 60; // primary ray march step count (higher will find more in distance, but slower) +static const float rayMarchStepIncrease = 1.05f; // primary ray march step increase (higher will travel more distance, but can miss details) +static const uint rayMarchFineIterationsMax = 2; // binary step count (higher is nicer but slower) +static const float rayMarchTolerance = 0.000002; // early exit factor for binary search (smaller is nicer but slower) +static const float rayMarchLevelIncrement = 0.3; // level increment based on ray travel distance and roughness (higher values improves performance, but traces at lower resolution) + +// samplePos where ray march left of +float3 BinarySearch(float3 samplePos, float3 V, float level) +{ + for (uint i = 0; i < rayMarchFineIterationsMax; i++) { - texture_raytrace[DTid.xy] = 0; - texture_rayLengths[DTid.xy] = 0; + float sampleDepth = texture_depth_hierarchy.SampleLevel(sampler_point_clamp, samplePos.xy, level).g; + + if (abs(samplePos.z - sampleDepth) < rayMarchTolerance) + { + return samplePos; + } + + if (samplePos.z >= sampleDepth) + { + samplePos += V; + } + + V *= 0.5f; + samplePos -= V; + } + + return samplePos; +} + +// P and V in screen space [0, 1] x [0, 1] +float3 RayMarch(float3 P, float3 V, float roughness, float jitter, out bool validHit) +{ + float3 samplePos = P + V * jitter; + + float sampleDepth = 0; + float level = 1; + + uint iterations = 0; + while (iterations <= rayMarchIterationsMax) + { + if (any(samplePos.xy < 0.0) || any(samplePos.xy > 1.0)) + { + validHit = false; + return samplePos; + } + + samplePos += V; + + sampleDepth = texture_depth_hierarchy.SampleLevel(sampler_point_clamp, samplePos.xy, level).g; + + if (sampleDepth > samplePos.z) + { + samplePos = BinarySearch(samplePos, V, level); + break; + } + + V *= rayMarchStepIncrease; + level += rayMarchLevelIncrement * roughness; + + iterations++; + } + + validHit = (iterations <= rayMarchIterationsMax); + return float3(samplePos.xy, sampleDepth); +} + +float CalculateEdgeVignette(float2 hitPixel) +{ + float2 hitPixelNDC = hitPixel * 2.0 - 1.0; + + //float maxDimension = min(1.0, max(abs(hitPixelNDC.x), abs(hitPixelNDC.y))); + //float attenuation = 1.0 - max(0.0, maxDimension - blendScreenEdgeFade) / (1.0 - blendScreenEdgeFade); + + float2 vignette = saturate(abs(hitPixelNDC) * blendScreenEdgeFade - (blendScreenEdgeFade - 1.0f)); + float attenuation = saturate(1.0 - dot(vignette, vignette)); + + return attenuation; +} + +float ValidateHit(float3 hit, float hitDepth, float2 prevHitUV) +{ + float vignetteHit = CalculateEdgeVignette(hit.xy); + float vignetteHitPrev = CalculateEdgeVignette(prevHitUV); + float vignette = min(vignetteHit, vignetteHitPrev); + + float3 surfaceViewPosition = reconstruct_position(hit.xy, hitDepth, GetCamera().inverse_projection); + float3 hitViewPosition = reconstruct_position(hit.xy, hit.z, GetCamera().inverse_projection); + + float distance = length(surfaceViewPosition - hitViewPosition); + float confidence = 1.0 - smoothstep(0.0, traceThickness, distance); + + return vignette * confidence; +} + +[numthreads(POSTPROCESS_BLOCKSIZE * POSTPROCESS_BLOCKSIZE, 1, 1)] +void main(uint3 Gid : SV_GroupID, uint3 GTid : SV_GroupThreadID) +{ + // This pass is rendered at half-res + const uint downsampleFactor = 2; + + const uint2 pixel = GetReflectionIndirectDispatchCoord(Gid, GTid, tiles, downsampleFactor); + const float2 uv = (pixel + 0.5f) * postprocess.resolution_rcp; + +#ifdef SSR_EARLYEXIT + + output_rayIndirectSpecular[pixel] = 0; + output_rayDirectionPDF[pixel] = 0; + output_rayLengths[pixel] = 0; + +#else + + // This is necessary for accurate upscaling. This is so we don't reuse the same half-res pixels + uint2 screenJitter = floor(blue_noise(uint2(0, 0)).xy * downsampleFactor); + uint2 jitterPixel = screenJitter + pixel * downsampleFactor; + float2 jitterUV = (screenJitter + pixel + 0.5f) * postprocess.resolution_rcp; + + // Due to HiZ tracing, the tracing and the pass components must match depth. + float depth = texture_depth_hierarchy[screenJitter + pixel].r; + float roughness = texture_surface_roughness[jitterPixel]; + + if (!NeedReflection(roughness, depth)) + { + output_rayIndirectSpecular[pixel] = 0.0; + output_rayDirectionPDF[pixel] = 0.0; + output_rayLengths[pixel] = 0.0; return; } - // Everything in view space: - float3 N = normalize(mul((float3x3)GetCamera().view, surface.N)); - float3 P = reconstruct_position(uv, depth, GetCamera().inverse_projection); - float3 V = normalize(-P); - const float roughness = GetRoughness(surface.roughness); - - const float roughnessFade = GetRoughnessFade(roughness, SSRMaxRoughness); - if (roughnessFade <= 0) - { - texture_raytrace[DTid.xy] = 0; - return; - } - + float3 N = texture_surface_normal[jitterPixel]; + float3 P = reconstruct_position(jitterUV, depth); + float3 V = normalize(GetCamera().position - P); + float4 H; float3 L; - float jitter; if (roughness > 0.05f) { float3x3 tangentBasis = GetTangentBasis(N); float3 tangentV = mul(tangentBasis, V); #ifdef GGX_SAMPLE_VISIBLE - -#if 1 - const float2 bluenoise = blue_noise(DTid.xy).xy; + + const float2 bluenoise = blue_noise(pixel).xy; float2 Xi = bluenoise.xy; - + Xi.y = lerp(Xi.y, 0.0f, GGX_IMPORTANCE_SAMPLE_BIAS); H = ImportanceSampleVisibleGGX(SampleDisk(Xi), roughness, tangentV); - -#else // Old - - // Low-discrepancy sequence - uint2 Random = Rand_PCG16(int3((DTid.xy + 0.5f), GetFrame().frame_count)).xy; - - float2 Xi = HammersleyRandom16(1, Random); // SingleSPP - - Xi.y = lerp(Xi.y, 0.0f, GGX_IMPORTANCE_SAMPLE_BIAS); - - H = ImportanceSampleVisibleGGX(SampleDisk(Xi), roughness, tangentV); - -#endif - // Tangent to world + // Tangent to world H.xyz = mul(H.xyz, tangentBasis); - + #else - + const float surfaceMargin = 0.0f; const float maxRegenCount = 15.0f; - - uint2 Random = Rand_PCG16(int3((DTid.xy + 0.5f), GetFrame().frame_count)).xy; - - // By using an uniform importance sampling method, some rays go below the surface. - // We simply re-generate them at a negligible cost, to get some nice ones. - + + // By using an uniform importance sampling method, some rays go below the surface. + // We simply re-generate them at a negligible cost, to get some nice ones. + float RdotN = 0.0f; float regenCount = 0; - [loop] + [loop] for (; RdotN <= surfaceMargin && regenCount < maxRegenCount; regenCount++) { - // Low-discrepancy sequence - //float2 Xi = float2(Random) * rcp(65536.0); // equivalent to HammersleyRandom(0, 1, Random). - float2 Xi = HammersleyRandom16(regenCount, Random); // SingleSPP - + // Low-discrepancy sequence + const float2 bluenoise = blue_noise(pixel, regenCount).xy; + + float2 Xi = bluenoise.xy; + Xi.y = lerp(Xi.y, 0.0, GGX_IMPORTANCE_SAMPLE_BIAS); - + H = ImportanceSampleGGX(Xi, roughness); - - // Tangent to world + + // Tangent to world H.xyz = mul(H.xyz, tangentBasis); - + RdotN = dot(N, reflect(-V, H.xyz)); } - -#endif - + +#endif // GGX_SAMPLE_VISIBLE + L = reflect(-V, H.xyz); - jitter = InterleavedGradientNoise(DTid.xy, GetFrame().frame_count); } else { H = float4(N.xyz, 1.0f); L = reflect(-V, H.xyz); - jitter = 0; } - - float2 hitPixel = float2(0.0f, 0.0f); - float3 hitPoint = float3(0.0f, 0.0f, 0.0f); - float iterations = 0.0f; - bool hit = ScreenSpaceRayTrace(P, L, jitter, roughness, hitPixel, hitPoint, iterations); + float4 rayStartClip = mul(GetCamera().view_projection, float4(P, 1)); // World to Clip + float4 rayEndClip = mul(GetCamera().view_projection, float4(P + L, 1)); + float3 rayStartScreen = rayStartClip.xyz * rcp(rayStartClip.w); + float3 rayEndScreen = rayEndClip.xyz * rcp(rayEndClip.w); - float hitDepth = texture_depth.SampleLevel(sampler_linear_clamp, hitPixel, 1); + rayStartScreen.xy = rayStartScreen.xy * float2(0.5, -0.5) + float2(0.5, 0.5); + rayEndScreen.xy = rayEndScreen.xy * float2(0.5, -0.5) + float2(0.5, 0.5); - // Output: - // xy: hit pixel - // z: hit depth - // w: pdf - float4 raytrace = max(0, float4(hitPixel, hitDepth, H.w)); - texture_raytrace[DTid.xy] = raytrace; +#ifdef SSR_CHEAP - if (hit) + rayStartScreen.xy *= postprocess.params1.xy; // Ratio factor between hierarchy and pass + rayEndScreen.xy *= postprocess.params1.xy; + + float3 rayDirectionScreen = rayEndScreen - rayStartScreen; + + // The ray marching benefits from jittering to create a smoother transition between samples and LOD + float jitter = InterleavedGradientNoise(pixel, GetFrame().frame_count % 8u); // Temporally stabilize + + bool validHit = false; + float3 hit = RayMarch(rayStartScreen, rayDirectionScreen, roughness, jitter, validHit); + + hit.xy *= postprocess.params1.zw; // Undo ratio + +#else + + float3 rayDirectionScreen = rayEndScreen - rayStartScreen; + + bool validHit = false; + float3 hit = HierarchicalRaymarch(rayStartScreen, rayDirectionScreen, postprocess.resolution, validHit); + +#endif // SSR_CHEAP + + float2 prevHitUV = texture_gbuffer1.SampleLevel(sampler_point_clamp, hit.xy, 0).xy + hit.xy; + + float hitDepth = texture_depth.SampleLevel(sampler_point_clamp, hit.xy, 0); + float confidence = validHit ? ValidateHit(hit, hitDepth, prevHitUV) : 0; + + float4 indirectSpecular; + indirectSpecular.rgb = confidence > 0 ? input.SampleLevel(sampler_point_clamp, prevHitUV, 0).rgb : 0; + indirectSpecular.a = confidence; + + output_rayIndirectSpecular[pixel] = indirectSpecular; + + output_rayDirectionPDF[pixel] = float4(L, H.w); + + if (validHit) { - const float3 Phit = reconstruct_position(uv, hitDepth, GetCamera().inverse_projection); - texture_rayLengths[DTid.xy] = distance(P, Phit); + const float3 Phit = reconstruct_position(jitterUV, hit.z, GetCamera().inverse_projection); + output_rayLengths[pixel] = distance(P, Phit); } else { - texture_rayLengths[DTid.xy] = 0; + output_rayLengths[pixel] = 0; } + +#endif // SSR_EARLYEXIT + +#ifdef DEBUG_TILING + float3 color = input[pixel].rgb; +#if defined(SSR_EARLYEXIT) + output_rayIndirectSpecular[pixel] = float4(lerp(color, float3(0, 0, 1), 0.5f), 1.0); +#elif defined(SSR_CHEAP) + output_rayIndirectSpecular[pixel] = float4(lerp(color, float3(0, 1, 0), 0.5f), 1.0); +#else + output_rayIndirectSpecular[pixel] = float4(lerp(color, float3(1, 0, 0), 0.5f), 1.0); +#endif // SSR_EARLYEXIT +#endif // DEBUG_TILING } diff --git a/WickedEngine/shaders/ssr_raytraceCS_cheap.hlsl b/WickedEngine/shaders/ssr_raytraceCS_cheap.hlsl new file mode 100644 index 000000000..c3e5e0099 --- /dev/null +++ b/WickedEngine/shaders/ssr_raytraceCS_cheap.hlsl @@ -0,0 +1,2 @@ +#define SSR_CHEAP +#include "ssr_raytraceCS.hlsl" diff --git a/WickedEngine/shaders/ssr_raytraceCS_earlyexit.hlsl b/WickedEngine/shaders/ssr_raytraceCS_earlyexit.hlsl new file mode 100644 index 000000000..f486e9992 --- /dev/null +++ b/WickedEngine/shaders/ssr_raytraceCS_earlyexit.hlsl @@ -0,0 +1,2 @@ +#define SSR_EARLYEXIT +#include "ssr_raytraceCS.hlsl" diff --git a/WickedEngine/shaders/ssr_resolveCS.hlsl b/WickedEngine/shaders/ssr_resolveCS.hlsl index 39e5aa590..816da6a56 100644 --- a/WickedEngine/shaders/ssr_resolveCS.hlsl +++ b/WickedEngine/shaders/ssr_resolveCS.hlsl @@ -5,221 +5,170 @@ PUSHCONSTANT(postprocess, PostProcess); -Texture2D texture_raytrace : register(t0); -Texture2D texture_main : register(t1); +Texture2D texture_surface_normal : register(t0); +Texture2D texture_surface_roughness : register(t1); +Texture2D texture_rayIndirectSpecular : register(t2); +Texture2D texture_rayDirectionPDF : register(t3); +Texture2D texture_rayLength : register(t4); RWTexture2D texture_resolve : register(u0); +RWTexture2D texture_resolve_variance : register(u1); +RWTexture2D texture_reprojectionDepth : register(u2); +static const float2 resolveSpatialSizeMinMax = float2(2.0, 8.0); // Good to have a min size as downsample scale (2x in this case) +static const uint resolveSpatialReconstructionCount = 4.0f; -static const float2 spatialReuseOffsets3x3[9] = +float GetWeight(int2 neighborTracingCoord, float3 V, float3 N, float roughness, float NdotV) { - float2(0.0, 0.0), - float2(0.0, 1.0), - float2(1.0, -1.0), - float2(-1.0, -1.0), - float2(-1.0, 0.0), - float2(0.0, -1.0), - float2(1.0, 0.0), - float2(-1.0, 1.0), - float2(1.0, 1.0) -}; + // Sample local pixel information + float4 rayDirectionPDF = texture_rayDirectionPDF[neighborTracingCoord]; + float3 rayDirection = rayDirectionPDF.rgb; + float PDF = rayDirectionPDF.a; -// Not in use, but could perhaps be useful in the future. -/*float2 CalculateTailDirection(float3 viewNormal) -{ - float3 upVector = abs(viewNormal.z) < 0.999 ? float3(0.0, 0.0, 1.0) : float3(1.0, 0.0, 0.0); - float3 T = normalize(cross(upVector, viewNormal)); + float3 sampleL = normalize(rayDirection); + float3 sampleH = normalize(sampleL + V); - float tailDirection = T.x * -viewNormal.y; - - return lerp(float2(1.0, 0.1), float2(0.1, 1.0), tailDirection); -}*/ + float sampleNdotH = saturate(dot(N, sampleH)); + float sampleNdotL = saturate(dot(N, sampleL)); -float CalculateEdgeFade(float2 hitPixel) -{ - float2 hitPixelNDC = hitPixel * 2.0 - 1.0; - - //float maxDimension = min(1.0, max(abs(hitPixelNDC.x), abs(hitPixelNDC.y))); - //float attenuation = 1.0 - max(0.0, maxDimension - blendScreenEdgeFade) / (1.0 - blendScreenEdgeFade); + float roughnessBRDF = roughness * roughness; - float2 vignette = saturate(abs(hitPixelNDC) * SSRBlendScreenEdgeFade - (SSRBlendScreenEdgeFade - 1.0f)); - float attenuation = saturate(1.0 - dot(vignette, vignette)); - - return attenuation; + float Vis = V_SmithGGXCorrelated(roughnessBRDF, NdotV, sampleNdotL); + float D = D_GGX(roughnessBRDF, sampleNdotH, sampleH); + float localBRDF = Vis * D * sampleNdotL; + + float weight = localBRDF / max(PDF, 0.00001f); + + return weight; } -void GetSampleInfo(float2 velocity, float2 neighborUV, float2 uv, float3 P, float3 V, float3 N, float NdotV, float specularConeTangent, float roughness, out float4 sampleColor, out float weight) +// Weighted incremental variance +// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance +void GetWeightedVariance(float4 sampleColor, float weight, float weightSum, inout float mean, inout float S) { - // Sample local pixel information - float4 raytraceSource = texture_raytrace.SampleLevel(sampler_point_clamp, neighborUV, 0); - - float2 hitPixel = raytraceSource.xy + velocity; - float hitDepth = raytraceSource.z; - float hitPDF = raytraceSource.w; + float luminance = Luminance(sampleColor.rgb); + float oldMean = mean; + mean += weight / weightSum * (luminance - oldMean); + S += weight * (luminance - oldMean) * (luminance - mean); +} - float intersectionCircleRadius = specularConeTangent * length(hitPixel - uv); - float sourceMip = clamp(log2(intersectionCircleRadius * ssr_input_resolution_max), 0.0, ssr_input_maxmip) * SSRResolveConeMip; - - sampleColor.rgb = texture_main.SampleLevel(sampler_linear_clamp, hitPixel, sourceMip).rgb; // Scene color - sampleColor.a = CalculateEdgeFade(raytraceSource.xy); // Opacity - Since this is used for masking, we can ignore velocity - - // BRDF Weight - - float3 hitViewPosition = reconstruct_position(hitPixel, hitDepth, GetCamera().inverse_projection); - - float3 L = normalize(hitViewPosition - P); - float3 H = normalize(L + V); +// modified from 'globals.hlsli' with random shift +// idx : iteration index +// num : number of iterations in total +// random : 16 bit random sequence +inline float2 hammersley2d_random(uint idx, uint num, uint2 random) +{ + uint bits = idx; + bits = (bits << 16u) | (bits >> 16u); + bits = ((bits & 0x55555555u) << 1u) | ((bits & 0xAAAAAAAAu) >> 1u); + bits = ((bits & 0x33333333u) << 2u) | ((bits & 0xCCCCCCCCu) >> 2u); + bits = ((bits & 0x0F0F0F0Fu) << 4u) | ((bits & 0xF0F0F0F0u) >> 4u); + bits = ((bits & 0x00FF00FFu) << 8u) | ((bits & 0xFF00FF00u) >> 8u); + const float radicalInverse_VdC = float(bits ^ random.y) * 2.3283064365386963e-10; // / 0x100000000 - float NdotH = saturate(dot(N, H)); - float NdotL = saturate(dot(N, L)); - - Surface surface; - surface.init(); - surface.roughnessBRDF = roughness * roughness; - surface.NdotV = NdotV; - - SurfaceToLight surfaceToLight; - surfaceToLight.NdotH = NdotH; - surfaceToLight.NdotL = NdotL; - - // Calculate BRDF where Fresnel = 1 - float Vis = V_SmithGGXCorrelated(surface.roughnessBRDF, surface.NdotV, surfaceToLight.NdotL); - float D = D_GGX(surface.roughnessBRDF, surfaceToLight.NdotH, surfaceToLight.H); - float specularLight = Vis * D * PI / 4.0; + // ... & 0xffff) / (1 << 16): limit to 65536 then range 0 - 1 + return float2(frac(float(idx) / float(num) + float(random.x & 0xffff) / (1 << 16)), radicalInverse_VdC); // frac since we only want range [0; 1[ +} - weight = specularLight / max(hitPDF, 0.00001f); +uint baseHash(uint3 p) +{ + p = 1103515245u * ((p.xyz >> 1u) ^ (p.yzx)); + uint h32 = 1103515245u * ((p.x ^ p.z) ^ (p.y >> 3u)); + return h32 ^ (h32 >> 16); +} + +// Great quality hash with 3D input +// based on: https://www.shadertoy.com/view/Xt3cDn +uint3 hash33(uint3 x) +{ + uint n = baseHash(x); + return uint3(n, n * 16807u, n * 48271u); //see: http://random.mat.sbg.ac.at/results/karl/server/node4.html +} + +// Computes post-projection depth from linear depth +float getInverseLinearDepth(float lin, float near, float far) +{ + float z_n = ((lin - 2 * far) * near + far * lin) / (lin * near - far * lin); + float z = (z_n + 1) / 2; + return z; } [numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] void main(uint3 DTid : SV_DispatchThreadID) { const float2 uv = (DTid.xy + 0.5f) * postprocess.resolution_rcp; - const float depth = texture_depth.SampleLevel(sampler_linear_clamp, uv, 0); - if (depth == 0.0f) - return; + const uint2 tracingCoord = DTid.xy / 2; - // Everthing in view space: - const float3 P = reconstruct_position(uv, depth, GetCamera().inverse_projection); - const float3 V = normalize(-P); + const float depth = texture_depth[DTid.xy]; + const float roughness = texture_surface_roughness[DTid.xy]; - PrimitiveID prim; - prim.unpack(texture_gbuffer0[DTid.xy * 2]); - - Surface surface; - surface.init(); - if (!surface.load(prim, P)) + if (!NeedReflection(roughness, depth)) { + texture_resolve[DTid.xy] = texture_rayIndirectSpecular[tracingCoord]; + texture_resolve_variance[DTid.xy] = 0.0; + texture_reprojectionDepth[DTid.xy] = 0.0; return; } - const float3 N = normalize(mul((float3x3)GetCamera().view, surface.N)); - const float roughness = GetRoughness(surface.roughness); - + // Everthing in world space: + const float3 P = reconstruct_position(uv, depth); + const float3 N = texture_surface_normal[DTid.xy]; + const float3 V = normalize(GetCamera().position - P); const float NdotV = saturate(dot(N, V)); - const float2 velocity = texture_gbuffer1.SampleLevel(sampler_point_clamp, uv, 0).xy; - const float2 prevUV = uv + velocity; + const float resolveSpatialScale = saturate(roughness * 5.0); // roughness 0.2 is destination + const float2 resolveSpatialSize = lerp(resolveSpatialSizeMinMax.x, resolveSpatialSizeMinMax.y, resolveSpatialScale); - // Early out, useless if the roughness is out of range - float roughnessFade = GetRoughnessFade(roughness, SSRMaxRoughness); - if (roughnessFade <= 0.0f) - { - texture_resolve[DTid.xy] = 0; - return; - } - - // Since we aren't importance sampling in this range, no need to resolve - if (roughness < 0.05f) - { - float4 raytraceSource = texture_raytrace.SampleLevel(sampler_point_clamp, uv, 0); - float2 hitPixel = raytraceSource.xy + velocity; - - float4 sampleColor; - sampleColor.rgb = texture_main.SampleLevel(sampler_linear_clamp, hitPixel, 0).rgb; // Scene color - sampleColor.a = CalculateEdgeFade(raytraceSource.xy); // Opacity - - texture_resolve[DTid.xy] = sampleColor; - return; - } - - - // Cone mip sampling - float specularConeTangent = lerp(0.0, roughness * (1.0 - GGX_IMPORTANCE_SAMPLE_BIAS), NdotV * sqrt(roughness)); - specularConeTangent *= lerp(saturate(NdotV * 2), 1.0f, sqrt(roughness)); - - -#if 1 // EAW spatial resolve - - float4 result = 0.0f; float weightSum = 0.0f; - -#define BLOCK_SAMPLE_RADIUS 1 - - [unroll] - for (int y = -BLOCK_SAMPLE_RADIUS; y <= BLOCK_SAMPLE_RADIUS; y++) + + float mean = 0.0f; + float S = 0.0f; + + float closestRayLength = 0.0f; + + const uint sampleCount = resolveSpatialReconstructionCount; + const uint2 random = hash33(uint3(DTid.xy, GetFrame().frame_count)).xy; + + for (int i = 0; i < sampleCount; i++) { - [loop] - for (int x = -BLOCK_SAMPLE_RADIUS; x <= BLOCK_SAMPLE_RADIUS; x++) + float2 offset = (hammersley2d_random(i, sampleCount, random) - 0.5) * resolveSpatialSize; + + int2 neighborTracingCoord = tracingCoord + offset; + int2 neighborCoord = DTid.xy + offset; + + float neighborDepth = texture_depth[neighborCoord]; + if (neighborDepth > 0.0) { - if (uint(abs(x) + abs(y)) % 2 == 0) - continue; - - float2 offsetUV = float2(x, y) * postprocess.resolution_rcp * SSRResolveSpatialSize; - float2 neighborUV = uv + offsetUV; - - float4 sampleColor; - float weight; - GetSampleInfo(velocity, neighborUV, uv, P, V, N, NdotV, specularConeTangent, roughness, sampleColor, weight); - + float weight = GetWeight(neighborTracingCoord, V, N, roughness, NdotV); + + float4 sampleColor = texture_rayIndirectSpecular[neighborTracingCoord]; sampleColor.rgb *= rcp(1 + Luminance(sampleColor.rgb)); - + result += sampleColor * weight; weightSum += weight; + + GetWeightedVariance(sampleColor, weight, weightSum, mean, S); + + if (weight > 0.001) + { + float neighborRayLength = texture_rayLength[neighborTracingCoord]; + closestRayLength = max(closestRayLength, neighborRayLength); + } } } - result /= weightSum; - - result.rgb *= rcp(1 - Luminance(result.rgb)); - -#undef BLOCK_SAMPLE_RADIUS - - -#else // Frostbite presentation, spatial resolve - - float4 result = 0.0f; - float weightSum = 0.0f; - -#define NUM_RESOLVE 4 // Four samples to achieve effective ray reuse patterns - - [unroll] - for (uint i = 0; i < NUM_RESOLVE; i++) - { - float2 offsetUV = spatialReuseOffsets3x3[i] * postprocess.resolution_rcp * SSRResolveSpatialSize; - float2 neighborUV = uv + offsetUV; - - float4 sampleColor; - float weight; - GetSampleInfo(velocity, neighborUV, uv, P, V, N, NdotV, specularConeTangent, roughness, sampleColor, weight); - - sampleColor.rgb *= rcp( 1 + Luminance(sampleColor.rgb) ); - - result += sampleColor * weight; - weightSum += weight; - } result /= weightSum; - - result.rgb *= rcp( 1 - Luminance(result.rgb) ); - -#undef NUM_RESOLVE - - -#endif - - - result *= roughnessFade; - result *= SSRIntensity; - + result.rgb *= rcp(1 - Luminance(result.rgb)); + + // Population variance + float resolveVariance = S / weightSum; + + // Convert to post-projection depth so we can construct dual source reprojection buffers later + const float lineardepth = texture_lineardepth[DTid.xy] * GetCamera().z_far; + float reprojectionDepth = getInverseLinearDepth(lineardepth + closestRayLength, GetCamera().z_near, GetCamera().z_far); + texture_resolve[DTid.xy] = max(result, 0.00001f); + texture_resolve_variance[DTid.xy] = resolveVariance; + texture_reprojectionDepth[DTid.xy] = reprojectionDepth; } diff --git a/WickedEngine/shaders/ssr_surfaceCS.hlsl b/WickedEngine/shaders/ssr_surfaceCS.hlsl new file mode 100644 index 000000000..fa5c3ab17 --- /dev/null +++ b/WickedEngine/shaders/ssr_surfaceCS.hlsl @@ -0,0 +1,51 @@ +#include "globals.hlsli" +#include "brdf.hlsli" +#include "lightingHF.hlsli" +#include "ShaderInterop_Postprocess.h" + +PUSHCONSTANT(postprocess, PostProcess); + +RWTexture2D output_surface_normal : register(u0); +RWTexture2D output_surface_roughness : register(u1); +RWTexture2D output_surface_environment : register(u2); + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint3 DTid : SV_DispatchThreadID) +{ + uint2 dim; + texture_depth.GetDimensions(dim.x, dim.y); + + float2 uv = (DTid.xy + 0.5f) / dim; + + float depth = texture_depth[DTid.xy]; + if (depth == 0.0) + { + output_surface_normal[DTid.xy] = 0.0; + output_surface_roughness[DTid.xy] = 0.0; + output_surface_environment[DTid.xy] = 0.0; + return; + } + + uint2 primitiveID = texture_gbuffer0[DTid.xy]; // Map to resolution + + PrimitiveID prim; + prim.unpack(primitiveID); + + Surface surface; + surface.init(); + if (!surface.load(prim, reconstruct_position(uv, depth))) + { + output_surface_normal[DTid.xy] = 0.0; + output_surface_roughness[DTid.xy] = 0.0; + output_surface_environment[DTid.xy] = 0.0; + return; + } + + float3 N = surface.N; + float roughness = surface.roughness; + float3 environmentReflection = EnvironmentReflection_Global(surface); + + output_surface_normal[DTid.xy] = N; + output_surface_roughness[DTid.xy] = roughness; + output_surface_environment[DTid.xy] = environmentReflection; +} diff --git a/WickedEngine/shaders/ssr_temporalCS.hlsl b/WickedEngine/shaders/ssr_temporalCS.hlsl index 1a55190f6..2b362b9ff 100644 --- a/WickedEngine/shaders/ssr_temporalCS.hlsl +++ b/WickedEngine/shaders/ssr_temporalCS.hlsl @@ -4,177 +4,236 @@ PUSHCONSTANT(postprocess, PostProcess); -Texture2D resolve_current : register(t0); -Texture2D resolve_history : register(t1); -Texture2D rayLengths : register(t3); +Texture2D texture_surface_roughness : register(t0); +Texture2D texture_color_current : register(t1); +Texture2D texture_color_history : register(t2); +Texture2D texture_variance_current : register(t3); +Texture2D texture_variance_history : register(t4); +Texture2D texture_reprojectionDepth : register(t5); -RWTexture2D output : register(u0); +RWTexture2D output_color : register(u0); +RWTexture2D output_variance : register(u1); -static const float temporalResponseMin = 0.75; -static const float temporalResponseMax = 0.95f; -static const float temporalScale = 3.0; -static const float temporalExposure = 10.0f; +static const float temporalResponse = 0.95; +static const float temporalScale = 2.0; +static const float disocclusionDepthWeight = 1.0f; +static const float disocclusionThreshold = 0.9f; +static const float varianceTemporalResponse = 0.9f; -inline float Luma4(float3 color) +float2 CalculateReprojectionBuffer(float2 uv, float depth) { - return (color.g * 2) + (color.r + color.b); + float x = uv.x * 2 - 1; + float y = (1 - uv.y) * 2 - 1; + float2 screenPosition = float2(x, y); + + float4 thisClip = float4(screenPosition, depth, 1); + + float4 prevClip = mul(GetCamera().inverse_view_projection, thisClip); + prevClip = mul(GetCamera().previous_view_projection, prevClip); + + float2 prevScreen = prevClip.xy / prevClip.w; + + float2 screenVelocity = screenPosition - prevScreen; + float2 prevScreenPosition = screenPosition - screenVelocity; + + return prevScreenPosition * float2(0.5, -0.5) + 0.5; } -inline float HdrWeight4(float3 color, float exposure) +float GetDisocclusion(float depth, float depthHistory) { - return rcp(Luma4(color) * exposure + 4.0f); + float lineardepthCurrent = compute_lineardepth(depth); + float lineardepthHistory = compute_lineardepth(depthHistory); + + float disocclusion = 1.0 + //* exp(-abs(1.0 - max(0.0, dot(normal, normalHistory))) * disocclusionNormalWeight) // Potential normal check if necessary + * exp(-abs(lineardepthHistory - lineardepthCurrent) / lineardepthCurrent * disocclusionDepthWeight); + + return disocclusion; } -float4 clip_aabb(float3 aabb_min, float3 aabb_max, float4 p, float4 q) +float4 SamplePreviousColor(float2 prevUV, float2 size, float depth, out float disocclusion, out float2 prevUVSample) { - float3 p_clip = 0.5 * (aabb_max + aabb_min); - float3 e_clip = 0.5 * (aabb_max - aabb_min) + 0.00000001f; + prevUVSample = prevUV; - float4 v_clip = q - float4(p_clip, p.w); - float3 v_unit = v_clip.xyz / e_clip; - float3 a_unit = abs(v_unit); - float ma_unit = max(a_unit.x, max(a_unit.y, a_unit.z)); + float4 previousColor = texture_color_history.SampleLevel(sampler_linear_clamp, prevUVSample, 0); + float previousDepth = texture_depth_history.SampleLevel(sampler_point_clamp, prevUVSample, 0); - if (ma_unit > 1.0) - return float4(p_clip, p.w) + v_clip / ma_unit; - else - return q; // point inside aabb -} + disocclusion = GetDisocclusion(depth, previousDepth); + if (disocclusion > disocclusionThreshold) // Good enough + { + return previousColor; + } -inline void ResolverAABB(Texture2D currentColor, SamplerState currentSampler, float sharpness, float exposureScale, float AABBScale, float2 uv, float2 texelSize, inout float4 currentMin, inout float4 currentMax, inout float4 currentAverage, inout float4 currentOutput) -{ - const int2 SampleOffset[9] = { int2(-1.0, -1.0), int2(0.0, -1.0), int2(1.0, -1.0), int2(-1.0, 0.0), int2(0.0, 0.0), int2(1.0, 0.0), int2(-1.0, 1.0), int2(0.0, 1.0), int2(1.0, 1.0) }; - - // Modulate Luma HDR - - float4 sampleColors[9]; - [unroll] - for (uint i = 0; i < 9; i++) - { - sampleColors[i] = currentColor.SampleLevel(currentSampler, uv + (SampleOffset[i] / texelSize), 0.0f); - } + // Try to find the closest sample in the vicinity if we are not convinced of a disocclusion + if (disocclusion < disocclusionThreshold) + { + float2 closestUV = prevUVSample; + float2 dudv = rcp(size); - float sampleWeights[9]; - [unroll] - for (uint j = 0; j < 9; j++) - { - sampleWeights[j] = HdrWeight4(sampleColors[j].rgb, exposureScale); - } + const int searchRadius = 1; + for (int y = -searchRadius; y <= searchRadius; y++) + { + for (int x = -searchRadius; x <= searchRadius; x++) + { + int2 offset = int2(x, y); + float2 sampleUV = prevUVSample + offset * dudv; - float totalWeight = 0; - [unroll] - for (uint k = 0; k < 9; k++) - { - totalWeight += sampleWeights[k]; - } - sampleColors[4] = (sampleColors[0] * sampleWeights[0] + sampleColors[1] * sampleWeights[1] + sampleColors[2] * sampleWeights[2] + sampleColors[3] * sampleWeights[3] + sampleColors[4] * sampleWeights[4] + - sampleColors[5] * sampleWeights[5] + sampleColors[6] * sampleWeights[6] + sampleColors[7] * sampleWeights[7] + sampleColors[8] * sampleWeights[8]) / totalWeight; + float samplePreviousDepth = texture_depth_history.SampleLevel(sampler_point_clamp, sampleUV, 0); - // Variance Clipping (AABB) - - float4 m1 = 0.0; - float4 m2 = 0.0; - [unroll] - for (uint x = 0; x < 9; x++) - { - m1 += sampleColors[x]; - m2 += sampleColors[x] * sampleColors[x]; - } + float weight = GetDisocclusion(depth, samplePreviousDepth); + if (weight > disocclusion) + { + disocclusion = weight; + closestUV = sampleUV; + prevUVSample = closestUV; + } + } + } - float4 mean = m1 / 9.0; - float4 stddev = sqrt((m2 / 9.0) - sqr(mean)); - - currentMin = mean - AABBScale * stddev; - currentMax = mean + AABBScale * stddev; + previousColor = texture_color_history.SampleLevel(sampler_linear_clamp, prevUVSample, 0); + } - currentOutput = sampleColors[4]; - currentMin = min(currentMin, currentOutput); - currentMax = max(currentMax, currentOutput); - currentAverage = mean; + // Bilinear interpolation on fallback - near edges + if (disocclusion < disocclusionThreshold) + { + float2 weight = frac(prevUVSample * size + 0.5); + + // Bilinear weights + float weights[4] = + { + (1 - weight.x) * (1 - weight.y), + weight.x * (1 - weight.y), + (1 - weight.x) * weight.y, + weight.x * weight.y + }; + + float4 previousColorResult = 0; + float previousDepthResult = 0; + float weightSum = 0; + + uint2 prevCoord = uint2(size * prevUVSample - 0.5); + uint2 offsets[4] = { uint2(0, 0), uint2(1, 0), uint2(0, 1), uint2(1, 1) }; + + for (uint i = 0; i < 4; i++) + { + uint2 sampleCoord = prevCoord + offsets[i]; + + previousColorResult += weights[i] * texture_color_history[sampleCoord]; + previousDepthResult += weights[i] * texture_depth_history[sampleCoord]; + + weightSum += weights[i]; + } + + previousColorResult /= max(weightSum, 0.00001); + previousDepthResult /= max(weightSum, 0.00001); + + previousColor = previousColorResult; + disocclusion = GetDisocclusion(depth, previousDepthResult); + } + + disocclusion = disocclusion < disocclusionThreshold ? 0.0 : disocclusion; + return previousColor; } [numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] -void main(uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint3 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex) +void main(uint3 Gid : SV_GroupID, uint3 GTid : SV_GroupThreadID, uint3 DTid : SV_DispatchThreadID) { - if ((uint)ssr_frame == 0) + if ((uint) ssr_frame == 0) { - output[DTid.xy] = resolve_current[DTid.xy]; + output_color[DTid.xy] = texture_color_current[DTid.xy]; return; } - const float2 uv = (DTid.xy + 0.5f) * postprocess.resolution_rcp; - const float depth = texture_depth.SampleLevel(sampler_linear_clamp, uv, 0); - if (depth == 0) - return; + const float depth = texture_depth[DTid.xy]; + const float roughness = texture_surface_roughness[DTid.xy]; - const float2 velocity = texture_gbuffer1.SampleLevel(sampler_point_clamp, uv, 0).xy; - float2 prevUV = uv + velocity; - if (!is_saturated(prevUV)) + if (!NeedReflection(roughness, depth)) { - output[DTid.xy] = resolve_current[DTid.xy]; + output_color[DTid.xy] = texture_color_current[DTid.xy]; + output_variance[DTid.xy] = 0.0; return; } - const float3 P = reconstruct_position(uv, depth, GetCamera().inverse_projection); + // Welford's online algorithm: + // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - PrimitiveID prim; - prim.unpack(texture_gbuffer0[DTid.xy * 2]); - - Surface surface; - surface.init(); - if (!surface.load(prim, P)) - return; - - const float roughness = surface.roughness; - - if (roughness < 0.01) + float4 m1 = 0.0; + float4 m2 = 0.0; + for (int x = -1; x <= 1; x++) { - output[DTid.xy] = resolve_current[DTid.xy]; - //return; - } - - // Secondary reprojection based on ray lengths: - // https://www.ea.com/seed/news/seed-dd18-presentation-slides-raytracing (Slide 45) - if (roughness < 0.5) - { - float rayLength = rayLengths[DTid.xy]; - if (rayLength > 0) + for (int y = -1; y <= 1; y++) { - const float3 P = reconstruct_position(uv, depth); - const float3 V = normalize(GetCamera().position - P); - const float3 rayEnd = P - V * rayLength; - float4 rayEndPrev = mul(GetCamera().previous_view_projection, float4(rayEnd, 1)); - rayEndPrev.xy /= rayEndPrev.w; - prevUV = rayEndPrev.xy * float2(0.5, -0.5) + 0.5; + int2 offset = int2(x, y); + int2 coord = DTid.xy + offset; + + float4 sampleColor = texture_color_current[coord]; + + m1 += sampleColor; + m2 += sampleColor * sampleColor; } } - // Disocclusion fallback: - float depth_current = compute_lineardepth(depth); - float depth_history = compute_lineardepth(texture_depth_history.SampleLevel(sampler_point_clamp, prevUV, 1)); - if (abs(depth_current - depth_history) > 1) + float4 mean = m1 / 9.0; + float4 variance = (m2 / 9.0) - (mean * mean); + float4 stddev = sqrt(max(variance, 0.0f)); + + // Secondary reprojection based on ray lengths: + // https://www.ea.com/seed/news/seed-dd18-presentation-slides-raytracing (Slide 45) + + float2 velocity = texture_gbuffer1[DTid.xy]; + float reprojectionDepth = texture_reprojectionDepth[DTid.xy]; + + float2 uv = (DTid.xy + 0.5f) * postprocess.resolution_rcp; + + float2 prevUVVelocity = uv + velocity; + float2 prevUVReflectionHit = CalculateReprojectionBuffer(uv, reprojectionDepth); + + float4 previousColorVelocity = texture_color_history.SampleLevel(sampler_linear_clamp, prevUVVelocity, 0); + float4 previousColorReflectionHit = texture_color_history.SampleLevel(sampler_linear_clamp, prevUVReflectionHit, 0); + + float previousDistanceVelocity = abs(Luminance(previousColorVelocity.rgb) - Luminance(mean.rgb)); + float previousDistanceReflectionHit = abs(Luminance(previousColorReflectionHit.rgb) - Luminance(mean.rgb)); + + float2 prevUV = previousDistanceVelocity < previousDistanceReflectionHit ? prevUVVelocity : prevUVReflectionHit; + + float disocclusion = 0.0; + float2 prevUVSample = 0.0; + float4 previousColor = SamplePreviousColor(prevUV, postprocess.resolution, depth, disocclusion, prevUVSample); + + float4 currentColor = texture_color_current[DTid.xy]; + float4 resultColor = currentColor; + + // Disocclusion fallback: color + if (disocclusion > disocclusionThreshold && is_saturated(prevUVSample)) { - output[DTid.xy] = resolve_current[DTid.xy]; - //output[DTid.xy] = float4(1, 0, 0, 1); - return; + // Color box clamp + float4 colorMin = mean - temporalScale * stddev; + float4 colorMax = mean + temporalScale * stddev; + previousColor = clamp(previousColor, colorMin, colorMax); + + resultColor = lerp(currentColor, previousColor, temporalResponse); } - - float4 previous = resolve_history.SampleLevel(sampler_linear_clamp, prevUV, 0); +#if 0 // Debug + else + { + resultColor = float4(1, 0, 0, 1); + } +#endif - // Luma HDR and AABB minmax - - float4 current = 0; - float4 currentMin, currentMax, currentAverage; - ResolverAABB(resolve_current, sampler_linear_clamp, 0, temporalExposure, temporalScale, uv, postprocess.resolution, currentMin, currentMax, currentAverage, current); + float currentVariance = texture_variance_current[DTid.xy]; + float varianceResponse = varianceTemporalResponse; - previous.xyz = clip_aabb(currentMin.xyz, currentMax.xyz, clamp(currentAverage, currentMin, currentMax), previous).xyz; - previous.a = clamp(previous.a, currentMin.a, currentMax.a); - - // Blend color & history - - float blendFinal = lerp(temporalResponseMin, temporalResponseMax, saturate(1.0 - length(velocity) * 100)); - - float4 result = lerp(current, previous, blendFinal); - - output[DTid.xy] = max(0, result); + // Disocclusion fallback: variance + if (disocclusion < disocclusionThreshold || !is_saturated(prevUVSample)) + { + // Apply white for variance on occlusion. This helps to hide artifacts from temporal + varianceResponse = 0.0f; + currentVariance = 1.0f; + } + + float previousVariance = texture_variance_history.SampleLevel(sampler_linear_clamp, prevUVSample, 0); + float resultVariance = lerp(currentVariance, previousVariance, varianceResponse); + + output_color[DTid.xy] = max(0, resultColor); + output_variance[DTid.xy] = max(0, resultVariance); } diff --git a/WickedEngine/shaders/ssr_tileMaxRoughness_horizontalCS.hlsl b/WickedEngine/shaders/ssr_tileMaxRoughness_horizontalCS.hlsl new file mode 100644 index 000000000..d62c41c12 --- /dev/null +++ b/WickedEngine/shaders/ssr_tileMaxRoughness_horizontalCS.hlsl @@ -0,0 +1,43 @@ +#include "globals.hlsli" +#include "brdf.hlsli" +#include "ShaderInterop_Postprocess.h" + +PUSHCONSTANT(postprocess, PostProcess); + +Texture2D texture_surface_roughness : register(t0); + +RWTexture2D tile_minmax_roughness_horizontal : register(u0); + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint3 DTid : SV_DispatchThreadID) +{ + const uint2 tile_upperleft = uint2(DTid.x * SSR_TILESIZE, DTid.y); + float minRoughness = 1.0; + float maxRoughness = 0.0; + + uint2 dim; + texture_depth.GetDimensions(dim.x, dim.y); + + [loop] + for (uint i = 0; i < SSR_TILESIZE; ++i) + { + const uint2 pixel = uint2(tile_upperleft.x + i, tile_upperleft.y); + if (pixel.x >= 0 && pixel.y >= 0 && pixel.x < dim.x && pixel.y < dim.y) + { + float depth = texture_depth[pixel]; + if (depth == 0.0) + { + maxRoughness = max(maxRoughness, 1.0); + minRoughness = min(minRoughness, 1.0); + } + else + { + float roughness = texture_surface_roughness[pixel]; + maxRoughness = max(maxRoughness, roughness); + minRoughness = min(minRoughness, roughness); + } + } + } + + tile_minmax_roughness_horizontal[DTid.xy] = float2(minRoughness, maxRoughness); +} diff --git a/WickedEngine/shaders/ssr_tileMaxRoughness_verticalCS.hlsl b/WickedEngine/shaders/ssr_tileMaxRoughness_verticalCS.hlsl new file mode 100644 index 000000000..c8eccaa66 --- /dev/null +++ b/WickedEngine/shaders/ssr_tileMaxRoughness_verticalCS.hlsl @@ -0,0 +1,58 @@ +#include "globals.hlsli" +#include "brdf.hlsli" +#include "stochasticSSRHF.hlsli" +#include "ShaderInterop_Postprocess.h" + +Texture2D tile_minmax_roughness_horizontal : register(t0); + +RWByteAddressBuffer tile_tracing_statistics : register(u0); +RWStructuredBuffer tiles_tracing_earlyexit : register(u1); +RWStructuredBuffer tiles_tracing_cheap : register(u2); +RWStructuredBuffer tiles_tracing_expensive : register(u3); +RWTexture2D tile_minmax_roughness : register(u4); + +static const float SSRRoughnessCheap = 0.35; + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint3 DTid : SV_DispatchThreadID) +{ + const uint2 tile_upperleft = uint2(DTid.x, DTid.y * SSR_TILESIZE); + float minRoughness = 1.0; + float maxRoughness = 0.0; + + int2 dim; + tile_minmax_roughness_horizontal.GetDimensions(dim.x, dim.y); + + [loop] + for (uint i = 0; i < SSR_TILESIZE; ++i) + { + const uint2 pixel = uint2(tile_upperleft.x, tile_upperleft.y + i); + if (pixel.x >= 0 && pixel.y >= 0 && pixel.x < dim.x && pixel.y < dim.y) + { + float2 minmax_roughness = tile_minmax_roughness_horizontal[pixel]; + minRoughness = min(minRoughness, minmax_roughness.r); + maxRoughness = max(maxRoughness, minmax_roughness.g); + } + } + + const uint tile = (DTid.x & 0xFFFF) | ((DTid.y & 0xFFFF) << 16); + + uint prevCount; + if (minRoughness < SSRRoughnessCheap) + { + tile_tracing_statistics.InterlockedAdd(TILE_STATISTICS_OFFSET_EXPENSIVE, 1, prevCount); + tiles_tracing_expensive[prevCount] = tile; + } + else if (maxRoughness > SSRRoughnessCheap && minRoughness < ReflectionMaxRoughness) + { + tile_tracing_statistics.InterlockedAdd(TILE_STATISTICS_OFFSET_CHEAP, 1, prevCount); + tiles_tracing_cheap[prevCount] = tile; + } + else + { + tile_tracing_statistics.InterlockedAdd(TILE_STATISTICS_OFFSET_EARLYEXIT, 1, prevCount); + tiles_tracing_earlyexit[prevCount] = tile; + } + + tile_minmax_roughness[DTid.xy] = float2(minRoughness, maxRoughness); +} diff --git a/WickedEngine/shaders/stochasticSSRHF.hlsli b/WickedEngine/shaders/stochasticSSRHF.hlsli index 0655d9072..f369a95a7 100644 --- a/WickedEngine/shaders/stochasticSSRHF.hlsli +++ b/WickedEngine/shaders/stochasticSSRHF.hlsli @@ -1,162 +1,70 @@ #ifndef WI_STOCHASTICSSR_HF #define WI_STOCHASTICSSR_HF #include "brdf.hlsli" - -// Stochastic Screen Space Reflections reference: -// https://www.ea.com/frostbite/news/stochastic-screen-space-reflections - +#include "ShaderInterop_Postprocess.h" #define GGX_SAMPLE_VISIBLE // Bias used on GGX importance sample when denoising, to remove part of the tale that create a lot more noise. #define GGX_IMPORTANCE_SAMPLE_BIAS 0.1 -// Shared SSR settings: -static const float SSRMaxRoughness = 1.0f; // Specify max roughness, this can improve performance in complex scenes. -static const float SSRIntensity = 1.0f; -static const float SSRResolveConeMip = 1.0f; // Control overall filtering of the importance sampling. -static const float SSRResolveSpatialSize = 3.0f; // Seems to work best with the temporal pass in the [-3;3] range -static const float SSRBlendScreenEdgeFade = 5.0f; +// Shared Reflection settings: +static const float ReflectionMaxRoughness = 0.6f; -// Temporary -static const float BRDFBias = 0.7f; - - -float ComputeRoughnessMaskScale(in float maxRoughness) +uint2 GetReflectionIndirectDispatchCoord(uint3 Gid, uint3 GTid, StructuredBuffer tiles, uint downsample) { - float MaxRoughness = clamp(maxRoughness, 0.01f, 1.0f); - - float roughnessMaskScale = -2.0f / MaxRoughness; - return roughnessMaskScale * 1.0f; // 2.0f & 1.0f + uint tile_replicate = sqr(SSR_TILESIZE / downsample / POSTPROCESS_BLOCKSIZE); + uint tile_idx = Gid.x / tile_replicate; + uint tile_packed = tiles[tile_idx]; + uint2 tile = uint2(tile_packed & 0xFFFF, (tile_packed >> 16) & 0xFFFF); + uint subtile_idx = Gid.x % tile_replicate; + uint2 subtile = unflatten2D(subtile_idx, SSR_TILESIZE / downsample / POSTPROCESS_BLOCKSIZE); + uint2 subtile_upperleft = tile * SSR_TILESIZE / downsample + subtile * POSTPROCESS_BLOCKSIZE; + return subtile_upperleft + unflatten2D(GTid.x, POSTPROCESS_BLOCKSIZE); } -float GetRoughnessFade(in float roughness, in float maxRoughness) +bool NeedReflection(float roughness, float depth) { - float roughnessMaskScale = ComputeRoughnessMaskScale(maxRoughness); - return min(roughness * roughnessMaskScale + 2, 1.0f); -} - -float GetRoughness(float roughness) -{ - return max(roughness, 0.02f); -} - -float Luminance(float3 color) -{ - return dot(color, float3(0.2126, 0.7152, 0.0722)); -} - -// Fast RNG inspired by PCG (Permuted Congruential Generator) - Based on Epic Games (Unreal Engine) -// Returns three elements with 16 random bits each (0-0xffff (65535)). -uint3 Rand_PCG16(int3 i) -{ - // Epic Games had good results by interpreting signed values as unsigned. - uint3 r = uint3(i); - - // Linear congruential generator - // A simple but very fast pseudorandom number generator - // see: https://en.wikipedia.org/wiki/Linear_congruential_generator - r = r * 1664525u + 1013904223u; // LCG set from 'Numerical Recipes' - - // Final shuffle - // In the original PCG code, they used xorshift for their final shuffle. - // According to Epic Games, they would do simple Feistel steps instead since xorshift is expensive. - // They would then use r.x, r.y and r.z as parts to create something persistence with few instructions. - r.x += r.y * r.z; - r.y += r.z * r.x; - r.z += r.x * r.y; - - r.x += r.y * r.z; - r.y += r.z * r.x; - r.z += r.x * r.y; - - // PCG would then shuffle the top 16 bits thoroughly. - return r >> 16u; -} - -// Hammersley sequence manipulated by a random value and returns top 16 bits -float2 HammersleyRandom16(uint idx, uint num, uint2 random) -{ - // Reverse Bits 32 - uint bits = idx; - bits = (bits << 16u) | (bits >> 16u); - bits = ((bits & 0x55555555u) << 1u) | ((bits & 0xAAAAAAAAu) >> 1u); - bits = ((bits & 0x33333333u) << 2u) | ((bits & 0xCCCCCCCCu) >> 2u); - bits = ((bits & 0x0F0F0F0Fu) << 4u) | ((bits & 0xF0F0F0F0u) >> 4u); - bits = ((bits & 0x00FF00FFu) << 8u) | ((bits & 0xFF00FF00u) >> 8u); - - float E1 = frac(float(idx / num) + float(random.x) * 1.52587890625e-5); // / 0xffff (rcp(65536) ) - float E2 = float((bits >> 16) ^ random.y) * 1.52587890625e-5; // Shift reverse bits by 16 and compare bits with random - return float2(E1, E2); -} - -float2 HammersleyRandom16(uint idx, uint2 random) -{ - uint bits = idx; - bits = (bits << 16u) | (bits >> 16u); - bits = ((bits & 0x55555555u) << 1u) | ((bits & 0xAAAAAAAAu) >> 1u); - bits = ((bits & 0x33333333u) << 2u) | ((bits & 0xCCCCCCCCu) >> 2u); - bits = ((bits & 0x0F0F0F0Fu) << 4u) | ((bits & 0xF0F0F0F0u) >> 4u); - bits = ((bits & 0x00FF00FFu) << 8u) | ((bits & 0xFF00FF00u) >> 8u); - - float E1 = frac(float(random.x) * 1.52587890625e-5); // / 0xffff (rcp(65536) ) - float E2 = float((bits >> 16) ^ random.y) * 1.52587890625e-5; // Shift reverse bits by 16 and compare bits with random - return float2(E1, E2); + return (roughness < ReflectionMaxRoughness) && (depth > 0.0); } // Brian Karis, Epic Games "Real Shading in Unreal Engine 4" float4 ImportanceSampleGGX(float2 Xi, float Roughness) { - float m = Roughness * Roughness; - float m2 = m * m; - - float Phi = 2 * PI * Xi.x; - - float CosTheta = sqrt((1.0 - Xi.y) / (1.0 + (m2 - 1.0) * Xi.y)); - float SinTheta = sqrt(max(1e-5, 1.0 - CosTheta * CosTheta)); - - float3 H; - H.x = SinTheta * cos(Phi); - H.y = SinTheta * sin(Phi); - H.z = CosTheta; - - float d = (CosTheta * m2 - CosTheta) * CosTheta + 1; - float D = m2 / (PI * d * d); - float pdf = D * CosTheta; + float m = Roughness * Roughness; + float m2 = m * m; - return float4(H, pdf); + float Phi = 2 * PI * Xi.x; + + float CosTheta = sqrt((1.0 - Xi.y) / (1.0 + (m2 - 1.0) * Xi.y)); + float SinTheta = sqrt(max(1e-5, 1.0 - CosTheta * CosTheta)); + + float3 H; + H.x = SinTheta * cos(Phi); + H.y = SinTheta * sin(Phi); + H.z = CosTheta; + + float d = (CosTheta * m2 - CosTheta) * CosTheta + 1; + float D = m2 / (PI * d * d); + float pdf = D * CosTheta; + + return float4(H, pdf); } // [ Duff et al. 2017, "Building an Orthonormal Basis, Revisited" ] // http://jcgt.org/published/0006/01/01/ float3x3 GetTangentBasis(float3 TangentZ) { - const float Sign = TangentZ.z >= 0 ? 1 : -1; - const float a = -rcp(Sign + TangentZ.z); - const float b = TangentZ.x * TangentZ.y * a; - - float3 TangentX = { 1 + Sign * a * pow(TangentZ.x, 2), Sign * b, -Sign * TangentZ.x }; - float3 TangentY = { b, Sign + a * pow(TangentZ.y, 2), -TangentZ.y }; + const float Sign = TangentZ.z >= 0 ? 1 : -1; + const float a = -rcp(Sign + TangentZ.z); + const float b = TangentZ.x * TangentZ.y * a; - return float3x3(TangentX, TangentY, TangentZ); + float3 TangentX = { 1 + Sign * a * pow(TangentZ.x, 2), Sign * b, -Sign * TangentZ.x }; + float3 TangentY = { b, Sign + a * pow(TangentZ.y, 2), -TangentZ.y }; + + return float3x3(TangentX, TangentY, TangentZ); } -float3 TangentToWorld(float3 vec, float3 tangentZ) -{ - return mul(vec, GetTangentBasis(tangentZ)); -} - -float4 TangentToWorld(float4 H, float3 tangentZ) -{ - return float4(mul(H.xyz, GetTangentBasis(tangentZ)), H.w); -} - -float3 WorldToTangent(float3 vec, float3 tangentZ) -{ - return mul(GetTangentBasis(tangentZ), vec); -} - - float2 SampleDisk(float2 Xi) { float theta = 2 * PI * Xi.x; @@ -209,5 +117,9 @@ float4 ImportanceSampleVisibleGGX(float2 diskXi, float roughness, float3 V) return float4(H, PDF); } +float Luminance(float3 color) +{ + return dot(color, float3(0.2126, 0.7152, 0.0722)); +} #endif // WI_STOCHASTICSSR_HF diff --git a/WickedEngine/wiEnums.h b/WickedEngine/wiEnums.h index 6d7482201..54581505a 100644 --- a/WickedEngine/wiEnums.h +++ b/WickedEngine/wiEnums.h @@ -284,10 +284,17 @@ namespace wi::enums CSTYPE_POSTPROCESS_MSAO_BLURUPSAMPLE_PREMIN, CSTYPE_POSTPROCESS_MSAO_BLURUPSAMPLE_PREMIN_BLENDOUT, CSTYPE_POSTPROCESS_RTREFLECTION, + CSTYPE_POSTPROCESS_SSR_SURFACE, + CSTYPE_POSTPROCESS_SSR_TILEMAXROUGHNESS_HORIZONTAL, + CSTYPE_POSTPROCESS_SSR_TILEMAXROUGHNESS_VERTICAL, + CSTYPE_POSTPROCESS_SSR_KICKJOBS, + CSTYPE_POSTPROCESS_SSR_DEPTHHIERARCHY, CSTYPE_POSTPROCESS_SSR_RAYTRACE, + CSTYPE_POSTPROCESS_SSR_RAYTRACE_EARLYEXIT, + CSTYPE_POSTPROCESS_SSR_RAYTRACE_CHEAP, CSTYPE_POSTPROCESS_SSR_RESOLVE, CSTYPE_POSTPROCESS_SSR_TEMPORAL, - CSTYPE_POSTPROCESS_SSR_MEDIAN, + CSTYPE_POSTPROCESS_SSR_BILATERAL, CSTYPE_POSTPROCESS_LIGHTSHAFTS, CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_HORIZONTAL, CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_VERTICAL, diff --git a/WickedEngine/wiRenderPath3D.cpp b/WickedEngine/wiRenderPath3D.cpp index 21e82e4cd..64a0f5106 100644 --- a/WickedEngine/wiRenderPath3D.cpp +++ b/WickedEngine/wiRenderPath3D.cpp @@ -1599,8 +1599,8 @@ void RenderPath3D::setSSREnabled(bool value) TextureDesc desc; desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS; desc.format = Format::R16G16B16A16_FLOAT; - desc.width = internalResolution.x / 2; - desc.height = internalResolution.y / 2; + desc.width = internalResolution.x; + desc.height = internalResolution.y; desc.layout = ResourceState::SHADER_RESOURCE_COMPUTE; device->CreateTexture(&desc, nullptr, &rtSSR); device->SetName(&rtSSR, "rtSSR"); @@ -1625,8 +1625,8 @@ void RenderPath3D::setRaytracedReflectionsEnabled(bool value) TextureDesc desc; desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS; desc.format = Format::R11G11B10_FLOAT; - desc.width = internalResolution.x / 2; - desc.height = internalResolution.y / 2; + desc.width = internalResolution.x; + desc.height = internalResolution.y; device->CreateTexture(&desc, nullptr, &rtSSR); device->SetName(&rtSSR, "rtSSR"); diff --git a/WickedEngine/wiRenderer.cpp b/WickedEngine/wiRenderer.cpp index 4331b11ff..57053caa4 100644 --- a/WickedEngine/wiRenderer.cpp +++ b/WickedEngine/wiRenderer.cpp @@ -934,10 +934,17 @@ void LoadShaders() wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_MSAO_BLURUPSAMPLE_BLENDOUT], "msao_blurupsampleCS_blendout.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_MSAO_BLURUPSAMPLE_PREMIN], "msao_blurupsampleCS_premin.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_MSAO_BLURUPSAMPLE_PREMIN_BLENDOUT], "msao_blurupsampleCS_premin_blendout.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_SURFACE], "ssr_surfaceCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_TILEMAXROUGHNESS_HORIZONTAL], "ssr_tileMaxRoughness_horizontalCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_TILEMAXROUGHNESS_VERTICAL], "ssr_tileMaxRoughness_verticalCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_KICKJOBS], "ssr_kickjobsCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_DEPTHHIERARCHY], "ssr_depthHierarchyCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_RAYTRACE], "ssr_raytraceCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_RAYTRACE_EARLYEXIT], "ssr_raytraceCS_earlyexit.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_RAYTRACE_CHEAP], "ssr_raytraceCS_cheap.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_RESOLVE], "ssr_resolveCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_TEMPORAL], "ssr_temporalCS.cso"); }); - wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_MEDIAN], "ssr_medianCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_BILATERAL], "ssr_bilateralCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_LIGHTSHAFTS], "lightShaftsCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_HORIZONTAL], "depthoffield_tileMaxCOC_horizontalCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_VERTICAL], "depthoffield_tileMaxCOC_verticalCS.cso"); }); @@ -9382,21 +9389,44 @@ void CreateRTReflectionResources(RTReflectionResources& res, XMUINT2 resolution) { res.frame = 0; + TextureDesc surface_desc; + surface_desc.type = TextureDesc::Type::TEXTURE_2D; + surface_desc.width = resolution.x; + surface_desc.height = resolution.y; + surface_desc.format = Format::R8G8B8A8_SNORM; + surface_desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS; + device->CreateTexture(&surface_desc, nullptr, &res.texture_surface_normal); + surface_desc.format = Format::R8_UNORM; + device->CreateTexture(&surface_desc, nullptr, &res.texture_surface_roughness); + surface_desc.format = Format::R11G11B10_FLOAT; + device->CreateTexture(&surface_desc, nullptr, &res.texture_surface_environment); + TextureDesc desc; + desc.type = TextureDesc::Type::TEXTURE_2D; desc.width = resolution.x / 2; desc.height = resolution.y / 2; desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS; desc.layout = ResourceState::SHADER_RESOURCE_COMPUTE; - desc.format = Format::R11G11B10_FLOAT; - device->CreateTexture(&desc, nullptr, &res.temporal[0]); - device->SetName(&res.temporal[0], "rtreflection_temporal[0]"); - device->CreateTexture(&desc, nullptr, &res.temporal[1]); - device->SetName(&res.temporal[1], "rtreflection_temporal[1]"); - + desc.format = Format::R16G16B16A16_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_rayIndirectSpecular); + device->CreateTexture(&desc, nullptr, &res.texture_rayDirectionPDF); desc.format = Format::R16_FLOAT; - device->CreateTexture(&desc, nullptr, &res.rayLengths); - device->SetName(&res.rayLengths, "rtreflection_rayLengths"); + device->CreateTexture(&desc, nullptr, &res.texture_rayLengths); + device->SetName(&res.texture_rayLengths, "ssr_rayLengths"); + + desc.width = resolution.x; + desc.height = resolution.y; + desc.format = Format::R16G16B16A16_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_resolve); + device->CreateTexture(&desc, nullptr, &res.texture_temporal[0]); + device->CreateTexture(&desc, nullptr, &res.texture_temporal[1]); + device->CreateTexture(&desc, nullptr, &res.texture_bilateral_temp); + desc.format = Format::R16_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_resolve_variance); + device->CreateTexture(&desc, nullptr, &res.texture_resolve_reprojectionDepth); + device->CreateTexture(&desc, nullptr, &res.texture_temporal_variance[0]); + device->CreateTexture(&desc, nullptr, &res.texture_temporal_variance[1]); } void Postprocess_RTReflection( const RTReflectionResources& res, @@ -9414,90 +9444,206 @@ void Postprocess_RTReflection( return; device->EventBegin("Postprocess_RTReflection", cmd); - auto prof_range = wi::profiler::BeginRangeGPU("RTReflection", cmd); - - const TextureDesc& desc = output.desc; - -#ifdef RTREFLECTION_WITH_RAYTRACING_PIPELINE - device->BindRaytracingPipelineState(&RTPSO_reflection, cmd); -#else - device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_RTREFLECTION], cmd); -#endif // RTREFLECTION_WITH_RAYTRACING_PIPELINE + auto profilerRange = wi::profiler::BeginRangeGPU("RTReflection", cmd); BindCommonResources(cmd); + // Compute common Raytraced surface properties: + { + device->EventBegin("RTReflection Surface", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_SURFACE], cmd); + + const GPUResource* uavs[] = { + &res.texture_surface_normal, + &res.texture_surface_roughness, + &res.texture_surface_environment, + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_surface_normal, res.texture_surface_normal.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_surface_roughness, res.texture_surface_roughness.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_surface_environment, res.texture_surface_environment.desc.layout, ResourceState::UNORDERED_ACCESS), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->Dispatch( + (res.texture_surface_normal.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_surface_normal.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Image(&res.texture_surface_normal, ResourceState::UNORDERED_ACCESS, res.texture_surface_normal.desc.layout), + GPUBarrier::Image(&res.texture_surface_roughness, ResourceState::UNORDERED_ACCESS, res.texture_surface_roughness.desc.layout), + GPUBarrier::Image(&res.texture_surface_environment, ResourceState::UNORDERED_ACCESS, res.texture_surface_environment.desc.layout), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->EventEnd(cmd); + } + + const TextureDesc& desc = output.desc; + + // Render half-res: PostProcess postprocess; - postprocess.resolution.x = desc.width; - postprocess.resolution.y = desc.height; + postprocess.resolution.x = desc.width / 2; + postprocess.resolution.y = desc.height / 2; postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; rtreflection_range = range; rtreflection_frame = (float)res.frame; std::memcpy(&postprocess.params1.x, &instanceInclusionMask, sizeof(instanceInclusionMask)); - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - - const GPUResource* uavs[] = { - &output, - &res.rayLengths - }; - device->BindUAVs(uavs, 0, arraysize(uavs), cmd); { - GPUBarrier barriers[] = { - GPUBarrier::Image(&output, output.desc.layout, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Image(&res.rayLengths, res.rayLengths.desc.layout, ResourceState::UNORDERED_ACCESS), - }; - device->Barrier(barriers, arraysize(barriers), cmd); - } + //device->EventBegin("RTReflection Raytrace pass", cmd); #ifdef RTREFLECTION_WITH_RAYTRACING_PIPELINE - size_t shaderIdentifierSize = device->GetShaderIdentifierSize(); - GraphicsDevice::GPUAllocation shadertable_raygen = device->AllocateGPU(shaderIdentifierSize, cmd); - GraphicsDevice::GPUAllocation shadertable_miss = device->AllocateGPU(shaderIdentifierSize, cmd); - GraphicsDevice::GPUAllocation shadertable_hitgroup = device->AllocateGPU(shaderIdentifierSize, cmd); + device->BindRaytracingPipelineState(&RTPSO_reflection, cmd); +#else + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_RTREFLECTION], cmd); +#endif // RTREFLECTION_WITH_RAYTRACING_PIPELINE - device->WriteShaderIdentifier(&RTPSO_reflection, 0, shadertable_raygen.data); - device->WriteShaderIdentifier(&RTPSO_reflection, 1, shadertable_miss.data); - device->WriteShaderIdentifier(&RTPSO_reflection, 2, shadertable_hitgroup.data); + device->PushConstants(&postprocess, sizeof(postprocess), cmd); - DispatchRaysDesc dispatchraysdesc; - dispatchraysdesc.ray_generation.buffer = &shadertable_raygen.buffer; - dispatchraysdesc.ray_generation.offset = shadertable_raygen.offset; - dispatchraysdesc.ray_generation.size = shaderIdentifierSize; + const GPUResource* resarray[] = { + &res.texture_surface_normal, + &res.texture_surface_roughness, + &res.texture_surface_environment, + }; + device->BindResources(resarray, 0, arraysize(resarray), cmd); - dispatchraysdesc.miss.buffer = &shadertable_miss.buffer; - dispatchraysdesc.miss.offset = shadertable_miss.offset; - dispatchraysdesc.miss.size = shaderIdentifierSize; - dispatchraysdesc.miss.stride = shaderIdentifierSize; + const GPUResource* uavs[] = { + &res.texture_rayIndirectSpecular, + &res.texture_rayDirectionPDF, + &res.texture_rayLengths + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); - dispatchraysdesc.hit_group.buffer = &shadertable_hitgroup.buffer; - dispatchraysdesc.hit_group.offset = shadertable_hitgroup.offset; - dispatchraysdesc.hit_group.size = shaderIdentifierSize; - dispatchraysdesc.hit_group.stride = shaderIdentifierSize; + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_rayIndirectSpecular, res.texture_rayIndirectSpecular.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_rayDirectionPDF, res.texture_rayDirectionPDF.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_rayLengths, res.texture_rayLengths.desc.layout, ResourceState::UNORDERED_ACCESS), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } - dispatchraysdesc.width = desc.width; - dispatchraysdesc.height = desc.height; +#ifdef RTREFLECTION_WITH_RAYTRACING_PIPELINE + size_t shaderIdentifierSize = device->GetShaderIdentifierSize(); + GraphicsDevice::GPUAllocation shadertable_raygen = device->AllocateGPU(shaderIdentifierSize, cmd); + GraphicsDevice::GPUAllocation shadertable_miss = device->AllocateGPU(shaderIdentifierSize, cmd); + GraphicsDevice::GPUAllocation shadertable_hitgroup = device->AllocateGPU(shaderIdentifierSize, cmd); - device->DispatchRays(&dispatchraysdesc, cmd); + device->WriteShaderIdentifier(&RTPSO_reflection, 0, shadertable_raygen.data); + device->WriteShaderIdentifier(&RTPSO_reflection, 1, shadertable_miss.data); + device->WriteShaderIdentifier(&RTPSO_reflection, 2, shadertable_hitgroup.data); + + DispatchRaysDesc dispatchraysdesc; + dispatchraysdesc.ray_generation.buffer = &shadertable_raygen.buffer; + dispatchraysdesc.ray_generation.offset = shadertable_raygen.offset; + dispatchraysdesc.ray_generation.size = shaderIdentifierSize; + + dispatchraysdesc.miss.buffer = &shadertable_miss.buffer; + dispatchraysdesc.miss.offset = shadertable_miss.offset; + dispatchraysdesc.miss.size = shaderIdentifierSize; + dispatchraysdesc.miss.stride = shaderIdentifierSize; + + dispatchraysdesc.hit_group.buffer = &shadertable_hitgroup.buffer; + dispatchraysdesc.hit_group.offset = shadertable_hitgroup.offset; + dispatchraysdesc.hit_group.size = shaderIdentifierSize; + dispatchraysdesc.hit_group.stride = shaderIdentifierSize; + + dispatchraysdesc.width = desc.width / 2; + dispatchraysdesc.height = desc.height / 2; + + device->DispatchRays(&dispatchraysdesc, cmd); #else - device->Dispatch( - (desc.width + 7) / 8, - (desc.height + 3) / 4, - 1, - cmd - ); + device->Dispatch( + (res.texture_rayIndirectSpecular.GetDesc().width + 7) / 8, + (res.texture_rayIndirectSpecular.GetDesc().height + 3) / 4, + 1, + cmd + ); #endif // RTREFLECTION_WITH_RAYTRACING_PIPELINE + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Image(&res.texture_rayIndirectSpecular, ResourceState::UNORDERED_ACCESS, res.texture_rayIndirectSpecular.desc.layout), + GPUBarrier::Image(&res.texture_rayDirectionPDF, ResourceState::UNORDERED_ACCESS, res.texture_rayDirectionPDF.desc.layout), + GPUBarrier::Image(&res.texture_rayLengths, ResourceState::UNORDERED_ACCESS, res.texture_rayLengths.desc.layout), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + //device->EventEnd(cmd); + } + + // Upscale to full-res: + postprocess.resolution.x = desc.width; + postprocess.resolution.y = desc.height; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + // Resolve pass: { - GPUBarrier barriers[] = { - GPUBarrier::Memory(), - GPUBarrier::Image(&output, ResourceState::UNORDERED_ACCESS, output.desc.layout), - GPUBarrier::Image(&res.rayLengths, ResourceState::UNORDERED_ACCESS, res.rayLengths.desc.layout), + device->EventBegin("RTReflection Resolve pass", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_RESOLVE], cmd); + + const GPUResource* resarray[] = { + &res.texture_surface_normal, + &res.texture_surface_roughness, + &res.texture_rayIndirectSpecular, + &res.texture_rayDirectionPDF, + &res.texture_rayLengths, }; - device->Barrier(barriers, arraysize(barriers), cmd); + device->BindResources(resarray, 0, arraysize(resarray), cmd); + + const GPUResource* uavs[] = { + &res.texture_resolve, + &res.texture_resolve_variance, + &res.texture_resolve_reprojectionDepth, + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_resolve, res.texture_resolve.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_resolve_variance, res.texture_resolve_variance.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_resolve_reprojectionDepth, res.texture_resolve_reprojectionDepth.desc.layout, ResourceState::UNORDERED_ACCESS), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->Dispatch( + (res.texture_resolve.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_resolve.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Image(&res.texture_resolve, ResourceState::UNORDERED_ACCESS, res.texture_resolve.desc.layout), + GPUBarrier::Image(&res.texture_resolve_variance, ResourceState::UNORDERED_ACCESS, res.texture_resolve_variance.desc.layout), + GPUBarrier::Image(&res.texture_resolve_reprojectionDepth, ResourceState::UNORDERED_ACCESS, res.texture_resolve_reprojectionDepth.desc.layout), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->EventEnd(cmd); } int temporal_output = device->GetFrameCount() % 2; @@ -9505,30 +9651,36 @@ void Postprocess_RTReflection( // Temporal pass: { - device->EventBegin("Temporal pass", cmd); + device->EventBegin("RTReflection Temporal pass", cmd); device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_TEMPORAL], cmd); - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - - device->BindResource(&output, 0, cmd); - device->BindResource(&res.temporal[temporal_history], 1, cmd); - device->BindResource(&res.rayLengths, 3, cmd); + const GPUResource* resarray[] = { + &res.texture_surface_roughness, + &res.texture_resolve, + &res.texture_temporal[temporal_history], + &res.texture_resolve_variance, + &res.texture_temporal_variance[temporal_history], + &res.texture_resolve_reprojectionDepth, + }; + device->BindResources(resarray, 0, arraysize(resarray), cmd); const GPUResource* uavs[] = { - &res.temporal[temporal_output], + &res.texture_temporal[temporal_output], + &res.texture_temporal_variance[temporal_output], }; device->BindUAVs(uavs, 0, arraysize(uavs), cmd); { GPUBarrier barriers[] = { - GPUBarrier::Image(&res.temporal[temporal_output], res.temporal[temporal_output].desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_temporal[temporal_output], res.texture_temporal[temporal_output].desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_temporal_variance[temporal_output], res.texture_temporal_variance[temporal_output].desc.layout, ResourceState::UNORDERED_ACCESS), }; device->Barrier(barriers, arraysize(barriers), cmd); } device->Dispatch( - (desc.width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - (desc.height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_temporal[temporal_output].GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_temporal[temporal_output].GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, 1, cmd ); @@ -9536,7 +9688,8 @@ void Postprocess_RTReflection( { GPUBarrier barriers[] = { GPUBarrier::Memory(), - GPUBarrier::Image(&res.temporal[temporal_output], ResourceState::UNORDERED_ACCESS, res.temporal[temporal_output].desc.layout), + GPUBarrier::Image(&res.texture_temporal[temporal_output], ResourceState::UNORDERED_ACCESS, res.texture_temporal[temporal_output].desc.layout), + GPUBarrier::Image(&res.texture_temporal_variance[temporal_output], ResourceState::UNORDERED_ACCESS, res.texture_temporal_variance[temporal_output].desc.layout), }; device->Barrier(barriers, arraysize(barriers), cmd); } @@ -9544,40 +9697,93 @@ void Postprocess_RTReflection( device->EventEnd(cmd); } - // Median blur pass: + // Bilateral blur pass: { - device->EventBegin("Median blur pass", cmd); - device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_MEDIAN], cmd); - - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - - device->BindResource(&res.temporal[temporal_output], 0, cmd); - - const GPUResource* uavs[] = { - &output, - }; - device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + device->EventBegin("RTReflection Bilateral blur pass", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_BILATERAL], cmd); + // Horizontal: { - GPUBarrier barriers[] = { - GPUBarrier::Image(&output, output.desc.layout, ResourceState::UNORDERED_ACCESS), + postprocess.params0.x = 1; + postprocess.params0.y = 0; + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + const GPUResource* resarray[] = { + &res.texture_temporal[temporal_output], + &res.texture_temporal_variance[temporal_output], + &res.texture_surface_normal, + &res.texture_surface_roughness, }; - device->Barrier(barriers, arraysize(barriers), cmd); + device->BindResources(resarray, 0, arraysize(resarray), cmd); + + const GPUResource* uavs[] = { + &res.texture_bilateral_temp, + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_bilateral_temp, res.texture_bilateral_temp.desc.layout, ResourceState::UNORDERED_ACCESS), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->Dispatch( + (res.texture_bilateral_temp.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_bilateral_temp.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Image(&res.texture_bilateral_temp, ResourceState::UNORDERED_ACCESS, res.texture_bilateral_temp.desc.layout), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } } - device->Dispatch( - (output.desc.width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - (output.desc.height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - 1, - cmd - ); - + // Vertical: { - GPUBarrier barriers[] = { - GPUBarrier::Memory(), - GPUBarrier::Image(&output, ResourceState::UNORDERED_ACCESS, output.desc.layout), + postprocess.params0.x = 0; + postprocess.params0.y = 1; + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + const GPUResource* resarray[] = { + &res.texture_bilateral_temp, + &res.texture_temporal_variance[temporal_output], + &res.texture_surface_normal, + &res.texture_surface_roughness, }; - device->Barrier(barriers, arraysize(barriers), cmd); + device->BindResources(resarray, 0, arraysize(resarray), cmd); + + const GPUResource* uavs[] = { + &output, + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&output, output.desc.layout, ResourceState::UNORDERED_ACCESS), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->Dispatch( + (output.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (output.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Image(&output, ResourceState::UNORDERED_ACCESS, output.desc.layout), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } } device->EventEnd(cmd); @@ -9585,13 +9791,48 @@ void Postprocess_RTReflection( res.frame++; - wi::profiler::EndRange(prof_range); + wi::profiler::EndRange(profilerRange); device->EventEnd(cmd); } void CreateSSRResources(SSRResources& res, XMUINT2 resolution) { res.frame = 0; + TextureDesc surface_desc; + surface_desc.type = TextureDesc::Type::TEXTURE_2D; + surface_desc.width = resolution.x; + surface_desc.height = resolution.y; + surface_desc.format = Format::R8G8B8A8_SNORM; + surface_desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS; + device->CreateTexture(&surface_desc, nullptr, &res.texture_surface_normal); + surface_desc.format = Format::R8_UNORM; + device->CreateTexture(&surface_desc, nullptr, &res.texture_surface_roughness); + + TextureDesc tile_desc; + tile_desc.type = TextureDesc::Type::TEXTURE_2D; + tile_desc.width = (resolution.x + SSR_TILESIZE - 1) / SSR_TILESIZE; + tile_desc.height = (resolution.y + SSR_TILESIZE - 1) / SSR_TILESIZE; + tile_desc.format = Format::R16G16_FLOAT; + tile_desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS; + device->CreateTexture(&tile_desc, nullptr, &res.texture_tile_minmax_roughness); + + tile_desc.height = resolution.y; + device->CreateTexture(&tile_desc, nullptr, &res.texture_tile_minmax_roughness_horizontal); + + GPUBufferDesc bufferdesc; + bufferdesc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS; + + bufferdesc.size = TILE_STATISTICS_CAPACITY * sizeof(uint); + bufferdesc.misc_flags = ResourceMiscFlag::BUFFER_RAW | ResourceMiscFlag::INDIRECT_ARGS; + device->CreateBuffer(&bufferdesc, nullptr, &res.buffer_tile_tracing_statistics); + + bufferdesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED; + bufferdesc.stride = sizeof(uint); + bufferdesc.size = tile_desc.width * tile_desc.height * bufferdesc.stride; + device->CreateBuffer(&bufferdesc, nullptr, &res.buffer_tiles_tracing_earlyexit); + device->CreateBuffer(&bufferdesc, nullptr, &res.buffer_tiles_tracing_cheap); + device->CreateBuffer(&bufferdesc, nullptr, &res.buffer_tiles_tracing_expensive); + TextureDesc desc; desc.type = TextureDesc::Type::TEXTURE_2D; desc.width = resolution.x / 2; @@ -9599,13 +9840,39 @@ void CreateSSRResources(SSRResources& res, XMUINT2 resolution) desc.format = Format::R16G16B16A16_FLOAT; desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS; desc.layout = ResourceState::SHADER_RESOURCE_COMPUTE; - device->CreateTexture(&desc, nullptr, &res.texture_raytrace); + device->CreateTexture(&desc, nullptr, &res.texture_rayIndirectSpecular); + device->CreateTexture(&desc, nullptr, &res.texture_rayDirectionPDF); + desc.format = Format::R16_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_rayLengths); + device->SetName(&res.texture_rayLengths, "ssr_rayLengths"); + + desc.width = resolution.x; + desc.height = resolution.y; + desc.format = Format::R16G16B16A16_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_resolve); device->CreateTexture(&desc, nullptr, &res.texture_temporal[0]); device->CreateTexture(&desc, nullptr, &res.texture_temporal[1]); - + device->CreateTexture(&desc, nullptr, &res.texture_bilateral_temp); desc.format = Format::R16_FLOAT; - device->CreateTexture(&desc, nullptr, &res.rayLengths); - device->SetName(&res.rayLengths, "ssr_rayLengths"); + device->CreateTexture(&desc, nullptr, &res.texture_resolve_variance); + device->CreateTexture(&desc, nullptr, &res.texture_resolve_reprojectionDepth); + device->CreateTexture(&desc, nullptr, &res.texture_temporal_variance[0]); + device->CreateTexture(&desc, nullptr, &res.texture_temporal_variance[1]); + + desc.width = (uint32_t)std::pow(2.0f, 1.0f + std::floor(std::log2((float)resolution.x / 2))); + desc.height = (uint32_t)std::pow(2.0f, 1.0f + std::floor(std::log2((float)resolution.y / 2))); + desc.format = Format::R32G32_FLOAT; + desc.mip_levels = 1 + (uint32_t)std::floor(std::log2f(std::max((float)desc.width, (float)desc.height))); + device->CreateTexture(&desc, nullptr, &res.texture_depth_hierarchy); + + for (uint32_t i = 0; i < desc.mip_levels; ++i) + { + int subresource_index; + subresource_index = device->CreateSubresource(&res.texture_depth_hierarchy, SubresourceType::SRV, 0, 1, i, 1); + assert(subresource_index == i); + subresource_index = device->CreateSubresource(&res.texture_depth_hierarchy, SubresourceType::UAV, 0, 1, i, 1); + assert(subresource_index == i); + } } void Postprocess_SSR( const SSRResources& res, @@ -9615,47 +9882,33 @@ void Postprocess_SSR( ) { device->EventBegin("Postprocess_SSR", cmd); - auto range = wi::profiler::BeginRangeGPU("SSR", cmd); + auto range = wi::profiler::BeginRangeGPU("Screen Space Reflections", cmd); BindCommonResources(cmd); - const TextureDesc& input_desc = input.GetDesc(); - const TextureDesc& desc = output.GetDesc(); - - PostProcess postprocess; - postprocess.resolution.x = desc.width; - postprocess.resolution.y = desc.height; - postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; - postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; - ssr_input_maxmip = float(input_desc.mip_levels - 1); - ssr_input_resolution_max = (float)std::max(input_desc.width, input_desc.height); - ssr_frame = (float)res.frame; - - // Raytrace pass: + // Compute common SSR surface properties: { - device->EventBegin("Stochastic Raytrace pass", cmd); - device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_RAYTRACE], cmd); - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - - device->BindResource(&input, 0, cmd); + device->EventBegin("SSR Surface", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_SURFACE], cmd); const GPUResource* uavs[] = { - &res.texture_raytrace, - &res.rayLengths + &res.texture_surface_normal, + &res.texture_surface_roughness, }; device->BindUAVs(uavs, 0, arraysize(uavs), cmd); { GPUBarrier barriers[] = { - GPUBarrier::Image(&res.texture_raytrace, res.texture_raytrace.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_surface_normal, res.texture_surface_normal.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_surface_roughness, res.texture_surface_roughness.desc.layout, ResourceState::UNORDERED_ACCESS), }; device->Barrier(barriers, arraysize(barriers), cmd); } device->Dispatch( - (desc.width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - (desc.height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_surface_normal.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_surface_normal.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, 1, cmd ); @@ -9663,7 +9916,8 @@ void Postprocess_SSR( { GPUBarrier barriers[] = { GPUBarrier::Memory(), - GPUBarrier::Image(&res.texture_raytrace, ResourceState::UNORDERED_ACCESS, res.texture_raytrace.desc.layout), + GPUBarrier::Image(&res.texture_surface_normal, ResourceState::UNORDERED_ACCESS, res.texture_surface_normal.desc.layout), + GPUBarrier::Image(&res.texture_surface_roughness, ResourceState::UNORDERED_ACCESS, res.texture_surface_roughness.desc.layout), }; device->Barrier(barriers, arraysize(barriers), cmd); } @@ -9671,30 +9925,31 @@ void Postprocess_SSR( device->EventEnd(cmd); } - // Resolve pass: + // Compute tile classification (horizontal): { - device->EventBegin("Resolve pass", cmd); - device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_RESOLVE], cmd); - device->PushConstants(&postprocess, sizeof(postprocess), cmd); + device->EventBegin("SSR Tile Classification - Horizontal", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_TILEMAXROUGHNESS_HORIZONTAL], cmd); - device->BindResource(&res.texture_raytrace, 0, cmd); - device->BindResource(&input, 1, cmd); + const GPUResource* resarray[] = { + &res.texture_surface_roughness, + }; + device->BindResources(resarray, 0, arraysize(resarray), cmd); const GPUResource* uavs[] = { - &output, + &res.texture_tile_minmax_roughness_horizontal, }; device->BindUAVs(uavs, 0, arraysize(uavs), cmd); { GPUBarrier barriers[] = { - GPUBarrier::Image(&output, output.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_tile_minmax_roughness_horizontal, res.texture_tile_minmax_roughness_horizontal.desc.layout, ResourceState::UNORDERED_ACCESS), }; device->Barrier(barriers, arraysize(barriers), cmd); } device->Dispatch( - (desc.width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - (desc.height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_tile_minmax_roughness_horizontal.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_tile_minmax_roughness_horizontal.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, 1, cmd ); @@ -9702,7 +9957,286 @@ void Postprocess_SSR( { GPUBarrier barriers[] = { GPUBarrier::Memory(), - GPUBarrier::Image(&output, ResourceState::UNORDERED_ACCESS, output.desc.layout), + GPUBarrier::Image(&res.texture_tile_minmax_roughness_horizontal, ResourceState::UNORDERED_ACCESS, res.texture_tile_minmax_roughness_horizontal.desc.layout), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->EventEnd(cmd); + } + + // Compute tile classification (vertical): + { + device->EventBegin("SSR Tile Classification - Vertical", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_TILEMAXROUGHNESS_VERTICAL], cmd); + + const GPUResource* resarray[] = { + &res.texture_tile_minmax_roughness_horizontal, + }; + device->BindResources(resarray, 0, arraysize(resarray), cmd); + + const GPUResource* uavs[] = { + &res.buffer_tile_tracing_statistics, + &res.buffer_tiles_tracing_earlyexit, + &res.buffer_tiles_tracing_cheap, + &res.buffer_tiles_tracing_expensive, + &res.texture_tile_minmax_roughness, + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_tile_minmax_roughness, res.texture_tile_minmax_roughness.desc.layout, ResourceState::UNORDERED_ACCESS), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->Dispatch( + (res.texture_tile_minmax_roughness.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_tile_minmax_roughness.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Image(&res.texture_tile_minmax_roughness, ResourceState::UNORDERED_ACCESS, res.texture_tile_minmax_roughness.desc.layout), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->EventEnd(cmd); + } + + // Kick indirect tile jobs: + { + device->EventBegin("SSR Kickjobs", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_KICKJOBS], cmd); + + const GPUResource* uavs[] = { + &res.buffer_tile_tracing_statistics, + &res.buffer_tiles_tracing_earlyexit, + &res.buffer_tiles_tracing_cheap, + &res.buffer_tiles_tracing_expensive, + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + device->Dispatch(1, 1, 1, cmd); + + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Buffer(&res.buffer_tile_tracing_statistics, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + + device->EventEnd(cmd); + } + + PostProcess postprocess; + + // Depth hierarchy: + { + device->EventBegin("SSR Depth hierarchy pass", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_DEPTHHIERARCHY], cmd); + + TextureDesc hierarchyDesc = res.texture_depth_hierarchy.GetDesc(); + + { + device->BindUAV(&res.texture_depth_hierarchy, 0, cmd, 0); + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_depth_hierarchy, res.texture_depth_hierarchy.desc.layout, ResourceState::UNORDERED_ACCESS, 0), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + postprocess.params0.x = (float)hierarchyDesc.width; + postprocess.params0.y = (float)hierarchyDesc.height; + postprocess.params0.z = 1.0f; + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + device->Dispatch( + std::max(1u, hierarchyDesc.width / POSTPROCESS_BLOCKSIZE), + std::max(1u, hierarchyDesc.height / POSTPROCESS_BLOCKSIZE), + 1, + cmd + ); + + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Image(&res.texture_depth_hierarchy, ResourceState::UNORDERED_ACCESS, res.texture_depth_hierarchy.desc.layout, 0), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + } + + for (uint32_t i = 1; i < hierarchyDesc.mip_levels; i++) + { + device->BindResource(&res.texture_depth_hierarchy, 0, cmd, i - 1); + device->BindUAV(&res.texture_depth_hierarchy, 0, cmd, i); + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_depth_hierarchy, res.texture_depth_hierarchy.desc.layout, ResourceState::UNORDERED_ACCESS, i), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + hierarchyDesc.width /= 2; + hierarchyDesc.height /= 2; + + hierarchyDesc.width = std::max(1u, hierarchyDesc.width); + hierarchyDesc.height = std::max(1u, hierarchyDesc.height); + + postprocess.params0.x = (float)hierarchyDesc.width; + postprocess.params0.y = (float)hierarchyDesc.height; + postprocess.params0.z = 0.0f; + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + device->Dispatch( + std::max(1u, hierarchyDesc.width / POSTPROCESS_BLOCKSIZE), + std::max(1u, hierarchyDesc.height / POSTPROCESS_BLOCKSIZE), + 1, + cmd + ); + + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Image(&res.texture_depth_hierarchy, ResourceState::UNORDERED_ACCESS, res.texture_depth_hierarchy.desc.layout, i), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + } + + device->EventEnd(cmd); + } + + const TextureDesc& desc = output.GetDesc(); + + // Render half-res: + postprocess.resolution.x = desc.width / 2; + postprocess.resolution.y = desc.height / 2; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + + // Factor to scale ratio between hierarchy and trace pass + postprocess.params1.x = (float)postprocess.resolution.x / (float)res.texture_depth_hierarchy.GetDesc().width; + postprocess.params1.y = (float)postprocess.resolution.y / (float)res.texture_depth_hierarchy.GetDesc().height; + postprocess.params1.z = 1.0f / postprocess.params1.x; + postprocess.params1.w = 1.0f / postprocess.params1.y; + ssr_frame = (float)res.frame; + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + // Raytrace pass: + { + device->EventBegin("SSR Raytrace pass", cmd); + + const GPUResource* resarray[] = { + &res.texture_surface_normal, + &res.texture_surface_roughness, + &res.texture_depth_hierarchy, + &input, + &res.buffer_tiles_tracing_earlyexit, + &res.buffer_tiles_tracing_cheap, + &res.buffer_tiles_tracing_expensive + }; + device->BindResources(resarray, 0, arraysize(resarray), cmd); + + const GPUResource* uavs[] = { + &res.texture_rayIndirectSpecular, + &res.texture_rayDirectionPDF, + &res.texture_rayLengths + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + { + GPUBarrier barriers[] = { + GPUBarrier::Buffer(&res.buffer_tiles_tracing_earlyexit, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), + GPUBarrier::Buffer(&res.buffer_tiles_tracing_cheap, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), + GPUBarrier::Buffer(&res.buffer_tiles_tracing_expensive, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), + GPUBarrier::Image(&res.texture_rayIndirectSpecular, res.texture_rayIndirectSpecular.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_rayDirectionPDF, res.texture_rayDirectionPDF.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_rayLengths, res.texture_rayLengths.desc.layout, ResourceState::UNORDERED_ACCESS), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_RAYTRACE_EARLYEXIT], cmd); + device->DispatchIndirect(&res.buffer_tile_tracing_statistics, INDIRECT_OFFSET_EARLYEXIT, cmd); + + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_RAYTRACE_CHEAP], cmd); + device->DispatchIndirect(&res.buffer_tile_tracing_statistics, INDIRECT_OFFSET_CHEAP, cmd); + + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_RAYTRACE], cmd); + device->DispatchIndirect(&res.buffer_tile_tracing_statistics, INDIRECT_OFFSET_EXPENSIVE, cmd); + + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Image(&res.texture_rayIndirectSpecular, ResourceState::UNORDERED_ACCESS, res.texture_rayIndirectSpecular.desc.layout), + GPUBarrier::Image(&res.texture_rayDirectionPDF, ResourceState::UNORDERED_ACCESS, res.texture_rayDirectionPDF.desc.layout), + GPUBarrier::Image(&res.texture_rayLengths, ResourceState::UNORDERED_ACCESS, res.texture_rayLengths.desc.layout), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->EventEnd(cmd); + } + + // Upscale to full-res: + postprocess.resolution.x = desc.width; + postprocess.resolution.y = desc.height; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + // Resolve pass: + { + device->EventBegin("SSR Resolve pass", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_RESOLVE], cmd); + + const GPUResource* resarray[] = { + &res.texture_surface_normal, + &res.texture_surface_roughness, + &res.texture_rayIndirectSpecular, + &res.texture_rayDirectionPDF, + &res.texture_rayLengths, + }; + device->BindResources(resarray, 0, arraysize(resarray), cmd); + + const GPUResource* uavs[] = { + &res.texture_resolve, + &res.texture_resolve_variance, + &res.texture_resolve_reprojectionDepth, + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_resolve, res.texture_resolve.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_resolve_variance, res.texture_resolve_variance.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_resolve_reprojectionDepth, res.texture_resolve_reprojectionDepth.desc.layout, ResourceState::UNORDERED_ACCESS), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->Dispatch( + (res.texture_resolve.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_resolve.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Image(&res.texture_resolve, ResourceState::UNORDERED_ACCESS, res.texture_resolve.desc.layout), + GPUBarrier::Image(&res.texture_resolve_variance, ResourceState::UNORDERED_ACCESS, res.texture_resolve_variance.desc.layout), + GPUBarrier::Image(&res.texture_resolve_reprojectionDepth, ResourceState::UNORDERED_ACCESS, res.texture_resolve_reprojectionDepth.desc.layout), }; device->Barrier(barriers, arraysize(barriers), cmd); } @@ -9715,29 +10249,36 @@ void Postprocess_SSR( // Temporal pass: { - device->EventBegin("Temporal pass", cmd); + device->EventBegin("SSR Temporal pass", cmd); device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_TEMPORAL], cmd); - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - device->BindResource(&output, 0, cmd); - device->BindResource(&res.texture_temporal[temporal_history], 1, cmd); - device->BindResource(&res.rayLengths, 3, cmd); + const GPUResource* resarray[] = { + &res.texture_surface_roughness, + &res.texture_resolve, + &res.texture_temporal[temporal_history], + &res.texture_resolve_variance, + &res.texture_temporal_variance[temporal_history], + &res.texture_resolve_reprojectionDepth, + }; + device->BindResources(resarray, 0, arraysize(resarray), cmd); const GPUResource* uavs[] = { &res.texture_temporal[temporal_output], + &res.texture_temporal_variance[temporal_output], }; device->BindUAVs(uavs, 0, arraysize(uavs), cmd); { GPUBarrier barriers[] = { GPUBarrier::Image(&res.texture_temporal[temporal_output], res.texture_temporal[temporal_output].desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_temporal_variance[temporal_output], res.texture_temporal_variance[temporal_output].desc.layout, ResourceState::UNORDERED_ACCESS), }; device->Barrier(barriers, arraysize(barriers), cmd); } device->Dispatch( - (desc.width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - (desc.height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_temporal[temporal_output].GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_temporal[temporal_output].GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, 1, cmd ); @@ -9746,6 +10287,7 @@ void Postprocess_SSR( GPUBarrier barriers[] = { GPUBarrier::Memory(), GPUBarrier::Image(&res.texture_temporal[temporal_output], ResourceState::UNORDERED_ACCESS, res.texture_temporal[temporal_output].desc.layout), + GPUBarrier::Image(&res.texture_temporal_variance[temporal_output], ResourceState::UNORDERED_ACCESS, res.texture_temporal_variance[temporal_output].desc.layout), }; device->Barrier(barriers, arraysize(barriers), cmd); } @@ -9753,39 +10295,93 @@ void Postprocess_SSR( device->EventEnd(cmd); } - // Median blur pass: + // Bilateral blur pass: { - device->EventBegin("Median blur pass", cmd); - device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_MEDIAN], cmd); - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - - device->BindResource(&res.texture_temporal[temporal_output], 0, cmd); - - const GPUResource* uavs[] = { - &output, - }; - device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + device->EventBegin("SSR Bilateral blur pass", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_BILATERAL], cmd); + // Horizontal: { - GPUBarrier barriers[] = { - GPUBarrier::Image(&output, output.desc.layout, ResourceState::UNORDERED_ACCESS), + postprocess.params0.x = 1; + postprocess.params0.y = 0; + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + const GPUResource* resarray[] = { + &res.texture_temporal[temporal_output], + &res.texture_temporal_variance[temporal_output], + &res.texture_surface_normal, + &res.texture_surface_roughness, }; - device->Barrier(barriers, arraysize(barriers), cmd); + device->BindResources(resarray, 0, arraysize(resarray), cmd); + + const GPUResource* uavs[] = { + &res.texture_bilateral_temp, + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_bilateral_temp, res.texture_bilateral_temp.desc.layout, ResourceState::UNORDERED_ACCESS), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->Dispatch( + (res.texture_bilateral_temp.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (res.texture_bilateral_temp.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Image(&res.texture_bilateral_temp, ResourceState::UNORDERED_ACCESS, res.texture_bilateral_temp.desc.layout), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } } - device->Dispatch( - (desc.width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - (desc.height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - 1, - cmd - ); - + // Vertical: { - GPUBarrier barriers[] = { - GPUBarrier::Memory(), - GPUBarrier::Image(&output, ResourceState::UNORDERED_ACCESS, output.desc.layout), + postprocess.params0.x = 0; + postprocess.params0.y = 1; + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + const GPUResource* resarray[] = { + &res.texture_bilateral_temp, + &res.texture_temporal_variance[temporal_output], + &res.texture_surface_normal, + &res.texture_surface_roughness, }; - device->Barrier(barriers, arraysize(barriers), cmd); + device->BindResources(resarray, 0, arraysize(resarray), cmd); + + const GPUResource* uavs[] = { + &output, + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&output, output.desc.layout, ResourceState::UNORDERED_ACCESS), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->Dispatch( + (output.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (output.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Image(&output, ResourceState::UNORDERED_ACCESS, output.desc.layout), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } } device->EventEnd(cmd); diff --git a/WickedEngine/wiRenderer.h b/WickedEngine/wiRenderer.h index b0a14c69e..6e0c96bc4 100644 --- a/WickedEngine/wiRenderer.h +++ b/WickedEngine/wiRenderer.h @@ -444,8 +444,18 @@ namespace wi::renderer struct RTReflectionResources { mutable int frame = 0; - wi::graphics::Texture temporal[2]; - wi::graphics::Texture rayLengths; + wi::graphics::Texture texture_surface_normal; + wi::graphics::Texture texture_surface_roughness; + wi::graphics::Texture texture_surface_environment; + wi::graphics::Texture texture_rayIndirectSpecular; + wi::graphics::Texture texture_rayDirectionPDF; + wi::graphics::Texture texture_rayLengths; + wi::graphics::Texture texture_resolve; + wi::graphics::Texture texture_resolve_variance; + wi::graphics::Texture texture_resolve_reprojectionDepth; + wi::graphics::Texture texture_temporal[2]; + wi::graphics::Texture texture_temporal_variance[2]; + wi::graphics::Texture texture_bilateral_temp; }; void CreateRTReflectionResources(RTReflectionResources& res, XMUINT2 resolution); void Postprocess_RTReflection( @@ -459,9 +469,24 @@ namespace wi::renderer struct SSRResources { mutable int frame = 0; - wi::graphics::Texture texture_raytrace; - wi::graphics::Texture rayLengths; + wi::graphics::Texture texture_surface_normal; + wi::graphics::Texture texture_surface_roughness; + wi::graphics::Texture texture_tile_minmax_roughness_horizontal; + wi::graphics::Texture texture_tile_minmax_roughness; + wi::graphics::Texture texture_depth_hierarchy; + wi::graphics::Texture texture_rayIndirectSpecular; + wi::graphics::Texture texture_rayDirectionPDF; + wi::graphics::Texture texture_rayLengths; + wi::graphics::Texture texture_resolve; + wi::graphics::Texture texture_resolve_variance; + wi::graphics::Texture texture_resolve_reprojectionDepth; wi::graphics::Texture texture_temporal[2]; + wi::graphics::Texture texture_temporal_variance[2]; + wi::graphics::Texture texture_bilateral_temp; + wi::graphics::GPUBuffer buffer_tile_tracing_statistics; + wi::graphics::GPUBuffer buffer_tiles_tracing_earlyexit; + wi::graphics::GPUBuffer buffer_tiles_tracing_cheap; + wi::graphics::GPUBuffer buffer_tiles_tracing_expensive; }; void CreateSSRResources(SSRResources& res, XMUINT2 resolution); void Postprocess_SSR( diff --git a/WickedEngine/wiVersion.cpp b/WickedEngine/wiVersion.cpp index 406079d0b..e50de2a7a 100644 --- a/WickedEngine/wiVersion.cpp +++ b/WickedEngine/wiVersion.cpp @@ -9,7 +9,7 @@ namespace wi::version // minor features, major updates, breaking compatibility changes const int minor = 60; // minor bug fixes, alterations, refactors, updates - const int revision = 27; + const int revision = 28; const std::string version_string = std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(revision);