diff --git a/Content/Documentation/ScriptingAPI-Documentation.md b/Content/Documentation/ScriptingAPI-Documentation.md index b478ac6bc..fdd9370c3 100644 --- a/Content/Documentation/ScriptingAPI-Documentation.md +++ b/Content/Documentation/ScriptingAPI-Documentation.md @@ -1485,6 +1485,7 @@ It inherits functions from RenderPath2D, so it can render a 2D overlay. - AO_MSAO : int -- enable multi scale screen space ambient occlusion (use in SetAO() function) - SetAOPower(float value) -- applies AO power value if any AO is enabled - SetSSREnabled(bool value) +- SetSSGIEnabled(bool value) - SetRaytracedDiffuseEnabled(bool value) - SetRaytracedReflectionsEnabled(bool value) - SetShadowsEnabled(bool value) diff --git a/Editor/GraphicsWindow.cpp b/Editor/GraphicsWindow.cpp index 6eec425f7..0af03fb53 100644 --- a/Editor/GraphicsWindow.cpp +++ b/Editor/GraphicsWindow.cpp @@ -13,7 +13,7 @@ void GraphicsWindow::Create(EditorComponent* _editor) wi::renderer::SetToDrawGridHelper(true); wi::renderer::SetToDrawDebugCameras(true); - SetSize(XMFLOAT2(580, 1640)); + SetSize(XMFLOAT2(580, 1660)); float step = 21; float itemheight = 18; @@ -940,6 +940,22 @@ void GraphicsWindow::Create(EditorComponent* _editor) AddWidget(&raytracedDiffuseCheckBox); raytracedDiffuseCheckBox.SetEnabled(wi::graphics::GetDevice()->CheckCapability(GraphicsDeviceCapability::RAYTRACING)); + ssgiCheckBox.Create("SSGI: "); + ssgiCheckBox.SetTooltip("Enable Screen Space Global Illumination, this can add a light bounce effect coming from objects on the screen."); + ssgiCheckBox.SetScriptTip("RenderPath3D::SetSSGIEnabled(bool value)"); + ssgiCheckBox.SetSize(XMFLOAT2(hei, hei)); + ssgiCheckBox.SetPos(XMFLOAT2(x + 140, y)); + if (editor->main->config.GetSection("graphics").Has("ssgi")) + { + editor->renderPath->setSSGIEnabled(editor->main->config.GetSection("graphics").GetBool("ssgi")); + } + ssgiCheckBox.OnClick([=](wi::gui::EventArgs args) { + editor->renderPath->setSSGIEnabled(args.bValue); + editor->main->config.GetSection("graphics").Set("ssgi", args.bValue); + editor->main->config.Commit(); + }); + AddWidget(&ssgiCheckBox); + raytracedDiffuseRangeSlider.Create(1.0f, 100.0f, 1, 1000, "RTDiffuse.Range: "); raytracedDiffuseRangeSlider.SetText("Range: "); raytracedDiffuseRangeSlider.SetTooltip("Set Reflection ray length for Ray traced diffuse."); @@ -1525,6 +1541,7 @@ void GraphicsWindow::Update() raytracedReflectionsCheckBox.SetCheck(editor->renderPath->getRaytracedReflectionEnabled()); raytracedReflectionsRangeSlider.SetValue(editor->renderPath->getRaytracedReflectionsRange()); raytracedDiffuseCheckBox.SetCheck(editor->renderPath->getRaytracedDiffuseEnabled()); + ssgiCheckBox.SetCheck(editor->renderPath->getSSGIEnabled()); raytracedDiffuseRangeSlider.SetValue(editor->renderPath->getRaytracedDiffuseRange()); screenSpaceShadowsCheckBox.SetCheck(wi::renderer::GetScreenSpaceShadowsEnabled()); screenSpaceShadowsRangeSlider.SetValue((float)editor->renderPath->getScreenSpaceShadowRange()); @@ -1787,6 +1804,8 @@ void GraphicsWindow::ResizeLayout() ssrCheckBox.SetPos(XMFLOAT2(reflectionsRoughnessCutoffSlider.GetPos().x - ssrCheckBox.GetSize().x - 80, reflectionsRoughnessCutoffSlider.GetPos().y)); add_right(raytracedReflectionsRangeSlider); raytracedReflectionsCheckBox.SetPos(XMFLOAT2(raytracedReflectionsRangeSlider.GetPos().x - raytracedReflectionsCheckBox.GetSize().x - 80, raytracedReflectionsRangeSlider.GetPos().y)); + add_right(ssgiCheckBox); + ssgiCheckBox.SetPos(XMFLOAT2(raytracedReflectionsCheckBox.GetPos().x, ssgiCheckBox.GetPos().y)); add_right(raytracedDiffuseRangeSlider); raytracedDiffuseCheckBox.SetPos(XMFLOAT2(raytracedDiffuseRangeSlider.GetPos().x - raytracedDiffuseCheckBox.GetSize().x - 80, raytracedDiffuseRangeSlider.GetPos().y)); add_right(screenSpaceShadowsStepCountSlider); diff --git a/Editor/GraphicsWindow.h b/Editor/GraphicsWindow.h index 2f3dcfe3a..34406d946 100644 --- a/Editor/GraphicsWindow.h +++ b/Editor/GraphicsWindow.h @@ -63,6 +63,7 @@ public: wi::gui::Slider raytracedReflectionsRangeSlider; wi::gui::CheckBox raytracedDiffuseCheckBox; wi::gui::Slider raytracedDiffuseRangeSlider; + wi::gui::CheckBox ssgiCheckBox; wi::gui::CheckBox screenSpaceShadowsCheckBox; wi::gui::Slider screenSpaceShadowsStepCountSlider; wi::gui::Slider screenSpaceShadowsRangeSlider; diff --git a/Editor/MaterialWindow.cpp b/Editor/MaterialWindow.cpp index 201aca902..af18d2808 100644 --- a/Editor/MaterialWindow.cpp +++ b/Editor/MaterialWindow.cpp @@ -280,7 +280,7 @@ void MaterialWindow::Create(EditorComponent* _editor) }); AddWidget(&alphaRefSlider); - emissiveSlider.Create(0, 1, 0.0f, 1000, "Emissive: "); + emissiveSlider.Create(0, 10, 0.0f, 1000, "Emissive: "); emissiveSlider.SetTooltip("Adjust the light emission of the surface. The color of the light emitted is that of the color of the material."); emissiveSlider.SetSize(XMFLOAT2(wid, hei)); emissiveSlider.SetPos(XMFLOAT2(x, y += step)); diff --git a/WickedEngine/offlineshadercompiler.cpp b/WickedEngine/offlineshadercompiler.cpp index 6499b380b..a61d44cd8 100644 --- a/WickedEngine/offlineshadercompiler.cpp +++ b/WickedEngine/offlineshadercompiler.cpp @@ -67,17 +67,20 @@ wi::vector shaders = { {"ffx-fsr2/ffx_fsr2_accumulate_pass", wi::graphics::ShaderStage::CS}, {"ffx-fsr2/ffx_fsr2_rcas_pass", wi::graphics::ShaderStage::CS}, {"ssaoCS", wi::graphics::ShaderStage::CS}, + {"ssgi_deinterleaveCS", wi::graphics::ShaderStage::CS}, + {"ssgiCS", wi::graphics::ShaderStage::CS}, + {"ssgi_upsampleCS", wi::graphics::ShaderStage::CS}, {"rtdiffuseCS", wi::graphics::ShaderStage::CS, wi::graphics::ShaderModel::SM_6_5}, {"rtdiffuse_spatialCS", wi::graphics::ShaderStage::CS}, {"rtdiffuse_temporalCS", wi::graphics::ShaderStage::CS}, - {"rtdiffuse_bilateralCS", wi::graphics::ShaderStage::CS}, + {"rtdiffuse_upsampleCS", wi::graphics::ShaderStage::CS}, {"rtreflectionCS", wi::graphics::ShaderStage::CS, wi::graphics::ShaderModel::SM_6_5}, {"ssr_tileMaxRoughness_horizontalCS", wi::graphics::ShaderStage::CS}, {"ssr_tileMaxRoughness_verticalCS", wi::graphics::ShaderStage::CS}, {"ssr_depthHierarchyCS", wi::graphics::ShaderStage::CS}, {"ssr_resolveCS", wi::graphics::ShaderStage::CS}, {"ssr_temporalCS", wi::graphics::ShaderStage::CS}, - {"ssr_bilateralCS", wi::graphics::ShaderStage::CS}, + {"ssr_upsampleCS", wi::graphics::ShaderStage::CS}, {"ssr_raytraceCS", wi::graphics::ShaderStage::CS}, {"ssr_raytraceCS_cheap", wi::graphics::ShaderStage::CS}, {"ssr_raytraceCS_earlyexit", wi::graphics::ShaderStage::CS}, @@ -484,6 +487,10 @@ int main(int argc, char* argv[]) shaders.back().permutations.emplace_back().defines = x; } + // permutations for ssgiCS: + shaders.push_back({ "ssgiCS", wi::graphics::ShaderStage::CS }); + shaders.back().permutations.emplace_back().defines = {"WIDE"}; + wi::jobsystem::Initialize(); wi::jobsystem::context ctx; diff --git a/WickedEngine/shaders/ShaderInterop_Postprocess.h b/WickedEngine/shaders/ShaderInterop_Postprocess.h index 78b0ef249..6ad3825dd 100644 --- a/WickedEngine/shaders/ShaderInterop_Postprocess.h +++ b/WickedEngine/shaders/ShaderInterop_Postprocess.h @@ -55,6 +55,7 @@ static const uint SSR_TILESIZE = 32; #define rtdiffuse_range ssao_range #define rtdiffuse_frame ssr_frame +#define ssgi_frame ssr_frame #define rtreflection_range ssao_range #define rtreflection_roughness_cutoff ssr_roughness_cutoff diff --git a/WickedEngine/shaders/ShaderInterop_Renderer.h b/WickedEngine/shaders/ShaderInterop_Renderer.h index 7a42561a7..e86d4679d 100644 --- a/WickedEngine/shaders/ShaderInterop_Renderer.h +++ b/WickedEngine/shaders/ShaderInterop_Renderer.h @@ -1047,14 +1047,14 @@ struct ShaderCamera int texture_roughness_index; int buffer_entitytiles_index; - int padding; int texture_reflection_index; int texture_reflection_depth_index; int texture_refraction_index; - int texture_waterriples_index; + int texture_ao_index; int texture_ssr_index; + int texture_ssgi_index; int texture_rtshadow_index; int texture_surfelgi_index; @@ -1120,6 +1120,7 @@ struct ShaderCamera texture_waterriples_index = -1; texture_ao_index = -1; texture_ssr_index = -1; + texture_ssgi_index = -1; texture_rtshadow_index = -1; texture_surfelgi_index = -1; texture_depth_index_prev = -1; diff --git a/WickedEngine/shaders/Shaders_SOURCE.vcxitems b/WickedEngine/shaders/Shaders_SOURCE.vcxitems index 4a61c0512..f3e69370d 100644 --- a/WickedEngine/shaders/Shaders_SOURCE.vcxitems +++ b/WickedEngine/shaders/Shaders_SOURCE.vcxitems @@ -882,7 +882,7 @@ Compute 4.0 - + Compute 4.0 @@ -939,7 +939,19 @@ Compute Compute - + + Compute + 4.0 + + + Compute + 4.0 + + + Compute + 4.0 + + Compute 4.0 diff --git a/WickedEngine/shaders/Shaders_SOURCE.vcxitems.filters b/WickedEngine/shaders/Shaders_SOURCE.vcxitems.filters index 16826db97..1386b4b3a 100644 --- a/WickedEngine/shaders/Shaders_SOURCE.vcxitems.filters +++ b/WickedEngine/shaders/Shaders_SOURCE.vcxitems.filters @@ -944,7 +944,7 @@ CS - + CS @@ -1010,7 +1010,7 @@ CS - + CS @@ -1145,6 +1145,15 @@ PS + + CS + + + CS + + + CS + diff --git a/WickedEngine/shaders/depthoffield_postfilterCS.hlsl b/WickedEngine/shaders/depthoffield_postfilterCS.hlsl index 071dd98c3..24fcce22e 100644 --- a/WickedEngine/shaders/depthoffield_postfilterCS.hlsl +++ b/WickedEngine/shaders/depthoffield_postfilterCS.hlsl @@ -9,66 +9,6 @@ Texture2D texture_alpha : register(t1); RWTexture2D output_postfilter : register(u0); RWTexture2D output_alpha : register(u1); -#ifndef __PSSL__ -float min3(float a, float b, float c) -{ - return min(min(a, b), c); -} -float max3(float a, float b, float c) -{ - return max(max(a, b), c); -} -float2 min3(float2 a, float2 b, float2 c) -{ - return float2(min3(a.x, b.x, c.x), min3(a.y, b.y, c.y)); -} -float3 min3(float3 a, float3 b, float3 c) -{ - return float3(min3(a.x, b.x, c.x), min3(a.y, b.y, c.y), min3(a.z, b.z, c.z)); -} -float4 min3(float4 a, float4 b, float4 c) -{ - return float4(min3(a.x, b.x, c.x), min3(a.y, b.y, c.y), min3(a.z, b.z, c.z), min3(a.w, b.w, c.w)); -} -float2 max3(float2 a, float2 b, float2 c) -{ - return float2(max3(a.x, b.x, c.x), max3(a.y, b.y, c.y)); -} -float3 max3(float3 a, float3 b, float3 c) -{ - return float3(max3(a.x, b.x, c.x), max3(a.y, b.y, c.y), max3(a.z, b.z, c.z)); -} -float4 max3(float4 a, float4 b, float4 c) -{ - return float4(max3(a.x, b.x, c.x), max3(a.y, b.y, c.y), max3(a.z, b.z, c.z), max3(a.w, b.w, c.w)); -} -float min4(float4 values) -{ - return min(min3(values.x, values.y, values.z), values.w); -} -float max4(float4 values) -{ - return max(max3(values.x, values.y, values.z), values.w); -} - -float med3(float a, float b, float c) -{ - return max(min(a, b), min(max(a, b), c)); -} -float2 med3(float2 a, float2 b, float2 c) -{ - return float2(med3(a.x, b.x, c.x), med3(a.y, b.y, c.y)); -} -float3 med3(float3 a, float3 b, float3 c) -{ - return float3(med3(a.x, b.x, c.x), med3(a.y, b.y, c.y), med3(a.z, b.z, c.z)); -} -float4 med3(float4 a, float4 b, float4 c) -{ - return float4(med3(a.x, b.x, c.x), med3(a.y, b.y, c.y), med3(a.z, b.z, c.z), med3(a.w, b.w, c.w)); -} -#endif // __PSSL__ - static const float2 squareOffsets[] = { { -1.0, -1.0 }, diff --git a/WickedEngine/shaders/globals.hlsli b/WickedEngine/shaders/globals.hlsli index b46e6de94..9bce0519e 100644 --- a/WickedEngine/shaders/globals.hlsli +++ b/WickedEngine/shaders/globals.hlsli @@ -375,6 +375,77 @@ static const float SKY_UNIT_TO_M = rcp(M_TO_SKY_UNIT); #define sqr(a) ((a)*(a)) #define pow5(x) pow(x, 5) +template +float max3(T v) +{ + return max(max(v.x, v.y), v.z); +} +template +float min3(T v) +{ + return min(min(v.x, v.y), v.z); +} + +#ifndef __PSSL__ +float min3(float a, float b, float c) +{ + return min(min(a, b), c); +} +float max3(float a, float b, float c) +{ + return max(max(a, b), c); +} +float2 min3(float2 a, float2 b, float2 c) +{ + return float2(min3(a.x, b.x, c.x), min3(a.y, b.y, c.y)); +} +float3 min3(float3 a, float3 b, float3 c) +{ + return float3(min3(a.x, b.x, c.x), min3(a.y, b.y, c.y), min3(a.z, b.z, c.z)); +} +float4 min3(float4 a, float4 b, float4 c) +{ + return float4(min3(a.x, b.x, c.x), min3(a.y, b.y, c.y), min3(a.z, b.z, c.z), min3(a.w, b.w, c.w)); +} +float2 max3(float2 a, float2 b, float2 c) +{ + return float2(max3(a.x, b.x, c.x), max3(a.y, b.y, c.y)); +} +float3 max3(float3 a, float3 b, float3 c) +{ + return float3(max3(a.x, b.x, c.x), max3(a.y, b.y, c.y), max3(a.z, b.z, c.z)); +} +float4 max3(float4 a, float4 b, float4 c) +{ + return float4(max3(a.x, b.x, c.x), max3(a.y, b.y, c.y), max3(a.z, b.z, c.z), max3(a.w, b.w, c.w)); +} +float min4(float4 values) +{ + return min(min3(values.x, values.y, values.z), values.w); +} +float max4(float4 values) +{ + return max(max3(values.x, values.y, values.z), values.w); +} + +float med3(float a, float b, float c) +{ + return max(min(a, b), min(max(a, b), c)); +} +float2 med3(float2 a, float2 b, float2 c) +{ + return float2(med3(a.x, b.x, c.x), med3(a.y, b.y, c.y)); +} +float3 med3(float3 a, float3 b, float3 c) +{ + return float3(med3(a.x, b.x, c.x), med3(a.y, b.y, c.y), med3(a.z, b.z, c.z)); +} +float4 med3(float4 a, float4 b, float4 c) +{ + return float4(med3(a.x, b.x, c.x), med3(a.y, b.y, c.y), med3(a.z, b.z, c.z), med3(a.w, b.w, c.w)); +} +#endif // __PSSL__ + // attribute computation with barycentric interpolation // a0 : attribute at triangle corner 0 // a1 : attribute at triangle corner 1 diff --git a/WickedEngine/shaders/lightingHF.hlsli b/WickedEngine/shaders/lightingHF.hlsli index af0e4e817..2a6c4d870 100644 --- a/WickedEngine/shaders/lightingHF.hlsli +++ b/WickedEngine/shaders/lightingHF.hlsli @@ -36,7 +36,7 @@ struct Lighting inline void ApplyLighting(in Surface surface, in Lighting lighting, inout float4 color) { - float3 diffuse = lighting.direct.diffuse / PI + lighting.indirect.diffuse * GetFrame().gi_boost * (1 - surface.F) * surface.occlusion; + float3 diffuse = lighting.direct.diffuse / PI + lighting.indirect.diffuse * GetFrame().gi_boost * (1 - surface.F) * surface.occlusion + surface.ssgi; float3 specular = lighting.direct.specular + lighting.indirect.specular * surface.occlusion; // reminder: cannot apply surface.F for whole indirect specular, because multiple layers have separate fresnels (sheen, clearcoat) color.rgb = lerp(surface.albedo * diffuse, surface.refraction.rgb, surface.refraction.a); color.rgb += specular; diff --git a/WickedEngine/shaders/objectHF.hlsli b/WickedEngine/shaders/objectHF.hlsli index 7488cc78e..813f8720d 100644 --- a/WickedEngine/shaders/objectHF.hlsli +++ b/WickedEngine/shaders/objectHF.hlsli @@ -892,6 +892,11 @@ float4 main(PixelInput input, in bool is_frontface : SV_IsFrontFace) : SV_Target float4 ssr = bindless_textures[GetCamera().texture_ssr_index].SampleLevel(sampler_linear_clamp, ScreenCoord, 0); lighting.indirect.specular = lerp(lighting.indirect.specular, ssr.rgb * surface.F, ssr.a); } + [branch] + if (GetCamera().texture_ssgi_index >= 0) + { + surface.ssgi = bindless_textures[GetCamera().texture_ssgi_index].SampleLevel(sampler_linear_clamp, ScreenCoord, 0).rgb; + } #endif // CARTOON #endif // TRANSPARENT #endif // ENVMAPRENDERING diff --git a/WickedEngine/shaders/rtdiffuse_bilateralCS.hlsl b/WickedEngine/shaders/rtdiffuse_upsampleCS.hlsl similarity index 54% rename from WickedEngine/shaders/rtdiffuse_bilateralCS.hlsl rename to WickedEngine/shaders/rtdiffuse_upsampleCS.hlsl index 8ef7cc5d5..f95efd68e 100644 --- a/WickedEngine/shaders/rtdiffuse_bilateralCS.hlsl +++ b/WickedEngine/shaders/rtdiffuse_upsampleCS.hlsl @@ -13,7 +13,7 @@ static const float depthThreshold = 10000.0; static const float normalThreshold = 1.0; static const float varianceEstimateThreshold = 0.015; // Larger variance values use stronger blur static const float varianceExitThreshold = 0.0025; // Variance needs to be higher than this value to accept blur -static const uint2 bilateralMinMaxRadius = uint2(4, 8); // Chosen by variance +static const uint2 bilateralMinMaxRadius = uint2(0, 8); // Chosen by variance #define BILATERAL_SIGMA 0.9 @@ -22,8 +22,6 @@ void main(uint3 DTid : SV_DispatchThreadID) { const float depth = texture_depth[DTid.xy]; - float2 direction = postprocess.params0.xy; - const float linearDepth = texture_lineardepth[DTid.xy]; const float3 N = decode_oct(texture_normal[DTid.xy]); @@ -49,37 +47,41 @@ void main(uint3 DTid : SV_DispatchThreadID) float4 result = 0; float weightSum = 0.0f; - for (int r = -effectiveRadius; r <= effectiveRadius; r++) + for(uint d = 0; d < 2; ++d) { - const int2 sampleCoord = DTid.xy + (direction * r); // Left to right diameter directionally - - if (all(and(sampleCoord >= int2(0, 0), sampleCoord < (int2) postprocess.resolution))) + const int2 direction = d < 1 ? int2(1, 0) : int2(0, 1); + for (int r = -effectiveRadius; r <= effectiveRadius; r++) { - const float sampleDepth = texture_depth[sampleCoord]; - - float2 sampleUV = (sampleCoord + 0.5) * postprocess.resolution_rcp; - const float4 sampleColor = texture_temporal.SampleLevel(sampler_linear_clamp, sampleUV, 0); - - const float3 sampleN = decode_oct(texture_normal[sampleCoord]); - - float3 sampleP = reconstruct_position(sampleUV, sampleDepth); + const int2 sampleCoord = DTid.xy + (direction * r); // Left to right diameter directionally + if (all(and(sampleCoord >= int2(0, 0), sampleCoord < (int2) postprocess.resolution))) { - float3 dq = P - sampleP; - float planeError = max(abs(dot(dq, sampleN)), abs(dot(dq, N))); - float relativeDepthDifference = planeError / (linearDepth * GetCamera().z_far); - float bilateralDepthWeight = exp(-sqr(relativeDepthDifference) * depthThreshold); + const float sampleDepth = texture_depth[sampleCoord]; - float normalError = pow(saturate(dot(sampleN, N)), 4.0); - float bilateralNormalWeight = saturate(1.0 - (1.0 - normalError) * normalThreshold); + float2 sampleUV = (sampleCoord + 0.5) * postprocess.resolution_rcp; + const float4 sampleColor = texture_temporal.SampleLevel(sampler_linear_clamp, sampleUV, 0); - float bilateralWeight = bilateralDepthWeight * bilateralNormalWeight; + const float3 sampleN = decode_oct(texture_normal[sampleCoord]); - float gaussian = exp(-sqr(r / sigma)); - float weight = (r == 0) ? 1.0 : gaussian * bilateralWeight; // Skip center gaussian peak + float3 sampleP = reconstruct_position(sampleUV, sampleDepth); - result += sampleColor * weight; - weightSum += weight; + { + float3 dq = P - sampleP; + float planeError = max(abs(dot(dq, sampleN)), abs(dot(dq, N))); + float relativeDepthDifference = planeError / (linearDepth * GetCamera().z_far); + float bilateralDepthWeight = exp(-sqr(relativeDepthDifference) * depthThreshold); + + float normalError = pow(saturate(dot(sampleN, N)), 4.0); + float bilateralNormalWeight = saturate(1.0 - (1.0 - normalError) * normalThreshold); + + float bilateralWeight = bilateralDepthWeight * bilateralNormalWeight; + + float gaussian = exp(-sqr(r / sigma)); + float weight = (r == 0) ? 1.0 : gaussian * bilateralWeight; // Skip center gaussian peak + + result += sampleColor * weight; + weightSum += weight; + } } } } diff --git a/WickedEngine/shaders/ssgiCS.hlsl b/WickedEngine/shaders/ssgiCS.hlsl new file mode 100644 index 000000000..c658f7fc7 --- /dev/null +++ b/WickedEngine/shaders/ssgiCS.hlsl @@ -0,0 +1,167 @@ +#include "globals.hlsli" +#include "stochasticSSRHF.hlsli" +#include "ShaderInterop_Postprocess.h" + +PUSHCONSTANT(postprocess, PostProcess); + +Texture2DArray input_depth : register(t0); +Texture2DArray input_color : register(t1); +Texture2D input_normal : register(t2); + +RWTexture2D output_diffuse : register(u0); + +#ifdef WIDE +static const uint THREADCOUNT = 16; +static const int TILE_BORDER = 18; +#else +static const uint THREADCOUNT = 8; +static const int TILE_BORDER = 4; +#endif // WIDE +static const int TILE_SIZE = TILE_BORDER + THREADCOUNT + TILE_BORDER; +groupshared uint cache_xy[TILE_SIZE * TILE_SIZE]; +groupshared float cache_z[TILE_SIZE * TILE_SIZE]; +groupshared uint cache_rgb[TILE_SIZE * TILE_SIZE]; +groupshared uint group_valid; + +inline uint coord_to_cache(int2 coord) +{ + return flatten2D(clamp(TILE_BORDER + coord, 0, TILE_SIZE - 1), TILE_SIZE); +} + +static const float depthRejection = 8; +static const float depthRejection_rcp = rcp(depthRejection); + +float3 compute_diffuse( + float3 origin_position, + float3 origin_normal, + int2 GTid, + int2 offset +) +{ + const int2 sampleLoc = GTid + offset; + const uint t = coord_to_cache(sampleLoc); + uint c = cache_rgb[t]; + if(c == 0) + return 0; // early exit if pixel doesn't have lighting + float3 sample_position; + sample_position.z = cache_z[t]; + sample_position.xy = unpack_half2(cache_xy[t]); + const float3 origin_to_sample = sample_position - origin_position; + float occlusion = saturate(dot(origin_normal, origin_to_sample)); // normal falloff + occlusion *= saturate(1 + origin_to_sample.z * depthRejection_rcp); // depth falloff + + if(occlusion > 0) + { + const float origin_z = origin_position.z; + const float sample_z = sample_position.z; + + // DDA occlusion: + const int2 start = GTid; + const int2 goal = sampleLoc; + + const int dx = int(goal.x) - int(start.x); + const int dy = int(goal.y) - int(start.y); + + int step = max(abs(dx), abs(dy)); + step = (step + 1) / 2; // reduce steps + const float step_rcp = rcp(step); + + const float x_incr = float(dx) * step_rcp; + const float y_incr = float(dy) * step_rcp; + + float x = float(start.x); + float y = float(start.y); + + for (int i = 0; i < step - 1; i++) + { + x += x_incr; + y += y_incr; + + const int2 loc = int2(round(x), round(y)); + const uint tt = coord_to_cache(loc); + + const float dt = float(i) / float(step); + const float z = lerp(origin_z, sample_z, dt); + + const float sz = cache_z[tt]; + if(sz < z - 0.1) + { + return occlusion * Unpack_R11G11B10_FLOAT(cache_rgb[tt]); + } + } + } + + return occlusion * Unpack_R11G11B10_FLOAT(c); +} + +[numthreads(THREADCOUNT, THREADCOUNT, 1)] +void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint2 GTid : SV_GroupThreadID, uint groupIndex : SV_GroupIndex) +{ + const uint layer = DTid.z; + const uint2 interleaved_pixel = DTid.xy << 2 | uint2(DTid.z & 3, DTid.z >> 2); + + if(groupIndex == 0) + { + group_valid = 0; + } + GroupMemoryBarrierWithGroupSync(); + + const int2 tile_upperleft = Gid.xy * THREADCOUNT - TILE_BORDER; + for(uint t = groupIndex; t < TILE_SIZE * TILE_SIZE; t += THREADCOUNT * THREADCOUNT) + { + const int2 pixel = tile_upperleft + unflatten2D(t, TILE_SIZE); + const float depth = input_depth[uint3(pixel, layer)]; + const float2 uv = (pixel + 0.5f) * postprocess.resolution_rcp; + const float3 P = reconstruct_position(uv, depth, GetCamera().inverse_projection); + const float3 color = input_color[uint3(pixel, layer)]; + const uint pkcolor = Pack_R11G11B10_FLOAT(color.rgb); + cache_xy[t] = pack_half2(P.xy); + cache_z[t] = P.z; + cache_rgb[t] = pkcolor; + if(pkcolor) + InterlockedOr(group_valid, 1u); + } + GroupMemoryBarrierWithGroupSync(); + + [branch] + if (group_valid == 0) + return; // if no valid color was cached, whole group can exit early + + const uint t = coord_to_cache(GTid.xy); + float3 P; + P.z = cache_z[t]; + + [branch] + if(P.z > GetCamera().z_far - 1) + return; // if pixel depth is not valid, it can exit early + + P.xy = unpack_half2(cache_xy[t]); + + const uint2 pixel = DTid.xy; + const float3 N = mul((float3x3)GetCamera().view, decode_oct(input_normal[interleaved_pixel].rg)); + + float3 diffuse = 0; + float sum = 0; + const int range = int(postprocess.params0.x); + const float spread = postprocess.params0.y + dither(pixel); + const float rangespread_rcp2 = postprocess.params0.z; + + for(int x = -range; x <= range; ++x) + { + for(int y = -range; y <= range; ++y) + { + const float2 foffset = float2(x, y) * spread; + const int2 offset = round(foffset); + const float weight = saturate(1 - abs(foffset.x) * abs(foffset.y) * rangespread_rcp2); + diffuse += compute_diffuse(P, N, GTid, offset) * weight; + sum += weight; + } + } + if(sum > 0) + { + diffuse = diffuse / sum; + } + + // interleave result: + output_diffuse[interleaved_pixel] = float4(diffuse, 1); +} diff --git a/WickedEngine/shaders/ssgi_deinterleaveCS.hlsl b/WickedEngine/shaders/ssgi_deinterleaveCS.hlsl new file mode 100644 index 000000000..b392ce3d7 --- /dev/null +++ b/WickedEngine/shaders/ssgi_deinterleaveCS.hlsl @@ -0,0 +1,112 @@ +#include "globals.hlsli" +#include "ShaderInterop_Postprocess.h" + +Texture2D texture_input : register(t0); + +RWTexture2DArray atlas2x_depth : register(u0); +RWTexture2DArray atlas4x_depth : register(u1); +RWTexture2DArray atlas8x_depth : register(u2); +RWTexture2DArray atlas16x_depth : register(u3); +RWTexture2DArray atlas2x_color : register(u4); +RWTexture2DArray atlas4x_color : register(u5); +RWTexture2DArray atlas8x_color : register(u6); +RWTexture2DArray atlas16x_color : register(u7); +RWTexture2D regular2x_depth : register(u8); +RWTexture2D regular2x_normal : register(u9); +RWTexture2D regular4x_depth : register(u10); +RWTexture2D regular4x_normal : register(u11); +RWTexture2D regular8x_depth : register(u12); +RWTexture2D regular8x_normal : register(u13); +RWTexture2D regular16x_depth : register(u14); +RWTexture2D regular16x_normal : register(u15); + +groupshared float shared_depths[256]; +groupshared float2 shared_normals[256]; +groupshared float3 shared_colors[256]; + +[numthreads(8, 8, 1)] +void main(uint3 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex, uint3 GTid : SV_GroupThreadID, uint3 DTid : SV_DispatchThreadID) +{ + uint2 dim; + texture_depth.GetDimensions(dim.x, dim.y); + const float2 dim_rcp = rcp(dim); + + uint2 startST = Gid.xy << 4 | GTid.xy; + uint destIdx = GTid.y << 4 | GTid.x; + shared_depths[destIdx + 0] = texture_depth[min(startST | uint2(0, 0), dim - 1)]; + shared_depths[destIdx + 8] = texture_depth[min(startST | uint2(8, 0), dim - 1)]; + shared_depths[destIdx + 128] = texture_depth[min(startST | uint2(0, 8), dim - 1)]; + shared_depths[destIdx + 136] = texture_depth[min(startST | uint2(8, 8), dim - 1)]; + + shared_normals[destIdx + 0] = texture_normal[min(startST | uint2(0, 0), dim - 1)]; + shared_normals[destIdx + 8] = texture_normal[min(startST | uint2(8, 0), dim - 1)]; + shared_normals[destIdx + 128] = texture_normal[min(startST | uint2(0, 8), dim - 1)]; + shared_normals[destIdx + 136] = texture_normal[min(startST | uint2(8, 8), dim - 1)]; + + const float2 uv0 = float2(startST | uint2(0, 0)) * dim_rcp; + const float2 uv1 = float2(startST | uint2(8, 0)) * dim_rcp; + const float2 uv2 = float2(startST | uint2(0, 8)) * dim_rcp; + const float2 uv3 = float2(startST | uint2(8, 8)) * dim_rcp; + const float2 velocity0 = texture_velocity[min(startST | uint2(0, 0), dim - 1)]; + const float2 velocity1 = texture_velocity[min(startST | uint2(8, 0), dim - 1)]; + const float2 velocity2 = texture_velocity[min(startST | uint2(0, 8), dim - 1)]; + const float2 velocity3 = texture_velocity[min(startST | uint2(8, 8), dim - 1)]; + const float2 prevUV0 = uv0 + velocity0; + const float2 prevUV1 = uv1 + velocity1; + const float2 prevUV2 = uv2 + velocity2; + const float2 prevUV3 = uv3 + velocity3; + shared_colors[destIdx + 0] = texture_input.SampleLevel(sampler_linear_clamp, prevUV0, 0); + shared_colors[destIdx + 8] = texture_input.SampleLevel(sampler_linear_clamp, prevUV1, 0); + shared_colors[destIdx + 128] = texture_input.SampleLevel(sampler_linear_clamp, prevUV2, 0); + shared_colors[destIdx + 136] = texture_input.SampleLevel(sampler_linear_clamp, prevUV3, 0); + + GroupMemoryBarrierWithGroupSync(); + + uint ldsIndex = (GTid.x << 1) | (GTid.y << 5); + + float depth = shared_depths[ldsIndex]; + float2 normal = shared_normals[ldsIndex]; + float3 color = shared_colors[ldsIndex]; + + color = color - 0.2; // cut out pixels that shouldn't act as lights + color *= 0.9; // accumulation energy loss + color = max(0, color); + + uint2 st = DTid.xy; + uint slice = flatten2D(st % 4, 4); + atlas2x_depth[uint3(st >> 2, slice)] = depth; + atlas2x_color[uint3(st >> 2, slice)] = color; + regular2x_depth[st] = depth; + regular2x_normal[st] = normal; + + if (all(GTid.xy % 2) == 0) + { + st = DTid.xy >> 1; + slice = flatten2D(st % 4, 4); + atlas4x_depth[uint3(st >> 2, slice)] = depth; + atlas4x_color[uint3(st >> 2, slice)] = color; + regular4x_depth[st] = depth; + regular4x_normal[st] = normal; + + if (all(GTid.xy % 4) == 0) + { + st = DTid.xy >> 2; + slice = flatten2D(st % 4, 4); + atlas8x_depth[uint3(st >> 2, slice)] = depth; + atlas8x_color[uint3(st >> 2, slice)] = color; + regular8x_depth[st] = depth; + regular8x_normal[st] = normal; + + if (groupIndex == 0) + { + st = DTid.xy >> 3; + slice = flatten2D(st % 4, 4); + atlas16x_depth[uint3(st >> 2, slice)] = depth; + atlas16x_color[uint3(st >> 2, slice)] = color; + regular16x_depth[st] = depth; + regular16x_normal[st] = normal; + } + } + } + +} diff --git a/WickedEngine/shaders/ssgi_upsampleCS.hlsl b/WickedEngine/shaders/ssgi_upsampleCS.hlsl new file mode 100644 index 000000000..8d0c9b44d --- /dev/null +++ b/WickedEngine/shaders/ssgi_upsampleCS.hlsl @@ -0,0 +1,82 @@ +#include "globals.hlsli" +#include "stochasticSSRHF.hlsli" +#include "ShaderInterop_Postprocess.h" + +PUSHCONSTANT(postprocess, PostProcess); + +Texture2D input_depth_low : register(t0); +Texture2D input_normal_low : register(t1); +Texture2D input_diffuse_low : register(t2); +Texture2D input_depth_high : register(t3); +Texture2D input_normal_high : register(t4); + +RWTexture2D output : register(u0); + +static const float depthThreshold = 0.1; +static const float normalThreshold = 64; + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint2 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex) +{ + uint2 GTid = remap_lane_8x8(groupIndex); + uint2 pixel = Gid * POSTPROCESS_BLOCKSIZE + GTid; + const float2 uv = (pixel + 0.5) * postprocess.resolution_rcp; + + const float depth = input_depth_high[pixel]; + const float linearDepth = compute_lineardepth(depth); + const float3 N = decode_oct(input_normal_high[pixel].rg); + +#if 1 + const float3 P = reconstruct_position(uv, depth); + const float3 ddxP = P - QuadReadAcrossX(P); + const float3 ddyP = P - QuadReadAcrossY(P); + const float curve = saturate(1 - pow(1 - max(dot(ddxP, ddxP), dot(ddyP, ddyP)), 32)); + const float normalPow = lerp(normalThreshold, 1, curve); +#else + const float normalPow = normalThreshold; +#endif + + float3 result = 0; + float sum = 0; +#if 1 + const int range = int(postprocess.params0.x); + const float spread = postprocess.params0.y; +#else + const int range = 1; + const float spread = 8; +#endif + for(int x = -range; x <= range; ++x) + { + for(int y = -range; y <= range; ++y) + { + const float2 offset = float2(x, y) * spread * postprocess.resolution_rcp; + const float2 sample_uv = uv + offset; + + const float3 sampleDiffuse = input_diffuse_low.SampleLevel(sampler_linear_clamp, sample_uv, 0).rgb; + + const float sampleDepth = input_depth_low.SampleLevel(sampler_point_clamp, sample_uv, 0); + const float sampleLinearDepth = compute_lineardepth(sampleDepth); + float bilateralDepthWeight = 1 - saturate(abs(sampleLinearDepth - linearDepth) * depthThreshold); + + const float3 sampleN = decode_oct(input_normal_low.SampleLevel(sampler_linear_clamp, sample_uv, 0)); + float normalError = pow(saturate(dot(sampleN, N)), normalPow); + float bilateralNormalWeight = normalError; + + float weight = bilateralDepthWeight * bilateralNormalWeight; + + //weight = 1; + result += sampleDiffuse * weight; + sum += weight; + } + } + + if(sum > 0) + { + result /= sum; + } + + result = max(0, result); + + output[pixel] = output[pixel] + float4(result, 1); + //output[pixel] = float4(curve.xxx, 1); +} diff --git a/WickedEngine/shaders/ssr_bilateralCS.hlsl b/WickedEngine/shaders/ssr_bilateralCS.hlsl deleted file mode 100644 index b51100e2b..000000000 --- a/WickedEngine/shaders/ssr_bilateralCS.hlsl +++ /dev/null @@ -1,104 +0,0 @@ -#include "globals.hlsli" -#include "stochasticSSRHF.hlsli" -#include "ShaderInterop_Postprocess.h" - -PUSHCONSTANT(postprocess, PostProcess); - -Texture2D texture_temporal : register(t0); -Texture2D texture_resolve_variance : register(t1); - -RWTexture2D output : register(u0); - -static const float depthThreshold = 10000.0; -static const float normalThreshold = 1.0; -static const float varianceEstimateThreshold = 0.015; // Larger variance values use stronger blur -static const float varianceExitThreshold = 0.005; // Variance needs to be higher than this value to accept blur -static const uint2 bilateralMinMaxRadius = uint2(0, 2); // Chosen by variance - -#define BILATERAL_SIGMA 0.9 - -[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] -void main(uint3 DTid : SV_DispatchThreadID) -{ -#if 0 // Debug - output[DTid.xy] = float4((texture_resolve_variance[DTid.xy] > varianceEstimateThreshold).rrr, 1.0); - return; -#endif - - const float depth = texture_depth[DTid.xy]; - const float roughness = texture_roughness[DTid.xy]; - - if (!NeedReflection(roughness, depth, ssr_roughness_cutoff)) - { - output[DTid.xy] = texture_temporal[DTid.xy]; - return; - } - - float2 direction = postprocess.params0.xy; - - const float linearDepth = texture_lineardepth[DTid.xy]; - const float3 N = decode_oct(texture_normal[DTid.xy]); - - float4 outputColor = texture_temporal[DTid.xy]; - - - float variance = texture_resolve_variance[DTid.xy]; - bool strongBlur = variance > varianceEstimateThreshold; - - float radius = strongBlur ? bilateralMinMaxRadius.y : bilateralMinMaxRadius.x; - radius = lerp(0.0, radius, saturate(roughness * 8.0)); // roughness 0.125 is destination - - float sigma = radius * BILATERAL_SIGMA; - int effectiveRadius = min(sigma * 2.0, radius); - - if (variance > varianceExitThreshold && effectiveRadius > 0) - { - float2 uv = (DTid.xy + 0.5f) * postprocess.resolution_rcp; - float3 P = reconstruct_position(uv, depth); - - float4 result = 0; - float weightSum = 0.0f; - - for (int r = -effectiveRadius; r <= effectiveRadius; r++) - { - const int2 sampleCoord = DTid.xy + (direction * r); // Left to right diameter directionally - - if (all(and(sampleCoord >= int2(0, 0), sampleCoord < (int2) postprocess.resolution))) - { - const float sampleDepth = texture_depth[sampleCoord]; - const float4 sampleColor = texture_temporal[sampleCoord]; - - const float3 sampleN = decode_oct(texture_normal[sampleCoord]); - const float sampleRoughness = texture_roughness[sampleCoord]; - - float2 sampleUV = (sampleCoord + 0.5) * postprocess.resolution_rcp; - float3 sampleP = reconstruct_position(sampleUV, sampleDepth); - - // Don't let invalid roughness samples interfere - if (NeedReflection(sampleRoughness, sampleDepth, ssr_roughness_cutoff)) - { - float3 dq = P - sampleP; - float planeError = max(abs(dot(dq, sampleN)), abs(dot(dq, N))); - float relativeDepthDifference = planeError / (linearDepth * GetCamera().z_far); - float bilateralDepthWeight = exp(-sqr(relativeDepthDifference) * depthThreshold); - - float normalError = pow(saturate(dot(sampleN, N)), 4.0); - float bilateralNormalWeight = saturate(1.0 - (1.0 - normalError) * normalThreshold); - - float bilateralWeight = bilateralDepthWeight * bilateralNormalWeight; - - float gaussian = exp(-sqr(r / sigma)); - float weight = (r == 0) ? 1.0 : gaussian * bilateralWeight; // Skip center gaussian peak - - result += sampleColor * weight; - weightSum += weight; - } - } - } - - result /= weightSum; - outputColor = result; - } - - output[DTid.xy] = outputColor; -} diff --git a/WickedEngine/shaders/ssr_resolveCS.hlsl b/WickedEngine/shaders/ssr_resolveCS.hlsl index 9c4a88543..fec2d54b2 100644 --- a/WickedEngine/shaders/ssr_resolveCS.hlsl +++ b/WickedEngine/shaders/ssr_resolveCS.hlsl @@ -87,14 +87,13 @@ uint3 hash33(uint3 x) void main(uint3 DTid : SV_DispatchThreadID) { const float2 uv = (DTid.xy + 0.5f) * postprocess.resolution_rcp; - const uint2 tracingCoord = DTid.xy / 2; - const float depth = texture_depth[DTid.xy]; - const float roughness = texture_roughness[DTid.xy]; + const float depth = texture_depth[DTid.xy * 2]; + const float roughness = texture_roughness[DTid.xy * 2]; if (!NeedReflection(roughness, depth, ssr_roughness_cutoff)) { - texture_resolve[DTid.xy] = texture_rayIndirectSpecular[tracingCoord]; + texture_resolve[DTid.xy] = texture_rayIndirectSpecular[DTid.xy]; texture_resolve_variance[DTid.xy] = 0.0; texture_reprojectionDepth[DTid.xy] = 0.0; return; @@ -102,7 +101,7 @@ void main(uint3 DTid : SV_DispatchThreadID) // Everthing in world space: const float3 P = reconstruct_position(uv, depth); - const float3 N = decode_oct(texture_normal[DTid.xy]); + const float3 N = decode_oct(texture_normal[DTid.xy * 2]); const float3 V = normalize(GetCamera().position - P); const float NdotV = saturate(dot(N, V)); @@ -123,16 +122,15 @@ void main(uint3 DTid : SV_DispatchThreadID) for (int i = 0; i < sampleCount; i++) { float2 offset = (hammersley2d_random(i, sampleCount, random) - 0.5) * resolveSpatialSize; - - int2 neighborTracingCoord = tracingCoord + offset; + int2 neighborCoord = DTid.xy + offset; - float neighborDepth = texture_depth[neighborCoord]; + float neighborDepth = texture_depth[neighborCoord * 2]; if (neighborDepth > 0.0) { - float weight = GetWeight(neighborTracingCoord, V, N, roughness, NdotV); + float weight = GetWeight(neighborCoord, V, N, roughness, NdotV); - float4 sampleColor = texture_rayIndirectSpecular[neighborTracingCoord]; + float4 sampleColor = texture_rayIndirectSpecular[neighborCoord]; sampleColor.rgb *= rcp(1 + Luminance(sampleColor.rgb)); result += sampleColor * weight; @@ -142,7 +140,7 @@ void main(uint3 DTid : SV_DispatchThreadID) if (weight > 0.001) { - float neighborRayLength = texture_rayLength[neighborTracingCoord]; + float neighborRayLength = texture_rayLength[neighborCoord]; closestRayLength = max(closestRayLength, neighborRayLength); } } @@ -155,7 +153,7 @@ void main(uint3 DTid : SV_DispatchThreadID) float resolveVariance = S / weightSum; // Convert to post-projection depth so we can construct dual source reprojection buffers later - const float lineardepth = texture_lineardepth[DTid.xy] * GetCamera().z_far; + const float lineardepth = texture_lineardepth[DTid.xy * 2] * GetCamera().z_far; float reprojectionDepth = compute_inverse_lineardepth(lineardepth + closestRayLength, GetCamera().z_near, GetCamera().z_far); texture_resolve[DTid.xy] = max(result, 0.00001f); diff --git a/WickedEngine/shaders/ssr_temporalCS.hlsl b/WickedEngine/shaders/ssr_temporalCS.hlsl index 3da41e2b9..e968e99a5 100644 --- a/WickedEngine/shaders/ssr_temporalCS.hlsl +++ b/WickedEngine/shaders/ssr_temporalCS.hlsl @@ -141,8 +141,8 @@ void main(uint3 Gid : SV_GroupID, uint3 GTid : SV_GroupThreadID, uint3 DTid : SV return; } - const float depth = texture_depth[DTid.xy]; - const float roughness = texture_roughness[DTid.xy]; + const float depth = texture_depth[DTid.xy * 2]; + const float roughness = texture_roughness[DTid.xy * 2]; if (!NeedReflection(roughness, depth, ssr_roughness_cutoff)) { @@ -177,7 +177,7 @@ void main(uint3 Gid : SV_GroupID, uint3 GTid : SV_GroupThreadID, uint3 DTid : SV // Secondary reprojection based on ray lengths: // https://www.ea.com/seed/news/seed-dd18-presentation-slides-raytracing (Slide 45) - float2 velocity = texture_velocity[DTid.xy]; + float2 velocity = texture_velocity[DTid.xy * 2]; float reprojectionDepth = texture_reprojectionDepth[DTid.xy]; float2 uv = (DTid.xy + 0.5f) * postprocess.resolution_rcp; diff --git a/WickedEngine/shaders/ssr_upsampleCS.hlsl b/WickedEngine/shaders/ssr_upsampleCS.hlsl new file mode 100644 index 000000000..addabdd6f --- /dev/null +++ b/WickedEngine/shaders/ssr_upsampleCS.hlsl @@ -0,0 +1,101 @@ +#include "globals.hlsli" +#include "stochasticSSRHF.hlsli" +#include "ShaderInterop_Postprocess.h" + +PUSHCONSTANT(postprocess, PostProcess); + +Texture2D texture_temporal : register(t0); +Texture2D texture_resolve_variance : register(t1); + +RWTexture2D output : register(u0); + +static const float depthThreshold = 10000.0; +static const float normalThreshold = 1.0; +static const float varianceEstimateThreshold = 0.015; // Larger variance values use stronger blur +static const float varianceExitThreshold = 0.005; // Variance needs to be higher than this value to accept blur +static const uint2 bilateralMinMaxRadius = uint2(0, 2); // Chosen by variance + +#define BILATERAL_SIGMA 0.9 + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint3 DTid : SV_DispatchThreadID) +{ + const float depth = texture_depth[DTid.xy]; + const float roughness = texture_roughness[DTid.xy]; + + if (!NeedReflection(roughness, depth, ssr_roughness_cutoff)) + { + output[DTid.xy] = 0; + return; + } + + const float linearDepth = texture_lineardepth[DTid.xy]; + const float3 N = decode_oct(texture_normal[DTid.xy]); + + const float2 uv = (DTid.xy + 0.5) * postprocess.resolution_rcp; + float4 outputColor = texture_temporal.SampleLevel(sampler_linear_clamp, uv, 0); + + float variance = texture_resolve_variance.SampleLevel(sampler_linear_clamp, uv, 0); + bool strongBlur = variance > varianceEstimateThreshold; + + float radius = strongBlur ? bilateralMinMaxRadius.y : bilateralMinMaxRadius.x; + radius = lerp(0.0, radius, saturate(roughness * 8.0)); // roughness 0.125 is destination + + float sigma = radius * BILATERAL_SIGMA; + int effectiveRadius = min(sigma * 2.0, radius); + + if (variance > varianceExitThreshold && effectiveRadius > 0) + { + float3 P = reconstruct_position(uv, depth); + + float4 result = 0; + float weightSum = 0.0f; + + for(uint d = 0; d < 2; ++d) + { + const int2 direction = d < 1 ? int2(1, 0) : int2(0, 1); + for (int r = -effectiveRadius; r <= effectiveRadius; r++) + { + const int2 sampleCoord = DTid.xy + (direction * r); // Left to right diameter directionally + + if (all(and(sampleCoord >= int2(0, 0), sampleCoord < (int2) postprocess.resolution))) + { + const float sampleDepth = texture_depth[sampleCoord]; + + float2 sampleUV = (sampleCoord + 0.5) * postprocess.resolution_rcp; + const float4 sampleColor = texture_temporal.SampleLevel(sampler_linear_clamp, sampleUV, 0); + + const float3 sampleN = decode_oct(texture_normal[sampleCoord]); + const float sampleRoughness = texture_roughness[sampleCoord]; + + float3 sampleP = reconstruct_position(sampleUV, sampleDepth); + + // Don't let invalid roughness samples interfere + if (NeedReflection(sampleRoughness, sampleDepth, ssr_roughness_cutoff)) + { + float3 dq = P - sampleP; + float planeError = max(abs(dot(dq, sampleN)), abs(dot(dq, N))); + float relativeDepthDifference = planeError / (linearDepth * GetCamera().z_far); + float bilateralDepthWeight = exp(-sqr(relativeDepthDifference) * depthThreshold); + + float normalError = pow(saturate(dot(sampleN, N)), 4.0); + float bilateralNormalWeight = saturate(1.0 - (1.0 - normalError) * normalThreshold); + + float bilateralWeight = bilateralDepthWeight * bilateralNormalWeight; + + float gaussian = exp(-sqr(r / sigma)); + float weight = (r == 0) ? 1.0 : gaussian * bilateralWeight; // Skip center gaussian peak + + result += sampleColor * weight; + weightSum += weight; + } + } + } + } + + result /= weightSum; + outputColor = result; + } + + output[DTid.xy] = outputColor; +} diff --git a/WickedEngine/shaders/surfaceHF.hlsli b/WickedEngine/shaders/surfaceHF.hlsli index 18a68de1f..a84c1888f 100644 --- a/WickedEngine/shaders/surfaceHF.hlsli +++ b/WickedEngine/shaders/surfaceHF.hlsli @@ -2,8 +2,6 @@ #define WI_SURFACE_HF #include "globals.hlsli" -#define max3(v) max(max(v.x, v.y), v.z) - // hard coded value for surfaces with simplified lighting: // occlusion = 1 // roughness = 1 @@ -100,6 +98,7 @@ struct Surface float hit_depth; float3 gi; float3 bumpColor; + float3 ssgi; // These will be computed when calling Update(): float NdotV; // cos(angle between normal and view vector) @@ -147,6 +146,7 @@ struct Surface flags = 0; gi = 0; bumpColor = 0; + ssgi = 0; uid_validate = 0; hit_depth = 0; diff --git a/WickedEngine/shaders/visibility_shadeCS.hlsl b/WickedEngine/shaders/visibility_shadeCS.hlsl index 97e296992..c8aa017b5 100644 --- a/WickedEngine/shaders/visibility_shadeCS.hlsl +++ b/WickedEngine/shaders/visibility_shadeCS.hlsl @@ -113,6 +113,11 @@ void main(uint Gid : SV_GroupID, uint groupIndex : SV_GroupIndex) lighting.indirect.specular = lerp(lighting.indirect.specular, ssr.rgb * surface.F, ssr.a); } [branch] + if (GetCamera().texture_ssgi_index >= 0) + { + surface.ssgi = bindless_textures[GetCamera().texture_ssgi_index].SampleLevel(sampler_linear_clamp, surface.screenUV, 0).rgb; + } + [branch] if (GetCamera().texture_ao_index >= 0) { surface.occlusion *= bindless_textures_float[GetCamera().texture_ao_index].SampleLevel(sampler_linear_clamp, surface.screenUV, 0).r; diff --git a/WickedEngine/wiEnums.h b/WickedEngine/wiEnums.h index baa4bce50..6e74bd700 100644 --- a/WickedEngine/wiEnums.h +++ b/WickedEngine/wiEnums.h @@ -296,10 +296,14 @@ namespace wi::enums CSTYPE_POSTPROCESS_MSAO_BLURUPSAMPLE_PREMIN, CSTYPE_POSTPROCESS_MSAO_BLURUPSAMPLE_PREMIN_BLENDOUT, CSTYPE_POSTPROCESS_RTREFLECTION, + CSTYPE_POSTPROCESS_SSGI_DEINTERLEAVE, + CSTYPE_POSTPROCESS_SSGI, + CSTYPE_POSTPROCESS_SSGI_WIDE, + CSTYPE_POSTPROCESS_SSGI_UPSAMPLE, CSTYPE_POSTPROCESS_RTDIFFUSE, CSTYPE_POSTPROCESS_RTDIFFUSE_SPATIAL, CSTYPE_POSTPROCESS_RTDIFFUSE_TEMPORAL, - CSTYPE_POSTPROCESS_RTDIFFUSE_BILATERAL, + CSTYPE_POSTPROCESS_RTDIFFUSE_UPSAMPLE, CSTYPE_POSTPROCESS_SSR_TILEMAXROUGHNESS_HORIZONTAL, CSTYPE_POSTPROCESS_SSR_TILEMAXROUGHNESS_VERTICAL, CSTYPE_POSTPROCESS_SSR_DEPTHHIERARCHY, @@ -308,7 +312,7 @@ namespace wi::enums CSTYPE_POSTPROCESS_SSR_RAYTRACE_CHEAP, CSTYPE_POSTPROCESS_SSR_RESOLVE, CSTYPE_POSTPROCESS_SSR_TEMPORAL, - CSTYPE_POSTPROCESS_SSR_BILATERAL, + CSTYPE_POSTPROCESS_SSR_UPSAMPLE, CSTYPE_POSTPROCESS_LIGHTSHAFTS, CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_HORIZONTAL, CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_VERTICAL, diff --git a/WickedEngine/wiGraphicsDevice_DX12.cpp b/WickedEngine/wiGraphicsDevice_DX12.cpp index 780bd2844..3313d57a3 100644 --- a/WickedEngine/wiGraphicsDevice_DX12.cpp +++ b/WickedEngine/wiGraphicsDevice_DX12.cpp @@ -7379,24 +7379,68 @@ using namespace dx12_internal; } void GraphicsDevice_DX12::ClearUAV(const GPUResource* resource, uint32_t value, CommandList cmd) { - auto internal_state = to_internal(resource); - // We cannot clear eg. a StructuredBuffer, so in those cases we must clear the RAW view with uav_raw - const SingleDescriptor& descriptor = internal_state->uav_raw.IsValid() ? internal_state->uav_raw : internal_state->uav; - D3D12_GPU_DESCRIPTOR_HANDLE gpu_handle = descriptorheap_res.start_gpu; - gpu_handle.ptr += descriptor.index * resource_descriptor_size; - D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle = descriptor.handle; - const UINT values[4] = { value,value,value,value }; - CommandList_DX12& commandlist = GetCommandList(cmd); - commandlist.GetGraphicsCommandList()->ClearUnorderedAccessViewUint( - gpu_handle, - cpu_handle, - internal_state->resource.Get(), - values, - 0, - nullptr - ); + auto internal_state = to_internal(resource); + if (internal_state->uav_raw.IsValid()) + { + // We cannot clear eg. a StructuredBuffer, so in those cases we must clear the RAW view with uav_raw + const SingleDescriptor& descriptor = internal_state->uav_raw; + D3D12_GPU_DESCRIPTOR_HANDLE gpu_handle = descriptorheap_res.start_gpu; + gpu_handle.ptr += descriptor.index * resource_descriptor_size; + D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle = descriptor.handle; + + CommandList_DX12& commandlist = GetCommandList(cmd); + commandlist.GetGraphicsCommandList()->ClearUnorderedAccessViewUint( + gpu_handle, + cpu_handle, + internal_state->resource.Get(), + values, + 0, + nullptr + ); + } + else + { + if (internal_state->subresources_uav.empty()) + { + const SingleDescriptor& descriptor = internal_state->uav; + D3D12_GPU_DESCRIPTOR_HANDLE gpu_handle = descriptorheap_res.start_gpu; + gpu_handle.ptr += descriptor.index * resource_descriptor_size; + D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle = descriptor.handle; + + CommandList_DX12& commandlist = GetCommandList(cmd); + commandlist.GetGraphicsCommandList()->ClearUnorderedAccessViewUint( + gpu_handle, + cpu_handle, + internal_state->resource.Get(), + values, + 0, + nullptr + ); + } + else + { + // This is clearing every subresource (for example every mip since they can't be referenced by single UAV) + for (auto& uav : internal_state->subresources_uav) + { + const SingleDescriptor& descriptor = uav; + D3D12_GPU_DESCRIPTOR_HANDLE gpu_handle = descriptorheap_res.start_gpu; + gpu_handle.ptr += descriptor.index * resource_descriptor_size; + D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle = descriptor.handle; + + CommandList_DX12& commandlist = GetCommandList(cmd); + commandlist.GetGraphicsCommandList()->ClearUnorderedAccessViewUint( + gpu_handle, + cpu_handle, + internal_state->resource.Get(), + values, + 0, + nullptr + ); + } + } + } } void GraphicsDevice_DX12::VideoDecode(const VideoDecoder* video_decoder, const VideoDecodeOperation* op, CommandList cmd) { diff --git a/WickedEngine/wiRenderPath3D.cpp b/WickedEngine/wiRenderPath3D.cpp index ccee5404d..be2ad8ba3 100644 --- a/WickedEngine/wiRenderPath3D.cpp +++ b/WickedEngine/wiRenderPath3D.cpp @@ -192,7 +192,7 @@ namespace wi assert(subresource_index == i); } - clearableTextures.push_back(&rtSceneCopy); // because this is used by SSR before it gets a chance to be normally rendered, it MUST be cleared! + clearableTextures.push_back(&rtSceneCopy); // because this is used by SSR and SSGI before it gets a chance to be normally rendered, it MUST be cleared! } { TextureDesc desc; @@ -316,6 +316,7 @@ namespace wi // These can trigger resource creations if needed: setAO(ao); setSSREnabled(ssrEnabled); + setSSGIEnabled(ssgiEnabled); setRaytracedReflectionsEnabled(raytracedReflectionsEnabled); setRaytracedDiffuseEnabled(raytracedDiffuseEnabled); setFSREnabled(fsrEnabled); @@ -446,6 +447,10 @@ namespace wi { rtSSR = {}; } + if (!getSSGIEnabled()) + { + rtSSGI = {}; + } if (!getRaytracedDiffuseEnabled()) { rtRaytracedDiffuse = {}; @@ -557,6 +562,7 @@ namespace wi getMotionBlurEnabled() || wi::renderer::GetTemporalAAEnabled() || getSSREnabled() || + getSSGIEnabled() || getRaytracedReflectionEnabled() || getRaytracedDiffuseEnabled() || wi::renderer::GetRaytracedShadowsEnabled() || @@ -616,6 +622,7 @@ namespace wi if ( visibility_shading_in_compute || getSSREnabled() || + getSSGIEnabled() || getRaytracedReflectionEnabled() || getRaytracedDiffuseEnabled() || wi::renderer::GetScreenSpaceShadowsEnabled() || @@ -704,6 +711,7 @@ namespace wi camera->texture_waterriples_index = device->GetDescriptorIndex(&rtWaterRipple, SubresourceType::SRV); camera->texture_ao_index = device->GetDescriptorIndex(&rtAO, SubresourceType::SRV); camera->texture_ssr_index = device->GetDescriptorIndex(&rtSSR, SubresourceType::SRV); + camera->texture_ssgi_index = device->GetDescriptorIndex(&rtSSGI, SubresourceType::SRV); camera->texture_rtshadow_index = device->GetDescriptorIndex(&rtShadow, SubresourceType::SRV); camera->texture_rtdiffuse_index = device->GetDescriptorIndex(&rtRaytracedDiffuse, SubresourceType::SRV); camera->texture_surfelgi_index = device->GetDescriptorIndex(&surfelGIResources.result, SubresourceType::SRV); @@ -738,6 +746,7 @@ namespace wi camera_reflection.texture_waterriples_index = -1; camera_reflection.texture_ao_index = -1; camera_reflection.texture_ssr_index = -1; + camera_reflection.texture_ssgi_index = -1; camera_reflection.texture_rtshadow_index = -1; camera_reflection.texture_rtdiffuse_index = -1; camera_reflection.texture_surfelgi_index = -1; @@ -1023,6 +1032,7 @@ namespace wi } else if ( getSSREnabled() || + getSSGIEnabled() || getRaytracedReflectionEnabled() || getRaytracedDiffuseEnabled() || wi::renderer::GetScreenSpaceShadowsEnabled() || @@ -1068,6 +1078,8 @@ namespace wi RenderSSR(cmd); + RenderSSGI(cmd); + if (wi::renderer::GetScreenSpaceShadowsEnabled()) { wi::renderer::Postprocess_ScreenSpaceShadow( @@ -1727,6 +1739,20 @@ namespace wi ); } } + void RenderPath3D::RenderSSGI(CommandList cmd) const + { + if (getSSGIEnabled()) + { + wi::renderer::Postprocess_SSGI( + ssgiResources, + rtSceneCopy, + depthBuffer_Copy, + visibilityResources.texture_normals, + rtSSGI, + cmd + ); + } + } void RenderPath3D::RenderOutline(CommandList cmd) const { if (getOutlineEnabled()) @@ -2386,6 +2412,33 @@ namespace wi ssrResources = {}; } } + void RenderPath3D::setSSGIEnabled(bool value) + { + ssgiEnabled = value; + + if (value) + { + GraphicsDevice* device = wi::graphics::GetDevice(); + XMUINT2 internalResolution = GetInternalResolution(); + if (internalResolution.x == 0 || internalResolution.y == 0) + return; + + TextureDesc desc; + desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS; + desc.format = Format::R16G16B16A16_FLOAT; + desc.width = internalResolution.x; + desc.height = internalResolution.y; + desc.layout = ResourceState::SHADER_RESOURCE_COMPUTE; + device->CreateTexture(&desc, nullptr, &rtSSGI); + device->SetName(&rtSSGI, "rtSSGI"); + + wi::renderer::CreateSSGIResources(ssgiResources, internalResolution); + } + else + { + ssgiResources = {}; + } + } void RenderPath3D::setRaytracedReflectionsEnabled(bool value) { raytracedReflectionsEnabled = value; diff --git a/WickedEngine/wiRenderPath3D.h b/WickedEngine/wiRenderPath3D.h index de8834e68..961063f27 100644 --- a/WickedEngine/wiRenderPath3D.h +++ b/WickedEngine/wiRenderPath3D.h @@ -60,6 +60,7 @@ namespace wi AO ao = AO_DISABLED; bool fxaaEnabled = false; bool ssrEnabled = false; + bool ssgiEnabled = false; bool raytracedReflectionsEnabled = false; bool raytracedDiffuseEnabled = false; bool reflectionsEnabled = true; @@ -94,6 +95,7 @@ namespace wi wi::graphics::Texture rtReflection; // contains the scene rendered for planar reflections wi::graphics::Texture rtRaytracedDiffuse; // raytraced diffuse screen space texture wi::graphics::Texture rtSSR; // standard screen-space reflection results + wi::graphics::Texture rtSSGI; // standard screen-space GI results wi::graphics::Texture rtSceneCopy; // contains the rendered scene that can be fed into transparent pass for distortion effect wi::graphics::Texture rtSceneCopy_tmp; // temporary for gaussian mipchain wi::graphics::Texture rtWaterRipple; // water ripple sprite normal maps are rendered into this @@ -129,6 +131,7 @@ namespace wi wi::renderer::RTDiffuseResources rtdiffuseResources; wi::renderer::RTReflectionResources rtreflectionResources; wi::renderer::SSRResources ssrResources; + wi::renderer::SSGIResources ssgiResources; wi::renderer::RTShadowResources rtshadowResources; wi::renderer::ScreenSpaceShadowResources screenspaceshadowResources; wi::renderer::DepthOfFieldResources depthoffieldResources; @@ -156,6 +159,7 @@ namespace wi virtual void RenderAO(wi::graphics::CommandList cmd) const; virtual void RenderSSR(wi::graphics::CommandList cmd) const; + virtual void RenderSSGI(wi::graphics::CommandList cmd) const; virtual void RenderOutline(wi::graphics::CommandList cmd) const; virtual void RenderLightShafts(wi::graphics::CommandList cmd) const; virtual void RenderVolumetrics(wi::graphics::CommandList cmd) const; @@ -236,6 +240,7 @@ namespace wi constexpr bool getAOEnabled() const { return ao != AO_DISABLED; } constexpr AO getAO() const { return ao; } constexpr bool getSSREnabled() const { return ssrEnabled; } + constexpr bool getSSGIEnabled() const { return ssgiEnabled; } constexpr bool getRaytracedDiffuseEnabled() const { return raytracedDiffuseEnabled; } constexpr bool getRaytracedReflectionEnabled() const { return raytracedReflectionsEnabled; } constexpr bool getShadowsEnabled() const { return shadowsEnabled; } @@ -289,6 +294,7 @@ namespace wi void setAO(AO value); void setSSREnabled(bool value); + void setSSGIEnabled(bool value); void setRaytracedReflectionsEnabled(bool value); void setRaytracedDiffuseEnabled(bool value); void setMotionBlurEnabled(bool value); diff --git a/WickedEngine/wiRenderPath3D_BindLua.cpp b/WickedEngine/wiRenderPath3D_BindLua.cpp index 5bdb4c67a..24fb6fc8f 100644 --- a/WickedEngine/wiRenderPath3D_BindLua.cpp +++ b/WickedEngine/wiRenderPath3D_BindLua.cpp @@ -27,6 +27,7 @@ namespace wi::lua lunamethod(RenderPath3D_BindLua, SetAO), lunamethod(RenderPath3D_BindLua, SetAOPower), lunamethod(RenderPath3D_BindLua, SetSSREnabled), + lunamethod(RenderPath3D_BindLua, SetSSGIEnabled), lunamethod(RenderPath3D_BindLua, SetRaytracedDiffuseEnabled), lunamethod(RenderPath3D_BindLua, SetRaytracedReflectionsEnabled), lunamethod(RenderPath3D_BindLua, SetShadowsEnabled), @@ -135,6 +136,19 @@ namespace wi::lua wi::lua::SError(L, "SetSSREnabled(bool value) not enough arguments!"); return 0; } + int RenderPath3D_BindLua::SetSSGIEnabled(lua_State* L) + { + if (component == nullptr) + { + wi::lua::SError(L, "SetSSGIEnabled(bool value) component is null!"); + return 0; + } + if (wi::lua::SGetArgCount(L) > 0) + ((RenderPath3D*)component)->setSSGIEnabled(wi::lua::SGetBool(L, 1)); + else + wi::lua::SError(L, "SetSSGIEnabled(bool value) not enough arguments!"); + return 0; + } int RenderPath3D_BindLua::SetRaytracedDiffuseEnabled(lua_State* L) { if (component == nullptr) diff --git a/WickedEngine/wiRenderPath3D_BindLua.h b/WickedEngine/wiRenderPath3D_BindLua.h index f0edcdd5a..ab66cded7 100644 --- a/WickedEngine/wiRenderPath3D_BindLua.h +++ b/WickedEngine/wiRenderPath3D_BindLua.h @@ -33,6 +33,7 @@ namespace wi::lua int SetAO(lua_State* L); int SetAOPower(lua_State* L); int SetSSREnabled(lua_State* L); + int SetSSGIEnabled(lua_State* L); int SetRaytracedDiffuseEnabled(lua_State* L); int SetRaytracedReflectionsEnabled(lua_State* L); int SetShadowsEnabled(lua_State* L); diff --git a/WickedEngine/wiRenderer.cpp b/WickedEngine/wiRenderer.cpp index 3563aa00f..140d44398 100644 --- a/WickedEngine/wiRenderer.cpp +++ b/WickedEngine/wiRenderer.cpp @@ -983,7 +983,7 @@ void LoadShaders() wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_RAYTRACE_CHEAP], "ssr_raytraceCS_cheap.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_RESOLVE], "ssr_resolveCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_TEMPORAL], "ssr_temporalCS.cso"); }); - wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_BILATERAL], "ssr_bilateralCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSR_UPSAMPLE], "ssr_upsampleCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_LIGHTSHAFTS], "lightShaftsCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_HORIZONTAL], "depthoffield_tileMaxCOC_horizontalCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_VERTICAL], "depthoffield_tileMaxCOC_verticalCS.cso"); }); @@ -1040,13 +1040,17 @@ void LoadShaders() wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_LINEARDEPTH], "lineardepthCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_NORMALSFROMDEPTH], "normalsfromdepthCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SCREENSPACESHADOW], "screenspaceshadowCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSGI_DEINTERLEAVE], "ssgi_deinterleaveCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSGI], "ssgiCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSGI_WIDE], "ssgiCS.cso", wi::graphics::ShaderModel::SM_5_0, { "WIDE" }); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSGI_UPSAMPLE], "ssgi_upsampleCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_SPATIAL], "rtdiffuse_spatialCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_TEMPORAL], "rtdiffuse_temporalCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_UPSAMPLE], "rtdiffuse_upsampleCS.cso"); }); if (device->CheckCapability(GraphicsDeviceCapability::RAYTRACING)) { wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTDIFFUSE], "rtdiffuseCS.cso", ShaderModel::SM_6_5); }); - wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_SPATIAL], "rtdiffuse_spatialCS.cso"); }); - wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_TEMPORAL], "rtdiffuse_temporalCS.cso"); }); - wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_BILATERAL], "rtdiffuse_bilateralCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTREFLECTION], "rtreflectionCS.cso", ShaderModel::SM_6_5); }); @@ -9877,6 +9881,7 @@ void BindCameraCB( shadercam.texture_waterriples_index = camera.texture_waterriples_index; shadercam.texture_ao_index = camera.texture_ao_index; shadercam.texture_ssr_index = camera.texture_ssr_index; + shadercam.texture_ssgi_index = camera.texture_ssgi_index; shadercam.texture_rtshadow_index = camera.texture_rtshadow_index; shadercam.texture_rtdiffuse_index = camera.texture_rtdiffuse_index; shadercam.texture_surfelgi_index = camera.texture_surfelgi_index; @@ -12213,11 +12218,6 @@ void CreateRTDiffuseResources(RTDiffuseResources& res, XMUINT2 resolution) device->CreateTexture(&desc, nullptr, &res.texture_spatial_variance); device->CreateTexture(&desc, nullptr, &res.texture_temporal_variance[0]); device->CreateTexture(&desc, nullptr, &res.texture_temporal_variance[1]); - - desc.format = Format::R11G11B10_FLOAT; - desc.width = resolution.x; - desc.height = resolution.y; - device->CreateTexture(&desc, nullptr, &res.texture_bilateral_temp); } void Postprocess_RTDiffuse( const RTDiffuseResources& res, @@ -12244,7 +12244,6 @@ void Postprocess_RTDiffuse( GPUBarrier::Image(&res.texture_rayIndirectDiffuse, res.texture_rayIndirectDiffuse.desc.layout, ResourceState::UNORDERED_ACCESS), GPUBarrier::Image(&res.texture_spatial, res.texture_spatial.desc.layout, ResourceState::UNORDERED_ACCESS), GPUBarrier::Image(&res.texture_spatial_variance, res.texture_spatial_variance.desc.layout, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Image(&res.texture_bilateral_temp, res.texture_bilateral_temp.desc.layout, ResourceState::UNORDERED_ACCESS), GPUBarrier::Image(&res.texture_temporal[temporal_output], res.texture_temporal[temporal_output].desc.layout, ResourceState::UNORDERED_ACCESS), GPUBarrier::Image(&res.texture_temporal_variance[temporal_output], res.texture_temporal_variance[temporal_output].desc.layout, ResourceState::UNORDERED_ACCESS), GPUBarrier::Image(&res.texture_temporal[temporal_history], res.texture_temporal[temporal_output].desc.layout, ResourceState::UNORDERED_ACCESS), @@ -12257,7 +12256,6 @@ void Postprocess_RTDiffuse( device->ClearUAV(&res.texture_rayIndirectDiffuse, 0, cmd); device->ClearUAV(&res.texture_spatial, 0, cmd); device->ClearUAV(&res.texture_spatial_variance, 0, cmd); - device->ClearUAV(&res.texture_bilateral_temp, 0, cmd); device->ClearUAV(&res.texture_temporal[temporal_output], 0, cmd); device->ClearUAV(&res.texture_temporal_variance[temporal_output], 0, cmd); if (res.frame == 0) @@ -12272,7 +12270,6 @@ void Postprocess_RTDiffuse( GPUBarrier::Memory(&res.texture_rayIndirectDiffuse), GPUBarrier::Memory(&res.texture_spatial), GPUBarrier::Memory(&res.texture_spatial_variance), - GPUBarrier::Memory(&res.texture_bilateral_temp), GPUBarrier::Memory(&res.texture_temporal[temporal_output]), GPUBarrier::Memory(&res.texture_temporal_variance[temporal_output]), GPUBarrier::Image(&res.texture_temporal[temporal_history], ResourceState::UNORDERED_ACCESS, res.texture_temporal[temporal_output].desc.layout), @@ -12298,7 +12295,7 @@ void Postprocess_RTDiffuse( std::memcpy(&postprocess.params1.x, &instanceInclusionMask, sizeof(instanceInclusionMask)); { - device->EventBegin("RTDiffuse Raytrace pass", cmd); + device->EventBegin("RTDiffuse - ray trace", cmd); device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_RTDIFFUSE], cmd); @@ -12328,7 +12325,7 @@ void Postprocess_RTDiffuse( // Spatial pass: { - device->EventBegin("RTDiffuse - spatial filter", cmd); + device->EventBegin("RTDiffuse - spatial resolve", cmd); device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_SPATIAL], cmd); const GPUResource* resarray[] = { @@ -12362,7 +12359,7 @@ void Postprocess_RTDiffuse( // Temporal pass: { - device->EventBegin("RTDiffuse temporal filter", cmd); + device->EventBegin("RTDiffuse - temporal resolve", cmd); device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_TEMPORAL], cmd); device->PushConstants(&postprocess, sizeof(postprocess), cmd); @@ -12407,61 +12404,489 @@ void Postprocess_RTDiffuse( // Bilateral blur pass: { - device->EventBegin("RTDiffuse - bilateral filter", cmd); - device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_BILATERAL], cmd); + device->EventBegin("RTDiffuse - upsample", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_UPSAMPLE], cmd); + + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + const GPUResource* resarray[] = { + &res.texture_temporal[temporal_output], + &res.texture_temporal_variance[temporal_output], + }; + device->BindResources(resarray, 0, arraysize(resarray), cmd); + + const GPUResource* uavs[] = { + &output, + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + device->Dispatch( + (output.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (output.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); - // Horizontal: { - postprocess.params0.x = 1; - postprocess.params0.y = 0; - device->PushConstants(&postprocess, sizeof(postprocess), cmd); + GPUBarrier barriers[] = { + GPUBarrier::Image(&output, ResourceState::UNORDERED_ACCESS, output.desc.layout), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + device->EventEnd(cmd); + } + + res.frame++; + + wi::profiler::EndRange(profilerRange); + device->EventEnd(cmd); +} +void CreateSSGIResources(SSGIResources& res, XMUINT2 resolution) +{ + TextureDesc desc; + desc.type = TextureDesc::Type::TEXTURE_2D; + desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS; + desc.layout = ResourceState::SHADER_RESOURCE_COMPUTE; + desc.format = Format::R11G11B10_FLOAT; + + resolution.x = AlignTo(resolution.x, 64u); + resolution.y = AlignTo(resolution.y, 64u); + + desc.width = (resolution.x + 7) / 8; + desc.height = (resolution.y + 7) / 8; + desc.array_size = 16; + desc.format = Format::R32_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_atlas2x_depth); + desc.format = Format::R11G11B10_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_atlas2x_color); + + desc.width = (resolution.x + 15) / 16; + desc.height = (resolution.y + 15) / 16; + desc.array_size = 16; + desc.format = Format::R32_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_atlas4x_depth); + desc.format = Format::R11G11B10_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_atlas4x_color); + + desc.width = (resolution.x + 31) / 32; + desc.height = (resolution.y + 31) / 32; + desc.array_size = 16; + desc.format = Format::R32_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_atlas8x_depth); + desc.format = Format::R11G11B10_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_atlas8x_color); + + desc.width = (resolution.x + 63) / 64; + desc.height = (resolution.y + 63) / 64; + desc.array_size = 16; + desc.format = Format::R32_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_atlas16x_depth); + desc.format = Format::R11G11B10_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_atlas16x_color); + + desc.array_size = 1; + desc.mip_levels = 4; + desc.width = (resolution.x + 1) / 2; + desc.height = (resolution.y + 1) / 2; + desc.format = Format::R32_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_depth_mips); + desc.format = Format::R16G16_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_normal_mips); + desc.format = Format::R11G11B10_FLOAT; + device->CreateTexture(&desc, nullptr, &res.texture_diffuse_mips); + + for (uint32_t i = 0; i < desc.mip_levels; ++i) + { + int subresource_index; + subresource_index = device->CreateSubresource(&res.texture_depth_mips, SubresourceType::SRV, 0, 1, i, 1); + assert(subresource_index == i); + subresource_index = device->CreateSubresource(&res.texture_depth_mips, SubresourceType::UAV, 0, 1, i, 1); + assert(subresource_index == i); + subresource_index = device->CreateSubresource(&res.texture_normal_mips, SubresourceType::SRV, 0, 1, i, 1); + assert(subresource_index == i); + subresource_index = device->CreateSubresource(&res.texture_normal_mips, SubresourceType::UAV, 0, 1, i, 1); + assert(subresource_index == i); + subresource_index = device->CreateSubresource(&res.texture_diffuse_mips, SubresourceType::SRV, 0, 1, i, 1); + assert(subresource_index == i); + subresource_index = device->CreateSubresource(&res.texture_diffuse_mips, SubresourceType::UAV, 0, 1, i, 1); + assert(subresource_index == i); + } +} +void Postprocess_SSGI( + const SSGIResources& res, + const Texture& input, + const Texture& input_depth, + const Texture& input_normal, + const Texture& output, + CommandList cmd +) +{ + device->EventBegin("Postprocess_SSGI", cmd); + auto profilerRange = wi::profiler::BeginRangeGPU("SSGI", cmd); + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_atlas2x_depth, res.texture_atlas2x_depth.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_atlas4x_depth, res.texture_atlas4x_depth.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_atlas8x_depth, res.texture_atlas8x_depth.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_atlas16x_depth, res.texture_atlas16x_depth.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_atlas2x_color, res.texture_atlas2x_color.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_atlas4x_color, res.texture_atlas4x_color.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_atlas8x_color, res.texture_atlas8x_color.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_atlas16x_color, res.texture_atlas16x_color.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_depth_mips, res.texture_depth_mips.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_normal_mips, res.texture_normal_mips.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_diffuse_mips, res.texture_diffuse_mips.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&output, output.desc.layout, ResourceState::UNORDERED_ACCESS), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + device->ClearUAV(&res.texture_atlas2x_depth, 0, cmd); + device->ClearUAV(&res.texture_atlas4x_depth, 0, cmd); + device->ClearUAV(&res.texture_atlas8x_depth, 0, cmd); + device->ClearUAV(&res.texture_atlas16x_depth, 0, cmd); + device->ClearUAV(&res.texture_atlas2x_color, 0, cmd); + device->ClearUAV(&res.texture_atlas4x_color, 0, cmd); + device->ClearUAV(&res.texture_atlas8x_color, 0, cmd); + device->ClearUAV(&res.texture_atlas16x_color, 0, cmd); + device->ClearUAV(&res.texture_depth_mips, 0, cmd); + device->ClearUAV(&res.texture_normal_mips, 0, cmd); + device->ClearUAV(&res.texture_diffuse_mips, 0, cmd); + device->ClearUAV(&output, 0, cmd); + + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(&res.texture_atlas2x_depth), + GPUBarrier::Memory(&res.texture_atlas4x_depth), + GPUBarrier::Memory(&res.texture_atlas8x_depth), + GPUBarrier::Memory(&res.texture_atlas16x_depth), + GPUBarrier::Memory(&res.texture_atlas2x_color), + GPUBarrier::Memory(&res.texture_atlas4x_color), + GPUBarrier::Memory(&res.texture_atlas8x_color), + GPUBarrier::Memory(&res.texture_atlas16x_color), + GPUBarrier::Memory(&res.texture_depth_mips), + GPUBarrier::Memory(&res.texture_normal_mips), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + BindCommonResources(cmd); + + PostProcess postprocess = {}; + + { + device->EventBegin("SSGI - deinterleave", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI_DEINTERLEAVE], cmd); + + device->BindResource(&input, 0, cmd); + + const GPUResource* uavs[] = { + &res.texture_atlas2x_depth, + &res.texture_atlas4x_depth, + &res.texture_atlas8x_depth, + &res.texture_atlas16x_depth, + &res.texture_atlas2x_color, + &res.texture_atlas4x_color, + &res.texture_atlas8x_color, + &res.texture_atlas16x_color, + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + device->BindUAV(&res.texture_depth_mips, arraysize(uavs) + 0, cmd, 0); + device->BindUAV(&res.texture_normal_mips, arraysize(uavs) + 1, cmd, 0); + device->BindUAV(&res.texture_depth_mips, arraysize(uavs) + 2, cmd, 1); + device->BindUAV(&res.texture_normal_mips, arraysize(uavs) + 3, cmd, 1); + device->BindUAV(&res.texture_depth_mips, arraysize(uavs) + 4, cmd, 2); + device->BindUAV(&res.texture_normal_mips, arraysize(uavs) + 5, cmd, 2); + device->BindUAV(&res.texture_depth_mips, arraysize(uavs) + 6, cmd, 3); + device->BindUAV(&res.texture_normal_mips, arraysize(uavs) + 7, cmd, 3); + + const TextureDesc& desc = res.texture_atlas4x_depth.GetDesc(); + device->Dispatch( + desc.width, + desc.height, + 1, + cmd + ); + + device->EventEnd(cmd); + } + + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(&res.texture_diffuse_mips), + GPUBarrier::Image(&res.texture_atlas2x_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas2x_depth.desc.layout), + GPUBarrier::Image(&res.texture_atlas4x_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas4x_depth.desc.layout), + GPUBarrier::Image(&res.texture_atlas8x_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas8x_depth.desc.layout), + GPUBarrier::Image(&res.texture_atlas16x_depth, ResourceState::UNORDERED_ACCESS, res.texture_atlas16x_depth.desc.layout), + GPUBarrier::Image(&res.texture_atlas2x_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas2x_color.desc.layout), + GPUBarrier::Image(&res.texture_atlas4x_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas4x_color.desc.layout), + GPUBarrier::Image(&res.texture_atlas8x_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas8x_color.desc.layout), + GPUBarrier::Image(&res.texture_atlas16x_color, ResourceState::UNORDERED_ACCESS, res.texture_atlas16x_color.desc.layout), + GPUBarrier::Image(&res.texture_depth_mips, ResourceState::UNORDERED_ACCESS, res.texture_depth_mips.desc.layout), + GPUBarrier::Image(&res.texture_normal_mips, ResourceState::UNORDERED_ACCESS, res.texture_normal_mips.desc.layout), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + { + device->EventBegin("SSGI - diffuse", cmd); + + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI], cmd); + + // 2x: + { const GPUResource* resarray[] = { - &res.texture_temporal[temporal_output], - &res.texture_temporal_variance[temporal_output], + &res.texture_atlas2x_depth, + &res.texture_atlas2x_color, }; device->BindResources(resarray, 0, arraysize(resarray), cmd); + device->BindResource(&res.texture_normal_mips, arraysize(resarray) + 0, cmd, 0); + device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 0); - const GPUResource* uavs[] = { - &res.texture_bilateral_temp, - }; - device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + const TextureDesc& desc = res.texture_atlas2x_depth.GetDesc(); + + postprocess.resolution.x = desc.width; + postprocess.resolution.y = desc.height; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + postprocess.params0.x = 1; // range + postprocess.params0.y = 2; // spread + postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2 + device->PushConstants(&postprocess, sizeof(postprocess), cmd); device->Dispatch( - (res.texture_bilateral_temp.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - (res.texture_bilateral_temp.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (desc.width + 7) / 8, + (desc.height + 7) / 8, + 16, + cmd + ); + } + // 4x: + { + const GPUResource* resarray[] = { + &res.texture_atlas4x_depth, + &res.texture_atlas4x_color, + }; + device->BindResources(resarray, 0, arraysize(resarray), cmd); + device->BindResource(&res.texture_normal_mips, arraysize(resarray) + 0, cmd, 1); + device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 1); + + const TextureDesc& desc = res.texture_atlas4x_depth.GetDesc(); + + postprocess.resolution.x = desc.width; + postprocess.resolution.y = desc.height; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + postprocess.params0.x = 2; // range + postprocess.params0.y = 2; // spread + postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2 + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + device->Dispatch( + (desc.width + 7) / 8, + (desc.height + 7) / 8, + 16, + cmd + ); + } + + // Switch to wide sampling shader: + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI_WIDE], cmd); + + // 8x: + { + const GPUResource* resarray[] = { + &res.texture_atlas8x_depth, + &res.texture_atlas8x_color, + }; + device->BindResources(resarray, 0, arraysize(resarray), cmd); + device->BindResource(&res.texture_normal_mips, arraysize(resarray) + 0, cmd, 2); + device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 2); + + const TextureDesc& desc = res.texture_atlas8x_depth.GetDesc(); + + postprocess.resolution.x = desc.width; + postprocess.resolution.y = desc.height; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + postprocess.params0.x = 4; // range + postprocess.params0.y = 4; // spread + postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2 + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + device->Dispatch( + (desc.width + 15) / 16, + (desc.height + 15) / 16, + 16, + cmd + ); + } + // 16x: + { + const GPUResource* resarray[] = { + &res.texture_atlas16x_depth, + &res.texture_atlas16x_color, + }; + device->BindResources(resarray, 0, arraysize(resarray), cmd); + device->BindResource(&res.texture_normal_mips, arraysize(resarray) + 0, cmd, 3); + device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 3); + + const TextureDesc& desc = res.texture_atlas16x_depth.GetDesc(); + + postprocess.resolution.x = desc.width; + postprocess.resolution.y = desc.height; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + postprocess.params0.x = 8; // range + postprocess.params0.y = 2; // spread + postprocess.params0.z = std::pow(1.0f / (postprocess.params0.x * postprocess.params0.y), 2.0f); // rangespread_rcp2 + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + device->Dispatch( + (desc.width + 15) / 16, + (desc.height + 15) / 16, + 16, + cmd + ); + } + + device->EventEnd(cmd); + } + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_diffuse_mips, ResourceState::UNORDERED_ACCESS, res.texture_diffuse_mips.desc.layout, 3), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + { + device->EventBegin("SSGI - upsample", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI_UPSAMPLE], cmd); + + // 16x -> 8x + { + device->BindResource(&res.texture_depth_mips, 0, cmd, 3); + device->BindResource(&res.texture_normal_mips, 1, cmd, 3); + device->BindResource(&res.texture_diffuse_mips, 2, cmd, 3); + device->BindResource(&res.texture_depth_mips, 3, cmd, 2); + device->BindResource(&res.texture_normal_mips, 4, cmd, 2); + device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 2); + + const TextureDesc& desc = res.texture_diffuse_mips.desc; + postprocess.resolution.x = desc.width >> 2; + postprocess.resolution.y = desc.height >> 2; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + postprocess.params0.x = 2; // range + postprocess.params0.y = 8; // spread + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + device->Dispatch( + (postprocess.resolution.x + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (postprocess.resolution.y + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, 1, cmd ); { GPUBarrier barriers[] = { - GPUBarrier::Image(&res.texture_bilateral_temp, ResourceState::UNORDERED_ACCESS, res.texture_bilateral_temp.desc.layout), + GPUBarrier::Image(&res.texture_diffuse_mips, ResourceState::UNORDERED_ACCESS, res.texture_diffuse_mips.desc.layout, 2), }; device->Barrier(barriers, arraysize(barriers), cmd); } } - // Vertical: + // 8x -> 4x { - postprocess.params0.x = 0; - postprocess.params0.y = 1; + device->BindResource(&res.texture_depth_mips, 0, cmd, 2); + device->BindResource(&res.texture_normal_mips, 1, cmd, 2); + device->BindResource(&res.texture_diffuse_mips, 2, cmd, 2); + device->BindResource(&res.texture_depth_mips, 3, cmd, 1); + device->BindResource(&res.texture_normal_mips, 4, cmd, 1); + device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 1); + + const TextureDesc& desc = res.texture_diffuse_mips.desc; + postprocess.resolution.x = desc.width >> 1; + postprocess.resolution.y = desc.height >> 1; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + postprocess.params0.x = 2; // range + postprocess.params0.y = 8; // spread device->PushConstants(&postprocess, sizeof(postprocess), cmd); - const GPUResource* resarray[] = { - &res.texture_bilateral_temp, - &res.texture_temporal_variance[temporal_output], - }; - device->BindResources(resarray, 0, arraysize(resarray), cmd); + device->Dispatch( + (postprocess.resolution.x + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (postprocess.resolution.y + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); - const GPUResource* uavs[] = { - &output, - }; - device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_diffuse_mips, ResourceState::UNORDERED_ACCESS, res.texture_diffuse_mips.desc.layout, 1), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + } + + // 4x -> 2x + { + device->BindResource(&res.texture_depth_mips, 0, cmd, 1); + device->BindResource(&res.texture_normal_mips, 1, cmd, 1); + device->BindResource(&res.texture_diffuse_mips, 2, cmd, 1); + device->BindResource(&res.texture_depth_mips, 3, cmd, 0); + device->BindResource(&res.texture_normal_mips, 4, cmd, 0); + device->BindUAV(&res.texture_diffuse_mips, 0, cmd, 0); + + const TextureDesc& desc = res.texture_diffuse_mips.desc; + postprocess.resolution.x = desc.width; + postprocess.resolution.y = desc.height; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + postprocess.params0.x = 1; // range + postprocess.params0.y = 8; // spread + device->PushConstants(&postprocess, sizeof(postprocess), cmd); device->Dispatch( - (output.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - (output.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (postprocess.resolution.x + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (postprocess.resolution.y + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_diffuse_mips, ResourceState::UNORDERED_ACCESS, res.texture_diffuse_mips.desc.layout, 0), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + } + + // 2x -> output + { + device->BindResource(&res.texture_depth_mips, 0, cmd, 0); + device->BindResource(&res.texture_normal_mips, 1, cmd, 0); + device->BindResource(&res.texture_diffuse_mips, 2, cmd, 0); + device->BindResource(&input_depth, 3, cmd); + device->BindResource(&input_normal, 4, cmd); + device->BindUAV(&output, 0, cmd); + + const TextureDesc& desc = output.desc; + postprocess.resolution.x = AlignTo(desc.width, 64u); // align = uv correction! + postprocess.resolution.y = AlignTo(desc.height, 64u); // align = uv correction! + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + postprocess.params0.x = 1; // range + postprocess.params0.y = 8; // spread + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + device->Dispatch( + (desc.width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (desc.height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, 1, cmd ); @@ -12477,8 +12902,6 @@ void Postprocess_RTDiffuse( device->EventEnd(cmd); } - res.frame++; - wi::profiler::EndRange(profilerRange); device->EventEnd(cmd); } @@ -12500,13 +12923,12 @@ void CreateRTReflectionResources(RTReflectionResources& res, XMUINT2 resolution) device->CreateTexture(&desc, nullptr, &res.texture_rayLengths); device->SetName(&res.texture_rayLengths, "ssr_rayLengths"); - desc.width = resolution.x; - desc.height = resolution.y; + desc.width = resolution.x / 2; + desc.height = resolution.y / 2; desc.format = Format::R16G16B16A16_FLOAT; device->CreateTexture(&desc, nullptr, &res.texture_resolve); device->CreateTexture(&desc, nullptr, &res.texture_temporal[0]); device->CreateTexture(&desc, nullptr, &res.texture_temporal[1]); - device->CreateTexture(&desc, nullptr, &res.texture_bilateral_temp); desc.format = Format::R16_FLOAT; device->CreateTexture(&desc, nullptr, &res.texture_resolve_variance); device->CreateTexture(&desc, nullptr, &res.texture_resolve_reprojectionDepth); @@ -12533,6 +12955,49 @@ void Postprocess_RTReflection( BindCommonResources(cmd); + const int temporal_output = res.frame % 2; + const int temporal_history = 1 - temporal_output; + + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_resolve, res.texture_resolve.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_resolve_variance, res.texture_resolve_variance.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_resolve_reprojectionDepth, res.texture_resolve_reprojectionDepth.desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_temporal[temporal_output], res.texture_temporal[temporal_output].desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_temporal_variance[temporal_output], res.texture_temporal_variance[temporal_output].desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&output, output.desc.layout, ResourceState::UNORDERED_ACCESS), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + + if (res.frame == 0) + { + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&res.texture_temporal[temporal_history], res.texture_temporal[temporal_history].desc.layout, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&res.texture_temporal_variance[temporal_history], res.texture_temporal_variance[temporal_history].desc.layout, ResourceState::UNORDERED_ACCESS), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + device->ClearUAV(&res.texture_resolve, 0, cmd); + device->ClearUAV(&res.texture_resolve_variance, 0, cmd); + device->ClearUAV(&res.texture_resolve_reprojectionDepth, 0, cmd); + device->ClearUAV(&res.texture_temporal[0], 0, cmd); + device->ClearUAV(&res.texture_temporal[1], 0, cmd); + device->ClearUAV(&res.texture_temporal_variance[0], 0, cmd); + device->ClearUAV(&res.texture_temporal_variance[1], 0, cmd); + { + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + GPUBarrier::Image(&res.texture_temporal[temporal_history], ResourceState::UNORDERED_ACCESS, res.texture_temporal[temporal_history].desc.layout), + GPUBarrier::Image(&res.texture_temporal_variance[temporal_history], ResourceState::UNORDERED_ACCESS, res.texture_temporal_variance[temporal_history].desc.layout), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + } + } + + device->ClearUAV(&output, 0, cmd); + const TextureDesc& desc = output.desc; // Render half-res: @@ -12617,7 +13082,6 @@ void Postprocess_RTReflection( { GPUBarrier barriers[] = { - GPUBarrier::Memory(), GPUBarrier::Image(&res.texture_rayIndirectSpecular, ResourceState::UNORDERED_ACCESS, res.texture_rayIndirectSpecular.desc.layout), GPUBarrier::Image(&res.texture_rayDirectionPDF, ResourceState::UNORDERED_ACCESS, res.texture_rayDirectionPDF.desc.layout), GPUBarrier::Image(&res.texture_rayLengths, ResourceState::UNORDERED_ACCESS, res.texture_rayLengths.desc.layout), @@ -12628,17 +13092,11 @@ void Postprocess_RTReflection( //device->EventEnd(cmd); } - // Upscale to full-res: - postprocess.resolution.x = desc.width; - postprocess.resolution.y = desc.height; - postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; - postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - // Resolve pass: { - device->EventBegin("RTReflection Resolve pass", cmd); + device->EventBegin("RTReflection - spatial resolve", cmd); device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_RESOLVE], cmd); + device->PushConstants(&postprocess, sizeof(postprocess), cmd); const GPUResource* resarray[] = { &res.texture_rayIndirectSpecular, @@ -12683,13 +13141,11 @@ void Postprocess_RTReflection( device->EventEnd(cmd); } - int temporal_output = device->GetFrameCount() % 2; - int temporal_history = 1 - temporal_output; - // Temporal pass: { - device->EventBegin("RTReflection Temporal pass", cmd); + device->EventBegin("RTReflection - temporal resolve", cmd); device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_TEMPORAL], cmd); + device->PushConstants(&postprocess, sizeof(postprocess), cmd); const GPUResource* resarray[] = { &res.texture_resolve, @@ -12723,9 +13179,9 @@ void Postprocess_RTReflection( { GPUBarrier barriers[] = { - GPUBarrier::Memory(), GPUBarrier::Image(&res.texture_temporal[temporal_output], ResourceState::UNORDERED_ACCESS, res.texture_temporal[temporal_output].desc.layout), GPUBarrier::Image(&res.texture_temporal_variance[temporal_output], ResourceState::UNORDERED_ACCESS, res.texture_temporal_variance[temporal_output].desc.layout), + GPUBarrier::Memory(&output), }; device->Barrier(barriers, arraysize(barriers), cmd); } @@ -12733,89 +13189,41 @@ void Postprocess_RTReflection( device->EventEnd(cmd); } + // Upscale to full-res: + postprocess.resolution.x = desc.width; + postprocess.resolution.y = desc.height; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + // Bilateral blur pass: { - device->EventBegin("RTReflection Bilateral blur pass", cmd); - device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_BILATERAL], cmd); + device->EventBegin("RTReflection - upsample", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_UPSAMPLE], cmd); + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + const GPUResource* resarray[] = { + &res.texture_temporal[temporal_output], + &res.texture_temporal_variance[temporal_output], + }; + device->BindResources(resarray, 0, arraysize(resarray), cmd); + + const GPUResource* uavs[] = { + &output, + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + device->Dispatch( + (output.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (output.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); - // Horizontal: { - postprocess.params0.x = 1; - postprocess.params0.y = 0; - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - - const GPUResource* resarray[] = { - &res.texture_temporal[temporal_output], - &res.texture_temporal_variance[temporal_output], + GPUBarrier barriers[] = { + GPUBarrier::Image(&output, ResourceState::UNORDERED_ACCESS, output.desc.layout), }; - device->BindResources(resarray, 0, arraysize(resarray), cmd); - - const GPUResource* uavs[] = { - &res.texture_bilateral_temp, - }; - device->BindUAVs(uavs, 0, arraysize(uavs), cmd); - - { - GPUBarrier barriers[] = { - GPUBarrier::Image(&res.texture_bilateral_temp, res.texture_bilateral_temp.desc.layout, ResourceState::UNORDERED_ACCESS), - }; - device->Barrier(barriers, arraysize(barriers), cmd); - } - - device->Dispatch( - (res.texture_bilateral_temp.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - (res.texture_bilateral_temp.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - 1, - cmd - ); - - { - GPUBarrier barriers[] = { - GPUBarrier::Memory(), - GPUBarrier::Image(&res.texture_bilateral_temp, ResourceState::UNORDERED_ACCESS, res.texture_bilateral_temp.desc.layout), - }; - device->Barrier(barriers, arraysize(barriers), cmd); - } - } - - // Vertical: - { - postprocess.params0.x = 0; - postprocess.params0.y = 1; - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - - const GPUResource* resarray[] = { - &res.texture_bilateral_temp, - &res.texture_temporal_variance[temporal_output], - }; - device->BindResources(resarray, 0, arraysize(resarray), cmd); - - const GPUResource* uavs[] = { - &output, - }; - device->BindUAVs(uavs, 0, arraysize(uavs), cmd); - - { - GPUBarrier barriers[] = { - GPUBarrier::Image(&output, output.desc.layout, ResourceState::UNORDERED_ACCESS), - }; - device->Barrier(barriers, arraysize(barriers), cmd); - } - - device->Dispatch( - (output.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - (output.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - 1, - cmd - ); - - { - GPUBarrier barriers[] = { - GPUBarrier::Memory(), - GPUBarrier::Image(&output, ResourceState::UNORDERED_ACCESS, output.desc.layout), - }; - device->Barrier(barriers, arraysize(barriers), cmd); - } + device->Barrier(barriers, arraysize(barriers), cmd); } device->EventEnd(cmd); @@ -12871,13 +13279,12 @@ void CreateSSRResources(SSRResources& res, XMUINT2 resolution) device->CreateTexture(&desc, nullptr, &res.texture_rayLengths); device->SetName(&res.texture_rayLengths, "ssr_rayLengths"); - desc.width = resolution.x; - desc.height = resolution.y; + desc.width = resolution.x / 2; + desc.height = resolution.y / 2; desc.format = Format::R16G16B16A16_FLOAT; device->CreateTexture(&desc, nullptr, &res.texture_resolve); device->CreateTexture(&desc, nullptr, &res.texture_temporal[0]); device->CreateTexture(&desc, nullptr, &res.texture_temporal[1]); - device->CreateTexture(&desc, nullptr, &res.texture_bilateral_temp); desc.format = Format::R16_FLOAT; device->CreateTexture(&desc, nullptr, &res.texture_resolve_variance); device->CreateTexture(&desc, nullptr, &res.texture_resolve_reprojectionDepth); @@ -12925,7 +13332,6 @@ void Postprocess_SSR( GPUBarrier::Image(&res.texture_resolve_reprojectionDepth, res.texture_resolve_reprojectionDepth.desc.layout, ResourceState::UNORDERED_ACCESS), GPUBarrier::Image(&res.texture_temporal[temporal_output], res.texture_temporal[temporal_output].desc.layout, ResourceState::UNORDERED_ACCESS), GPUBarrier::Image(&res.texture_temporal_variance[temporal_output], res.texture_temporal_variance[temporal_output].desc.layout, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Image(&res.texture_bilateral_temp, res.texture_bilateral_temp.desc.layout, ResourceState::UNORDERED_ACCESS), GPUBarrier::Image(&output, output.desc.layout, ResourceState::UNORDERED_ACCESS), }; device->Barrier(barriers, arraysize(barriers), cmd); @@ -12949,7 +13355,6 @@ void Postprocess_SSR( device->ClearUAV(&res.texture_temporal[1], 0, cmd); device->ClearUAV(&res.texture_temporal_variance[0], 0, cmd); device->ClearUAV(&res.texture_temporal_variance[1], 0, cmd); - device->ClearUAV(&res.texture_bilateral_temp, 0, cmd); { GPUBarrier barriers[] = { GPUBarrier::Memory(), @@ -12987,7 +13392,7 @@ void Postprocess_SSR( // Compute tile classification (horizontal): { - device->EventBegin("SSR Tile Classification - Horizontal", cmd); + device->EventBegin("SSR - tile classification horizontal", cmd); device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_TILEMAXROUGHNESS_HORIZONTAL], cmd); const GPUResource* uavs[] = { @@ -13014,7 +13419,7 @@ void Postprocess_SSR( // Compute tile classification (vertical): { - device->EventBegin("SSR Tile Classification - Vertical", cmd); + device->EventBegin("SSR - tile classification vertical", cmd); device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_TILEMAXROUGHNESS_VERTICAL], cmd); device->PushConstants(&postprocess, sizeof(postprocess), cmd); @@ -13051,7 +13456,7 @@ void Postprocess_SSR( // Depth hierarchy: { - device->EventBegin("SSR Depth hierarchy pass", cmd); + device->EventBegin("SSR - depth hierarchy", cmd); device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_DEPTHHIERARCHY], cmd); TextureDesc hierarchyDesc = res.texture_depth_hierarchy.GetDesc(); @@ -13080,7 +13485,6 @@ void Postprocess_SSR( { GPUBarrier barriers[] = { - GPUBarrier::Memory(), GPUBarrier::Image(&res.texture_depth_hierarchy, ResourceState::UNORDERED_ACCESS, res.texture_depth_hierarchy.desc.layout, 0), }; device->Barrier(barriers, arraysize(barriers), cmd); @@ -13146,7 +13550,7 @@ void Postprocess_SSR( // Raytrace pass: { - device->EventBegin("SSR Raytrace pass", cmd); + device->EventBegin("SSR - ray trace", cmd); const GPUResource* resarray[] = { &res.texture_depth_hierarchy, @@ -13200,15 +13604,9 @@ void Postprocess_SSR( device->EventEnd(cmd); } - // Upscale to full-res: - postprocess.resolution.x = desc.width; - postprocess.resolution.y = desc.height; - postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; - postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; - // Resolve pass: { - device->EventBegin("SSR Resolve pass", cmd); + device->EventBegin("SSR - spatial resolve", cmd); device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_RESOLVE], cmd); device->PushConstants(&postprocess, sizeof(postprocess), cmd); @@ -13235,7 +13633,6 @@ void Postprocess_SSR( { GPUBarrier barriers[] = { - GPUBarrier::Memory(), GPUBarrier::Image(&res.texture_resolve, ResourceState::UNORDERED_ACCESS, res.texture_resolve.desc.layout), GPUBarrier::Image(&res.texture_resolve_variance, ResourceState::UNORDERED_ACCESS, res.texture_resolve_variance.desc.layout), GPUBarrier::Image(&res.texture_resolve_reprojectionDepth, ResourceState::UNORDERED_ACCESS, res.texture_resolve_reprojectionDepth.desc.layout), @@ -13248,7 +13645,7 @@ void Postprocess_SSR( // Temporal pass: { - device->EventBegin("SSR Temporal pass", cmd); + device->EventBegin("SSR - temporal resolve", cmd); device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_TEMPORAL], cmd); device->PushConstants(&postprocess, sizeof(postprocess), cmd); @@ -13278,6 +13675,7 @@ void Postprocess_SSR( GPUBarrier barriers[] = { GPUBarrier::Image(&res.texture_temporal[temporal_output], ResourceState::UNORDERED_ACCESS, res.texture_temporal[temporal_output].desc.layout), GPUBarrier::Image(&res.texture_temporal_variance[temporal_output], ResourceState::UNORDERED_ACCESS, res.texture_temporal_variance[temporal_output].desc.layout), + GPUBarrier::Memory(&output), }; device->Barrier(barriers, arraysize(barriers), cmd); } @@ -13285,74 +13683,41 @@ void Postprocess_SSR( device->EventEnd(cmd); } + // Upscale to full-res: + postprocess.resolution.x = desc.width; + postprocess.resolution.y = desc.height; + postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; + postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; + // Bilateral blur pass: { - device->EventBegin("SSR Bilateral blur pass", cmd); - device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_BILATERAL], cmd); + device->EventBegin("SSR - upsample", cmd); + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSR_UPSAMPLE], cmd); + device->PushConstants(&postprocess, sizeof(postprocess), cmd); + + const GPUResource* resarray[] = { + &res.texture_temporal[temporal_output], + &res.texture_temporal_variance[temporal_output], + }; + device->BindResources(resarray, 0, arraysize(resarray), cmd); + + const GPUResource* uavs[] = { + &output, + }; + device->BindUAVs(uavs, 0, arraysize(uavs), cmd); + + device->Dispatch( + (output.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (output.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); - // Horizontal: { - postprocess.params0.x = 1; - postprocess.params0.y = 0; - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - - const GPUResource* resarray[] = { - &res.texture_temporal[temporal_output], - &res.texture_temporal_variance[temporal_output], + GPUBarrier barriers[] = { + GPUBarrier::Image(&output, ResourceState::UNORDERED_ACCESS, output.desc.layout), }; - device->BindResources(resarray, 0, arraysize(resarray), cmd); - - const GPUResource* uavs[] = { - &res.texture_bilateral_temp, - }; - device->BindUAVs(uavs, 0, arraysize(uavs), cmd); - - device->Dispatch( - (res.texture_bilateral_temp.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - (res.texture_bilateral_temp.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - 1, - cmd - ); - - { - GPUBarrier barriers[] = { - GPUBarrier::Image(&res.texture_bilateral_temp, ResourceState::UNORDERED_ACCESS, res.texture_bilateral_temp.desc.layout), - GPUBarrier::Memory(&output), - }; - device->Barrier(barriers, arraysize(barriers), cmd); - } - } - - // Vertical: - { - postprocess.params0.x = 0; - postprocess.params0.y = 1; - device->PushConstants(&postprocess, sizeof(postprocess), cmd); - - const GPUResource* resarray[] = { - &res.texture_bilateral_temp, - &res.texture_temporal_variance[temporal_output], - }; - device->BindResources(resarray, 0, arraysize(resarray), cmd); - - const GPUResource* uavs[] = { - &output, - }; - device->BindUAVs(uavs, 0, arraysize(uavs), cmd); - - device->Dispatch( - (output.GetDesc().width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - (output.GetDesc().height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, - 1, - cmd - ); - - { - GPUBarrier barriers[] = { - GPUBarrier::Image(&output, ResourceState::UNORDERED_ACCESS, output.desc.layout), - }; - device->Barrier(barriers, arraysize(barriers), cmd); - } + device->Barrier(barriers, arraysize(barriers), cmd); } device->EventEnd(cmd); diff --git a/WickedEngine/wiRenderer.h b/WickedEngine/wiRenderer.h index 0da746cfe..e8920e5dd 100644 --- a/WickedEngine/wiRenderer.h +++ b/WickedEngine/wiRenderer.h @@ -543,7 +543,6 @@ namespace wi::renderer wi::graphics::Texture texture_spatial_variance; wi::graphics::Texture texture_temporal[2]; wi::graphics::Texture texture_temporal_variance[2]; - wi::graphics::Texture texture_bilateral_temp; }; void CreateRTDiffuseResources(RTDiffuseResources& res, XMUINT2 resolution); void Postprocess_RTDiffuse( @@ -553,6 +552,29 @@ namespace wi::renderer wi::graphics::CommandList cmd, float range = 1000.0f ); + struct SSGIResources + { + wi::graphics::Texture texture_atlas2x_depth; + wi::graphics::Texture texture_atlas4x_depth; + wi::graphics::Texture texture_atlas8x_depth; + wi::graphics::Texture texture_atlas16x_depth; + wi::graphics::Texture texture_atlas2x_color; + wi::graphics::Texture texture_atlas4x_color; + wi::graphics::Texture texture_atlas8x_color; + wi::graphics::Texture texture_atlas16x_color; + wi::graphics::Texture texture_depth_mips; + wi::graphics::Texture texture_normal_mips; + wi::graphics::Texture texture_diffuse_mips; + }; + void CreateSSGIResources(SSGIResources& res, XMUINT2 resolution); + void Postprocess_SSGI( + const SSGIResources& res, + const wi::graphics::Texture& input, + const wi::graphics::Texture& input_depth, + const wi::graphics::Texture& input_normal, + const wi::graphics::Texture& output, + wi::graphics::CommandList cmd + ); struct RTReflectionResources { mutable int frame = 0; @@ -564,7 +586,6 @@ namespace wi::renderer wi::graphics::Texture texture_resolve_reprojectionDepth; wi::graphics::Texture texture_temporal[2]; wi::graphics::Texture texture_temporal_variance[2]; - wi::graphics::Texture texture_bilateral_temp; }; void CreateRTReflectionResources(RTReflectionResources& res, XMUINT2 resolution); void Postprocess_RTReflection( @@ -589,7 +610,6 @@ namespace wi::renderer wi::graphics::Texture texture_resolve_reprojectionDepth; wi::graphics::Texture texture_temporal[2]; wi::graphics::Texture texture_temporal_variance[2]; - wi::graphics::Texture texture_bilateral_temp; wi::graphics::GPUBuffer buffer_tile_tracing_statistics; wi::graphics::GPUBuffer buffer_tiles_tracing_earlyexit; wi::graphics::GPUBuffer buffer_tiles_tracing_cheap; diff --git a/WickedEngine/wiScene.cpp b/WickedEngine/wiScene.cpp index d583e1b6f..0b4959ad2 100644 --- a/WickedEngine/wiScene.cpp +++ b/WickedEngine/wiScene.cpp @@ -1835,10 +1835,15 @@ namespace wi::scene } else if (channel.path == AnimationComponent::AnimationChannel::Path::WEIGHTS) { - ObjectComponent* object = objects.GetComponent(channel.target); - if (object == nullptr) - continue; - target_mesh = meshes.GetComponent(object->meshID); + target_mesh = meshes.GetComponent(channel.target); + if (target_mesh == nullptr) + { + // Also try going through object's mesh reference: + ObjectComponent* object = objects.GetComponent(channel.target); + if (object == nullptr) + continue; + target_mesh = meshes.GetComponent(object->meshID); + } if (target_mesh == nullptr) continue; animation.morph_weights_temp.resize(target_mesh->morph_targets.size()); diff --git a/WickedEngine/wiScene_Components.h b/WickedEngine/wiScene_Components.h index 8acc9f26b..e63df21d2 100644 --- a/WickedEngine/wiScene_Components.h +++ b/WickedEngine/wiScene_Components.h @@ -1142,6 +1142,7 @@ namespace wi::scene int texture_waterriples_index = -1; int texture_ao_index = -1; int texture_ssr_index = -1; + int texture_ssgi_index = -1; int texture_rtshadow_index = -1; int texture_rtdiffuse_index = -1; int texture_surfelgi_index = -1; diff --git a/WickedEngine/wiVersion.cpp b/WickedEngine/wiVersion.cpp index ff99656e5..b0d9562dd 100644 --- a/WickedEngine/wiVersion.cpp +++ b/WickedEngine/wiVersion.cpp @@ -9,7 +9,7 @@ namespace wi::version // minor features, major updates, breaking compatibility changes const int minor = 71; // minor bug fixes, alterations, refactors, updates - const int revision = 417; + const int revision = 418; const std::string version_string = std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(revision); diff --git a/enc_temp_folder/2f9a23bddcd8e6a426511f3643315cb7/ssgi_upsampleCS.hlsl b/enc_temp_folder/2f9a23bddcd8e6a426511f3643315cb7/ssgi_upsampleCS.hlsl new file mode 100644 index 000000000..5033b2433 --- /dev/null +++ b/enc_temp_folder/2f9a23bddcd8e6a426511f3643315cb7/ssgi_upsampleCS.hlsl @@ -0,0 +1,74 @@ +#include "globals.hlsli" +#include "stochasticSSRHF.hlsli" +#include "ShaderInterop_Postprocess.h" + +PUSHCONSTANT(postprocess, PostProcess); + +Texture2D input_depth_low : register(t0); +Texture2D input_normal_low : register(t1); +Texture2D input_diffuse_low : register(t2); +Texture2D input_depth_high : register(t3); +Texture2D input_normal_high : register(t4); + +RWTexture2D output : register(u0); + +static const float depthThreshold = 1000.0; +static const float normalThreshold = 1.0; + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint2 DTid : SV_DispatchThreadID) +{ + uint2 pixel = DTid.xy; + const float2 uv = (pixel + 0.5) * postprocess.resolution_rcp; + + const float depth = input_depth_high[pixel]; + const float linearDepth = compute_lineardepth(depth); + const float3 N = decode_oct(input_normal_high[pixel].rg); + const float3 P = reconstruct_position(uv, depth); + + float3 result = 0; + float sum = 0; +#if 1 + const int range = int(postprocess.params0.x); + const float spread = postprocess.params0.y; +#else + const int range = 1; + const float spread = 8; +#endif + for(int x = -range; x <= range; ++x) + { + for(int y = -range; y <= range; ++y) + { + const float2 offset = float2(x, y) * spread * postprocess.resolution_rcp; + const float2 sample_uv = uv + offset; + + const float sampleDepth = input_depth_low.SampleLevel(sampler_linear_clamp, sample_uv, 0); + const float3 sampleN = decode_oct(input_normal_low.SampleLevel(sampler_linear_clamp, sample_uv, 0)); + const float3 sampleDiffuse = input_diffuse_low.SampleLevel(sampler_linear_clamp, sample_uv, 0).rgb; + const float3 sampleP = reconstruct_position(sample_uv, sampleDepth); + + float3 dq = P - sampleP; + float planeError = max(abs(dot(dq, sampleN)), abs(dot(dq, N))); + float relativeDepthDifference = planeError / linearDepth; + float bilateralDepthWeight = exp(-sqr(relativeDepthDifference) * depthThreshold); + + float normalError = pow(saturate(dot(sampleN, N)), 4.0); + float bilateralNormalWeight = saturate(1.0 - (1.0 - normalError) * normalThreshold); + + float weight = bilateralDepthWeight * bilateralNormalWeight; + + //weight = 1; + result += sampleDiffuse * weight; + sum += weight; + } + } + + if(sum > 0) + { + result /= sum; + } + + result = max(0, result); + + output[pixel] = (output[pixel] + float4(result, 1)) ; +} diff --git a/enc_temp_folder/343d8179d01536ce28957367d6f61811/ssgiCS.hlsl b/enc_temp_folder/343d8179d01536ce28957367d6f61811/ssgiCS.hlsl new file mode 100644 index 000000000..8e972abf8 --- /dev/null +++ b/enc_temp_folder/343d8179d01536ce28957367d6f61811/ssgiCS.hlsl @@ -0,0 +1,192 @@ +#include "globals.hlsli" +#include "stochasticSSRHF.hlsli" +#include "ShaderInterop_Postprocess.h" + +PUSHCONSTANT(postprocess, PostProcess); + +Texture2D input : register(t0); +Texture2DArray input_depth : register(t1); +Texture2D input_normal : register(t2); + +RWTexture2D output_diffuse : register(u0); + +#ifdef WIDE +static const uint THREADCOUNT = 16; +static const int TILE_BORDER = 18; +#else +static const uint THREADCOUNT = 8; +static const int TILE_BORDER = 4; +#endif // WIDE +static const int TILE_SIZE = TILE_BORDER + THREADCOUNT + TILE_BORDER; +groupshared uint cache_xy[TILE_SIZE * TILE_SIZE]; +groupshared float cache_z[TILE_SIZE * TILE_SIZE]; +groupshared uint cache_rgb[TILE_SIZE * TILE_SIZE]; +groupshared uint group_valid; + +inline uint coord_to_cache(int2 coord) +{ + return flatten2D(clamp(TILE_BORDER + coord, 0, TILE_SIZE - 1), TILE_SIZE); +} + +static const float radius = 14; +static const float radius2 = radius * radius; +static const float radius2_rcp_negative = -rcp(radius2); + +#if 0 +static const uint depth_test_count = 1; +static const float depth_tests[] = {0.33}; +#else +static const uint depth_test_count = 3; +static const float depth_tests[] = {0.125, 0.25, 0.75}; +#endif + +float3 compute_diffuse( + float3 origin_position, + float3 origin_normal, + int2 GTid, + int2 offset +) +{ + const int2 sampleLoc = GTid + offset; + const uint t = coord_to_cache(sampleLoc); + float3 sample_position; + sample_position.z = cache_z[t]; + if(sample_position.z > GetCamera().z_far - 1) + return 0; + sample_position.xy = unpack_half2(cache_xy[t]); + const float3 origin_to_sample = sample_position - origin_position; + const float distance2 = dot(origin_to_sample, origin_to_sample); + float occlusion = saturate(dot(origin_normal, origin_to_sample)); + occlusion *= saturate(distance2 * radius2_rcp_negative + 1.0f); + + if(occlusion > 0) + { + const float origin_z = origin_position.z; + const float sample_z = sample_position.z; + +#if 1 + // DDA occlusion: + const int2 start = GTid; + const int2 goal = sampleLoc; + + const int dx = int(goal.x) - int(start.x); + const int dy = int(goal.y) - int(start.y); + + int step = max(abs(dx), abs(dy)); + step = (step + 1) / 2; // reduce steps + const float step_rcp = rcp(step); + + const float x_incr = float(dx) * step_rcp; + const float y_incr = float(dy) * step_rcp; + + float x = float(start.x); + float y = float(start.y); + + for (int i = 0; i < step - 1; i++) + { + x += x_incr; + y += y_incr; + + const int2 loc = int2(round(x), round(y)); + const uint tt = coord_to_cache(loc); + + const float dt = float(i) / float(step); + const float z = lerp(origin_z, sample_z, dt); + + const float sz = cache_z[tt]; + if(sz < z - 0.1) + { + return occlusion * Unpack_R11G11B10_FLOAT(cache_rgb[tt]); + } + } +#else + // Simple occlusion: + for (uint i = 0; i < depth_test_count; ++i) + { + const float dt = depth_tests[i]; + const float z = lerp(origin_z, sample_z, dt); + const int2 loc = round(lerp(float2(GTid), float2(sampleLoc), dt)); + const uint tt = coord_to_cache(loc); + const float sz = cache_z[tt]; + if (sz < z - 0.1) + { + return occlusion * Unpack_R11G11B10_FLOAT(cache_rgb[tt]); + } + } +#endif + } + + return occlusion * Unpack_R11G11B10_FLOAT(cache_rgb[t]); +} + +[numthreads(THREADCOUNT, THREADCOUNT, 1)] +void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint2 GTid : SV_GroupThreadID, uint groupIndex : SV_GroupIndex) +{ + const uint layer = DTid.z; + const uint2 interleaved_pixel = DTid.xy << 2 | uint2(DTid.z & 3, DTid.z >> 2); + + if(groupIndex == 0) + { + group_valid = 0; + } + GroupMemoryBarrierWithGroupSync(); + + const int2 tile_upperleft = Gid.xy * THREADCOUNT - TILE_BORDER; + for(uint t = groupIndex; t < TILE_SIZE * TILE_SIZE; t += THREADCOUNT * THREADCOUNT) + { + const int2 pixel = tile_upperleft + unflatten2D(t, TILE_SIZE); + const float depth = input_depth[uint3(pixel, layer)]; + const float2 uv = (pixel + 0.5f) * postprocess.resolution_rcp; + const float3 P = reconstruct_position(uv, depth, GetCamera().inverse_projection); + const float3 color = input.SampleLevel(sampler_linear_clamp, uv, 0).rgb; + const uint pkcolor = Pack_R11G11B10_FLOAT(color.rgb); + cache_xy[t] = pack_half2(P.xy); + cache_z[t] = P.z; + cache_rgb[t] = pkcolor; + if(pkcolor) + InterlockedOr(group_valid, 1u); + } + GroupMemoryBarrierWithGroupSync(); + + [branch] + if (group_valid == 0) + return; // if no valid color was cached, whole group can exit early + + const uint t = coord_to_cache(GTid.xy); + float3 P; + P.z = cache_z[t]; + + [branch] + if(P.z > GetCamera().z_far - 1) + return; // if pixel depth is not valid, it can exit early + + P.xy = unpack_half2(cache_xy[t]); + + const uint2 pixel = DTid.xy; + const float3 N = mul((float3x3)GetCamera().view, decode_oct(input_normal[interleaved_pixel].rg)); + + float3 diffuse = 0; + float sum = 0; + const int range = int(postprocess.params0.x); + const float spread = postprocess.params0.y /*+ dither(pixel)*/; + const float rangespread_rcp2 = postprocess.params0.z; + + for(int x = -range; x <= range; ++x) + { + for(int y = -range; y <= range; ++y) + { + const float2 foffset = float2(x, y) * spread; + const int2 offset = round(foffset); + const float weight = saturate(1 - abs(foffset.x) * abs(foffset.y) * rangespread_rcp2); + diffuse += compute_diffuse(P, N, GTid, offset) * weight; + sum += weight; + } + } + if(sum > 0) + { + diffuse = diffuse / sum; + } + + // interleave result: + output_diffuse[interleaved_pixel] = float4(diffuse, 1); +}