diff --git a/WickedEngine/RenderPath3D.cpp b/WickedEngine/RenderPath3D.cpp index 74ae3bd37..128d65e63 100644 --- a/WickedEngine/RenderPath3D.cpp +++ b/WickedEngine/RenderPath3D.cpp @@ -36,6 +36,15 @@ void RenderPath3D::ResizeBuffers() assert(subresource_index == i); } } + { + TextureDesc desc; + desc.BindFlags = BIND_SHADER_RESOURCE | BIND_UNORDERED_ACCESS; + desc.Format = FORMAT_R16G16B16A16_FLOAT; + desc.Width = wiRenderer::GetInternalResolution().x; + desc.Height = wiRenderer::GetInternalResolution().y; + device->CreateTexture(&desc, nullptr, &rtStochasticSSR); + device->SetName(&rtStochasticSSR, "rtStochasticSSR"); + } { TextureDesc desc; desc.BindFlags = BIND_RENDER_TARGET | BIND_SHADER_RESOURCE; @@ -441,6 +450,13 @@ void RenderPath3D::RenderSSR(const Texture& srcSceneRT, const wiGraphics::Textur wiRenderer::Postprocess_SSR(srcSceneRT, depthBuffer_Copy, rtLinearDepth_minmax, gbuffer1, rtSSR, cmd); } } +void RenderPath3D::RenderStochasticSSR(const Texture& srcSceneRT, const wiGraphics::Texture& gbuffer0, const wiGraphics::Texture& gbuffer1, const wiGraphics::Texture& gbuffer2, CommandList cmd) const +{ + if (getSSREnabled()) + { + wiRenderer::Postprocess_StochasticSSR(srcSceneRT, depthBuffer_Copy, rtLinearDepth_minmax, gbuffer0, gbuffer1, gbuffer2, rtStochasticSSR, cmd); + } +} void RenderPath3D::DownsampleDepthBuffer(CommandList cmd) const { GraphicsDevice* device = wiRenderer::GetDevice(); diff --git a/WickedEngine/RenderPath3D.h b/WickedEngine/RenderPath3D.h index 01fd00023..efbe58821 100644 --- a/WickedEngine/RenderPath3D.h +++ b/WickedEngine/RenderPath3D.h @@ -51,7 +51,8 @@ private: protected: wiGraphics::Texture rtReflection; // conains the scene rendered for planar reflections - wiGraphics::Texture rtSSR; // screen-space reflection results + wiGraphics::Texture rtSSR; // standard screen-space reflection results + wiGraphics::Texture rtStochasticSSR; // stochastic screen-space reflection results wiGraphics::Texture rtSceneCopy; // contains the rendered scene that can be fed into transparent pass for distortion effect wiGraphics::Texture rtWaterRipple; // water ripple sprite normal maps are rendered into this wiGraphics::Texture rtParticleDistortion; // contains distortive particles @@ -102,6 +103,7 @@ protected: virtual void RenderLinearDepth(wiGraphics::CommandList cmd) const; virtual void RenderSSAO(wiGraphics::CommandList cmd) const; virtual void RenderSSR(const wiGraphics::Texture& srcSceneRT, const wiGraphics::Texture& gbuffer1, wiGraphics::CommandList cmd) const; + virtual void RenderStochasticSSR(const wiGraphics::Texture& srcSceneRT, const wiGraphics::Texture& gbuffer0, const wiGraphics::Texture& gbuffer1, const wiGraphics::Texture& gbuffer2, wiGraphics::CommandList cmd) const; virtual void DownsampleDepthBuffer(wiGraphics::CommandList cmd) const; virtual void RenderOutline(const wiGraphics::Texture& dstSceneRT, wiGraphics::CommandList cmd) const; virtual void RenderLightShafts(wiGraphics::CommandList cmd) const; diff --git a/WickedEngine/RenderPath3D_Deferred.cpp b/WickedEngine/RenderPath3D_Deferred.cpp index b5bab5c98..98ad5aa1a 100644 --- a/WickedEngine/RenderPath3D_Deferred.cpp +++ b/WickedEngine/RenderPath3D_Deferred.cpp @@ -209,7 +209,7 @@ void RenderPath3D_Deferred::Render() const device->BindViewports(1, &vp, cmd); device->BindResource(PS, getSSAOEnabled() ? &rtSSAO[0] : wiTextureHelper::getWhite(), TEXSLOT_RENDERPATH_SSAO, cmd); - device->BindResource(PS, getSSREnabled() ? &rtSSR : wiTextureHelper::getTransparent(), TEXSLOT_RENDERPATH_SSR, cmd); + device->BindResource(PS, getSSREnabled() ? &rtStochasticSSR : wiTextureHelper::getTransparent(), TEXSLOT_RENDERPATH_SSR, cmd); wiRenderer::DrawDeferredLights(wiRenderer::GetCamera(), depthBuffer_Copy, rtGBuffer[0], rtGBuffer[1], rtGBuffer[2], cmd); device->RenderPassEnd(cmd); @@ -227,7 +227,7 @@ void RenderPath3D_Deferred::Render() const RenderDeferredComposition(cmd); - RenderSSR(rtDeferred, rtGBuffer[1], cmd); + RenderStochasticSSR(rtDeferred, rtGBuffer[0], rtGBuffer[1], rtGBuffer[2], cmd); DownsampleDepthBuffer(cmd); diff --git a/WickedEngine/RenderPath3D_TiledDeferred.cpp b/WickedEngine/RenderPath3D_TiledDeferred.cpp index a41c25db6..05ee98bb2 100644 --- a/WickedEngine/RenderPath3D_TiledDeferred.cpp +++ b/WickedEngine/RenderPath3D_TiledDeferred.cpp @@ -105,7 +105,7 @@ void RenderPath3D_TiledDeferred::Render() const RenderDecals(cmd); device->BindResource(CS, getSSAOEnabled() ? &rtSSAO[0] : wiTextureHelper::getWhite(), TEXSLOT_RENDERPATH_SSAO, cmd); - device->BindResource(CS, getSSREnabled() ? &rtSSR : wiTextureHelper::getTransparent(), TEXSLOT_RENDERPATH_SSR, cmd); + device->BindResource(CS, getSSREnabled() ? &rtStochasticSSR : wiTextureHelper::getTransparent(), TEXSLOT_RENDERPATH_SSR, cmd); if (device->CheckCapability(GraphicsDevice::GRAPHICSDEVICE_CAPABILITY_UAV_LOAD_FORMAT_R11G11B10_FLOAT)) @@ -156,7 +156,7 @@ void RenderPath3D_TiledDeferred::Render() const RenderDeferredComposition(cmd); - RenderSSR(rtDeferred, rtGBuffer[1], cmd); + RenderStochasticSSR(rtDeferred, rtGBuffer[0], rtGBuffer[1], rtGBuffer[2], cmd); DownsampleDepthBuffer(cmd); diff --git a/WickedEngine/WickedEngine_SHADERS.vcxproj b/WickedEngine/WickedEngine_SHADERS.vcxproj index cb746e399..c355fbd80 100644 --- a/WickedEngine/WickedEngine_SHADERS.vcxproj +++ b/WickedEngine/WickedEngine_SHADERS.vcxproj @@ -31,6 +31,7 @@ + @@ -801,12 +802,27 @@ Compute + + Compute + + + Compute + + + Compute + Compute Pixel + + Compute + + + Compute + Pixel @@ -974,4 +990,4 @@ - \ No newline at end of file + diff --git a/WickedEngine/WickedEngine_SHADERS.vcxproj.filters b/WickedEngine/WickedEngine_SHADERS.vcxproj.filters index 3324156e2..fcf685b1e 100644 --- a/WickedEngine/WickedEngine_SHADERS.vcxproj.filters +++ b/WickedEngine/WickedEngine_SHADERS.vcxproj.filters @@ -85,6 +85,9 @@ HF + + HF + @@ -852,6 +855,21 @@ CS + + CS + + + CS + + + CS + + + CS + + + CS + CS @@ -897,4 +915,4 @@ {12396e21-0254-42fa-a88b-805f0703eca5} - \ No newline at end of file + diff --git a/WickedEngine/deferredPS.hlsl b/WickedEngine/deferredPS.hlsl index 8037359fd..5e2d52f40 100644 --- a/WickedEngine/deferredPS.hlsl +++ b/WickedEngine/deferredPS.hlsl @@ -19,5 +19,5 @@ float4 main(float4 pos : SV_Position, float2 uv : TEXCOORD) : SV_TARGET ApplyFog(depth, color); - return color; + return max(0, color); } \ No newline at end of file diff --git a/WickedEngine/globals.hlsli b/WickedEngine/globals.hlsli index 879ae4b1b..0f3e3e41e 100644 --- a/WickedEngine/globals.hlsli +++ b/WickedEngine/globals.hlsli @@ -147,6 +147,34 @@ inline float2 hammersley2d(uint idx, uint num) { return float2(float(idx) / float(num), radicalInverse_VdC); } +inline float2 HammersleyRandom(uint idx, uint num, uint2 random) +{ + uint bits = idx; + bits = (bits << 16) | (bits >> 16); + bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8); + bits = ((bits & 0x0f0f0f0f) << 4) | ((bits & 0xf0f0f0f0) >> 4); + bits = ((bits & 0x33333333) << 2) | ((bits & 0xcccccccc) >> 2); + bits = ((bits & 0x55555555) << 1) | ((bits & 0xaaaaaaaa) >> 1); + + float E1 = frac((float) idx / num + float(random.x) * (1.0 / 65536.0)); + float E2 = float((bits >> 16) ^ random.y) * (1.0 / 65536.0); + return float2(E1, E2); +} + +inline float2 HammersleyRandom(uint idx, uint2 random) +{ + uint bits = idx; + bits = (bits << 16) | (bits >> 16); + bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8); + bits = ((bits & 0x0f0f0f0f) << 4) | ((bits & 0xf0f0f0f0) >> 4); + bits = ((bits & 0x33333333) << 2) | ((bits & 0xcccccccc) >> 2); + bits = ((bits & 0x55555555) << 1) | ((bits & 0xaaaaaaaa) >> 1); + + float E1 = frac(float(random.x) * (1.0 / 65536.0)); + float E2 = float((bits >> 16) ^ random.y) * (1.0 / 65536.0); + return float2(E1, E2); +} + // "Next Generation Post Processing in Call of Duty: Advanced Warfare" // http://advances.realtimerendering.com/s2014/index.html float InterleavedGradientNoise(float2 uv, uint frameCount) @@ -554,4 +582,4 @@ inline float dither(in float2 pixel) return ditherMask8(pixel); } -#endif // WI_SHADER_GLOBALS_HF \ No newline at end of file +#endif // WI_SHADER_GLOBALS_HF diff --git a/WickedEngine/stochasticSSRCS_combine.hlsl b/WickedEngine/stochasticSSRCS_combine.hlsl new file mode 100644 index 000000000..15f7a1404 --- /dev/null +++ b/WickedEngine/stochasticSSRCS_combine.hlsl @@ -0,0 +1,44 @@ +#include "globals.hlsli" +#include "brdf.hlsli" +#include "stochasticSSRHF.hlsli" +#include "ShaderInterop_Postprocess.h" + +TEXTURE2D(texture_median, float4, TEXSLOT_ONDEMAND0); + +RWTEXTURE2D(output, float4, 0); + +// Final Stochastic SSR pass. Here we can apply final touches like specular occlusion or fresnel and BRDFLUT? + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint3 DTid : SV_DispatchThreadID) +{ + const float2 uv = (DTid.xy + 0.5f) * xPPResolution_rcp; + const float depth = texture_depth.SampleLevel(sampler_point_clamp, uv, 0); + if (depth == 0.0f) + return; + + // Everything in view space: + const float3 P = reconstructPosition(uv, depth, g_xCamera_InvP); + const float3 N = mul((float3x3) g_xCamera_View, decodeNormal(texture_gbuffer1.SampleLevel(sampler_point_clamp, uv, 0).xy)).xyz; + const float3 V = normalize(P); + + float NdotV = max(dot(N, V), 0.0f); + + float3 albedo = texture_gbuffer0.SampleLevel(sampler_point_clamp, uv, 0).rgb; + float4 baseColor = float4(albedo, 1.0f); + + float4 GBuffer2 = texture_gbuffer2.SampleLevel(sampler_point_clamp, uv, 0); + //float occlusion = GBuffer2.r; + //float roughness = GBuffer2.g; + float metalness = GBuffer2.b; + float reflectance = GBuffer2.a; + + float3 f0 = ComputeF0(baseColor, reflectance, metalness); + float f90 = saturate(50.0 * dot(f0, 0.33)); + float3 F = F_Schlick(f0, f90, NdotV); + + float4 final = texture_median.SampleLevel(sampler_point_clamp, uv, 0); + final.rgb *= F; + + output[DTid.xy] = final; +} diff --git a/WickedEngine/stochasticSSRCS_median.hlsl b/WickedEngine/stochasticSSRCS_median.hlsl new file mode 100644 index 000000000..5e3f69b25 --- /dev/null +++ b/WickedEngine/stochasticSSRCS_median.hlsl @@ -0,0 +1,64 @@ +#include "globals.hlsli" +#include "ShaderInterop_Postprocess.h" + +TEXTURE2D(texture_temporal, float4, TEXSLOT_ONDEMAND0); + +RWTEXTURE2D(output, float4, 0); + +// A Fast, Small-Radius GPU Median Filter by Morgan McGuire +// https://casual-effects.com/research/McGuire2008Median/index.html + +#define s2(a, b) temp = a; a = min(a, b); b = max(temp, b); +#define t2(a, b) s2(v[a], v[b]); +#define t24(a, b, c, d, e, f, g, h) t2(a, b); t2(c, d); t2(e, f); t2(g, h); +#define t25(a, b, c, d, e, f, g, h, i, j) t24(a, b, c, d, e, f, g, h); t2(i, j); + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint3 DTid : SV_DispatchThreadID) +{ + const float2 uv = (DTid.xy + 0.5f) * xPPResolution_rcp; + const float depth = texture_depth.SampleLevel(sampler_point_clamp, uv, 0); + if (depth == 0.0f) + return; + + half4 v[25]; + + // Add the pixels which make up our window to the pixel array. + [unroll] + for (int dX = -2; dX <= 2; ++dX) + { + [unroll] + for (int dY = -2; dY <= 2; ++dY) + { + float2 offset = float2(float(dX), float(dY)); + + // If a pixel in the window is located at (x+dX, y+dY), put it at index (dX + R)(2R + 1) + (dY + R) of the + // pixel array. This will fill the pixel array, with the top left pixel of the window at pixel[0] and the + // bottom right pixel of the window at pixel[N-1]. + v[(dX + 2) * 5 + (dY + 2)] = texture_temporal.SampleLevel(sampler_linear_clamp, uv + offset * xPPResolution_rcp, 0); + } + } + + half4 temp; + t25(0, 1, 3, 4, 2, 4, 2, 3, 6, 7); + t25(5, 7, 5, 6, 9, 7, 1, 7, 1, 4); + t25(12, 13, 11, 13, 11, 12, 15, 16, 14, 16); + t25(14, 15, 18, 19, 17, 19, 17, 18, 21, 22); + t25(20, 22, 20, 21, 23, 24, 2, 5, 3, 6); + t25(0, 6, 0, 3, 4, 7, 1, 7, 1, 4); + t25(11, 14, 8, 14, 8, 11, 12, 15, 9, 15); + t25(9, 12, 13, 16, 10, 16, 10, 13, 20, 23); + t25(17, 23, 17, 20, 21, 24, 18, 24, 18, 21); + t25(19, 22, 8, 17, 9, 18, 0, 18, 0, 9); + t25(10, 19, 1, 19, 1, 10, 11, 20, 2, 20); + t25(2, 11, 12, 21, 3, 21, 3, 12, 13, 22); + t25(4, 22, 4, 13, 14, 23, 5, 23, 5, 14); + t25(15, 24, 6, 24, 6, 15, 7, 16, 7, 19); + t25(3, 11, 5, 17, 11, 17, 9, 17, 4, 10); + t25(6, 12, 7, 14, 4, 6, 4, 7, 12, 14); + t25(10, 14, 6, 7, 10, 12, 6, 10, 6, 17); + t25(12, 17, 7, 17, 7, 10, 12, 18, 7, 12); + t24(10, 18, 12, 20, 10, 20, 10, 12); + + output[DTid.xy] = v[12]; +} diff --git a/WickedEngine/stochasticSSRCS_raytrace.hlsl b/WickedEngine/stochasticSSRCS_raytrace.hlsl new file mode 100644 index 000000000..1f175199a --- /dev/null +++ b/WickedEngine/stochasticSSRCS_raytrace.hlsl @@ -0,0 +1,296 @@ +#include "globals.hlsli" +#include "stochasticSSRHF.hlsli" +#include "ShaderInterop_Postprocess.h" + +TEXTURE2D(input, float4, TEXSLOT_ONDEMAND0); +TEXTURE2D(texture_lineardepth_minmax, float2, TEXSLOT_ONDEMAND1); + +RWTEXTURE2D(texture_raytrace, float4, 0); +RWTEXTURE2D(texture_mask, float2, 1); + +// Use this to use reduced precision, but higher framerate: +#define USE_LINEARDEPTH + +static const float rayTraceStride = 1.0f; // Step in horizontal or vertical pixels between samples. +static const float rayTraceMaxStep = 512.0f; // Maximum number of iterations. Higher gives better images but may be slow. +static const float rayTraceHitThickness = 1.5f; // Thickness to ascribe to each pixel in the depth buffer. +static const float rayTraceHitThicknessBias = 7.0f; // Bias to control the thickness along distance. +static const float rayTraceMaxDistance = 1000.0f; // Maximum camera-space distance to trace before returning a miss. +static const float rayTraceStrideCutoff = 100.0f; // More distant pixels are smaller in screen space. This value tells at what point to + // start relaxing the stride to give higher quality reflections for objects far from the camera. +static const float raytraceHZBBias = 1.0f; + +float DistanceSquared(float2 a, float2 b) +{ + a -= b; + return dot(a, a); +} + +bool intersectsDepthBuffer(float z, float minZ, float maxZ) +{ + // Increase thickness along distance. + // This will help objects from dissapering in the distance. + float thicknessScale = min(1.0f, z / rayTraceStrideCutoff); + float thickness = rayTraceHitThickness * rayTraceHitThicknessBias * thicknessScale; + thickness = clamp(thickness, rayTraceHitThickness, 10.0f); + + // Effectively remove line/tiny artifacts, mostly caused by Zbuffers precision. + float depthScale = min(1.0f, z / rayTraceStrideCutoff); + z += lerp(0.05f, 0.0f, depthScale); + + return (minZ >= z) && (maxZ - thickness <= z); +} + +// Heavily adapted from McGuire and Mara's original implementation +// http://casual-effects.blogspot.com/2014/08/screen-space-ray-tracing.html +bool ScreenSpaceRayTrace(float3 csOrig, float3 csDir, float jitter, float roughness, out float2 hitPixel, out float3 hitPoint, out float iterationCount) +{ + float rayLength = ((csOrig.z + csDir.z * rayTraceMaxDistance) < g_xCamera_ZNearP) ? (g_xCamera_ZNearP - csOrig.z) / csDir.z : rayTraceMaxDistance; + + float3 csRayEnd = csOrig + csDir * rayLength; + + // Project into homogeneous clip space + float4 clipRayOrigin = mul(g_xCamera_Proj, float4(csOrig, 1.0f)); + float4 clipRayEnd = mul(g_xCamera_Proj, float4(csRayEnd, 1.0f)); + + float k0 = 1.0f / clipRayOrigin.w; + float k1 = 1.0f / clipRayEnd.w; + + float3 Q0 = csOrig * k0; + float3 Q1 = csRayEnd * k1; + + // Screen-space endpoints + float2 P0 = clipRayOrigin.xy * k0; + float2 P1 = clipRayEnd.xy * k1; + + // Project to pixel + P0 = P0 * float2(0.5, -0.5) + float2(0.5, 0.5); + P1 = P1 * float2(0.5, -0.5) + float2(0.5, 0.5); + + P0.xy *= xPPResolution.xy; + P1.xy *= xPPResolution.xy; + +#if 1 + // Clip to the screen coordinates. Alternatively we could just modify rayTraceMaxStep instead + // This will also improve the framerate, without losing quality or features + float2 yDelta = float2(xPPResolution.y + 2.0f, -2.0f); // - 0.5, 0.5 + float2 xDelta = float2(xPPResolution.x + 2.0f, -2.0f); // - 0.5, 0.5 + float alpha = 0.0; + + // P0 must be in bounds + if (P1.y > yDelta.x || P1.y < yDelta.y) + { + float yClip = (P1.y > yDelta.x) ? yDelta.x : yDelta.y; + float yAlpha = (P1.y - yClip) / (P1.y - P0.y); + alpha = yAlpha; + } + + // P1 must be in bounds + if (P1.x > xDelta.x || P1.x < xDelta.y) + { + float xClip = (P1.x > xDelta.x) ? xDelta.x : xDelta.y; + float xAlpha = (P1.x - xClip) / (P1.x - P0.x); + alpha = max(alpha, xAlpha); + } + + // These are all in homogeneous space, so they interpolate linearly + P1 = lerp(P1, P0, alpha); + k1 = lerp(k1, k0, alpha); + Q1 = lerp(Q1, Q0, alpha); +#endif + + // If the line is degenerate, make it cover at least one pixel to avoid handling zero-pixel extent as a special case later + P1 += (DistanceSquared(P0, P1) < 0.0001f) ? float2(0.01f, 0.01f) : 0.0f; + float2 screenOffset = P1 - P0; + + // Permute so that the primary iteration is in x to collapse all quadrant-specific DDA cases later + bool permute = false; + if (abs(screenOffset.x) < abs(screenOffset.y)) + { + permute = true; + screenOffset = screenOffset.yx; + P0 = P0.yx; + P1 = P1.yx; + } + + float stepDirection = sign(screenOffset.x); + float stepInterval = stepDirection / screenOffset.x; + + // Track the derivatives of Q and k + float3 dQ = (Q1 - Q0) * stepInterval; + float dk = (k1 - k0) * stepInterval; + + // Because we test 1/2 a texel forward along the ray, on the very last iteration + // the interpolation can go past the end of the ray. Use these bounds to clamp it. + float zMin = min(csRayEnd.z, csOrig.z); + float zMax = max(csRayEnd.z, csOrig.z); + + float2 dP = float2(stepDirection, screenOffset.y * stepInterval); + + // Scale derivatives by the desired pixel stride and then offset the starting values by the jitter fraction + float strideScale = 1.0f - min(1.0f, csOrig.z / rayTraceStrideCutoff); + float stride = 1.0f + strideScale * rayTraceStride; + + dP *= stride; + dQ *= stride; + dk *= stride; + + P0 += dP * jitter; + Q0 += dQ * jitter; + k0 += dk * jitter; + + float4 PQk = float4(P0, Q0.z, k0); + float4 dPQk = float4(dP, dQ.z, dk); + float3 Q = Q0; + + // Adjust end condition for iteration direction + float end = P1.x * stepDirection; + + // raytrace iterations based on roughness + // Matte materials will get less samples + float roughnessTraceStep = max(rayTraceMaxStep * (1.0 - roughness), 1.0f); + + float stepCount = 0.0f; + float level = 0.0f; // 1.0f start level. Parameter? + + float prevZMaxEstimate = csOrig.z; + float rayZMin = prevZMaxEstimate; + float rayZMax = prevZMaxEstimate; + float sceneZMax = rayZMax + 100000.0f; + + [loop] + for (; ((PQk.x * stepDirection) <= end) && + (stepCount <= roughnessTraceStep - 1) && + !intersectsDepthBuffer(sceneZMax, rayZMin, rayZMax) && + (sceneZMax != 0.0f) && + (level > -1); + PQk += dPQk, stepCount++) + { + if (!is_saturated(hitPixel)) + { + return false; + } + + rayZMin = prevZMaxEstimate; + + // Compute the value at 1/2 step into the future + rayZMax = (dPQk.z * 0.5f + PQk.z) / (dPQk.w * 0.5f + PQk.w); + rayZMax = clamp(rayZMax, zMin, zMax); + prevZMaxEstimate = rayZMax; + + [flatten] + if (rayTraceMaxDistance < rayZMax) + { + return false; + } + + [flatten] + if (rayZMin > rayZMax) + { + float t = rayZMin; + rayZMin = rayZMax; + rayZMax = t; + } + + // A simple HZB approach based on roughness + level += min(raytraceHZBBias / 10.0f, 5.0f) * roughness; + + hitPixel = permute ? PQk.yx : PQk.xy; + hitPixel *= xPPResolution_rcp; + + #ifdef USE_LINEARDEPTH + sceneZMax = texture_lineardepth_minmax.SampleLevel(sampler_point_clamp, hitPixel, level).g * g_xCamera_ZFarP; + #else + sceneZMax = getLinearDepth(texture_depth.SampleLevel(sampler_point_clamp, hitPixel, 0).r); + #endif + } + + // Advance Q based on the number of steps + Q.xy += dQ.xy * stepCount; + hitPoint = Q * (1.0f / PQk.w); + iterationCount = stepCount; + + return intersectsDepthBuffer(sceneZMax, rayZMin, rayZMax); +} + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint3 DTid : SV_DispatchThreadID) +{ + const float2 uv = (DTid.xy + 0.5f) * xPPResolution_rcp; + const float depth = texture_depth.SampleLevel(sampler_point_clamp, uv, 0); + if (depth == 0.0f) + return; + + // Everything in view space: + const float3 P = reconstructPosition(uv, depth, g_xCamera_InvP); + const float3 N = mul((float3x3)g_xCamera_View, decodeNormal(texture_gbuffer1.SampleLevel(sampler_point_clamp, uv, 0).xy)).xyz; + const float3 V = normalize(P); + + const float roughness = GetRoughness(texture_gbuffer2.SampleLevel(sampler_point_clamp, uv, 0).g); + + const float roughnessFade = GetRoughnessFade(roughness, SSRMaxRoughness); + if (roughnessFade <= 0.0f) + { + return; + } + + float4 H; + if (roughness > 0.1f) + { + const float surfaceMargin = 0.0f; + const float maxRegenCount = 15.0f; + + uint2 Random = Rand3DPCG16(int3((DTid.xy + 0.5f), g_xFrame_FrameCount)).xy; + + // Pick the best rays + + float RdotN = 0.0f; + float regenCount = 0; + [loop] + for (; RdotN <= surfaceMargin && regenCount < maxRegenCount; regenCount++) + { + // Low-discrepancy sequence + //float2 Xi = float2(Random) * rcp(65536.0); // equivalent to HammersleyRandom(0, 1, Random). + float2 Xi = HammersleyRandom(regenCount, Random); // SingleSPP + + Xi.y = lerp(Xi.y, 0.0f, BRDFBias); + + // I should probably use importance sampling of visible normals http://jcgt.org/published/0007/04/01/paper.pdf + H = ImportanceSampleGGX(Xi, roughness); + H = TangentToWorld(H, N); + + RdotN = dot(N, reflect(V, H.xyz)); + } + } + else + { + H = float4(N.xyz, 1.0f); + } + + float3 dir = reflect(V, H.xyz); + + float2 hitPixel = float2(0.0f, 0.0f); + float3 hitPoint = float3(0.0f, 0.0f, 0.0f); + float iterationCount = 0.0f; + + float2 uv2 = (DTid.xy + 0.5f); + //float jitter = 1.0f + rand(uv2 + g_xFrame_Time); + float jitter = 1.0f + InterleavedGradientNoise(uv2, g_xFrame_FrameCount); + + bool hit = ScreenSpaceRayTrace(P, dir, jitter, roughness, hitPixel, hitPoint, iterationCount); + + float hitDepth = texture_depth.SampleLevel(sampler_point_clamp, hitPixel, 0); + + // Output: + // xy: hit pixel + // z: hit depth + // w: pdf + float4 raytrace = max(0, float4(hitPixel, hitDepth, H.w)); + texture_raytrace[DTid.xy] = raytrace; + + // Output: + // x: hit (bool) + // y: iteration count / rayTraceMaxStep + float2 mask = float2(hit, iterationCount / rayTraceMaxStep); + texture_mask[DTid.xy] = mask; +} diff --git a/WickedEngine/stochasticSSRCS_resolve.hlsl b/WickedEngine/stochasticSSRCS_resolve.hlsl new file mode 100644 index 000000000..4cbdc3cd4 --- /dev/null +++ b/WickedEngine/stochasticSSRCS_resolve.hlsl @@ -0,0 +1,147 @@ +#include "globals.hlsli" +#include "brdf.hlsli" +#include "stochasticSSRHF.hlsli" +#include "ShaderInterop_Postprocess.h" + +TEXTURE2D(texture_raytrace, float4, TEXSLOT_ONDEMAND0); +TEXTURE2D(texture_mask, float2, TEXSLOT_ONDEMAND1); +TEXTURE2D(texture_main, float4, TEXSLOT_ONDEMAND2); + +RWTEXTURE2D(texture_resolve, float4, 0); + +static const float resolveSequenceSize = 20.0f; // Can help reduce noise on rough surfaces, but too high values tend to wash out contact points. +static const float resolveMip = 1.0f; +static const float resolveSSRIntensity = 1.0f; + +static const float blendScreenEdgeFade = 5.0f; +static const bool blendReflectSky = true; + +float CalculateBlendIntersection(bool hit, float iterationStep, float2 hitPixel) +{ + float confidence = 1.0 - pow(iterationStep, 8.0f); + float2 hitPixelNDC = hitPixel * 2.0 - 1.0; + + //float maxDimension = min(1.0, max(abs(hitPixelNDC.x), abs(hitPixelNDC.y))); + //float attenuation = 1.0 - max(0.0, maxDimension - blendScreenEdgeFade) / (1.0 - blendScreenEdgeFade); + + float2 vignette = saturate(abs(hitPixelNDC) * blendScreenEdgeFade - (blendScreenEdgeFade - 1.0f)); + float attenuation = saturate(1.0 - dot(vignette, vignette)); + + float blend = confidence * attenuation; + + if (!hit && !blendReflectSky) + blend = 0.0; + + return blend; +} + +// I probably need to figure out a better way to deal with this. +float2 CalculateTailDirection(float3 viewNormal) +{ + float3 upVector = abs(viewNormal.z) < 0.999 ? float3(0.0, 0.0, 1.0) : float3(1.0, 0.0, 0.0); + float3 T = normalize(cross(upVector, viewNormal)); + + float tailDirection = T.x * -viewNormal.y; + + return lerp(float2(1.0, 0.1), float2(0.1, 1.0), tailDirection); +} + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint3 DTid : SV_DispatchThreadID) +{ + const float2 uv = (DTid.xy + 0.5f) * xPPResolution_rcp; + const float depth = texture_depth.SampleLevel(sampler_point_clamp, uv, 0); + if (depth == 0.0f) + return; + + // Everthing in view space: + const float3 P = reconstructPosition(uv, depth, g_xCamera_InvP); + const float3 N = mul((float3x3) g_xCamera_View, decodeNormal(texture_gbuffer1.SampleLevel(sampler_point_clamp, uv, 0).xy)).xyz; + const float3 V = normalize(-P); + const float NdotV = saturate(dot(N, V)); + + const float roughness = GetRoughness(texture_gbuffer2.SampleLevel(sampler_point_clamp, uv, 0).g); + const float roughnessSequenceSize = resolveSequenceSize * roughness + 1.0f; + + // Early out, useless if the roughness is out of range + float roughnessFade = GetRoughnessFade(roughness, SSRMaxRoughness); + if (roughnessFade <= 0.0f) + { + texture_resolve[DTid.xy] = 0; + return; + } + + float specularConeTangent = lerp(0.0, roughness * (1.0 - BRDFBias), NdotV * sqrt(roughness)); + specularConeTangent *= lerp(saturate(NdotV * 2), 1.0f, sqrt(roughness)); + + const float maxMipLevel = 11.0f - 1.0f; + const uint2 Random = Rand3DPCG16(int3((DTid.xy + 0.5f), g_xFrame_FrameCount)).xy; + + float4 result = 0.0f; + float weightSum = 0.0f; + + const uint NumResolve = 4; + [unroll] + for (uint i = 0; i < NumResolve; i++) + { + float2 offsetRotation = (HammersleyRandom(i, NumResolve, Random) * 2.0 - 1.0) * roughnessSequenceSize; + float2x2 offsetRotationMatrix = float2x2(offsetRotation.x, offsetRotation.y, -offsetRotation.y, offsetRotation.x); + + float2 offsetUV = offset[i] * (1.0f / xPPResolution); + offsetUV = uv + mul(offsetRotationMatrix, offsetUV) * CalculateTailDirection(N); + + float4 raytraceSource = texture_raytrace.SampleLevel(sampler_point_clamp, offsetUV, 0); + float2 maskSource = texture_mask.SampleLevel(sampler_point_clamp, offsetUV, 0); + + float2 hitPixel = raytraceSource.xy; + float hitDepth = raytraceSource.z; + float hitPDF = raytraceSource.w; + bool hit = (bool)maskSource.x; + float iterationStep = maskSource.y; + + float intersectionCircleRadius = specularConeTangent * length(hitPixel - uv); + float sourceMip = clamp(log2(intersectionCircleRadius * max(xPPResolution.x, xPPResolution.y)), 0.0, maxMipLevel) * resolveMip; + + float4 sampleColor; + sampleColor.rgb = texture_main.SampleLevel(sampler_linear_clamp, hitPixel, sourceMip).xyz; + sampleColor.a = CalculateBlendIntersection(hit, iterationStep, hitPixel); + + sampleColor.rgb /= 1 + Luminance(sampleColor.rgb); + + // BRDF + + float3 hitViewPosition = reconstructPosition(hitPixel, hitDepth, g_xCamera_InvP); + + float3 L = normalize(hitViewPosition - P); + float3 H = normalize(L + V); + + float NdotH = saturate(dot(N, H)); + float NdotL = saturate(dot(N, L)); + + Surface surface; + surface.alphaRoughnessSq = pow(roughness, 4); + + SurfaceToLight surfaceToLight; + surfaceToLight.NdotH = NdotH; + surfaceToLight.NdotL = NdotL; + surfaceToLight.NdotV = NdotV; + + // We could simply use BRDF_GetSpecular, but we exclude fresnel for later + float Vis = visibilityOcclusion(surface, surfaceToLight); + float D = microfacetDistribution(surface, surfaceToLight); + float specularLight = Vis * D * surfaceToLight.NdotL; + + float weight = specularLight / max(hitPDF, 0.00001f); + + result += sampleColor * weight; + weightSum += weight; + } + result /= weightSum; + + result.rgb /= 1 - Luminance(result.rgb); + + result *= roughnessFade; + result *= resolveSSRIntensity; + + texture_resolve[DTid.xy] = max(result, 0.00001f); +} diff --git a/WickedEngine/stochasticSSRCS_temporal.hlsl b/WickedEngine/stochasticSSRCS_temporal.hlsl new file mode 100644 index 000000000..4a1a3ad82 --- /dev/null +++ b/WickedEngine/stochasticSSRCS_temporal.hlsl @@ -0,0 +1,171 @@ +#include "globals.hlsli" +#include "stochasticSSRHF.hlsli" +#include "ShaderInterop_Postprocess.h" + +TEXTURE2D(resolve_current, float4, TEXSLOT_ONDEMAND0); +TEXTURE2D(resolve_history, float4, TEXSLOT_ONDEMAND1); +TEXTURE2D(texture_raytrace, float4, TEXSLOT_ONDEMAND2); + +RWTEXTURE2D(output, float4, 0); + +static const float temporalResponseMin = 0.85f; +static const float temporalResponseMax = 1.0f; +static const float temporalScale = 2.0f; +static const float temporalExposure = 10.0f; + +inline float Luma4(float3 color) +{ + return (color.g * 2) + (color.r + color.b); +} + +inline float HdrWeight4(float3 color, float exposure) +{ + return rcp(Luma4(color) * exposure + 4.0f); +} + +float4 clip_aabb(float3 aabb_min, float3 aabb_max, float4 p, float4 q) +{ + float3 p_clip = 0.5 * (aabb_max + aabb_min); + float3 e_clip = 0.5 * (aabb_max - aabb_min) + 0.00000001f; + + float4 v_clip = q - float4(p_clip, p.w); + float3 v_unit = v_clip.xyz / e_clip; + float3 a_unit = abs(v_unit); + float ma_unit = max(a_unit.x, max(a_unit.y, a_unit.z)); + + if (ma_unit > 1.0) + return float4(p_clip, p.w) + v_clip / ma_unit; + else + return q; // point inside aabb +} + +inline void ResolverAABB(Texture2D currentColor, SamplerState currentSampler, float sharpness, float exposureScale, float AABBScale, float2 uv, float2 texelSize, inout float4 currentMin, inout float4 currentMax, inout float4 currentAverage, inout float4 currentOutput) +{ + const int2 SampleOffset[9] = { int2(-1.0, -1.0), int2(0.0, -1.0), int2(1.0, -1.0), int2(-1.0, 0.0), int2(0.0, 0.0), int2(1.0, 0.0), int2(-1.0, 1.0), int2(0.0, 1.0), int2(1.0, 1.0) }; + + // Modulate Luma HDR + + float4 sampleColors[9]; + [unroll] + for (uint i = 0; i < 9; i++) + { + sampleColors[i] = currentColor.SampleLevel(currentSampler, uv + (SampleOffset[i] / texelSize), 0.0f); + } + + float sampleWeights[9]; + [unroll] + for (uint j = 0; j < 9; j++) + { + sampleWeights[j] = HdrWeight4(sampleColors[j].rgb, exposureScale); + } + + float totalWeight = 0; + [unroll] + for (uint k = 0; k < 9; k++) + { + totalWeight += sampleWeights[k]; + } + sampleColors[4] = (sampleColors[0] * sampleWeights[0] + sampleColors[1] * sampleWeights[1] + sampleColors[2] * sampleWeights[2] + sampleColors[3] * sampleWeights[3] + sampleColors[4] * sampleWeights[4] + + sampleColors[5] * sampleWeights[5] + sampleColors[6] * sampleWeights[6] + sampleColors[7] * sampleWeights[7] + sampleColors[8] * sampleWeights[8]) / totalWeight; + + // Variance Clipping (AABB) + + float4 m1 = 0.0; + float4 m2 = 0.0; + [unroll] + for (uint x = 0; x < 9; x++) + { + m1 += sampleColors[x]; + m2 += sampleColors[x] * sampleColors[x]; + } + + float4 mean = m1 / 9.0; + float4 stddev = sqrt((m2 / 9.0) - sqr(mean)); + + currentMin = mean - AABBScale * stddev; + currentMax = mean + AABBScale * stddev; + + currentOutput = sampleColors[4]; + currentMin = min(currentMin, currentOutput); + currentMax = max(currentMax, currentOutput); + currentAverage = mean; +} + +float2 CalculateCustomMotion(float depth, float2 uv) +{ + float4 sampleWorldPosition = float4(reconstructPosition(uv, depth, g_xCamera_InvVP), 1.0f); + + float4 thisClip = mul(g_xCamera_VP, sampleWorldPosition); + float4 prevClip = mul(g_xFrame_MainCamera_PrevVP, sampleWorldPosition); + + float2 thisScreen = thisClip.xy * rcp(thisClip.w); + float2 prevScreen = prevClip.xy * rcp(prevClip.w); + thisScreen = (thisScreen.xy + 1.0f) / 2.0f; + prevScreen = (prevScreen.xy + 1.0f) / 2.0f; + + return thisScreen - prevScreen; +} + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint3 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex) +{ + const float2 uv = (DTid.xy + 0.5f) * xPPResolution_rcp; + const float depth = texture_depth.SampleLevel(sampler_point_clamp, uv, 0); + + const float3 worldNormal = decodeNormal(texture_gbuffer1.SampleLevel(sampler_point_clamp, uv, 0).xy); + + float4 raytraceSource = texture_raytrace.SampleLevel(sampler_point_clamp, uv, 0); + float hitDepth = raytraceSource.z; + float2 hitPixel = raytraceSource.xy; + + // Calculate custom motion vectors to counter smearing, which we would get by using normal gbuffer velocity + + float2 reflectionCustomVelocity = CalculateCustomMotion(hitDepth, uv); + float2 hitCustomVelocity = CalculateCustomMotion(hitDepth, hitPixel); + float2 customVelocity = CalculateCustomMotion(depth, uv); + + float2 standardHitVelocity = texture_gbuffer1.SampleLevel(sampler_point_clamp, hitPixel, 0).zw; + float2 standardVelocity = texture_gbuffer1.SampleLevel(sampler_point_clamp, uv, 0).zw; + + float2 velocityDifference = customVelocity - standardVelocity; + float2 hitVelocityDifference = hitCustomVelocity - standardHitVelocity; + + float objectVelocityMask = saturate(dot(velocityDifference, velocityDifference) * xPPResolution_rcp.x * 100.0f); + float hitObjectVelocityMask = saturate(dot(hitVelocityDifference, hitVelocityDifference) * xPPResolution_rcp.x * 100.0f); + + float2 objectVelocity = standardVelocity * objectVelocityMask; + float2 hitObjectVelocity = standardHitVelocity * hitObjectVelocityMask; + + float2 velocity = lerp(lerp(reflectionCustomVelocity, hitObjectVelocity, hitObjectVelocityMask), objectVelocity, objectVelocityMask); + float2 prevUV = float2(uv.x - velocity.x, uv.y + velocity.y); + + float4 previous = resolve_history.SampleLevel(sampler_linear_clamp, prevUV, 0); + + // Luma HDR and AABB minmax + + float4 current = 0; + float4 currentMin, currentMax, currentAverage; + ResolverAABB(resolve_current, sampler_linear_clamp, 0, temporalExposure, temporalScale, uv, xPPResolution, currentMin, currentMax, currentAverage, current); + + previous.xyz = clip_aabb(currentMin.xyz, currentMax.xyz, clamp(currentAverage, currentMin, currentMax), previous).xyz; + previous.a = clamp(previous.a, currentMin.a, currentMax.a); + + // Blend color & history + // Feedback weight from unbiased luminance difference (Timothy Lottes) + + float lumFiltered = Luminance(current.rgb); // Luma4(current.rgb) + float lumHistory = Luminance(previous.rgb); + + float lumDifference = abs(lumFiltered - lumHistory) / max(lumFiltered, max(lumHistory, 0.2f)); + float lumWeight = sqr(1.0f - lumDifference); + float blendFinal = lerp(temporalResponseMin, temporalResponseMax, lumWeight); + + // Reduce ghosting by refreshing the blend by velocity... but adds additional noise + //float2 velocityScreen = velocity * xPPResolution; + //float velocityBlend = sqrt(dot(velocityScreen, velocityScreen)); + //blendFinal = lerp(blendFinal, 0.2f, saturate(velocityBlend / 100.0f)); + + float4 result = lerp(current, previous, blendFinal); + + output[DTid.xy] = result; +} diff --git a/WickedEngine/stochasticSSRHF.hlsli b/WickedEngine/stochasticSSRHF.hlsli new file mode 100644 index 000000000..de0df5555 --- /dev/null +++ b/WickedEngine/stochasticSSRHF.hlsli @@ -0,0 +1,116 @@ + +#ifndef WI_STOCHASTICSSR_HF +#define WI_STOCHASTICSSR_HF + +// Shared SSR settings: +static const float SSRMaxRoughness = 1.0f; // Specify max roughness, this can improve performance in complex scenes. +static const float BRDFBias = 0.7f; + +float ComputeRoughnessMaskScale(in float maxRoughness) +{ + float MaxRoughness = clamp(maxRoughness, 0.01f, 1.0f); + + float roughnessMaskScale = -2.0f / MaxRoughness; + return roughnessMaskScale * 1.0f; // 2.0f & 1.0f +} + +float GetRoughnessFade(in float roughness, in float maxRoughness) +{ + float roughnessMaskScale = ComputeRoughnessMaskScale(maxRoughness); + return min(roughness * roughnessMaskScale + 2, 1.0f); +} + +float GetRoughness(float roughness) +{ + return max(roughness, 0.02f); +} + +float Luminance(float3 color) +{ + return dot(color, float3(0.2126, 0.7152, 0.0722)); +} + +static const float2 offset[9] = +{ + float2(-2.0, -2.0), + float2(0.0, -2.0), + float2(2.0, -2.0), + float2(-2.0, 0.0), + float2(0.0, 0.0), + float2(2.0, 0.0), + float2(-2.0, 2.0), + float2(0.0, 2.0), + float2(2.0, 2.0) +}; + + +uint3 Rand3DPCG16(int3 p) +{ + uint3 v = uint3(p); + + v = v * 1664525u + 1013904223u; + + v.x += v.y * v.z; + v.y += v.z * v.x; + v.z += v.x * v.y; + v.x += v.y * v.z; + v.y += v.z * v.x; + v.z += v.x * v.y; + + // only top 16 bits are well shuffled + return v >> 16u; +} + +// Brian Karis, Epic Games "Real Shading in Unreal Engine 4" +float4 ImportanceSampleGGX(float2 Xi, float Roughness) +{ + float m = Roughness * Roughness; + float m2 = m * m; + + float Phi = 2 * PI * Xi.x; + + float CosTheta = sqrt((1.0 - Xi.y) / (1.0 + (m2 - 1.0) * Xi.y)); + float SinTheta = sqrt(max(1e-5, 1.0 - CosTheta * CosTheta)); + + float3 H; + H.x = SinTheta * cos(Phi); + H.y = SinTheta * sin(Phi); + H.z = CosTheta; + + float d = (CosTheta * m2 - CosTheta) * CosTheta + 1; + float D = m2 / (PI * d * d); + float pdf = D * CosTheta; + + return float4(H, pdf); +} + +// [ Duff et al. 2017, "Building an Orthonormal Basis, Revisited" ] +// http://jcgt.org/published/0006/01/01/ +float3x3 GetTangentBasis(float3 TangentZ) +{ + const float Sign = TangentZ.z >= 0 ? 1 : -1; + const float a = -rcp(Sign + TangentZ.z); + const float b = TangentZ.x * TangentZ.y * a; + + float3 TangentX = { 1 + Sign * a * pow(TangentZ.x, 2), Sign * b, -Sign * TangentZ.x }; + float3 TangentY = { b, Sign + a * pow(TangentZ.y, 2), -TangentZ.y }; + + return float3x3(TangentX, TangentY, TangentZ); +} + +float3 TangentToWorld(float3 vec, float3 tangentZ) +{ + return mul(vec, GetTangentBasis(tangentZ)); +} + +float4 TangentToWorld(float4 H, float3 tangentZ) +{ + return float4(mul(H.xyz, GetTangentBasis(tangentZ)), H.w); +} + +float3 WorldToTangent(float3 vec, float3 tangentZ) +{ + return mul(GetTangentBasis(tangentZ), vec); +} + +#endif // WI_STOCHASTICSSR_HF diff --git a/WickedEngine/wiEnums.h b/WickedEngine/wiEnums.h index 8ee4592f5..5ac9cbfd7 100644 --- a/WickedEngine/wiEnums.h +++ b/WickedEngine/wiEnums.h @@ -300,6 +300,11 @@ enum CSTYPES CSTYPE_POSTPROCESS_BLUR_BILATERAL_UNORM4, CSTYPE_POSTPROCESS_SSAO, CSTYPE_POSTPROCESS_SSR, + CSTYPE_POSTPROCESS_STOCHASTICSSR_RAYTRACE, + CSTYPE_POSTPROCESS_STOCHASTICSSR_RESOLVE, + CSTYPE_POSTPROCESS_STOCHASTICSSR_TEMPORAL, + CSTYPE_POSTPROCESS_STOCHASTICSSR_MEDIAN, + CSTYPE_POSTPROCESS_STOCHASTICSSR_COMBINE, CSTYPE_POSTPROCESS_LIGHTSHAFTS, CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_HORIZONTAL, CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_VERTICAL, diff --git a/WickedEngine/wiRenderer.cpp b/WickedEngine/wiRenderer.cpp index 2d7fc4cce..6422c0ad5 100644 --- a/WickedEngine/wiRenderer.cpp +++ b/WickedEngine/wiRenderer.cpp @@ -1333,6 +1333,11 @@ void LoadShaders() wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_BLUR_BILATERAL_UNORM4], "blur_bilateral_unorm4CS.cso"); }); wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_SSAO], "ssaoCS.cso"); }); wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_SSR], "ssrCS.cso"); }); + wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_RAYTRACE], "stochasticSSRCS_raytrace.cso"); }); + wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_RESOLVE], "stochasticSSRCS_resolve.cso"); }); + wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_TEMPORAL], "stochasticSSRCS_temporal.cso"); }); + wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_MEDIAN], "stochasticSSRCS_median.cso"); }); + wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_COMBINE], "stochasticSSRCS_combine.cso"); }); wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_LIGHTSHAFTS], "lightshaftsCS.cso"); }); wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_HORIZONTAL], "depthoffield_tileMaxCOC_horizontalCS.cso"); }); wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_VERTICAL], "depthoffield_tileMaxCOC_verticalCS.cso"); }); @@ -8843,6 +8848,256 @@ void Postprocess_SSR( wiProfiler::EndRange(range); device->EventEnd(cmd); } +void Postprocess_StochasticSSR( + const Texture& input, + const Texture& depthbuffer, + const Texture& lineardepth_minmax, + const Texture& gbuffer0, + const Texture& gbuffer1, + const Texture& gbuffer2, + const Texture& output, + CommandList cmd +) +{ + GraphicsDevice* device = GetDevice(); + + device->EventBegin("Postprocess_StochasticSSR", cmd); + auto range = wiProfiler::BeginRangeGPU("Stochastic SSR", cmd); + + device->UnbindResources(TEXSLOT_RENDERPATH_SSR, 1, cmd); + + const TextureDesc& desc = output.GetDesc(); + + static TextureDesc initialized_desc; + static Texture texture_main; + static Texture texture_raytrace; + static Texture texture_mask; + static Texture texture_resolve; + static Texture texture_temporal[2]; + static Texture texture_median; + + // Initialize once + if (initialized_desc.Width != desc.Width || initialized_desc.Height != desc.Height) + { + initialized_desc = desc; + + TextureDesc main_desc; + main_desc.type = TextureDesc::TEXTURE_2D; + main_desc.Width = desc.Width; + main_desc.Height = desc.Height; + main_desc.Format = FORMAT_R16G16B16A16_FLOAT; + main_desc.BindFlags = BIND_SHADER_RESOURCE | BIND_UNORDERED_ACCESS; + main_desc.MipLevels = 0; // full mip chain + device->CreateTexture(&main_desc, nullptr, &texture_main); + + main_desc = texture_main.GetDesc(); // mip count was initialized in CreateTexture() + for (uint32_t i = 0; i < main_desc.MipLevels; ++i) + { + int subresource_index; + subresource_index = device->CreateSubresource(&texture_main, SRV, 0, 1, i, 1); + assert(subresource_index == i); + subresource_index = device->CreateSubresource(&texture_main, UAV, 0, 1, i, 1); + assert(subresource_index == i); + } + + TextureDesc cast_desc; + cast_desc.type = TextureDesc::TEXTURE_2D; + cast_desc.Width = desc.Width / 2; + cast_desc.Height = desc.Height / 2; + cast_desc.Format = FORMAT_R16G16B16A16_FLOAT; + cast_desc.BindFlags = BIND_SHADER_RESOURCE | BIND_UNORDERED_ACCESS; + device->CreateTexture(&cast_desc, nullptr, &texture_raytrace); + cast_desc.Format = FORMAT_R16G16_FLOAT; + device->CreateTexture(&cast_desc, nullptr, &texture_mask); + + TextureDesc buffer_desc; + buffer_desc.type = TextureDesc::TEXTURE_2D; + buffer_desc.Width = desc.Width; + buffer_desc.Height = desc.Height; + buffer_desc.Format = FORMAT_R16G16B16A16_FLOAT; + buffer_desc.BindFlags = BIND_SHADER_RESOURCE | BIND_UNORDERED_ACCESS; + device->CreateTexture(&buffer_desc, nullptr, &texture_resolve); + device->CreateTexture(&buffer_desc, nullptr, &texture_temporal[0]); + device->CreateTexture(&buffer_desc, nullptr, &texture_temporal[1]); + device->CreateTexture(&buffer_desc, nullptr, &texture_median); + } + + // This is very expensive. There is problably a better way of getting LOD of input. + // For now I'm just making a copy of input, to stay on the safe side. + + // Main buffer copy and mip: + { + device->EventBegin("Main buffer pass", cmd); + + CopyTexture2D(texture_main, 0, 0, 0, input, 0, cmd); + GenerateMipChain(texture_main, MIPGENFILTER_GAUSSIAN, cmd); + + device->EventEnd(cmd); + } + + // Switch to half res + PostProcessCB cb; + cb.xPPResolution.x = desc.Width / 2; + cb.xPPResolution.y = desc.Height / 2; + cb.xPPResolution_rcp.x = 1.0f / cb.xPPResolution.x; + cb.xPPResolution_rcp.y = 1.0f / cb.xPPResolution.y; + device->UpdateBuffer(&constantBuffers[CBTYPE_POSTPROCESS], &cb, cmd); + device->BindConstantBuffer(CS, &constantBuffers[CBTYPE_POSTPROCESS], CB_GETBINDSLOT(PostProcessCB), cmd); + + // Raytrace pass: + { + device->EventBegin("Stochastic Raytrace pass", cmd); + device->BindComputeShader(&computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_RAYTRACE], cmd); + + device->BindResource(CS, &depthbuffer, TEXSLOT_DEPTH, cmd); + device->BindResource(CS, &gbuffer1, TEXSLOT_GBUFFER1, cmd); + device->BindResource(CS, &gbuffer2, TEXSLOT_GBUFFER2, cmd); + device->BindResource(CS, &input, TEXSLOT_ONDEMAND0, cmd); + device->BindResource(CS, &lineardepth_minmax, TEXSLOT_ONDEMAND1, cmd); + + const GPUResource* uavs[] = { + &texture_raytrace, + &texture_mask, + }; + device->BindUAVs(CS, uavs, 0, arraysize(uavs), cmd); + + device->Dispatch( + (texture_raytrace.GetDesc().Width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (texture_raytrace.GetDesc().Width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + device->Barrier(&GPUBarrier::Memory(), 1, cmd); + device->UnbindUAVs(0, arraysize(uavs), cmd); + device->EventEnd(cmd); + } + + // Switch to full res + cb.xPPResolution.x = desc.Width; + cb.xPPResolution.y = desc.Height; + cb.xPPResolution_rcp.x = 1.0f / cb.xPPResolution.x; + cb.xPPResolution_rcp.y = 1.0f / cb.xPPResolution.y; + device->UpdateBuffer(&constantBuffers[CBTYPE_POSTPROCESS], &cb, cmd); + device->BindConstantBuffer(CS, &constantBuffers[CBTYPE_POSTPROCESS], CB_GETBINDSLOT(PostProcessCB), cmd); + + // Resolve pass: + { + device->EventBegin("Resolve pass", cmd); + device->BindComputeShader(&computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_RESOLVE], cmd); + + device->BindResource(CS, &depthbuffer, TEXSLOT_DEPTH, cmd); + device->BindResource(CS, &gbuffer1, TEXSLOT_GBUFFER1, cmd); + device->BindResource(CS, &gbuffer2, TEXSLOT_GBUFFER2, cmd); + device->BindResource(CS, &texture_raytrace, TEXSLOT_ONDEMAND0, cmd); + device->BindResource(CS, &texture_mask, TEXSLOT_ONDEMAND1, cmd); + device->BindResource(CS, &texture_main, TEXSLOT_ONDEMAND2, cmd); + + const GPUResource* uavs[] = { + &texture_resolve, + }; + device->BindUAVs(CS, uavs, 0, arraysize(uavs), cmd); + + device->Dispatch( + (texture_resolve.GetDesc().Width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (texture_resolve.GetDesc().Height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + device->Barrier(&GPUBarrier::Memory(), 1, cmd); + device->UnbindUAVs(0, arraysize(uavs), cmd); + device->EventEnd(cmd); + } + + int temporal_output = device->GetFrameCount() % 2; + int temporal_history = 1 - temporal_output; + + // Temporal pass: + { + device->EventBegin("Temporal pass", cmd); + device->BindComputeShader(&computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_TEMPORAL], cmd); + + device->BindResource(CS, &gbuffer1, TEXSLOT_GBUFFER1, cmd); + device->BindResource(CS, &depthbuffer, TEXSLOT_DEPTH, cmd); + device->BindResource(CS, &texture_resolve, TEXSLOT_ONDEMAND0, cmd); + device->BindResource(CS, &texture_temporal[temporal_history], TEXSLOT_ONDEMAND1, cmd); + device->BindResource(CS, &texture_raytrace, TEXSLOT_ONDEMAND2, cmd); + + const GPUResource* uavs[] = { + &texture_temporal[temporal_output], + }; + device->BindUAVs(CS, uavs, 0, arraysize(uavs), cmd); + + device->Dispatch( + (texture_temporal[temporal_output].GetDesc().Width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (texture_temporal[temporal_output].GetDesc().Height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + device->Barrier(&GPUBarrier::Memory(), 1, cmd); + device->UnbindUAVs(0, arraysize(uavs), cmd); + device->EventEnd(cmd); + } + + // Median blur pass: + { + device->EventBegin("Median blur pass", cmd); + device->BindComputeShader(&computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_MEDIAN], cmd); + + device->BindResource(CS, &depthbuffer, TEXSLOT_DEPTH, cmd); + device->BindResource(CS, &texture_temporal[temporal_output], TEXSLOT_ONDEMAND0, cmd); + + const GPUResource* uavs[] = { + &texture_median, + }; + device->BindUAVs(CS, uavs, 0, arraysize(uavs), cmd); + + device->Dispatch( + (texture_median.GetDesc().Width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (texture_median.GetDesc().Height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + device->Barrier(&GPUBarrier::Memory(), 1, cmd); + device->UnbindUAVs(0, arraysize(uavs), cmd); + device->EventEnd(cmd); + } + //Postprocess_Blur_Bilateral(texture_temporal[temporal_output], lineardepth, texture_temp, output, cmd, 0.85f, 0.85f, 1.2f); + + // combine pass: + { + device->EventBegin("Combine pass", cmd); + device->BindComputeShader(&computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_COMBINE], cmd); + + device->BindResource(CS, &depthbuffer, TEXSLOT_DEPTH, cmd); + device->BindResource(CS, &gbuffer0, TEXSLOT_GBUFFER0, cmd); + device->BindResource(CS, &gbuffer1, TEXSLOT_GBUFFER1, cmd); + device->BindResource(CS, &gbuffer2, TEXSLOT_GBUFFER2, cmd); + device->BindResource(CS, &texture_median, TEXSLOT_ONDEMAND0, cmd); + + const GPUResource* uavs[] = { + &output, + }; + device->BindUAVs(CS, uavs, 0, arraysize(uavs), cmd); + + device->Dispatch( + (desc.Width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (desc.Height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + device->Barrier(&GPUBarrier::Memory(), 1, cmd); + device->UnbindUAVs(0, arraysize(uavs), cmd); + device->EventEnd(cmd); + } + + wiProfiler::EndRange(range); + device->EventEnd(cmd); +} void Postprocess_SSS( const Texture& lineardepth, const Texture& gbuffer0, diff --git a/WickedEngine/wiRenderer.h b/WickedEngine/wiRenderer.h index 509887a05..2d79f8987 100644 --- a/WickedEngine/wiRenderer.h +++ b/WickedEngine/wiRenderer.h @@ -208,6 +208,16 @@ namespace wiRenderer const wiGraphics::Texture& output, wiGraphics::CommandList cmd ); + void Postprocess_StochasticSSR( + const wiGraphics::Texture& input, + const wiGraphics::Texture& depthbuffer, + const wiGraphics::Texture& lineardepth_minmax, + const wiGraphics::Texture& gbuffer0, + const wiGraphics::Texture& gbuffer1, + const wiGraphics::Texture& gbuffer2, + const wiGraphics::Texture& output, + wiGraphics::CommandList cmd + ); void Postprocess_SSS( const wiGraphics::Texture& lineardepth, const wiGraphics::Texture& gbuffer0, diff --git a/WickedEngine/wiVersion.cpp b/WickedEngine/wiVersion.cpp index dcf1facb1..b0f45b633 100644 --- a/WickedEngine/wiVersion.cpp +++ b/WickedEngine/wiVersion.cpp @@ -9,7 +9,7 @@ namespace wiVersion // minor features, major updates const int minor = 38; // minor bug fixes, alterations, refactors, updates - const int revision = 7; + const int revision = 8; long GetVersion()