diff --git a/WickedEngine/RenderPath3D.cpp b/WickedEngine/RenderPath3D.cpp
index 74ae3bd37..128d65e63 100644
--- a/WickedEngine/RenderPath3D.cpp
+++ b/WickedEngine/RenderPath3D.cpp
@@ -36,6 +36,15 @@ void RenderPath3D::ResizeBuffers()
assert(subresource_index == i);
}
}
+ {
+ TextureDesc desc;
+ desc.BindFlags = BIND_SHADER_RESOURCE | BIND_UNORDERED_ACCESS;
+ desc.Format = FORMAT_R16G16B16A16_FLOAT;
+ desc.Width = wiRenderer::GetInternalResolution().x;
+ desc.Height = wiRenderer::GetInternalResolution().y;
+ device->CreateTexture(&desc, nullptr, &rtStochasticSSR);
+ device->SetName(&rtStochasticSSR, "rtStochasticSSR");
+ }
{
TextureDesc desc;
desc.BindFlags = BIND_RENDER_TARGET | BIND_SHADER_RESOURCE;
@@ -441,6 +450,13 @@ void RenderPath3D::RenderSSR(const Texture& srcSceneRT, const wiGraphics::Textur
wiRenderer::Postprocess_SSR(srcSceneRT, depthBuffer_Copy, rtLinearDepth_minmax, gbuffer1, rtSSR, cmd);
}
}
+void RenderPath3D::RenderStochasticSSR(const Texture& srcSceneRT, const wiGraphics::Texture& gbuffer0, const wiGraphics::Texture& gbuffer1, const wiGraphics::Texture& gbuffer2, CommandList cmd) const
+{
+ if (getSSREnabled())
+ {
+ wiRenderer::Postprocess_StochasticSSR(srcSceneRT, depthBuffer_Copy, rtLinearDepth_minmax, gbuffer0, gbuffer1, gbuffer2, rtStochasticSSR, cmd);
+ }
+}
void RenderPath3D::DownsampleDepthBuffer(CommandList cmd) const
{
GraphicsDevice* device = wiRenderer::GetDevice();
diff --git a/WickedEngine/RenderPath3D.h b/WickedEngine/RenderPath3D.h
index 01fd00023..efbe58821 100644
--- a/WickedEngine/RenderPath3D.h
+++ b/WickedEngine/RenderPath3D.h
@@ -51,7 +51,8 @@ private:
protected:
wiGraphics::Texture rtReflection; // conains the scene rendered for planar reflections
- wiGraphics::Texture rtSSR; // screen-space reflection results
+ wiGraphics::Texture rtSSR; // standard screen-space reflection results
+ wiGraphics::Texture rtStochasticSSR; // stochastic screen-space reflection results
wiGraphics::Texture rtSceneCopy; // contains the rendered scene that can be fed into transparent pass for distortion effect
wiGraphics::Texture rtWaterRipple; // water ripple sprite normal maps are rendered into this
wiGraphics::Texture rtParticleDistortion; // contains distortive particles
@@ -102,6 +103,7 @@ protected:
virtual void RenderLinearDepth(wiGraphics::CommandList cmd) const;
virtual void RenderSSAO(wiGraphics::CommandList cmd) const;
virtual void RenderSSR(const wiGraphics::Texture& srcSceneRT, const wiGraphics::Texture& gbuffer1, wiGraphics::CommandList cmd) const;
+ virtual void RenderStochasticSSR(const wiGraphics::Texture& srcSceneRT, const wiGraphics::Texture& gbuffer0, const wiGraphics::Texture& gbuffer1, const wiGraphics::Texture& gbuffer2, wiGraphics::CommandList cmd) const;
virtual void DownsampleDepthBuffer(wiGraphics::CommandList cmd) const;
virtual void RenderOutline(const wiGraphics::Texture& dstSceneRT, wiGraphics::CommandList cmd) const;
virtual void RenderLightShafts(wiGraphics::CommandList cmd) const;
diff --git a/WickedEngine/RenderPath3D_Deferred.cpp b/WickedEngine/RenderPath3D_Deferred.cpp
index b5bab5c98..98ad5aa1a 100644
--- a/WickedEngine/RenderPath3D_Deferred.cpp
+++ b/WickedEngine/RenderPath3D_Deferred.cpp
@@ -209,7 +209,7 @@ void RenderPath3D_Deferred::Render() const
device->BindViewports(1, &vp, cmd);
device->BindResource(PS, getSSAOEnabled() ? &rtSSAO[0] : wiTextureHelper::getWhite(), TEXSLOT_RENDERPATH_SSAO, cmd);
- device->BindResource(PS, getSSREnabled() ? &rtSSR : wiTextureHelper::getTransparent(), TEXSLOT_RENDERPATH_SSR, cmd);
+ device->BindResource(PS, getSSREnabled() ? &rtStochasticSSR : wiTextureHelper::getTransparent(), TEXSLOT_RENDERPATH_SSR, cmd);
wiRenderer::DrawDeferredLights(wiRenderer::GetCamera(), depthBuffer_Copy, rtGBuffer[0], rtGBuffer[1], rtGBuffer[2], cmd);
device->RenderPassEnd(cmd);
@@ -227,7 +227,7 @@ void RenderPath3D_Deferred::Render() const
RenderDeferredComposition(cmd);
- RenderSSR(rtDeferred, rtGBuffer[1], cmd);
+ RenderStochasticSSR(rtDeferred, rtGBuffer[0], rtGBuffer[1], rtGBuffer[2], cmd);
DownsampleDepthBuffer(cmd);
diff --git a/WickedEngine/RenderPath3D_TiledDeferred.cpp b/WickedEngine/RenderPath3D_TiledDeferred.cpp
index a41c25db6..05ee98bb2 100644
--- a/WickedEngine/RenderPath3D_TiledDeferred.cpp
+++ b/WickedEngine/RenderPath3D_TiledDeferred.cpp
@@ -105,7 +105,7 @@ void RenderPath3D_TiledDeferred::Render() const
RenderDecals(cmd);
device->BindResource(CS, getSSAOEnabled() ? &rtSSAO[0] : wiTextureHelper::getWhite(), TEXSLOT_RENDERPATH_SSAO, cmd);
- device->BindResource(CS, getSSREnabled() ? &rtSSR : wiTextureHelper::getTransparent(), TEXSLOT_RENDERPATH_SSR, cmd);
+ device->BindResource(CS, getSSREnabled() ? &rtStochasticSSR : wiTextureHelper::getTransparent(), TEXSLOT_RENDERPATH_SSR, cmd);
if (device->CheckCapability(GraphicsDevice::GRAPHICSDEVICE_CAPABILITY_UAV_LOAD_FORMAT_R11G11B10_FLOAT))
@@ -156,7 +156,7 @@ void RenderPath3D_TiledDeferred::Render() const
RenderDeferredComposition(cmd);
- RenderSSR(rtDeferred, rtGBuffer[1], cmd);
+ RenderStochasticSSR(rtDeferred, rtGBuffer[0], rtGBuffer[1], rtGBuffer[2], cmd);
DownsampleDepthBuffer(cmd);
diff --git a/WickedEngine/WickedEngine_SHADERS.vcxproj b/WickedEngine/WickedEngine_SHADERS.vcxproj
index cb746e399..c355fbd80 100644
--- a/WickedEngine/WickedEngine_SHADERS.vcxproj
+++ b/WickedEngine/WickedEngine_SHADERS.vcxproj
@@ -31,6 +31,7 @@
+
@@ -801,12 +802,27 @@
Compute
+
+ Compute
+
+
+ Compute
+
+
+ Compute
+
Compute
Pixel
+
+ Compute
+
+
+ Compute
+
Pixel
@@ -974,4 +990,4 @@
-
\ No newline at end of file
+
diff --git a/WickedEngine/WickedEngine_SHADERS.vcxproj.filters b/WickedEngine/WickedEngine_SHADERS.vcxproj.filters
index 3324156e2..fcf685b1e 100644
--- a/WickedEngine/WickedEngine_SHADERS.vcxproj.filters
+++ b/WickedEngine/WickedEngine_SHADERS.vcxproj.filters
@@ -85,6 +85,9 @@
HF
+
+ HF
+
@@ -852,6 +855,21 @@
CS
+
+ CS
+
+
+ CS
+
+
+ CS
+
+
+ CS
+
+
+ CS
+
CS
@@ -897,4 +915,4 @@
{12396e21-0254-42fa-a88b-805f0703eca5}
-
\ No newline at end of file
+
diff --git a/WickedEngine/deferredPS.hlsl b/WickedEngine/deferredPS.hlsl
index 8037359fd..5e2d52f40 100644
--- a/WickedEngine/deferredPS.hlsl
+++ b/WickedEngine/deferredPS.hlsl
@@ -19,5 +19,5 @@ float4 main(float4 pos : SV_Position, float2 uv : TEXCOORD) : SV_TARGET
ApplyFog(depth, color);
- return color;
+ return max(0, color);
}
\ No newline at end of file
diff --git a/WickedEngine/globals.hlsli b/WickedEngine/globals.hlsli
index 879ae4b1b..0f3e3e41e 100644
--- a/WickedEngine/globals.hlsli
+++ b/WickedEngine/globals.hlsli
@@ -147,6 +147,34 @@ inline float2 hammersley2d(uint idx, uint num) {
return float2(float(idx) / float(num), radicalInverse_VdC);
}
+inline float2 HammersleyRandom(uint idx, uint num, uint2 random)
+{
+ uint bits = idx;
+ bits = (bits << 16) | (bits >> 16);
+ bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8);
+ bits = ((bits & 0x0f0f0f0f) << 4) | ((bits & 0xf0f0f0f0) >> 4);
+ bits = ((bits & 0x33333333) << 2) | ((bits & 0xcccccccc) >> 2);
+ bits = ((bits & 0x55555555) << 1) | ((bits & 0xaaaaaaaa) >> 1);
+
+ float E1 = frac((float) idx / num + float(random.x) * (1.0 / 65536.0));
+ float E2 = float((bits >> 16) ^ random.y) * (1.0 / 65536.0);
+ return float2(E1, E2);
+}
+
+inline float2 HammersleyRandom(uint idx, uint2 random)
+{
+ uint bits = idx;
+ bits = (bits << 16) | (bits >> 16);
+ bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8);
+ bits = ((bits & 0x0f0f0f0f) << 4) | ((bits & 0xf0f0f0f0) >> 4);
+ bits = ((bits & 0x33333333) << 2) | ((bits & 0xcccccccc) >> 2);
+ bits = ((bits & 0x55555555) << 1) | ((bits & 0xaaaaaaaa) >> 1);
+
+ float E1 = frac(float(random.x) * (1.0 / 65536.0));
+ float E2 = float((bits >> 16) ^ random.y) * (1.0 / 65536.0);
+ return float2(E1, E2);
+}
+
// "Next Generation Post Processing in Call of Duty: Advanced Warfare"
// http://advances.realtimerendering.com/s2014/index.html
float InterleavedGradientNoise(float2 uv, uint frameCount)
@@ -554,4 +582,4 @@ inline float dither(in float2 pixel)
return ditherMask8(pixel);
}
-#endif // WI_SHADER_GLOBALS_HF
\ No newline at end of file
+#endif // WI_SHADER_GLOBALS_HF
diff --git a/WickedEngine/stochasticSSRCS_combine.hlsl b/WickedEngine/stochasticSSRCS_combine.hlsl
new file mode 100644
index 000000000..15f7a1404
--- /dev/null
+++ b/WickedEngine/stochasticSSRCS_combine.hlsl
@@ -0,0 +1,44 @@
+#include "globals.hlsli"
+#include "brdf.hlsli"
+#include "stochasticSSRHF.hlsli"
+#include "ShaderInterop_Postprocess.h"
+
+TEXTURE2D(texture_median, float4, TEXSLOT_ONDEMAND0);
+
+RWTEXTURE2D(output, float4, 0);
+
+// Final Stochastic SSR pass. Here we can apply final touches like specular occlusion or fresnel and BRDFLUT?
+
+[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)]
+void main(uint3 DTid : SV_DispatchThreadID)
+{
+ const float2 uv = (DTid.xy + 0.5f) * xPPResolution_rcp;
+ const float depth = texture_depth.SampleLevel(sampler_point_clamp, uv, 0);
+ if (depth == 0.0f)
+ return;
+
+ // Everything in view space:
+ const float3 P = reconstructPosition(uv, depth, g_xCamera_InvP);
+ const float3 N = mul((float3x3) g_xCamera_View, decodeNormal(texture_gbuffer1.SampleLevel(sampler_point_clamp, uv, 0).xy)).xyz;
+ const float3 V = normalize(P);
+
+ float NdotV = max(dot(N, V), 0.0f);
+
+ float3 albedo = texture_gbuffer0.SampleLevel(sampler_point_clamp, uv, 0).rgb;
+ float4 baseColor = float4(albedo, 1.0f);
+
+ float4 GBuffer2 = texture_gbuffer2.SampleLevel(sampler_point_clamp, uv, 0);
+ //float occlusion = GBuffer2.r;
+ //float roughness = GBuffer2.g;
+ float metalness = GBuffer2.b;
+ float reflectance = GBuffer2.a;
+
+ float3 f0 = ComputeF0(baseColor, reflectance, metalness);
+ float f90 = saturate(50.0 * dot(f0, 0.33));
+ float3 F = F_Schlick(f0, f90, NdotV);
+
+ float4 final = texture_median.SampleLevel(sampler_point_clamp, uv, 0);
+ final.rgb *= F;
+
+ output[DTid.xy] = final;
+}
diff --git a/WickedEngine/stochasticSSRCS_median.hlsl b/WickedEngine/stochasticSSRCS_median.hlsl
new file mode 100644
index 000000000..5e3f69b25
--- /dev/null
+++ b/WickedEngine/stochasticSSRCS_median.hlsl
@@ -0,0 +1,64 @@
+#include "globals.hlsli"
+#include "ShaderInterop_Postprocess.h"
+
+TEXTURE2D(texture_temporal, float4, TEXSLOT_ONDEMAND0);
+
+RWTEXTURE2D(output, float4, 0);
+
+// A Fast, Small-Radius GPU Median Filter by Morgan McGuire
+// https://casual-effects.com/research/McGuire2008Median/index.html
+
+#define s2(a, b) temp = a; a = min(a, b); b = max(temp, b);
+#define t2(a, b) s2(v[a], v[b]);
+#define t24(a, b, c, d, e, f, g, h) t2(a, b); t2(c, d); t2(e, f); t2(g, h);
+#define t25(a, b, c, d, e, f, g, h, i, j) t24(a, b, c, d, e, f, g, h); t2(i, j);
+
+[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)]
+void main(uint3 DTid : SV_DispatchThreadID)
+{
+ const float2 uv = (DTid.xy + 0.5f) * xPPResolution_rcp;
+ const float depth = texture_depth.SampleLevel(sampler_point_clamp, uv, 0);
+ if (depth == 0.0f)
+ return;
+
+ half4 v[25];
+
+ // Add the pixels which make up our window to the pixel array.
+ [unroll]
+ for (int dX = -2; dX <= 2; ++dX)
+ {
+ [unroll]
+ for (int dY = -2; dY <= 2; ++dY)
+ {
+ float2 offset = float2(float(dX), float(dY));
+
+ // If a pixel in the window is located at (x+dX, y+dY), put it at index (dX + R)(2R + 1) + (dY + R) of the
+ // pixel array. This will fill the pixel array, with the top left pixel of the window at pixel[0] and the
+ // bottom right pixel of the window at pixel[N-1].
+ v[(dX + 2) * 5 + (dY + 2)] = texture_temporal.SampleLevel(sampler_linear_clamp, uv + offset * xPPResolution_rcp, 0);
+ }
+ }
+
+ half4 temp;
+ t25(0, 1, 3, 4, 2, 4, 2, 3, 6, 7);
+ t25(5, 7, 5, 6, 9, 7, 1, 7, 1, 4);
+ t25(12, 13, 11, 13, 11, 12, 15, 16, 14, 16);
+ t25(14, 15, 18, 19, 17, 19, 17, 18, 21, 22);
+ t25(20, 22, 20, 21, 23, 24, 2, 5, 3, 6);
+ t25(0, 6, 0, 3, 4, 7, 1, 7, 1, 4);
+ t25(11, 14, 8, 14, 8, 11, 12, 15, 9, 15);
+ t25(9, 12, 13, 16, 10, 16, 10, 13, 20, 23);
+ t25(17, 23, 17, 20, 21, 24, 18, 24, 18, 21);
+ t25(19, 22, 8, 17, 9, 18, 0, 18, 0, 9);
+ t25(10, 19, 1, 19, 1, 10, 11, 20, 2, 20);
+ t25(2, 11, 12, 21, 3, 21, 3, 12, 13, 22);
+ t25(4, 22, 4, 13, 14, 23, 5, 23, 5, 14);
+ t25(15, 24, 6, 24, 6, 15, 7, 16, 7, 19);
+ t25(3, 11, 5, 17, 11, 17, 9, 17, 4, 10);
+ t25(6, 12, 7, 14, 4, 6, 4, 7, 12, 14);
+ t25(10, 14, 6, 7, 10, 12, 6, 10, 6, 17);
+ t25(12, 17, 7, 17, 7, 10, 12, 18, 7, 12);
+ t24(10, 18, 12, 20, 10, 20, 10, 12);
+
+ output[DTid.xy] = v[12];
+}
diff --git a/WickedEngine/stochasticSSRCS_raytrace.hlsl b/WickedEngine/stochasticSSRCS_raytrace.hlsl
new file mode 100644
index 000000000..1f175199a
--- /dev/null
+++ b/WickedEngine/stochasticSSRCS_raytrace.hlsl
@@ -0,0 +1,296 @@
+#include "globals.hlsli"
+#include "stochasticSSRHF.hlsli"
+#include "ShaderInterop_Postprocess.h"
+
+TEXTURE2D(input, float4, TEXSLOT_ONDEMAND0);
+TEXTURE2D(texture_lineardepth_minmax, float2, TEXSLOT_ONDEMAND1);
+
+RWTEXTURE2D(texture_raytrace, float4, 0);
+RWTEXTURE2D(texture_mask, float2, 1);
+
+// Use this to use reduced precision, but higher framerate:
+#define USE_LINEARDEPTH
+
+static const float rayTraceStride = 1.0f; // Step in horizontal or vertical pixels between samples.
+static const float rayTraceMaxStep = 512.0f; // Maximum number of iterations. Higher gives better images but may be slow.
+static const float rayTraceHitThickness = 1.5f; // Thickness to ascribe to each pixel in the depth buffer.
+static const float rayTraceHitThicknessBias = 7.0f; // Bias to control the thickness along distance.
+static const float rayTraceMaxDistance = 1000.0f; // Maximum camera-space distance to trace before returning a miss.
+static const float rayTraceStrideCutoff = 100.0f; // More distant pixels are smaller in screen space. This value tells at what point to
+ // start relaxing the stride to give higher quality reflections for objects far from the camera.
+static const float raytraceHZBBias = 1.0f;
+
+float DistanceSquared(float2 a, float2 b)
+{
+ a -= b;
+ return dot(a, a);
+}
+
+bool intersectsDepthBuffer(float z, float minZ, float maxZ)
+{
+ // Increase thickness along distance.
+ // This will help objects from dissapering in the distance.
+ float thicknessScale = min(1.0f, z / rayTraceStrideCutoff);
+ float thickness = rayTraceHitThickness * rayTraceHitThicknessBias * thicknessScale;
+ thickness = clamp(thickness, rayTraceHitThickness, 10.0f);
+
+ // Effectively remove line/tiny artifacts, mostly caused by Zbuffers precision.
+ float depthScale = min(1.0f, z / rayTraceStrideCutoff);
+ z += lerp(0.05f, 0.0f, depthScale);
+
+ return (minZ >= z) && (maxZ - thickness <= z);
+}
+
+// Heavily adapted from McGuire and Mara's original implementation
+// http://casual-effects.blogspot.com/2014/08/screen-space-ray-tracing.html
+bool ScreenSpaceRayTrace(float3 csOrig, float3 csDir, float jitter, float roughness, out float2 hitPixel, out float3 hitPoint, out float iterationCount)
+{
+ float rayLength = ((csOrig.z + csDir.z * rayTraceMaxDistance) < g_xCamera_ZNearP) ? (g_xCamera_ZNearP - csOrig.z) / csDir.z : rayTraceMaxDistance;
+
+ float3 csRayEnd = csOrig + csDir * rayLength;
+
+ // Project into homogeneous clip space
+ float4 clipRayOrigin = mul(g_xCamera_Proj, float4(csOrig, 1.0f));
+ float4 clipRayEnd = mul(g_xCamera_Proj, float4(csRayEnd, 1.0f));
+
+ float k0 = 1.0f / clipRayOrigin.w;
+ float k1 = 1.0f / clipRayEnd.w;
+
+ float3 Q0 = csOrig * k0;
+ float3 Q1 = csRayEnd * k1;
+
+ // Screen-space endpoints
+ float2 P0 = clipRayOrigin.xy * k0;
+ float2 P1 = clipRayEnd.xy * k1;
+
+ // Project to pixel
+ P0 = P0 * float2(0.5, -0.5) + float2(0.5, 0.5);
+ P1 = P1 * float2(0.5, -0.5) + float2(0.5, 0.5);
+
+ P0.xy *= xPPResolution.xy;
+ P1.xy *= xPPResolution.xy;
+
+#if 1
+ // Clip to the screen coordinates. Alternatively we could just modify rayTraceMaxStep instead
+ // This will also improve the framerate, without losing quality or features
+ float2 yDelta = float2(xPPResolution.y + 2.0f, -2.0f); // - 0.5, 0.5
+ float2 xDelta = float2(xPPResolution.x + 2.0f, -2.0f); // - 0.5, 0.5
+ float alpha = 0.0;
+
+ // P0 must be in bounds
+ if (P1.y > yDelta.x || P1.y < yDelta.y)
+ {
+ float yClip = (P1.y > yDelta.x) ? yDelta.x : yDelta.y;
+ float yAlpha = (P1.y - yClip) / (P1.y - P0.y);
+ alpha = yAlpha;
+ }
+
+ // P1 must be in bounds
+ if (P1.x > xDelta.x || P1.x < xDelta.y)
+ {
+ float xClip = (P1.x > xDelta.x) ? xDelta.x : xDelta.y;
+ float xAlpha = (P1.x - xClip) / (P1.x - P0.x);
+ alpha = max(alpha, xAlpha);
+ }
+
+ // These are all in homogeneous space, so they interpolate linearly
+ P1 = lerp(P1, P0, alpha);
+ k1 = lerp(k1, k0, alpha);
+ Q1 = lerp(Q1, Q0, alpha);
+#endif
+
+ // If the line is degenerate, make it cover at least one pixel to avoid handling zero-pixel extent as a special case later
+ P1 += (DistanceSquared(P0, P1) < 0.0001f) ? float2(0.01f, 0.01f) : 0.0f;
+ float2 screenOffset = P1 - P0;
+
+ // Permute so that the primary iteration is in x to collapse all quadrant-specific DDA cases later
+ bool permute = false;
+ if (abs(screenOffset.x) < abs(screenOffset.y))
+ {
+ permute = true;
+ screenOffset = screenOffset.yx;
+ P0 = P0.yx;
+ P1 = P1.yx;
+ }
+
+ float stepDirection = sign(screenOffset.x);
+ float stepInterval = stepDirection / screenOffset.x;
+
+ // Track the derivatives of Q and k
+ float3 dQ = (Q1 - Q0) * stepInterval;
+ float dk = (k1 - k0) * stepInterval;
+
+ // Because we test 1/2 a texel forward along the ray, on the very last iteration
+ // the interpolation can go past the end of the ray. Use these bounds to clamp it.
+ float zMin = min(csRayEnd.z, csOrig.z);
+ float zMax = max(csRayEnd.z, csOrig.z);
+
+ float2 dP = float2(stepDirection, screenOffset.y * stepInterval);
+
+ // Scale derivatives by the desired pixel stride and then offset the starting values by the jitter fraction
+ float strideScale = 1.0f - min(1.0f, csOrig.z / rayTraceStrideCutoff);
+ float stride = 1.0f + strideScale * rayTraceStride;
+
+ dP *= stride;
+ dQ *= stride;
+ dk *= stride;
+
+ P0 += dP * jitter;
+ Q0 += dQ * jitter;
+ k0 += dk * jitter;
+
+ float4 PQk = float4(P0, Q0.z, k0);
+ float4 dPQk = float4(dP, dQ.z, dk);
+ float3 Q = Q0;
+
+ // Adjust end condition for iteration direction
+ float end = P1.x * stepDirection;
+
+ // raytrace iterations based on roughness
+ // Matte materials will get less samples
+ float roughnessTraceStep = max(rayTraceMaxStep * (1.0 - roughness), 1.0f);
+
+ float stepCount = 0.0f;
+ float level = 0.0f; // 1.0f start level. Parameter?
+
+ float prevZMaxEstimate = csOrig.z;
+ float rayZMin = prevZMaxEstimate;
+ float rayZMax = prevZMaxEstimate;
+ float sceneZMax = rayZMax + 100000.0f;
+
+ [loop]
+ for (; ((PQk.x * stepDirection) <= end) &&
+ (stepCount <= roughnessTraceStep - 1) &&
+ !intersectsDepthBuffer(sceneZMax, rayZMin, rayZMax) &&
+ (sceneZMax != 0.0f) &&
+ (level > -1);
+ PQk += dPQk, stepCount++)
+ {
+ if (!is_saturated(hitPixel))
+ {
+ return false;
+ }
+
+ rayZMin = prevZMaxEstimate;
+
+ // Compute the value at 1/2 step into the future
+ rayZMax = (dPQk.z * 0.5f + PQk.z) / (dPQk.w * 0.5f + PQk.w);
+ rayZMax = clamp(rayZMax, zMin, zMax);
+ prevZMaxEstimate = rayZMax;
+
+ [flatten]
+ if (rayTraceMaxDistance < rayZMax)
+ {
+ return false;
+ }
+
+ [flatten]
+ if (rayZMin > rayZMax)
+ {
+ float t = rayZMin;
+ rayZMin = rayZMax;
+ rayZMax = t;
+ }
+
+ // A simple HZB approach based on roughness
+ level += min(raytraceHZBBias / 10.0f, 5.0f) * roughness;
+
+ hitPixel = permute ? PQk.yx : PQk.xy;
+ hitPixel *= xPPResolution_rcp;
+
+ #ifdef USE_LINEARDEPTH
+ sceneZMax = texture_lineardepth_minmax.SampleLevel(sampler_point_clamp, hitPixel, level).g * g_xCamera_ZFarP;
+ #else
+ sceneZMax = getLinearDepth(texture_depth.SampleLevel(sampler_point_clamp, hitPixel, 0).r);
+ #endif
+ }
+
+ // Advance Q based on the number of steps
+ Q.xy += dQ.xy * stepCount;
+ hitPoint = Q * (1.0f / PQk.w);
+ iterationCount = stepCount;
+
+ return intersectsDepthBuffer(sceneZMax, rayZMin, rayZMax);
+}
+
+[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)]
+void main(uint3 DTid : SV_DispatchThreadID)
+{
+ const float2 uv = (DTid.xy + 0.5f) * xPPResolution_rcp;
+ const float depth = texture_depth.SampleLevel(sampler_point_clamp, uv, 0);
+ if (depth == 0.0f)
+ return;
+
+ // Everything in view space:
+ const float3 P = reconstructPosition(uv, depth, g_xCamera_InvP);
+ const float3 N = mul((float3x3)g_xCamera_View, decodeNormal(texture_gbuffer1.SampleLevel(sampler_point_clamp, uv, 0).xy)).xyz;
+ const float3 V = normalize(P);
+
+ const float roughness = GetRoughness(texture_gbuffer2.SampleLevel(sampler_point_clamp, uv, 0).g);
+
+ const float roughnessFade = GetRoughnessFade(roughness, SSRMaxRoughness);
+ if (roughnessFade <= 0.0f)
+ {
+ return;
+ }
+
+ float4 H;
+ if (roughness > 0.1f)
+ {
+ const float surfaceMargin = 0.0f;
+ const float maxRegenCount = 15.0f;
+
+ uint2 Random = Rand3DPCG16(int3((DTid.xy + 0.5f), g_xFrame_FrameCount)).xy;
+
+ // Pick the best rays
+
+ float RdotN = 0.0f;
+ float regenCount = 0;
+ [loop]
+ for (; RdotN <= surfaceMargin && regenCount < maxRegenCount; regenCount++)
+ {
+ // Low-discrepancy sequence
+ //float2 Xi = float2(Random) * rcp(65536.0); // equivalent to HammersleyRandom(0, 1, Random).
+ float2 Xi = HammersleyRandom(regenCount, Random); // SingleSPP
+
+ Xi.y = lerp(Xi.y, 0.0f, BRDFBias);
+
+ // I should probably use importance sampling of visible normals http://jcgt.org/published/0007/04/01/paper.pdf
+ H = ImportanceSampleGGX(Xi, roughness);
+ H = TangentToWorld(H, N);
+
+ RdotN = dot(N, reflect(V, H.xyz));
+ }
+ }
+ else
+ {
+ H = float4(N.xyz, 1.0f);
+ }
+
+ float3 dir = reflect(V, H.xyz);
+
+ float2 hitPixel = float2(0.0f, 0.0f);
+ float3 hitPoint = float3(0.0f, 0.0f, 0.0f);
+ float iterationCount = 0.0f;
+
+ float2 uv2 = (DTid.xy + 0.5f);
+ //float jitter = 1.0f + rand(uv2 + g_xFrame_Time);
+ float jitter = 1.0f + InterleavedGradientNoise(uv2, g_xFrame_FrameCount);
+
+ bool hit = ScreenSpaceRayTrace(P, dir, jitter, roughness, hitPixel, hitPoint, iterationCount);
+
+ float hitDepth = texture_depth.SampleLevel(sampler_point_clamp, hitPixel, 0);
+
+ // Output:
+ // xy: hit pixel
+ // z: hit depth
+ // w: pdf
+ float4 raytrace = max(0, float4(hitPixel, hitDepth, H.w));
+ texture_raytrace[DTid.xy] = raytrace;
+
+ // Output:
+ // x: hit (bool)
+ // y: iteration count / rayTraceMaxStep
+ float2 mask = float2(hit, iterationCount / rayTraceMaxStep);
+ texture_mask[DTid.xy] = mask;
+}
diff --git a/WickedEngine/stochasticSSRCS_resolve.hlsl b/WickedEngine/stochasticSSRCS_resolve.hlsl
new file mode 100644
index 000000000..4cbdc3cd4
--- /dev/null
+++ b/WickedEngine/stochasticSSRCS_resolve.hlsl
@@ -0,0 +1,147 @@
+#include "globals.hlsli"
+#include "brdf.hlsli"
+#include "stochasticSSRHF.hlsli"
+#include "ShaderInterop_Postprocess.h"
+
+TEXTURE2D(texture_raytrace, float4, TEXSLOT_ONDEMAND0);
+TEXTURE2D(texture_mask, float2, TEXSLOT_ONDEMAND1);
+TEXTURE2D(texture_main, float4, TEXSLOT_ONDEMAND2);
+
+RWTEXTURE2D(texture_resolve, float4, 0);
+
+static const float resolveSequenceSize = 20.0f; // Can help reduce noise on rough surfaces, but too high values tend to wash out contact points.
+static const float resolveMip = 1.0f;
+static const float resolveSSRIntensity = 1.0f;
+
+static const float blendScreenEdgeFade = 5.0f;
+static const bool blendReflectSky = true;
+
+float CalculateBlendIntersection(bool hit, float iterationStep, float2 hitPixel)
+{
+ float confidence = 1.0 - pow(iterationStep, 8.0f);
+ float2 hitPixelNDC = hitPixel * 2.0 - 1.0;
+
+ //float maxDimension = min(1.0, max(abs(hitPixelNDC.x), abs(hitPixelNDC.y)));
+ //float attenuation = 1.0 - max(0.0, maxDimension - blendScreenEdgeFade) / (1.0 - blendScreenEdgeFade);
+
+ float2 vignette = saturate(abs(hitPixelNDC) * blendScreenEdgeFade - (blendScreenEdgeFade - 1.0f));
+ float attenuation = saturate(1.0 - dot(vignette, vignette));
+
+ float blend = confidence * attenuation;
+
+ if (!hit && !blendReflectSky)
+ blend = 0.0;
+
+ return blend;
+}
+
+// I probably need to figure out a better way to deal with this.
+float2 CalculateTailDirection(float3 viewNormal)
+{
+ float3 upVector = abs(viewNormal.z) < 0.999 ? float3(0.0, 0.0, 1.0) : float3(1.0, 0.0, 0.0);
+ float3 T = normalize(cross(upVector, viewNormal));
+
+ float tailDirection = T.x * -viewNormal.y;
+
+ return lerp(float2(1.0, 0.1), float2(0.1, 1.0), tailDirection);
+}
+
+[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)]
+void main(uint3 DTid : SV_DispatchThreadID)
+{
+ const float2 uv = (DTid.xy + 0.5f) * xPPResolution_rcp;
+ const float depth = texture_depth.SampleLevel(sampler_point_clamp, uv, 0);
+ if (depth == 0.0f)
+ return;
+
+ // Everthing in view space:
+ const float3 P = reconstructPosition(uv, depth, g_xCamera_InvP);
+ const float3 N = mul((float3x3) g_xCamera_View, decodeNormal(texture_gbuffer1.SampleLevel(sampler_point_clamp, uv, 0).xy)).xyz;
+ const float3 V = normalize(-P);
+ const float NdotV = saturate(dot(N, V));
+
+ const float roughness = GetRoughness(texture_gbuffer2.SampleLevel(sampler_point_clamp, uv, 0).g);
+ const float roughnessSequenceSize = resolveSequenceSize * roughness + 1.0f;
+
+ // Early out, useless if the roughness is out of range
+ float roughnessFade = GetRoughnessFade(roughness, SSRMaxRoughness);
+ if (roughnessFade <= 0.0f)
+ {
+ texture_resolve[DTid.xy] = 0;
+ return;
+ }
+
+ float specularConeTangent = lerp(0.0, roughness * (1.0 - BRDFBias), NdotV * sqrt(roughness));
+ specularConeTangent *= lerp(saturate(NdotV * 2), 1.0f, sqrt(roughness));
+
+ const float maxMipLevel = 11.0f - 1.0f;
+ const uint2 Random = Rand3DPCG16(int3((DTid.xy + 0.5f), g_xFrame_FrameCount)).xy;
+
+ float4 result = 0.0f;
+ float weightSum = 0.0f;
+
+ const uint NumResolve = 4;
+ [unroll]
+ for (uint i = 0; i < NumResolve; i++)
+ {
+ float2 offsetRotation = (HammersleyRandom(i, NumResolve, Random) * 2.0 - 1.0) * roughnessSequenceSize;
+ float2x2 offsetRotationMatrix = float2x2(offsetRotation.x, offsetRotation.y, -offsetRotation.y, offsetRotation.x);
+
+ float2 offsetUV = offset[i] * (1.0f / xPPResolution);
+ offsetUV = uv + mul(offsetRotationMatrix, offsetUV) * CalculateTailDirection(N);
+
+ float4 raytraceSource = texture_raytrace.SampleLevel(sampler_point_clamp, offsetUV, 0);
+ float2 maskSource = texture_mask.SampleLevel(sampler_point_clamp, offsetUV, 0);
+
+ float2 hitPixel = raytraceSource.xy;
+ float hitDepth = raytraceSource.z;
+ float hitPDF = raytraceSource.w;
+ bool hit = (bool)maskSource.x;
+ float iterationStep = maskSource.y;
+
+ float intersectionCircleRadius = specularConeTangent * length(hitPixel - uv);
+ float sourceMip = clamp(log2(intersectionCircleRadius * max(xPPResolution.x, xPPResolution.y)), 0.0, maxMipLevel) * resolveMip;
+
+ float4 sampleColor;
+ sampleColor.rgb = texture_main.SampleLevel(sampler_linear_clamp, hitPixel, sourceMip).xyz;
+ sampleColor.a = CalculateBlendIntersection(hit, iterationStep, hitPixel);
+
+ sampleColor.rgb /= 1 + Luminance(sampleColor.rgb);
+
+ // BRDF
+
+ float3 hitViewPosition = reconstructPosition(hitPixel, hitDepth, g_xCamera_InvP);
+
+ float3 L = normalize(hitViewPosition - P);
+ float3 H = normalize(L + V);
+
+ float NdotH = saturate(dot(N, H));
+ float NdotL = saturate(dot(N, L));
+
+ Surface surface;
+ surface.alphaRoughnessSq = pow(roughness, 4);
+
+ SurfaceToLight surfaceToLight;
+ surfaceToLight.NdotH = NdotH;
+ surfaceToLight.NdotL = NdotL;
+ surfaceToLight.NdotV = NdotV;
+
+ // We could simply use BRDF_GetSpecular, but we exclude fresnel for later
+ float Vis = visibilityOcclusion(surface, surfaceToLight);
+ float D = microfacetDistribution(surface, surfaceToLight);
+ float specularLight = Vis * D * surfaceToLight.NdotL;
+
+ float weight = specularLight / max(hitPDF, 0.00001f);
+
+ result += sampleColor * weight;
+ weightSum += weight;
+ }
+ result /= weightSum;
+
+ result.rgb /= 1 - Luminance(result.rgb);
+
+ result *= roughnessFade;
+ result *= resolveSSRIntensity;
+
+ texture_resolve[DTid.xy] = max(result, 0.00001f);
+}
diff --git a/WickedEngine/stochasticSSRCS_temporal.hlsl b/WickedEngine/stochasticSSRCS_temporal.hlsl
new file mode 100644
index 000000000..4a1a3ad82
--- /dev/null
+++ b/WickedEngine/stochasticSSRCS_temporal.hlsl
@@ -0,0 +1,171 @@
+#include "globals.hlsli"
+#include "stochasticSSRHF.hlsli"
+#include "ShaderInterop_Postprocess.h"
+
+TEXTURE2D(resolve_current, float4, TEXSLOT_ONDEMAND0);
+TEXTURE2D(resolve_history, float4, TEXSLOT_ONDEMAND1);
+TEXTURE2D(texture_raytrace, float4, TEXSLOT_ONDEMAND2);
+
+RWTEXTURE2D(output, float4, 0);
+
+static const float temporalResponseMin = 0.85f;
+static const float temporalResponseMax = 1.0f;
+static const float temporalScale = 2.0f;
+static const float temporalExposure = 10.0f;
+
+inline float Luma4(float3 color)
+{
+ return (color.g * 2) + (color.r + color.b);
+}
+
+inline float HdrWeight4(float3 color, float exposure)
+{
+ return rcp(Luma4(color) * exposure + 4.0f);
+}
+
+float4 clip_aabb(float3 aabb_min, float3 aabb_max, float4 p, float4 q)
+{
+ float3 p_clip = 0.5 * (aabb_max + aabb_min);
+ float3 e_clip = 0.5 * (aabb_max - aabb_min) + 0.00000001f;
+
+ float4 v_clip = q - float4(p_clip, p.w);
+ float3 v_unit = v_clip.xyz / e_clip;
+ float3 a_unit = abs(v_unit);
+ float ma_unit = max(a_unit.x, max(a_unit.y, a_unit.z));
+
+ if (ma_unit > 1.0)
+ return float4(p_clip, p.w) + v_clip / ma_unit;
+ else
+ return q; // point inside aabb
+}
+
+inline void ResolverAABB(Texture2D currentColor, SamplerState currentSampler, float sharpness, float exposureScale, float AABBScale, float2 uv, float2 texelSize, inout float4 currentMin, inout float4 currentMax, inout float4 currentAverage, inout float4 currentOutput)
+{
+ const int2 SampleOffset[9] = { int2(-1.0, -1.0), int2(0.0, -1.0), int2(1.0, -1.0), int2(-1.0, 0.0), int2(0.0, 0.0), int2(1.0, 0.0), int2(-1.0, 1.0), int2(0.0, 1.0), int2(1.0, 1.0) };
+
+ // Modulate Luma HDR
+
+ float4 sampleColors[9];
+ [unroll]
+ for (uint i = 0; i < 9; i++)
+ {
+ sampleColors[i] = currentColor.SampleLevel(currentSampler, uv + (SampleOffset[i] / texelSize), 0.0f);
+ }
+
+ float sampleWeights[9];
+ [unroll]
+ for (uint j = 0; j < 9; j++)
+ {
+ sampleWeights[j] = HdrWeight4(sampleColors[j].rgb, exposureScale);
+ }
+
+ float totalWeight = 0;
+ [unroll]
+ for (uint k = 0; k < 9; k++)
+ {
+ totalWeight += sampleWeights[k];
+ }
+ sampleColors[4] = (sampleColors[0] * sampleWeights[0] + sampleColors[1] * sampleWeights[1] + sampleColors[2] * sampleWeights[2] + sampleColors[3] * sampleWeights[3] + sampleColors[4] * sampleWeights[4] +
+ sampleColors[5] * sampleWeights[5] + sampleColors[6] * sampleWeights[6] + sampleColors[7] * sampleWeights[7] + sampleColors[8] * sampleWeights[8]) / totalWeight;
+
+ // Variance Clipping (AABB)
+
+ float4 m1 = 0.0;
+ float4 m2 = 0.0;
+ [unroll]
+ for (uint x = 0; x < 9; x++)
+ {
+ m1 += sampleColors[x];
+ m2 += sampleColors[x] * sampleColors[x];
+ }
+
+ float4 mean = m1 / 9.0;
+ float4 stddev = sqrt((m2 / 9.0) - sqr(mean));
+
+ currentMin = mean - AABBScale * stddev;
+ currentMax = mean + AABBScale * stddev;
+
+ currentOutput = sampleColors[4];
+ currentMin = min(currentMin, currentOutput);
+ currentMax = max(currentMax, currentOutput);
+ currentAverage = mean;
+}
+
+float2 CalculateCustomMotion(float depth, float2 uv)
+{
+ float4 sampleWorldPosition = float4(reconstructPosition(uv, depth, g_xCamera_InvVP), 1.0f);
+
+ float4 thisClip = mul(g_xCamera_VP, sampleWorldPosition);
+ float4 prevClip = mul(g_xFrame_MainCamera_PrevVP, sampleWorldPosition);
+
+ float2 thisScreen = thisClip.xy * rcp(thisClip.w);
+ float2 prevScreen = prevClip.xy * rcp(prevClip.w);
+ thisScreen = (thisScreen.xy + 1.0f) / 2.0f;
+ prevScreen = (prevScreen.xy + 1.0f) / 2.0f;
+
+ return thisScreen - prevScreen;
+}
+
+[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)]
+void main(uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint3 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex)
+{
+ const float2 uv = (DTid.xy + 0.5f) * xPPResolution_rcp;
+ const float depth = texture_depth.SampleLevel(sampler_point_clamp, uv, 0);
+
+ const float3 worldNormal = decodeNormal(texture_gbuffer1.SampleLevel(sampler_point_clamp, uv, 0).xy);
+
+ float4 raytraceSource = texture_raytrace.SampleLevel(sampler_point_clamp, uv, 0);
+ float hitDepth = raytraceSource.z;
+ float2 hitPixel = raytraceSource.xy;
+
+ // Calculate custom motion vectors to counter smearing, which we would get by using normal gbuffer velocity
+
+ float2 reflectionCustomVelocity = CalculateCustomMotion(hitDepth, uv);
+ float2 hitCustomVelocity = CalculateCustomMotion(hitDepth, hitPixel);
+ float2 customVelocity = CalculateCustomMotion(depth, uv);
+
+ float2 standardHitVelocity = texture_gbuffer1.SampleLevel(sampler_point_clamp, hitPixel, 0).zw;
+ float2 standardVelocity = texture_gbuffer1.SampleLevel(sampler_point_clamp, uv, 0).zw;
+
+ float2 velocityDifference = customVelocity - standardVelocity;
+ float2 hitVelocityDifference = hitCustomVelocity - standardHitVelocity;
+
+ float objectVelocityMask = saturate(dot(velocityDifference, velocityDifference) * xPPResolution_rcp.x * 100.0f);
+ float hitObjectVelocityMask = saturate(dot(hitVelocityDifference, hitVelocityDifference) * xPPResolution_rcp.x * 100.0f);
+
+ float2 objectVelocity = standardVelocity * objectVelocityMask;
+ float2 hitObjectVelocity = standardHitVelocity * hitObjectVelocityMask;
+
+ float2 velocity = lerp(lerp(reflectionCustomVelocity, hitObjectVelocity, hitObjectVelocityMask), objectVelocity, objectVelocityMask);
+ float2 prevUV = float2(uv.x - velocity.x, uv.y + velocity.y);
+
+ float4 previous = resolve_history.SampleLevel(sampler_linear_clamp, prevUV, 0);
+
+ // Luma HDR and AABB minmax
+
+ float4 current = 0;
+ float4 currentMin, currentMax, currentAverage;
+ ResolverAABB(resolve_current, sampler_linear_clamp, 0, temporalExposure, temporalScale, uv, xPPResolution, currentMin, currentMax, currentAverage, current);
+
+ previous.xyz = clip_aabb(currentMin.xyz, currentMax.xyz, clamp(currentAverage, currentMin, currentMax), previous).xyz;
+ previous.a = clamp(previous.a, currentMin.a, currentMax.a);
+
+ // Blend color & history
+ // Feedback weight from unbiased luminance difference (Timothy Lottes)
+
+ float lumFiltered = Luminance(current.rgb); // Luma4(current.rgb)
+ float lumHistory = Luminance(previous.rgb);
+
+ float lumDifference = abs(lumFiltered - lumHistory) / max(lumFiltered, max(lumHistory, 0.2f));
+ float lumWeight = sqr(1.0f - lumDifference);
+ float blendFinal = lerp(temporalResponseMin, temporalResponseMax, lumWeight);
+
+ // Reduce ghosting by refreshing the blend by velocity... but adds additional noise
+ //float2 velocityScreen = velocity * xPPResolution;
+ //float velocityBlend = sqrt(dot(velocityScreen, velocityScreen));
+ //blendFinal = lerp(blendFinal, 0.2f, saturate(velocityBlend / 100.0f));
+
+ float4 result = lerp(current, previous, blendFinal);
+
+ output[DTid.xy] = result;
+}
diff --git a/WickedEngine/stochasticSSRHF.hlsli b/WickedEngine/stochasticSSRHF.hlsli
new file mode 100644
index 000000000..de0df5555
--- /dev/null
+++ b/WickedEngine/stochasticSSRHF.hlsli
@@ -0,0 +1,116 @@
+
+#ifndef WI_STOCHASTICSSR_HF
+#define WI_STOCHASTICSSR_HF
+
+// Shared SSR settings:
+static const float SSRMaxRoughness = 1.0f; // Specify max roughness, this can improve performance in complex scenes.
+static const float BRDFBias = 0.7f;
+
+float ComputeRoughnessMaskScale(in float maxRoughness)
+{
+ float MaxRoughness = clamp(maxRoughness, 0.01f, 1.0f);
+
+ float roughnessMaskScale = -2.0f / MaxRoughness;
+ return roughnessMaskScale * 1.0f; // 2.0f & 1.0f
+}
+
+float GetRoughnessFade(in float roughness, in float maxRoughness)
+{
+ float roughnessMaskScale = ComputeRoughnessMaskScale(maxRoughness);
+ return min(roughness * roughnessMaskScale + 2, 1.0f);
+}
+
+float GetRoughness(float roughness)
+{
+ return max(roughness, 0.02f);
+}
+
+float Luminance(float3 color)
+{
+ return dot(color, float3(0.2126, 0.7152, 0.0722));
+}
+
+static const float2 offset[9] =
+{
+ float2(-2.0, -2.0),
+ float2(0.0, -2.0),
+ float2(2.0, -2.0),
+ float2(-2.0, 0.0),
+ float2(0.0, 0.0),
+ float2(2.0, 0.0),
+ float2(-2.0, 2.0),
+ float2(0.0, 2.0),
+ float2(2.0, 2.0)
+};
+
+
+uint3 Rand3DPCG16(int3 p)
+{
+ uint3 v = uint3(p);
+
+ v = v * 1664525u + 1013904223u;
+
+ v.x += v.y * v.z;
+ v.y += v.z * v.x;
+ v.z += v.x * v.y;
+ v.x += v.y * v.z;
+ v.y += v.z * v.x;
+ v.z += v.x * v.y;
+
+ // only top 16 bits are well shuffled
+ return v >> 16u;
+}
+
+// Brian Karis, Epic Games "Real Shading in Unreal Engine 4"
+float4 ImportanceSampleGGX(float2 Xi, float Roughness)
+{
+ float m = Roughness * Roughness;
+ float m2 = m * m;
+
+ float Phi = 2 * PI * Xi.x;
+
+ float CosTheta = sqrt((1.0 - Xi.y) / (1.0 + (m2 - 1.0) * Xi.y));
+ float SinTheta = sqrt(max(1e-5, 1.0 - CosTheta * CosTheta));
+
+ float3 H;
+ H.x = SinTheta * cos(Phi);
+ H.y = SinTheta * sin(Phi);
+ H.z = CosTheta;
+
+ float d = (CosTheta * m2 - CosTheta) * CosTheta + 1;
+ float D = m2 / (PI * d * d);
+ float pdf = D * CosTheta;
+
+ return float4(H, pdf);
+}
+
+// [ Duff et al. 2017, "Building an Orthonormal Basis, Revisited" ]
+// http://jcgt.org/published/0006/01/01/
+float3x3 GetTangentBasis(float3 TangentZ)
+{
+ const float Sign = TangentZ.z >= 0 ? 1 : -1;
+ const float a = -rcp(Sign + TangentZ.z);
+ const float b = TangentZ.x * TangentZ.y * a;
+
+ float3 TangentX = { 1 + Sign * a * pow(TangentZ.x, 2), Sign * b, -Sign * TangentZ.x };
+ float3 TangentY = { b, Sign + a * pow(TangentZ.y, 2), -TangentZ.y };
+
+ return float3x3(TangentX, TangentY, TangentZ);
+}
+
+float3 TangentToWorld(float3 vec, float3 tangentZ)
+{
+ return mul(vec, GetTangentBasis(tangentZ));
+}
+
+float4 TangentToWorld(float4 H, float3 tangentZ)
+{
+ return float4(mul(H.xyz, GetTangentBasis(tangentZ)), H.w);
+}
+
+float3 WorldToTangent(float3 vec, float3 tangentZ)
+{
+ return mul(GetTangentBasis(tangentZ), vec);
+}
+
+#endif // WI_STOCHASTICSSR_HF
diff --git a/WickedEngine/wiEnums.h b/WickedEngine/wiEnums.h
index 8ee4592f5..5ac9cbfd7 100644
--- a/WickedEngine/wiEnums.h
+++ b/WickedEngine/wiEnums.h
@@ -300,6 +300,11 @@ enum CSTYPES
CSTYPE_POSTPROCESS_BLUR_BILATERAL_UNORM4,
CSTYPE_POSTPROCESS_SSAO,
CSTYPE_POSTPROCESS_SSR,
+ CSTYPE_POSTPROCESS_STOCHASTICSSR_RAYTRACE,
+ CSTYPE_POSTPROCESS_STOCHASTICSSR_RESOLVE,
+ CSTYPE_POSTPROCESS_STOCHASTICSSR_TEMPORAL,
+ CSTYPE_POSTPROCESS_STOCHASTICSSR_MEDIAN,
+ CSTYPE_POSTPROCESS_STOCHASTICSSR_COMBINE,
CSTYPE_POSTPROCESS_LIGHTSHAFTS,
CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_HORIZONTAL,
CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_VERTICAL,
diff --git a/WickedEngine/wiRenderer.cpp b/WickedEngine/wiRenderer.cpp
index 2d7fc4cce..6422c0ad5 100644
--- a/WickedEngine/wiRenderer.cpp
+++ b/WickedEngine/wiRenderer.cpp
@@ -1333,6 +1333,11 @@ void LoadShaders()
wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_BLUR_BILATERAL_UNORM4], "blur_bilateral_unorm4CS.cso"); });
wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_SSAO], "ssaoCS.cso"); });
wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_SSR], "ssrCS.cso"); });
+ wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_RAYTRACE], "stochasticSSRCS_raytrace.cso"); });
+ wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_RESOLVE], "stochasticSSRCS_resolve.cso"); });
+ wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_TEMPORAL], "stochasticSSRCS_temporal.cso"); });
+ wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_MEDIAN], "stochasticSSRCS_median.cso"); });
+ wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_COMBINE], "stochasticSSRCS_combine.cso"); });
wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_LIGHTSHAFTS], "lightshaftsCS.cso"); });
wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_HORIZONTAL], "depthoffield_tileMaxCOC_horizontalCS.cso"); });
wiJobSystem::Execute(ctx, [] { LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_VERTICAL], "depthoffield_tileMaxCOC_verticalCS.cso"); });
@@ -8843,6 +8848,256 @@ void Postprocess_SSR(
wiProfiler::EndRange(range);
device->EventEnd(cmd);
}
+void Postprocess_StochasticSSR(
+ const Texture& input,
+ const Texture& depthbuffer,
+ const Texture& lineardepth_minmax,
+ const Texture& gbuffer0,
+ const Texture& gbuffer1,
+ const Texture& gbuffer2,
+ const Texture& output,
+ CommandList cmd
+)
+{
+ GraphicsDevice* device = GetDevice();
+
+ device->EventBegin("Postprocess_StochasticSSR", cmd);
+ auto range = wiProfiler::BeginRangeGPU("Stochastic SSR", cmd);
+
+ device->UnbindResources(TEXSLOT_RENDERPATH_SSR, 1, cmd);
+
+ const TextureDesc& desc = output.GetDesc();
+
+ static TextureDesc initialized_desc;
+ static Texture texture_main;
+ static Texture texture_raytrace;
+ static Texture texture_mask;
+ static Texture texture_resolve;
+ static Texture texture_temporal[2];
+ static Texture texture_median;
+
+ // Initialize once
+ if (initialized_desc.Width != desc.Width || initialized_desc.Height != desc.Height)
+ {
+ initialized_desc = desc;
+
+ TextureDesc main_desc;
+ main_desc.type = TextureDesc::TEXTURE_2D;
+ main_desc.Width = desc.Width;
+ main_desc.Height = desc.Height;
+ main_desc.Format = FORMAT_R16G16B16A16_FLOAT;
+ main_desc.BindFlags = BIND_SHADER_RESOURCE | BIND_UNORDERED_ACCESS;
+ main_desc.MipLevels = 0; // full mip chain
+ device->CreateTexture(&main_desc, nullptr, &texture_main);
+
+ main_desc = texture_main.GetDesc(); // mip count was initialized in CreateTexture()
+ for (uint32_t i = 0; i < main_desc.MipLevels; ++i)
+ {
+ int subresource_index;
+ subresource_index = device->CreateSubresource(&texture_main, SRV, 0, 1, i, 1);
+ assert(subresource_index == i);
+ subresource_index = device->CreateSubresource(&texture_main, UAV, 0, 1, i, 1);
+ assert(subresource_index == i);
+ }
+
+ TextureDesc cast_desc;
+ cast_desc.type = TextureDesc::TEXTURE_2D;
+ cast_desc.Width = desc.Width / 2;
+ cast_desc.Height = desc.Height / 2;
+ cast_desc.Format = FORMAT_R16G16B16A16_FLOAT;
+ cast_desc.BindFlags = BIND_SHADER_RESOURCE | BIND_UNORDERED_ACCESS;
+ device->CreateTexture(&cast_desc, nullptr, &texture_raytrace);
+ cast_desc.Format = FORMAT_R16G16_FLOAT;
+ device->CreateTexture(&cast_desc, nullptr, &texture_mask);
+
+ TextureDesc buffer_desc;
+ buffer_desc.type = TextureDesc::TEXTURE_2D;
+ buffer_desc.Width = desc.Width;
+ buffer_desc.Height = desc.Height;
+ buffer_desc.Format = FORMAT_R16G16B16A16_FLOAT;
+ buffer_desc.BindFlags = BIND_SHADER_RESOURCE | BIND_UNORDERED_ACCESS;
+ device->CreateTexture(&buffer_desc, nullptr, &texture_resolve);
+ device->CreateTexture(&buffer_desc, nullptr, &texture_temporal[0]);
+ device->CreateTexture(&buffer_desc, nullptr, &texture_temporal[1]);
+ device->CreateTexture(&buffer_desc, nullptr, &texture_median);
+ }
+
+ // This is very expensive. There is problably a better way of getting LOD of input.
+ // For now I'm just making a copy of input, to stay on the safe side.
+
+ // Main buffer copy and mip:
+ {
+ device->EventBegin("Main buffer pass", cmd);
+
+ CopyTexture2D(texture_main, 0, 0, 0, input, 0, cmd);
+ GenerateMipChain(texture_main, MIPGENFILTER_GAUSSIAN, cmd);
+
+ device->EventEnd(cmd);
+ }
+
+ // Switch to half res
+ PostProcessCB cb;
+ cb.xPPResolution.x = desc.Width / 2;
+ cb.xPPResolution.y = desc.Height / 2;
+ cb.xPPResolution_rcp.x = 1.0f / cb.xPPResolution.x;
+ cb.xPPResolution_rcp.y = 1.0f / cb.xPPResolution.y;
+ device->UpdateBuffer(&constantBuffers[CBTYPE_POSTPROCESS], &cb, cmd);
+ device->BindConstantBuffer(CS, &constantBuffers[CBTYPE_POSTPROCESS], CB_GETBINDSLOT(PostProcessCB), cmd);
+
+ // Raytrace pass:
+ {
+ device->EventBegin("Stochastic Raytrace pass", cmd);
+ device->BindComputeShader(&computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_RAYTRACE], cmd);
+
+ device->BindResource(CS, &depthbuffer, TEXSLOT_DEPTH, cmd);
+ device->BindResource(CS, &gbuffer1, TEXSLOT_GBUFFER1, cmd);
+ device->BindResource(CS, &gbuffer2, TEXSLOT_GBUFFER2, cmd);
+ device->BindResource(CS, &input, TEXSLOT_ONDEMAND0, cmd);
+ device->BindResource(CS, &lineardepth_minmax, TEXSLOT_ONDEMAND1, cmd);
+
+ const GPUResource* uavs[] = {
+ &texture_raytrace,
+ &texture_mask,
+ };
+ device->BindUAVs(CS, uavs, 0, arraysize(uavs), cmd);
+
+ device->Dispatch(
+ (texture_raytrace.GetDesc().Width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE,
+ (texture_raytrace.GetDesc().Width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE,
+ 1,
+ cmd
+ );
+
+ device->Barrier(&GPUBarrier::Memory(), 1, cmd);
+ device->UnbindUAVs(0, arraysize(uavs), cmd);
+ device->EventEnd(cmd);
+ }
+
+ // Switch to full res
+ cb.xPPResolution.x = desc.Width;
+ cb.xPPResolution.y = desc.Height;
+ cb.xPPResolution_rcp.x = 1.0f / cb.xPPResolution.x;
+ cb.xPPResolution_rcp.y = 1.0f / cb.xPPResolution.y;
+ device->UpdateBuffer(&constantBuffers[CBTYPE_POSTPROCESS], &cb, cmd);
+ device->BindConstantBuffer(CS, &constantBuffers[CBTYPE_POSTPROCESS], CB_GETBINDSLOT(PostProcessCB), cmd);
+
+ // Resolve pass:
+ {
+ device->EventBegin("Resolve pass", cmd);
+ device->BindComputeShader(&computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_RESOLVE], cmd);
+
+ device->BindResource(CS, &depthbuffer, TEXSLOT_DEPTH, cmd);
+ device->BindResource(CS, &gbuffer1, TEXSLOT_GBUFFER1, cmd);
+ device->BindResource(CS, &gbuffer2, TEXSLOT_GBUFFER2, cmd);
+ device->BindResource(CS, &texture_raytrace, TEXSLOT_ONDEMAND0, cmd);
+ device->BindResource(CS, &texture_mask, TEXSLOT_ONDEMAND1, cmd);
+ device->BindResource(CS, &texture_main, TEXSLOT_ONDEMAND2, cmd);
+
+ const GPUResource* uavs[] = {
+ &texture_resolve,
+ };
+ device->BindUAVs(CS, uavs, 0, arraysize(uavs), cmd);
+
+ device->Dispatch(
+ (texture_resolve.GetDesc().Width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE,
+ (texture_resolve.GetDesc().Height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE,
+ 1,
+ cmd
+ );
+
+ device->Barrier(&GPUBarrier::Memory(), 1, cmd);
+ device->UnbindUAVs(0, arraysize(uavs), cmd);
+ device->EventEnd(cmd);
+ }
+
+ int temporal_output = device->GetFrameCount() % 2;
+ int temporal_history = 1 - temporal_output;
+
+ // Temporal pass:
+ {
+ device->EventBegin("Temporal pass", cmd);
+ device->BindComputeShader(&computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_TEMPORAL], cmd);
+
+ device->BindResource(CS, &gbuffer1, TEXSLOT_GBUFFER1, cmd);
+ device->BindResource(CS, &depthbuffer, TEXSLOT_DEPTH, cmd);
+ device->BindResource(CS, &texture_resolve, TEXSLOT_ONDEMAND0, cmd);
+ device->BindResource(CS, &texture_temporal[temporal_history], TEXSLOT_ONDEMAND1, cmd);
+ device->BindResource(CS, &texture_raytrace, TEXSLOT_ONDEMAND2, cmd);
+
+ const GPUResource* uavs[] = {
+ &texture_temporal[temporal_output],
+ };
+ device->BindUAVs(CS, uavs, 0, arraysize(uavs), cmd);
+
+ device->Dispatch(
+ (texture_temporal[temporal_output].GetDesc().Width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE,
+ (texture_temporal[temporal_output].GetDesc().Height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE,
+ 1,
+ cmd
+ );
+
+ device->Barrier(&GPUBarrier::Memory(), 1, cmd);
+ device->UnbindUAVs(0, arraysize(uavs), cmd);
+ device->EventEnd(cmd);
+ }
+
+ // Median blur pass:
+ {
+ device->EventBegin("Median blur pass", cmd);
+ device->BindComputeShader(&computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_MEDIAN], cmd);
+
+ device->BindResource(CS, &depthbuffer, TEXSLOT_DEPTH, cmd);
+ device->BindResource(CS, &texture_temporal[temporal_output], TEXSLOT_ONDEMAND0, cmd);
+
+ const GPUResource* uavs[] = {
+ &texture_median,
+ };
+ device->BindUAVs(CS, uavs, 0, arraysize(uavs), cmd);
+
+ device->Dispatch(
+ (texture_median.GetDesc().Width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE,
+ (texture_median.GetDesc().Height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE,
+ 1,
+ cmd
+ );
+
+ device->Barrier(&GPUBarrier::Memory(), 1, cmd);
+ device->UnbindUAVs(0, arraysize(uavs), cmd);
+ device->EventEnd(cmd);
+ }
+ //Postprocess_Blur_Bilateral(texture_temporal[temporal_output], lineardepth, texture_temp, output, cmd, 0.85f, 0.85f, 1.2f);
+
+ // combine pass:
+ {
+ device->EventBegin("Combine pass", cmd);
+ device->BindComputeShader(&computeShaders[CSTYPE_POSTPROCESS_STOCHASTICSSR_COMBINE], cmd);
+
+ device->BindResource(CS, &depthbuffer, TEXSLOT_DEPTH, cmd);
+ device->BindResource(CS, &gbuffer0, TEXSLOT_GBUFFER0, cmd);
+ device->BindResource(CS, &gbuffer1, TEXSLOT_GBUFFER1, cmd);
+ device->BindResource(CS, &gbuffer2, TEXSLOT_GBUFFER2, cmd);
+ device->BindResource(CS, &texture_median, TEXSLOT_ONDEMAND0, cmd);
+
+ const GPUResource* uavs[] = {
+ &output,
+ };
+ device->BindUAVs(CS, uavs, 0, arraysize(uavs), cmd);
+
+ device->Dispatch(
+ (desc.Width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE,
+ (desc.Height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE,
+ 1,
+ cmd
+ );
+
+ device->Barrier(&GPUBarrier::Memory(), 1, cmd);
+ device->UnbindUAVs(0, arraysize(uavs), cmd);
+ device->EventEnd(cmd);
+ }
+
+ wiProfiler::EndRange(range);
+ device->EventEnd(cmd);
+}
void Postprocess_SSS(
const Texture& lineardepth,
const Texture& gbuffer0,
diff --git a/WickedEngine/wiRenderer.h b/WickedEngine/wiRenderer.h
index 509887a05..2d79f8987 100644
--- a/WickedEngine/wiRenderer.h
+++ b/WickedEngine/wiRenderer.h
@@ -208,6 +208,16 @@ namespace wiRenderer
const wiGraphics::Texture& output,
wiGraphics::CommandList cmd
);
+ void Postprocess_StochasticSSR(
+ const wiGraphics::Texture& input,
+ const wiGraphics::Texture& depthbuffer,
+ const wiGraphics::Texture& lineardepth_minmax,
+ const wiGraphics::Texture& gbuffer0,
+ const wiGraphics::Texture& gbuffer1,
+ const wiGraphics::Texture& gbuffer2,
+ const wiGraphics::Texture& output,
+ wiGraphics::CommandList cmd
+ );
void Postprocess_SSS(
const wiGraphics::Texture& lineardepth,
const wiGraphics::Texture& gbuffer0,
diff --git a/WickedEngine/wiVersion.cpp b/WickedEngine/wiVersion.cpp
index dcf1facb1..b0f45b633 100644
--- a/WickedEngine/wiVersion.cpp
+++ b/WickedEngine/wiVersion.cpp
@@ -9,7 +9,7 @@ namespace wiVersion
// minor features, major updates
const int minor = 38;
// minor bug fixes, alterations, refactors, updates
- const int revision = 7;
+ const int revision = 8;
long GetVersion()