From fb8985cf44b14663fd35836bc7db75fc9b3bf1fc Mon Sep 17 00:00:00 2001 From: Turanszki Janos Date: Sun, 20 Sep 2020 16:14:42 +0200 Subject: [PATCH] rtao and vulkan mesh shader update --- WickedEngine/ShaderInterop_Postprocess.h | 1 + WickedEngine/Shaders_SOURCE.vcxitems | 11 +++ WickedEngine/Shaders_SOURCE.vcxitems.filters | 3 + WickedEngine/compile_shaders_hlsl6.py | 11 +-- WickedEngine/compile_shaders_spirv.py | 15 ++-- WickedEngine/emittedparticleMS.hlsl | 4 + WickedEngine/rtaoLIB.hlsl | 16 ++-- WickedEngine/rtao_temporalCS.hlsl | 87 ++++++++++++++++++++ WickedEngine/ssr_temporalCS.hlsl | 7 +- WickedEngine/wiEnums.h | 1 + WickedEngine/wiGraphicsDevice_Vulkan.cpp | 2 +- WickedEngine/wiRenderer.cpp | 61 +++++++++++--- WickedEngine/wiVersion.cpp | 2 +- 13 files changed, 183 insertions(+), 38 deletions(-) create mode 100644 WickedEngine/rtao_temporalCS.hlsl diff --git a/WickedEngine/ShaderInterop_Postprocess.h b/WickedEngine/ShaderInterop_Postprocess.h index 3d17b2855..e8c927d6c 100644 --- a/WickedEngine/ShaderInterop_Postprocess.h +++ b/WickedEngine/ShaderInterop_Postprocess.h @@ -27,6 +27,7 @@ CBUFFER(PostProcessCB, CBSLOT_RENDERER_POSTPROCESS) #define rtao_range ssao_range #define rtao_samplecount ssao_samplecount #define rtao_power ssao_power +#define rtao_seed xPPParams0.w static const uint POSTPROCESS_HBAO_THREADCOUNT = 320; #define hbao_direction xPPParams0.xy diff --git a/WickedEngine/Shaders_SOURCE.vcxitems b/WickedEngine/Shaders_SOURCE.vcxitems index 489cf43d3..d2d02f768 100644 --- a/WickedEngine/Shaders_SOURCE.vcxitems +++ b/WickedEngine/Shaders_SOURCE.vcxitems @@ -616,6 +616,17 @@ Pixel Pixel + + Compute + 5.0 + Compute + Compute + Compute + Compute + Compute + Compute + Compute + Compute 4.0 diff --git a/WickedEngine/Shaders_SOURCE.vcxitems.filters b/WickedEngine/Shaders_SOURCE.vcxitems.filters index 5d031c8ed..e9e221faf 100644 --- a/WickedEngine/Shaders_SOURCE.vcxitems.filters +++ b/WickedEngine/Shaders_SOURCE.vcxitems.filters @@ -1019,5 +1019,8 @@ CS + + CS + \ No newline at end of file diff --git a/WickedEngine/compile_shaders_hlsl6.py b/WickedEngine/compile_shaders_hlsl6.py index 62dc77a86..0f3c1fc76 100644 --- a/WickedEngine/compile_shaders_hlsl6.py +++ b/WickedEngine/compile_shaders_hlsl6.py @@ -48,13 +48,14 @@ for item in root.iter(): cmd += "as" cmd += "_6_5 " + + cmd += " -Fo " + "shaders/" + outputdir + "/" + os.path.splitext(name)[0] + ".cso " - #cmd += "-D RAYTRACING_INLINE " - #cmd += "-D RAYTRACING_TIER_1_1 " + cmd += " -flegacy-macro-expansion " - cmd += "-D HLSL6 " - - cmd += "-flegacy-macro-expansion -Fo " + "shaders/" + outputdir + "/" + os.path.splitext(name)[0] + ".cso " + cmd += " -D HLSL6 " + #cmd += " -D RAYTRACING_INLINE " + #cmd += " -D RAYTRACING_GEOMETRYINDEX " print(cmd) diff --git a/WickedEngine/compile_shaders_spirv.py b/WickedEngine/compile_shaders_spirv.py index 7cda6644c..010f45329 100644 --- a/WickedEngine/compile_shaders_spirv.py +++ b/WickedEngine/compile_shaders_spirv.py @@ -48,13 +48,12 @@ for item in root.iter(): cmd += "_6_5 " - #cmd += "-D RAYTRACING_INLINE " + cmd += " -Fo " + "shaders/" + outputdir + "/" + os.path.splitext(name)[0] + ".cso " - cmd += "-D SPIRV " - - cmd += " -spirv -fvk-use-dx-layout -flegacy-macro-expansion -Fo " + "shaders/" + outputdir + "/" + os.path.splitext(name)[0] + ".cso " - - cmd += "-fspv-target-env=vulkan1.2 " + cmd += " -spirv " + cmd += " -fspv-target-env=vulkan1.2 " + cmd += " -fvk-use-dx-layout " + cmd += " -flegacy-macro-expansion " if profile == "VS" or profile == "DS" or profile == "GS": cmd += " -fvk-invert-y " @@ -64,6 +63,10 @@ for item in root.iter(): cmd += " -fvk-u-shift 2000 all " cmd += " -fvk-s-shift 3000 all " + cmd += " -D SPIRV " + #cmd += " -D RAYTRACING_INLINE " + #cmd += " -D RAYTRACING_GEOMETRYINDEX " + print(cmd) try: diff --git a/WickedEngine/emittedparticleMS.hlsl b/WickedEngine/emittedparticleMS.hlsl index 8a7c91816..0ccb95d43 100644 --- a/WickedEngine/emittedparticleMS.hlsl +++ b/WickedEngine/emittedparticleMS.hlsl @@ -98,6 +98,10 @@ void main( Out.P = mul(g_xCamera_InvV, float4(Out.pos.xyz, 1)).xyz; Out.pos = mul(g_xCamera_Proj, Out.pos); +#ifdef SPIRV + Out.pos.y = -Out.pos.y; +#endif // SPIRV + Out.tex = float4(uv, uv2); Out.size = size; Out.color = (particle.color_mirror & 0x00FFFFFF) | (uint(opacity * 255.0f) << 24); diff --git a/WickedEngine/rtaoLIB.hlsl b/WickedEngine/rtaoLIB.hlsl index 92e5cec83..cb62524c6 100644 --- a/WickedEngine/rtaoLIB.hlsl +++ b/WickedEngine/rtaoLIB.hlsl @@ -4,13 +4,13 @@ RWTEXTURE2D(output, unorm float, 0); -#ifdef RAYTRACING_TIER_1_1 +#ifdef RAYTRACING_GEOMETRYINDEX ConstantBuffer subsets_material[MAX_DESCRIPTOR_INDEXING] : register(b0, space1); Texture2D subsets_texture_baseColor[MAX_DESCRIPTOR_INDEXING] : register(t0, space1); Buffer subsets_indexBuffer[MAX_DESCRIPTOR_INDEXING] : register(t100000, space1); Buffer subsets_vertexBuffer_UV0[MAX_DESCRIPTOR_INDEXING] : register(t300000, space1); Buffer subsets_vertexBuffer_UV1[MAX_DESCRIPTOR_INDEXING] : register(t400000, space1); -#endif // RAYTRACING_TIER_1_1 +#endif // RAYTRACING_GEOMETRYINDEX typedef BuiltInTriangleIntersectionAttributes MyAttributes; struct RayPayload @@ -42,7 +42,7 @@ void RTAO_Raygen() float3 N = normalize(cross(P1 - P0, P2 - P0)); - float seed = 666; + float seed = rtao_seed; RayDesc ray; ray.TMin = 0.001; @@ -55,10 +55,10 @@ void RTAO_Raygen() { ray.Direction = SampleHemisphere_cos(N, seed, uv); TraceRay(scene_acceleration_structure, -#ifndef RAYTRACING_TIER_1_1 // tier 1_0 method of alpha test without GeometryIndex() is not implemented yet +#ifndef RAYTRACING_GEOMETRYINDEX // tier 1_0 method of alpha test without GeometryIndex() is not implemented yet RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | -#endif // RAYTRACING_TIER_1_1 +#endif // RAYTRACING_GEOMETRYINDEX RAY_FLAG_SKIP_CLOSEST_HIT_SHADER , ~0, 0, 1, 0, ray, payload); } @@ -76,13 +76,13 @@ void RTAO_ClosestHit(inout RayPayload payload, in MyAttributes attr) [shader("anyhit")] void RTAO_AnyHit(inout RayPayload payload, in MyAttributes attr) { -#ifdef RAYTRACING_TIER_1_1 +#ifdef RAYTRACING_GEOMETRYINDEX float u = attr.barycentrics.x; float v = attr.barycentrics.y; float w = 1 - u - v; uint primitiveIndex = PrimitiveIndex(); uint geometryOffset = InstanceID(); - uint geometryIndex = GeometryIndex(); // requires tier_1_1!! + uint geometryIndex = GeometryIndex(); // requires tier_1_1 GeometryIndex feature!! uint descriptorIndex = geometryOffset + geometryIndex; ShaderMaterial material = subsets_material[descriptorIndex]; uint i0 = subsets_indexBuffer[descriptorIndex][primitiveIndex / 3 + 0]; @@ -114,7 +114,7 @@ void RTAO_AnyHit(inout RayPayload payload, in MyAttributes attr) { payload.color += 1 - baseColor.a; } -#endif // RAYTRACING_TIER_1_1 +#endif // RAYTRACING_GEOMETRYINDEX } [shader("miss")] diff --git a/WickedEngine/rtao_temporalCS.hlsl b/WickedEngine/rtao_temporalCS.hlsl new file mode 100644 index 000000000..10efeb6db --- /dev/null +++ b/WickedEngine/rtao_temporalCS.hlsl @@ -0,0 +1,87 @@ +#include "globals.hlsli" +#include "stochasticSSRHF.hlsli" +#include "ShaderInterop_Postprocess.h" + +TEXTURE2D(resolve_current, float, TEXSLOT_ONDEMAND0); +TEXTURE2D(resolve_history, float, TEXSLOT_ONDEMAND1); + +RWTEXTURE2D(output, unorm float, 0); + +static const float temporalResponseMin = 0.85; +static const float temporalResponseMax = 1.0f; +static const float temporalScale = 2.0; +static const float temporalExposure = 10.0f; + +inline void ResolverAABB(Texture2D currentColor, SamplerState currentSampler, float sharpness, float exposureScale, float AABBScale, float2 uv, float2 texelSize, inout float currentMin, inout float currentMax, inout float currentAverage, inout float currentOutput) +{ + const int2 SampleOffset[9] = { int2(-1.0, -1.0), int2(0.0, -1.0), int2(1.0, -1.0), int2(-1.0, 0.0), int2(0.0, 0.0), int2(1.0, 0.0), int2(-1.0, 1.0), int2(0.0, 1.0), int2(1.0, 1.0) }; + + float sampleColors[9]; + [unroll] + for (uint i = 0; i < 9; i++) + { + sampleColors[i] = currentColor.SampleLevel(currentSampler, uv + (SampleOffset[i] / texelSize), 0.0f); + } + + // Variance Clipping (AABB) + + float m1 = 0.0; + float m2 = 0.0; + [unroll] + for (uint x = 0; x < 9; x++) + { + m1 += sampleColors[x]; + m2 += sampleColors[x] * sampleColors[x]; + } + + float mean = m1 / 9.0; + float stddev = sqrt((m2 / 9.0) - sqr(mean)); + + currentMin = mean - AABBScale * stddev; + currentMax = mean + AABBScale * stddev; + + currentOutput = sampleColors[4]; + currentMin = min(currentMin, currentOutput); + currentMax = max(currentMax, currentOutput); + currentAverage = mean; +} + +[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] +void main(uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint3 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex) +{ + const float2 uv = (DTid.xy + 0.5f) * xPPResolution_rcp; + const float depth = texture_depth.SampleLevel(sampler_point_clamp, uv, 0); + + float4 pos = float4(reconstructPosition(uv, depth, g_xCamera_InvVP), 1.0f); + + float4 thisClip = mul(g_xCamera_VP, pos); + float4 prevClip = mul(g_xFrame_MainCamera_PrevVP, pos); + + float2 thisScreen = thisClip.xy * rcp(thisClip.w); + float2 prevScreen = prevClip.xy * rcp(prevClip.w); + thisScreen = thisScreen.xy * float2(0.5, -0.5) + 0.5; + prevScreen = prevScreen.xy * float2(0.5, -0.5) + 0.5; + + float2 velocity = thisScreen - prevScreen; + + float2 prevUV = uv - velocity; + + float previous = resolve_history.SampleLevel(sampler_linear_clamp, prevUV, 0); + + float current = 0; + float currentMin, currentMax, currentAverage; + ResolverAABB(resolve_current, sampler_linear_clamp, 0, temporalExposure, temporalScale, uv, xPPResolution, currentMin, currentMax, currentAverage, current); + + float lumDifference = abs(current - previous) / max(current, max(previous, 0.2f)); + float lumWeight = sqr(1.0f - lumDifference); + float blendFinal = lerp(temporalResponseMin, temporalResponseMax, lumWeight); + + // Reduce ghosting by refreshing the blend by velocity (Unreal) + float2 velocityScreen = velocity * xPPResolution; + float velocityBlend = sqrt(dot(velocityScreen, velocityScreen)); + blendFinal = lerp(blendFinal, 0.2, saturate(velocityBlend / 100.0)); + + float result = lerp(current, previous, blendFinal); + + output[DTid.xy] = result; +} diff --git a/WickedEngine/ssr_temporalCS.hlsl b/WickedEngine/ssr_temporalCS.hlsl index ac87a315d..7c6b75d5b 100644 --- a/WickedEngine/ssr_temporalCS.hlsl +++ b/WickedEngine/ssr_temporalCS.hlsl @@ -92,6 +92,7 @@ inline void ResolverAABB(Texture2D currentColor, SamplerState currentSam float2 CalculateCustomMotion(float depth, float2 uv) { + // Velocity buffer not good, because that contains object motion, and reflection is camera relative float4 sampleWorldPosition = float4(reconstructPosition(uv, depth, g_xCamera_InvVP), 1.0f); float4 thisClip = mul(g_xCamera_VP, sampleWorldPosition); @@ -110,12 +111,6 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint3 { const float2 uv = (DTid.xy + 0.5f) * xPPResolution_rcp; const float depth = texture_depth.SampleLevel(sampler_point_clamp, uv, 0); - - const float3 worldNormal = decodeNormal(texture_gbuffer1.SampleLevel(sampler_point_clamp, uv, 0).xy); - - //float4 raytraceSource = texture_raytrace.SampleLevel(sampler_point_clamp, uv, 0); - //float hitDepth = raytraceSource.z; - //float2 hitPixel = raytraceSource.xy; // Normal velocity seems to work best in most scenarios float2 customVelocity = CalculateCustomMotion(depth, uv); diff --git a/WickedEngine/wiEnums.h b/WickedEngine/wiEnums.h index ecf60c7ba..4db46e00a 100644 --- a/WickedEngine/wiEnums.h +++ b/WickedEngine/wiEnums.h @@ -333,6 +333,7 @@ enum CSTYPES CSTYPE_POSTPROCESS_SSR_RESOLVE, CSTYPE_POSTPROCESS_SSR_TEMPORAL, CSTYPE_POSTPROCESS_SSR_MEDIAN, + CSTYPE_POSTPROCESS_RTAO_TEMPORAL, CSTYPE_POSTPROCESS_LIGHTSHAFTS, CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_HORIZONTAL, CSTYPE_POSTPROCESS_DEPTHOFFIELD_TILEMAXCOC_VERTICAL, diff --git a/WickedEngine/wiGraphicsDevice_Vulkan.cpp b/WickedEngine/wiGraphicsDevice_Vulkan.cpp index 94a015b30..04c652900 100644 --- a/WickedEngine/wiGraphicsDevice_Vulkan.cpp +++ b/WickedEngine/wiGraphicsDevice_Vulkan.cpp @@ -2380,7 +2380,7 @@ using namespace Vulkan_Internal; if (mesh_shader_features.meshShader == VK_TRUE && mesh_shader_features.taskShader == VK_TRUE) { - // Currently, creating pipeline state with mesh shader crashes nvidia driver for me, so disable until solved + // Enable mesh shader here (problematic with certain driver versions, disabled by default): //MESH_SHADER = true; } diff --git a/WickedEngine/wiRenderer.cpp b/WickedEngine/wiRenderer.cpp index 7ede9f7e7..81fc59f40 100644 --- a/WickedEngine/wiRenderer.cpp +++ b/WickedEngine/wiRenderer.cpp @@ -10467,6 +10467,9 @@ void Postprocess_RTAO( rtdesc.max_attribute_size_in_bytes = sizeof(XMFLOAT2); // bary success = device->CreateRaytracingPipelineState(&rtdesc, &RTPSO); assert(success); + + success = LoadShader(CS, computeShaders[CSTYPE_POSTPROCESS_RTAO_TEMPORAL], "rtao_temporalCS.cso"); + assert(success); }; static wiEvent::Handle handle = wiEvent::Subscribe(SYSTEM_EVENT_RELOAD_SHADERS, load_shaders); @@ -10476,8 +10479,8 @@ void Postprocess_RTAO( } static TextureDesc saved_desc; - static Texture temp0; - static Texture temp1; + static Texture temp; + static Texture temporal[2]; const TextureDesc& lineardepth_desc = lineardepth.GetDesc(); if (saved_desc.Width != lineardepth_desc.Width || saved_desc.Height != lineardepth_desc.Height) @@ -10489,13 +10492,16 @@ void Postprocess_RTAO( desc.Format = FORMAT_R8_UNORM; desc.Width = (desc.Width + 1) / 2; desc.Height = (desc.Height + 1) / 2; - device->CreateTexture(&desc, nullptr, &temp0); - device->SetName(&temp0, "rtao_temp0"); - device->CreateTexture(&desc, nullptr, &temp1); - device->SetName(&temp1, "rtao_temp1"); + device->CreateTexture(&desc, nullptr, &temp); + device->SetName(&temp, "rtao_temp"); + + device->CreateTexture(&desc, nullptr, &temporal[0]); + device->SetName(&temporal[0], "rtao_temporal[0]"); + device->CreateTexture(&desc, nullptr, &temporal[1]); + device->SetName(&temporal[1], "rtao_temporal[1]"); } - const TextureDesc& desc = temp0.GetDesc(); + const TextureDesc& desc = temp.GetDesc(); PostProcessCB cb; cb.xPPResolution.x = desc.Width; @@ -10505,13 +10511,14 @@ void Postprocess_RTAO( cb.rtao_range = range; cb.rtao_samplecount = (float)samplecount; cb.rtao_power = power; + cb.rtao_seed = renderTime; GraphicsDevice::GPUAllocation cb_alloc = device->AllocateGPU(sizeof(cb), cmd); memcpy(cb_alloc.data, &cb, sizeof(cb)); device->BindRaytracingPipelineState(&RTPSO, cmd); device->WriteDescriptor(&descriptorTable, 0, 0, &depthbuffer); device->WriteDescriptor(&descriptorTable, 1, 0, &scene.TLAS); - device->WriteDescriptor(&descriptorTable, 2, 0, &temp0); + device->WriteDescriptor(&descriptorTable, 2, 0, &temp); device->BindDescriptorTable(RAYTRACING, 0, &descriptorTable, cmd); device->BindDescriptorTable(RAYTRACING, 1, &scene.descriptorTable, cmd); device->BindRootDescriptor(RAYTRACING, 0, &constantBuffers[CBTYPE_CAMERA], 0, cmd); @@ -10551,8 +10558,41 @@ void Postprocess_RTAO( }; device->Barrier(barriers, arraysize(barriers), cmd); - Postprocess_Blur_Bilateral(temp0, lineardepth, temp1, temp0, cmd, 1.2f, -1, -1, true); - Postprocess_Upsample_Bilateral(temp0, lineardepth, output, cmd); + int temporal_output = device->GetFrameCount() % 2; + int temporal_history = 1 - temporal_output; + + // Temporal pass: + { + device->EventBegin("Temporal pass", cmd); + device->BindComputeShader(&computeShaders[CSTYPE_POSTPROCESS_RTAO_TEMPORAL], cmd); + + device->BindResource(CS, &depthbuffer, TEXSLOT_DEPTH, cmd); + device->BindResource(CS, &temp, TEXSLOT_ONDEMAND0, cmd); + device->BindResource(CS, &temporal[temporal_history], TEXSLOT_ONDEMAND1, cmd); + + const GPUResource* uavs[] = { + &temporal[temporal_output], + }; + device->BindUAVs(CS, uavs, 0, arraysize(uavs), cmd); + + device->Dispatch( + (temporal[temporal_output].GetDesc().Width + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + (temporal[temporal_output].GetDesc().Height + POSTPROCESS_BLOCKSIZE - 1) / POSTPROCESS_BLOCKSIZE, + 1, + cmd + ); + + GPUBarrier barriers[] = { + GPUBarrier::Memory(), + }; + device->Barrier(barriers, arraysize(barriers), cmd); + + device->UnbindUAVs(0, arraysize(uavs), cmd); + device->EventEnd(cmd); + } + + Postprocess_Blur_Bilateral(temporal[temporal_output], lineardepth, temp, temporal[temporal_output], cmd, 1.2f, -1, -1, true); + Postprocess_Upsample_Bilateral(temporal[temporal_output], lineardepth, output, cmd); wiProfiler::EndRange(prof_range); device->EventEnd(cmd); @@ -10702,7 +10742,6 @@ void Postprocess_SSR( device->EventBegin("Temporal pass", cmd); device->BindComputeShader(&computeShaders[CSTYPE_POSTPROCESS_SSR_TEMPORAL], cmd); - device->BindResource(CS, &gbuffer1, TEXSLOT_GBUFFER1, cmd); device->BindResource(CS, &depthbuffer, TEXSLOT_DEPTH, cmd); device->BindResource(CS, &texture_resolve, TEXSLOT_ONDEMAND0, cmd); device->BindResource(CS, &texture_temporal[temporal_history], TEXSLOT_ONDEMAND1, cmd); diff --git a/WickedEngine/wiVersion.cpp b/WickedEngine/wiVersion.cpp index bde2dae4a..9d95639dc 100644 --- a/WickedEngine/wiVersion.cpp +++ b/WickedEngine/wiVersion.cpp @@ -9,7 +9,7 @@ namespace wiVersion // minor features, major updates, breaking API changes const int minor = 47; // minor bug fixes, alterations, refactors, updates - const int revision = 34; + const int revision = 35; const std::string version_string = std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(revision);