mipgen updates

2018-08-06 23:37:03 +01:00
parent 0e9fff9d3c
commit f45f08e8ee
9 changed files with 56 additions and 40 deletions
@@ -175,14 +175,13 @@ RendererWindow::RendererWindow(wiGUI* gui, Renderable3DComponent* component) : G
 	tessellationCheckBox->SetEnabled(wiRenderer::GetDevice()->CheckCapability(wiGraphicsTypes::GraphicsDevice::GRAPHICSDEVICE_CAPABILITY_TESSELLATION));

 	advancedRefractionsCheckBox = new wiCheckBox("Advanced Refractions: ");
-	advancedRefractionsCheckBox->SetTooltip("Enable advanced refraction rendering: rough transparent materials will be more matte. This needs additional support from the graphics driver.");
+	advancedRefractionsCheckBox->SetTooltip("Enable advanced refraction rendering: rough transparent materials will be more matte.");
 	advancedRefractionsCheckBox->SetPos(XMFLOAT2(x, y += step));
 	advancedRefractionsCheckBox->OnClick([=](wiEventArgs args) {
 		wiRenderer::SetAdvancedRefractionsEnabled(args.bValue);
 	});
 	advancedRefractionsCheckBox->SetCheck(wiRenderer::GetAdvancedRefractionsEnabled());
 	rendererWindow->AddWidget(advancedRefractionsCheckBox);
-	advancedRefractionsCheckBox->SetEnabled(wiRenderer::GetDevice()->CheckCapability(wiGraphicsTypes::GraphicsDevice::GRAPHICSDEVICE_CAPABILITY_UNORDEREDACCESSTEXTURE_LOAD_FORMAT_EXT));

 	alphaCompositionCheckBox = new wiCheckBox("Alpha Composition: ");
 	alphaCompositionCheckBox->SetTooltip("Enable Alpha Composition. Enables softer alpha blending on partly solid geometry (eg. vegetation) but rendering performance will be slower.");
@@ -8,44 +8,61 @@
 TEXTURE2D(input, float4, TEXSLOT_UNIQUE0);
 RWTEXTURE2D(input_output, MIP_OUTPUT_FORMAT, 0);

-// Shader requires feature: Typed UAV additional format loads!
+static const uint TILE_BORDER = 4;
+static const uint TILE_SIZE = TILE_BORDER + GENERATEMIPCHAIN_2D_BLOCK_SIZE + TILE_BORDER;
+groupshared float4 tile[TILE_SIZE][TILE_SIZE];
+
 [numthreads(GENERATEMIPCHAIN_2D_BLOCK_SIZE, GENERATEMIPCHAIN_2D_BLOCK_SIZE, 1)]
-void main(uint3 DTid : SV_DispatchThreadID)
+void main(uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint3 Gid : SV_GroupID)
 {
-#ifndef SHADERCOMPILER_SPIRV
+	uint i;

-	// Determine if the thread is alive (it is alive when the dispatchthreadID can directly index a pixel)
-	if (DTid.x < outputResolution.x && DTid.y < outputResolution.y)
+	// First, we prewarm the tile cache, including border region:
+	const uint2 tile_upperleft = Gid.xy * GENERATEMIPCHAIN_2D_BLOCK_SIZE - TILE_BORDER;
+	const uint2 co[] = {
+		uint2(0, 0), uint2(1, 0),
+		uint2(0, 1), uint2(1, 1)
+	};
+	for (i = 0; i < 4; ++i)
 	{
-		// Do a bilinear sample first and write it out:
-		input_output[DTid.xy] = input.SampleLevel(sampler_linear_clamp, (DTid.xy + 0.5f) / (float2)outputResolution.xy, 0);
-		DeviceMemoryBarrier();
+		const uint2 coord = GTid.xy * 2 + co[i];
+		tile[coord.x][coord.y] = input.SampleLevel(sampler_linear_clamp, (tile_upperleft + coord + 1.0f) / (float2)outputResolution.xy, 0);
+	}
+	GroupMemoryBarrierWithGroupSync();

-		uint i = 0;
-		float4 sum = 0;
+	const int2 thread_to_cache = GTid.xy + TILE_BORDER;

-		// Gather samples in the X (horizontal) direction:
-		[unroll]
-		for (i = 0; i < 9; ++i)
-		{
-			sum += input_output[DTid.xy + uint2(gaussianOffsets[i], 0)] * gaussianWeightsNormalized[i];
-		}
-		// Write out the result of the horizontal blur:
-		DeviceMemoryBarrier();
-		input_output[DTid.xy] = sum;
-		DeviceMemoryBarrier();
-		sum = 0;
+	float4 sum = 0;

-		// Gather samples in the Y (vertical) direction:
-		[unroll]
-		for (i = 0; i < 9; ++i)
-		{
-			sum += input_output[DTid.xy + uint2(0, gaussianOffsets[i])] * gaussianWeightsNormalized[i];
-		}
-		// Write out the result of the vertical blur:
-		DeviceMemoryBarrier();
-		input_output[DTid.xy] = sum;
+	// Then each thread processes just one pixel within tile, excluding border:
+
+	// Horizontal accumulation for each tile pixel, with help of the border region
+	[unroll]
+	for (i = 0; i < 9; ++i)
+	{
+		const uint2 coord = thread_to_cache + int2(gaussianOffsets[i], 0);
+		sum += tile[coord.x][coord.y] * gaussianWeightsNormalized[i];
 	}

-#endif
+	// write out into cache (excluding border region):
+	tile[thread_to_cache.x][thread_to_cache.y] = sum;
+
+	GroupMemoryBarrierWithGroupSync();
+
+	sum = 0;
+
+	// Vertical accumulation for each tile pixel, with help of the border region
+	[unroll]
+	for (i = 0; i < 9; ++i)
+	{
+		const uint2 coord = thread_to_cache + int2(0, gaussianOffsets[i]);
+		sum += tile[coord.x][coord.y] * gaussianWeightsNormalized[i];
+	}
+
+
+	if (DTid.x < outputResolution.x && DTid.y < outputResolution.y)
+	{
+		// Each valid thread writes out one pixel:
+		input_output[DTid.xy] = sum;
+	}
 }
@@ -15,6 +15,6 @@ void main(uint3 DTid : SV_DispatchThreadID)
 {
 	if (DTid.x < outputResolution.x && DTid.y < outputResolution.y)
 	{
-		output[DTid.xy] = input.SampleLevel(customsampler, (DTid.xy + 0.5f) / (float2)outputResolution.xy, 0);
+		output[DTid.xy] = input.SampleLevel(customsampler, (DTid.xy + 1.0f) / (float2)outputResolution.xy, 0);
 	}
 }
@@ -18,7 +18,7 @@ void main(uint3 DTid : SV_DispatchThreadID)
 	if (DTid.x < outputResolution.x && DTid.y < outputResolution.y && DTid.z < outputResolution.z)
 	{
 		// Do a bilinear sample first and write it out:
-		input_output[DTid] = input.SampleLevel(sampler_linear_clamp, (DTid + 0.5f) / (float3)outputResolution, 0);
+		input_output[DTid] = input.SampleLevel(sampler_linear_clamp, (DTid + 1.0f) / (float3)outputResolution, 0);
 		DeviceMemoryBarrier();

 		uint i = 0;
@@ -15,6 +15,6 @@ void main( uint3 DTid : SV_DispatchThreadID )
 {
 	if (DTid.x < outputResolution.x && DTid.y < outputResolution.y && DTid.z < outputResolution.z)
 	{
-		output[DTid] = input.SampleLevel(customsampler, (DTid + 0.5f) / (float3)outputResolution, 0);
+		output[DTid] = input.SampleLevel(customsampler, (DTid + 1.0f) / (float3)outputResolution, 0);
 	}
 }
@@ -15,7 +15,7 @@ void main(uint3 DTid : SV_DispatchThreadID)
 {
 	if (DTid.x < outputResolution.x && DTid.y < outputResolution.y)
 	{
-		float2 uv = (DTid.xy + 0.5f) / outputResolution.xy;
+		float2 uv = (DTid.xy + 1.0f) / outputResolution.xy;
 		float3 N = UV_to_CubeMap(uv, DTid.z);

 		output[uint3(DTid.xy, DTid.z + arrayIndex * 6)] = input.SampleLevel(customsampler, float4(N, arrayIndex), 0);
@@ -15,7 +15,7 @@ void main(uint3 DTid : SV_DispatchThreadID)
 {
 	if (DTid.x < outputResolution.x && DTid.y < outputResolution.y)
 	{
-		float2 uv = (DTid.xy + 0.5f) / outputResolution.xy;
+		float2 uv = (DTid.xy + 1.0f) / outputResolution.xy;
 		float3 N = UV_to_CubeMap(uv, DTid.z);

 		output[DTid.xyz] = input.SampleLevel(customsampler, N, 0);
@@ -19,7 +19,7 @@ static const float gaussianWeightsNormalized[9] = {
 	gaussWeight3 * gaussNormalization,
 	gaussWeight4 * gaussNormalization,
 };
-static const uint gaussianOffsets[9] = {
+static const int gaussianOffsets[9] = {
 	-4, -3, -2, -1, 0, 1, 2, 3, 4
 };

@@ -9,7 +9,7 @@ namespace wiVersion
 	// minor features, major updates
 	const int minor = 19;
 	// minor bug fixes, alterations, refactors, updates
-	const int revision = 2;
+	const int revision = 3;


 	long GetVersion()