From 50389bf7535d97459ad548df1de33f8f4e17673f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tur=C3=A1nszki=20J=C3=A1nos?= Date: Wed, 24 May 2023 08:10:29 +0200 Subject: [PATCH] terrain: added faster block compressor --- WickedEngine/shaders/BlockCompress.hlsli | 597 ++++ WickedEngine/shaders/Shaders_SOURCE.vcxitems | 1 + .../shaders/Shaders_SOURCE.vcxitems.filters | 3 + .../shaders/compressonator/bcn_common_api.h | 1408 -------- .../compressonator/bcn_common_kernel.h | 2956 ----------------- .../shaders/compressonator/common_def.h | 2282 ------------- .../terrainVirtualTextureUpdateCS.hlsl | 32 +- WickedEngine/wiVersion.cpp | 2 +- third_party_software.txt | 27 - 9 files changed, 615 insertions(+), 6693 deletions(-) create mode 100644 WickedEngine/shaders/BlockCompress.hlsli delete mode 100644 WickedEngine/shaders/compressonator/bcn_common_api.h delete mode 100644 WickedEngine/shaders/compressonator/bcn_common_kernel.h delete mode 100644 WickedEngine/shaders/compressonator/common_def.h diff --git a/WickedEngine/shaders/BlockCompress.hlsli b/WickedEngine/shaders/BlockCompress.hlsli new file mode 100644 index 000000000..ff126fe76 --- /dev/null +++ b/WickedEngine/shaders/BlockCompress.hlsli @@ -0,0 +1,597 @@ +//-------------------------------------------------------------------------------------- +// BlockCompress.hlsli +// +// Helper functions for block compression +// +// Advanced Technology Group (ATG) +// Copyright (C) Microsoft Corporation. All rights reserved. +// +//-------------------------------------------------------------------------------------- + +#define BlockCompressRS \ + "RootFlags ( DENY_VERTEX_SHADER_ROOT_ACCESS |" \ + " DENY_DOMAIN_SHADER_ROOT_ACCESS |" \ + " DENY_GEOMETRY_SHADER_ROOT_ACCESS |" \ + " DENY_HULL_SHADER_ROOT_ACCESS )," \ + "CBV(b0, visibility=SHADER_VISIBILITY_ALL)," \ + "DescriptorTable(SRV(t0, numDescriptors=1), visibility=SHADER_VISIBILITY_ALL)," \ + "DescriptorTable(UAV(u0, numDescriptors=1), visibility=SHADER_VISIBILITY_ALL)," \ + "DescriptorTable(UAV(u1, numDescriptors=1), visibility=SHADER_VISIBILITY_ALL)," \ + "DescriptorTable(UAV(u2, numDescriptors=1), visibility=SHADER_VISIBILITY_ALL)," \ + "DescriptorTable(UAV(u3, numDescriptors=1), visibility=SHADER_VISIBILITY_ALL)," \ + "DescriptorTable(UAV(u4, numDescriptors=1), visibility=SHADER_VISIBILITY_ALL)," \ + "StaticSampler(s0, " \ + " filter = FILTER_MIN_MAG_MIP_POINT," \ + " addressU = TEXTURE_ADDRESS_CLAMP," \ + " addressV = TEXTURE_ADDRESS_CLAMP," \ + " addressW = TEXTURE_ADDRESS_CLAMP," \ + " visibility = SHADER_VISIBILITY_ALL)" + +#define COMPRESS_ONE_MIP_THREADGROUP_WIDTH 8 +#define COMPRESS_TWO_MIPS_THREADGROUP_WIDTH 16 + +#define MIP1_BLOCKS_PER_ROW 8 + +// Constant buffer for block compression shaders +cbuffer BlockCompressCB : register(b0) +{ + float g_oneOverTextureWidth; +} + +// CUSTOMBUILD : warning X4714: sum of temp registers and indexable temp registers times 256 threads exceeds the recommended total 16384. Performance may be reduced +// This warning shows up in Debug mode due to the complexity of the unoptimized shaders, but it's harmless aside from the fact that the shaders will be slow in Debug +#pragma warning(disable: 4714) + +//-------------------------------------------------------------------------------------- +// Name: ColorTo565 +// Desc: Pack a 3-component color into a uint +//-------------------------------------------------------------------------------------- +uint ColorTo565(float3 color) +{ + uint3 rgb = round(color * float3(31.0f, 63.0f, 31.0f)); + return (rgb.r << 11) | (rgb.g << 5) | rgb.b; +} + + +//-------------------------------------------------------------------------------------- +// Name: TexelToUV +// Desc: Convert from a texel to the UV coordinates used in a Gather call +//-------------------------------------------------------------------------------------- +float2 TexelToUV(float2 texel, float oneOverTextureWidth) +{ + // We Gather from the bottom-right corner of the texel + return (texel + 1.0f) * oneOverTextureWidth; +} + + +//-------------------------------------------------------------------------------------- +// Name: LoadTexelsRGB +// Desc: Load the 16 RGB texels that form a block +//-------------------------------------------------------------------------------------- +void LoadTexelsRGB(Texture2D tex, SamplerState samp, float oneOverTextureWidth, uint2 threadIDWithinDispatch, out float3 block[16]) +{ + float2 uv = TexelToUV(float2(threadIDWithinDispatch * 4), oneOverTextureWidth); + + float4 red = tex.GatherRed(samp, uv, int2(0, 0)); + float4 green = tex.GatherGreen(samp, uv, int2(0, 0)); + float4 blue = tex.GatherBlue(samp, uv, int2(0, 0)); + block[0] = float3(red[3], green[3], blue[3]); + block[1] = float3(red[2], green[2], blue[2]); + block[4] = float3(red[0], green[0], blue[0]); + block[5] = float3(red[1], green[1], blue[1]); + + red = tex.GatherRed(samp, uv, int2(2, 0)); + green = tex.GatherGreen(samp, uv, int2(2, 0)); + blue = tex.GatherBlue(samp, uv, int2(2, 0)); + block[2] = float3(red[3], green[3], blue[3]); + block[3] = float3(red[2], green[2], blue[2]); + block[6] = float3(red[0], green[0], blue[0]); + block[7] = float3(red[1], green[1], blue[1]); + + red = tex.GatherRed(samp, uv, int2(0, 2)); + green = tex.GatherGreen(samp, uv, int2(0, 2)); + blue = tex.GatherBlue(samp, uv, int2(0, 2)); + block[8] = float3(red[3], green[3], blue[3]); + block[9] = float3(red[2], green[2], blue[2]); + block[12] = float3(red[0], green[0], blue[0]); + block[13] = float3(red[1], green[1], blue[1]); + + red = tex.GatherRed(samp, uv, int2(2, 2)); + green = tex.GatherGreen(samp, uv, int2(2, 2)); + blue = tex.GatherBlue(samp, uv, int2(2, 2)); + block[10] = float3(red[3], green[3], blue[3]); + block[11] = float3(red[2], green[2], blue[2]); + block[14] = float3(red[0], green[0], blue[0]); + block[15] = float3(red[1], green[1], blue[1]); +} + + +//-------------------------------------------------------------------------------------- +// Name: LoadTexelsRGBBias +// Desc: Load the 16 RGB texels that form a block, with a mip bias +//-------------------------------------------------------------------------------------- +void LoadTexelsRGBBias(Texture2D tex, SamplerState samp, float oneOverTextureSize, uint2 threadIDWithinDispatch, uint mipBias, out float3 block[16]) +{ + // We need to use Sample rather than Gather/Load for the Bias functions, because low mips will read outside + // the texture boundary. When reading outside the boundary, Gather/Load return 0, but Sample can clamp + float2 location = float2(threadIDWithinDispatch * 4) * oneOverTextureSize; + block[0] = tex.SampleLevel(samp, location, mipBias, int2(0, 0)).rgb; + block[1] = tex.SampleLevel(samp, location, mipBias, int2(1, 0)).rgb; + block[2] = tex.SampleLevel(samp, location, mipBias, int2(2, 0)).rgb; + block[3] = tex.SampleLevel(samp, location, mipBias, int2(3, 0)).rgb; + block[4] = tex.SampleLevel(samp, location, mipBias, int2(0, 1)).rgb; + block[5] = tex.SampleLevel(samp, location, mipBias, int2(1, 1)).rgb; + block[6] = tex.SampleLevel(samp, location, mipBias, int2(2, 1)).rgb; + block[7] = tex.SampleLevel(samp, location, mipBias, int2(3, 1)).rgb; + block[8] = tex.SampleLevel(samp, location, mipBias, int2(0, 2)).rgb; + block[9] = tex.SampleLevel(samp, location, mipBias, int2(1, 2)).rgb; + block[10] = tex.SampleLevel(samp, location, mipBias, int2(2, 2)).rgb; + block[11] = tex.SampleLevel(samp, location, mipBias, int2(3, 2)).rgb; + block[12] = tex.SampleLevel(samp, location, mipBias, int2(0, 3)).rgb; + block[13] = tex.SampleLevel(samp, location, mipBias, int2(1, 3)).rgb; + block[14] = tex.SampleLevel(samp, location, mipBias, int2(2, 3)).rgb; + block[15] = tex.SampleLevel(samp, location, mipBias, int2(3, 3)).rgb; +} + + +//-------------------------------------------------------------------------------------- +// Name: LoadTexelsRGBA +// Desc: Load the 16 RGBA texels that form a block +//-------------------------------------------------------------------------------------- +void LoadTexelsRGBA(Texture2D tex, uint2 threadIDWithinDispatch, out float3 blockRGB[16], out float blockA[16]) +{ + float4 rgba; + int3 location = int3(threadIDWithinDispatch * 4, 0); + rgba = tex.Load(location, int2(0, 0)); blockRGB[0] = rgba.rgb; blockA[0] = rgba.a; + rgba = tex.Load(location, int2(1, 0)); blockRGB[1] = rgba.rgb; blockA[1] = rgba.a; + rgba = tex.Load(location, int2(2, 0)); blockRGB[2] = rgba.rgb; blockA[2] = rgba.a; + rgba = tex.Load(location, int2(3, 0)); blockRGB[3] = rgba.rgb; blockA[3] = rgba.a; + rgba = tex.Load(location, int2(0, 1)); blockRGB[4] = rgba.rgb; blockA[4] = rgba.a; + rgba = tex.Load(location, int2(1, 1)); blockRGB[5] = rgba.rgb; blockA[5] = rgba.a; + rgba = tex.Load(location, int2(2, 1)); blockRGB[6] = rgba.rgb; blockA[6] = rgba.a; + rgba = tex.Load(location, int2(3, 1)); blockRGB[7] = rgba.rgb; blockA[7] = rgba.a; + rgba = tex.Load(location, int2(0, 2)); blockRGB[8] = rgba.rgb; blockA[8] = rgba.a; + rgba = tex.Load(location, int2(1, 2)); blockRGB[9] = rgba.rgb; blockA[9] = rgba.a; + rgba = tex.Load(location, int2(2, 2)); blockRGB[10] = rgba.rgb; blockA[10] = rgba.a; + rgba = tex.Load(location, int2(3, 2)); blockRGB[11] = rgba.rgb; blockA[11] = rgba.a; + rgba = tex.Load(location, int2(0, 3)); blockRGB[12] = rgba.rgb; blockA[12] = rgba.a; + rgba = tex.Load(location, int2(1, 3)); blockRGB[13] = rgba.rgb; blockA[13] = rgba.a; + rgba = tex.Load(location, int2(2, 3)); blockRGB[14] = rgba.rgb; blockA[14] = rgba.a; + rgba = tex.Load(location, int2(3, 3)); blockRGB[15] = rgba.rgb; blockA[15] = rgba.a; +} + + +//-------------------------------------------------------------------------------------- +// Name: LoadTexelsRGBABias +// Desc: Load the 16 RGBA texels that form a block, with a mip bias +//-------------------------------------------------------------------------------------- +void LoadTexelsRGBABias(Texture2D tex, SamplerState samp, float oneOverTextureSize, uint2 threadIDWithinDispatch, uint mipBias, out float3 blockRGB[16], out float blockA[16]) +{ + // We need to use Sample rather than Gather/Load for the Bias functions, because low mips will read outside + // the texture boundary. When reading outside the boundary, Gather/Load return 0, but Sample will clamp + float4 rgba; + float2 location = float2(threadIDWithinDispatch * 4) * oneOverTextureSize; + rgba = tex.SampleLevel(samp, location, mipBias, int2(0, 0)); blockRGB[0] = rgba.rgb; blockA[0] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(1, 0)); blockRGB[1] = rgba.rgb; blockA[1] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(2, 0)); blockRGB[2] = rgba.rgb; blockA[2] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(3, 0)); blockRGB[3] = rgba.rgb; blockA[3] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(0, 1)); blockRGB[4] = rgba.rgb; blockA[4] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(1, 1)); blockRGB[5] = rgba.rgb; blockA[5] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(2, 1)); blockRGB[6] = rgba.rgb; blockA[6] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(3, 1)); blockRGB[7] = rgba.rgb; blockA[7] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(0, 2)); blockRGB[8] = rgba.rgb; blockA[8] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(1, 2)); blockRGB[9] = rgba.rgb; blockA[9] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(2, 2)); blockRGB[10] = rgba.rgb; blockA[10] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(3, 2)); blockRGB[11] = rgba.rgb; blockA[11] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(0, 3)); blockRGB[12] = rgba.rgb; blockA[12] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(1, 3)); blockRGB[13] = rgba.rgb; blockA[13] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(2, 3)); blockRGB[14] = rgba.rgb; blockA[14] = rgba.a; + rgba = tex.SampleLevel(samp, location, mipBias, int2(3, 3)); blockRGB[15] = rgba.rgb; blockA[15] = rgba.a; +} + + +//-------------------------------------------------------------------------------------- +// Name: LoadTexelsUV +// Desc: Load the 16 UV texels that form a block +//-------------------------------------------------------------------------------------- +void LoadTexelsUV(Texture2D tex, SamplerState samp, float oneOverTextureWidth, uint2 threadIDWithinDispatch, out float blockU[16], out float blockV[16]) +{ + float2 uv = TexelToUV(float2(threadIDWithinDispatch * 4), oneOverTextureWidth); + + float4 red = tex.GatherRed(samp, uv, int2(0, 0)); + float4 green = tex.GatherGreen(samp, uv, int2(0, 0)); + blockU[0] = red[3]; blockV[0] = green[3]; + blockU[1] = red[2]; blockV[1] = green[2]; + blockU[4] = red[0]; blockV[4] = green[0]; + blockU[5] = red[1]; blockV[5] = green[1]; + + red = tex.GatherRed(samp, uv, int2(2, 0)); + green = tex.GatherGreen(samp, uv, int2(2, 0)); + blockU[2] = red[3]; blockV[2] = green[3]; + blockU[3] = red[2]; blockV[3] = green[2]; + blockU[6] = red[0]; blockV[6] = green[0]; + blockU[7] = red[1]; blockV[7] = green[1]; + + red = tex.GatherRed(samp, uv, int2(0, 2)); + green = tex.GatherGreen(samp, uv, int2(0, 2)); + blockU[8] = red[3]; blockV[8] = green[3]; + blockU[9] = red[2]; blockV[9] = green[2]; + blockU[12] = red[0]; blockV[12] = green[0]; + blockU[13] = red[1]; blockV[13] = green[1]; + + red = tex.GatherRed(samp, uv, int2(2, 2)); + green = tex.GatherGreen(samp, uv, int2(2, 2)); + blockU[10] = red[3]; blockV[10] = green[3]; + blockU[11] = red[2]; blockV[11] = green[2]; + blockU[14] = red[0]; blockV[14] = green[0]; + blockU[15] = red[1]; blockV[15] = green[1]; +} + + +//-------------------------------------------------------------------------------------- +// Name: LoadTexelsUVBias +// Desc: Load the 16 UV texels that form a block, with a mip bias +//-------------------------------------------------------------------------------------- +void LoadTexelsUVBias(Texture2D tex, SamplerState samp, float oneOverTextureSize, uint2 threadIDWithinDispatch, uint mipBias, out float blockU[16], out float blockV[16]) +{ + // We need to use Sample rather than Gather/Load for the Bias functions, because low mips will read outside + // the texture boundary. When reading outside the boundary, Gather/Load return 0, but Sample will clamp + float4 rgba; + float2 location = float2(threadIDWithinDispatch * 4) * oneOverTextureSize; + rgba = tex.SampleLevel(samp, location, mipBias, int2(0, 0)); blockU[0] = rgba.r; blockV[0] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(1, 0)); blockU[1] = rgba.r; blockV[1] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(2, 0)); blockU[2] = rgba.r; blockV[2] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(3, 0)); blockU[3] = rgba.r; blockV[3] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(0, 1)); blockU[4] = rgba.r; blockV[4] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(1, 1)); blockU[5] = rgba.r; blockV[5] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(2, 1)); blockU[6] = rgba.r; blockV[6] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(3, 1)); blockU[7] = rgba.r; blockV[7] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(0, 2)); blockU[8] = rgba.r; blockV[8] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(1, 2)); blockU[9] = rgba.r; blockV[9] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(2, 2)); blockU[10] = rgba.r; blockV[10] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(3, 2)); blockU[11] = rgba.r; blockV[11] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(0, 3)); blockU[12] = rgba.r; blockV[12] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(1, 3)); blockU[13] = rgba.r; blockV[13] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(2, 3)); blockU[14] = rgba.r; blockV[14] = rgba.g; + rgba = tex.SampleLevel(samp, location, mipBias, int2(3, 3)); blockU[15] = rgba.r; blockV[15] = rgba.g; +} + + +//-------------------------------------------------------------------------------------- +// Name: GetMinMaxChannel +// Desc: Get the min and max of a single channel +//-------------------------------------------------------------------------------------- +void GetMinMaxChannel(float block[16], out float minC, out float maxC) +{ + minC = block[0]; + maxC = block[0]; + + for (int i = 1; i < 16; ++i) + { + minC = min(minC, block[i]); + maxC = max(maxC, block[i]); + } +} + + +//-------------------------------------------------------------------------------------- +// Name: GetMinMaxUV +// Desc: Get the min and max of two channels (UV) +//-------------------------------------------------------------------------------------- +void GetMinMaxUV(float blockU[16], float blockV[16], out float minU, out float maxU, out float minV, out float maxV) +{ + minU = blockU[0]; + maxU = blockU[0]; + minV = blockV[0]; + maxV = blockV[0]; + + for (int i = 1; i < 16; ++i) + { + minU = min(minU, blockU[i]); + maxU = max(maxU, blockU[i]); + minV = min(minV, blockV[i]); + maxV = max(maxV, blockV[i]); + } +} + + +//-------------------------------------------------------------------------------------- +// Name: GetMinMaxRGB +// Desc: Get the min and max of three channels (RGB) +//-------------------------------------------------------------------------------------- +void GetMinMaxRGB(float3 colorBlock[16], out float3 minColor, out float3 maxColor) +{ + minColor = colorBlock[0]; + maxColor = colorBlock[0]; + + for (int i = 1; i < 16; ++i) + { + minColor = min(minColor, colorBlock[i]); + maxColor = max(maxColor, colorBlock[i]); + } +} + + +//-------------------------------------------------------------------------------------- +// Name: InsetMinMaxRGB +// Desc: Slightly inset the min and max color values to reduce RMS error. +// This is recommended by van Waveren & Castano, "Real-Time YCoCg-DXT Compression" +// http://www.nvidia.com/object/real-time-ycocg-dxt-compression.html +//-------------------------------------------------------------------------------------- +void InsetMinMaxRGB(inout float3 minColor, inout float3 maxColor, float colorScale) +{ + // Since we have four points, (1/16) * (max-min) will give us half the distance between + // two points on the line in color space + float3 offset = (1.0f / 16.0f) * (maxColor - minColor); + + // After applying the offset, we want to round up or down to the next integral color value (0 to 255) + colorScale *= 255.0f; + maxColor = ceil((maxColor - offset) * colorScale) / colorScale; + minColor = floor((minColor + offset) * colorScale) / colorScale; +} + + +//-------------------------------------------------------------------------------------- +// Name: GetIndicesRGB +// Desc: Calculate the BC block indices for each color in the block +//-------------------------------------------------------------------------------------- +uint GetIndicesRGB(float3 block[16], float3 minColor, float3 maxColor) +{ + uint indices = 0; + + // For each input color, we need to select between one of the following output colors: + // 0: maxColor + // 1: (2/3)*maxColor + (1/3)*minColor + // 2: (1/3)*maxColor + (2/3)*minColor + // 3: minColor + // + // We essentially just project (block[i] - maxColor) onto (minColor - maxColor), but we pull out + // a few constant terms. + float3 diag = minColor - maxColor; + float stepInc = 3.0f / dot(diag, diag); // Scale up by 3, because our indices are between 0 and 3 + diag *= stepInc; + float c = stepInc * (dot(maxColor, maxColor) - dot(maxColor, minColor)); + + for (int i = 15; i >= 0; --i) + { + // Compute the index for this block element + uint index = round(dot(block[i], diag) + c); + + // Now we need to convert our index into the somewhat unintuivive BC1 indexing scheme: + // 0: maxColor + // 1: minColor + // 2: (2/3)*maxColor + (1/3)*minColor + // 3: (1/3)*maxColor + (2/3)*minColor + // + // The mapping is: + // 0 -> 0 + // 1 -> 2 + // 2 -> 3 + // 3 -> 1 + // + // We can perform this mapping using bitwise operations, which is faster + // than predication or branching as long as it doesn't increase our register + // count too much. The mapping in binary looks like: + // 00 -> 00 + // 01 -> 10 + // 10 -> 11 + // 11 -> 01 + // + // Splitting it up by bit, the output looks like: + // bit1_out = bit0_in XOR bit1_in + // bit0_out = bit1_in + uint bit0_in = index & 1; + uint bit1_in = index >> 1; + indices |= ((bit0_in^bit1_in) << 1) | bit1_in; + + if (i != 0) + { + indices <<= 2; + } + } + + return indices; +} + + +//-------------------------------------------------------------------------------------- +// Name: GetIndicesAlpha +// Desc: Calculate the BC block indices for an alpha channel +//-------------------------------------------------------------------------------------- +void GetIndicesAlpha(float block[16], float minA, float maxA, inout uint2 packed) +{ + float d = minA - maxA; + float stepInc = 7.0f / d; + + // Both packed.x and packed.y contain index values, so we need two loops + + uint index = 0; + uint shift = 16; + for (int i = 0; i < 6; ++i) + { + // For each input alpha value, we need to select between one of eight output values + // 0: maxA + // 1: (6/7)*maxA + (1/7)*minA + // ... + // 6: (1/7)*maxA + (6/3)*minA + // 7: minA + index = round(stepInc * (block[i] - maxA)); + + // Now we need to convert our index into the BC indexing scheme: + // 0: maxA + // 1: minA + // 2: (6/7)*maxA + (1/7)*minA + // ... + // 7: (1/7)*maxA + (6/3)*minA + index += (index > 0) - (7 * (index == 7)); + + packed.x |= (index << shift); + shift += 3; + } + + // The 6th index straddles the two uints + packed.y |= (index >> 1); + + shift = 2; + for (i = 6; i < 16; ++i) + { + index = round((block[i] - maxA) * stepInc); + index += (index > 0) - (7 * (index == 7)); + + packed.y |= (index << shift); + shift += 3; + } +} + + +//-------------------------------------------------------------------------------------- +// Name: CompressBC1Block +// Desc: Compress a BC1 block. colorScale is a scale value to be applied to the input +// colors; this used as an optimization when compressing two mips at a time. +// When compressing only a single mip, colorScale is always 1.0 +//-------------------------------------------------------------------------------------- +uint2 CompressBC1Block(float3 block[16], float colorScale = 1.0f) +{ + float3 minColor, maxColor; + GetMinMaxRGB(block, minColor, maxColor); + + // Inset the min and max values + InsetMinMaxRGB(minColor, maxColor, colorScale); + + // Pack our colors into uints + uint minColor565 = ColorTo565(colorScale * minColor); + uint maxColor565 = ColorTo565(colorScale * maxColor); + + uint indices = 0; + if (minColor565 < maxColor565) + { + indices = GetIndicesRGB(block, minColor, maxColor); + } + + return uint2((minColor565 << 16) | maxColor565, indices); +} + + +//-------------------------------------------------------------------------------------- +// Name: CompressBC3Block +// Desc: Compress a BC3 block. valueScale is a scale value to be applied to the input +// values; this used as an optimization when compressing two mips at a time. +// When compressing only a single mip, valueScale is always 1.0 +//-------------------------------------------------------------------------------------- +uint4 CompressBC3Block(float3 blockRGB[16], float blockA[16], float valueScale = 1.0f) +{ + float3 minColor, maxColor; + float minA, maxA; + GetMinMaxRGB(blockRGB, minColor, maxColor); + GetMinMaxChannel(blockA, minA, maxA); + + // Inset the min and max color values. We don't inset the alpha values + // because, while it may reduce the RMS error, it has a tendency to turn + // fully opaque texels partially transparent, which is probably not desirable. + InsetMinMaxRGB(minColor, maxColor, valueScale); + + // Pack our colors and alpha values into uints + uint minColor565 = ColorTo565(valueScale * minColor); + uint maxColor565 = ColorTo565(valueScale * maxColor); + uint minAPacked = round(minA * valueScale * 255.0f); + uint maxAPacked = round(maxA * valueScale * 255.0f); + + uint indices = 0; + if (minColor565 < maxColor565) + { + indices = GetIndicesRGB(blockRGB, minColor, maxColor); + } + + uint2 outA = uint2((minAPacked << 8) | maxAPacked, 0); + if (minAPacked < maxAPacked) + { + GetIndicesAlpha(blockA, minA, maxA, outA); + } + + return uint4(outA.x, outA.y, (minColor565 << 16) | maxColor565, indices); +} + + +//-------------------------------------------------------------------------------------- +// Name: CompressBC5Block +// Desc: Compress a BC5 block. valueScale is a scale value to be applied to the input +// values; this used as an optimization when compressing two mips at a time. +// When compressing only a single mip, valueScale is always 1.0 +//-------------------------------------------------------------------------------------- +uint4 CompressBC5Block(float blockU[16], float blockV[16], float valueScale = 1.0f) +{ + float minU, maxU, minV, maxV; + GetMinMaxUV(blockU, blockV, minU, maxU, minV, maxV); + + // Pack our min and max uv values + uint minUPacked = round(minU * valueScale * 255.0f); + uint maxUPacked = round(maxU * valueScale * 255.0f); + uint minVPacked = round(minV * valueScale * 255.0f); + uint maxVPacked = round(maxV * valueScale * 255.0f); + + uint2 outU = uint2((minUPacked << 8) | maxUPacked, 0); + uint2 outV = uint2((minVPacked << 8) | maxVPacked, 0); + + if (minUPacked < maxUPacked) + { + GetIndicesAlpha(blockU, minU, maxU, outU); + } + + if (minVPacked < maxVPacked) + { + GetIndicesAlpha(blockV, minV, maxV, outV); + } + + return uint4(outU.x, outU.y, outV.x, outV.y); +} + + +//-------------------------------------------------------------------------------------- +// Name: CalcTailMipsParams +// Desc: Calculate parameters used in the "compress tail mips" shaders +//-------------------------------------------------------------------------------------- +void CalcTailMipsParams(uint2 threadIDWithinDispatch, out float oneOverTextureSize, out uint2 blockID, out uint mipBias) +{ + blockID = threadIDWithinDispatch; + mipBias = 0; + oneOverTextureSize = 1; + + // When compressing our tail mips, we only dispatch one 8x8 threadgroup. Different threads + // are selected to compress different mip levels based on the position of thr thread in + // the threadgroup. + if (blockID.x < 4) + { + if (blockID.y < 4) + { + // 16x16 mip + oneOverTextureSize = 1.0f / 16.0f; + } + else + { + // 1x1 mip + mipBias = 4; + blockID.y -= 4; + } + } + else if (blockID.x < 6) + { + // 8x8 mip + mipBias = 1; + blockID -= float2(4, 4); + oneOverTextureSize = 1.0f / 8.0f; + } + else if (blockID.x < 7) + { + // 4x4 mip + mipBias = 2; + blockID -= float2(6, 6); + oneOverTextureSize = 1.0f / 4.0f; + } + else if (blockID.x < 8) + { + // 2x2 mip + mipBias = 3; + blockID -= float2(7, 7); + oneOverTextureSize = 1.0f / 2.0f; + } +} diff --git a/WickedEngine/shaders/Shaders_SOURCE.vcxitems b/WickedEngine/shaders/Shaders_SOURCE.vcxitems index 5f274fdb7..cd18bbde7 100644 --- a/WickedEngine/shaders/Shaders_SOURCE.vcxitems +++ b/WickedEngine/shaders/Shaders_SOURCE.vcxitems @@ -15,6 +15,7 @@ + diff --git a/WickedEngine/shaders/Shaders_SOURCE.vcxitems.filters b/WickedEngine/shaders/Shaders_SOURCE.vcxitems.filters index 9b975b528..6ebc767c4 100644 --- a/WickedEngine/shaders/Shaders_SOURCE.vcxitems.filters +++ b/WickedEngine/shaders/Shaders_SOURCE.vcxitems.filters @@ -153,6 +153,9 @@ HF + + HF + diff --git a/WickedEngine/shaders/compressonator/bcn_common_api.h b/WickedEngine/shaders/compressonator/bcn_common_api.h deleted file mode 100644 index 676428664..000000000 --- a/WickedEngine/shaders/compressonator/bcn_common_api.h +++ /dev/null @@ -1,1408 +0,0 @@ -//=============================================================================== -// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files(the "Software"), to deal -// in the Software without restriction, including without limitation the rights to -// use, copy, modify, merge, publish, distribute, sublicense, and / or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions : -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. -// -//=============================================================================== - -#ifndef BCN_COMMON_API_H_ -#define BCN_COMMON_API_H_ - -//=================================================================== -// NOTE: Do not use these API in production code, subject to changes -//=================================================================== - -#ifndef ASPM_GPU -#pragma warning(disable : 4244) -#pragma warning(disable : 4201) -#endif - -#include "common_def.h" - -#define CMP_MAX_16BITFLOAT 65504.0f -#define CMP_FLT_MAX 3.402823466e+38F -#define BC1ConstructColour(r, g, b) (((r) << 11) | ((g) << 5) | (b)) - - -#ifdef ASPM_HLSL -#define fabs(x) abs(x) -#endif - -CMP_STATIC CGU_FLOAT cmp_fabs(CMP_IN CGU_FLOAT x) -{ - return fabs(x); -} - -CMP_STATIC CGU_FLOAT cmp_linearToSrgbf(CMP_IN CGU_FLOAT Color) -{ - if (Color <= 0.0f) - return (0.0f); - if (Color >= 1.0f) - return (1.0f); - // standard : 0.0031308f - if (Color <= 0.00313066844250063) - return (Color * 12.92f); - return (pow(fabs(Color), 1.0f / 2.4f) * 1.055f - 0.055f); -} - -CMP_STATIC CGU_Vec3f cmp_linearToSrgb(CMP_IN CGU_Vec3f Color) -{ - Color.x = cmp_linearToSrgbf(Color.x); - Color.y = cmp_linearToSrgbf(Color.y); - Color.z = cmp_linearToSrgbf(Color.z); - return Color; -} - -CMP_STATIC CGU_FLOAT cmp_srgbToLinearf(CMP_IN CGU_FLOAT Color) -{ - if (Color <= 0.0f) - return (0.0f); - if (Color >= 1.0f) - return (1.0f); - // standard 0.04045f - if (Color <= 0.0404482362771082) - return (Color / 12.92f); - return pow((Color + 0.055f) / 1.055f, 2.4f); -} - -CMP_STATIC CGU_Vec3f cmp_srgbToLinear(CMP_IN CGU_Vec3f Color) -{ - Color.x = cmp_srgbToLinearf(Color.x); - Color.y = cmp_srgbToLinearf(Color.y); - Color.z = cmp_srgbToLinearf(Color.z); - return Color; -} - -CMP_STATIC CGU_Vec3f cmp_565ToLinear(CMP_IN CGU_UINT32 n565) -{ - CGU_UINT32 r0; - CGU_UINT32 g0; - CGU_UINT32 b0; - - r0 = ((n565 & 0xf800) >> 8); - g0 = ((n565 & 0x07e0) >> 3); - b0 = ((n565 & 0x001f) << 3); - - // Apply the lower bit replication to give full dynamic range (5,6,5) - r0 += (r0 >> 5); - g0 += (g0 >> 6); - b0 += (b0 >> 5); - - CGU_Vec3f LinearColor; - LinearColor.x = (CGU_FLOAT)r0; - LinearColor.y = (CGU_FLOAT)g0; - LinearColor.z = (CGU_FLOAT)b0; - - return LinearColor; -} - -CMP_STATIC CGU_UINT32 cmp_get2Bit32(CMP_IN CGU_UINT32 value, CMP_IN CGU_UINT32 indexPos) -{ - return (value >> (indexPos * 2)) & 0x3; -} - -CMP_STATIC CGU_UINT32 cmp_set2Bit32(CMP_IN CGU_UINT32 value, CMP_IN CGU_UINT32 indexPos) -{ - return ((value & 0x3) << (indexPos * 2)); -} - -CMP_STATIC CGU_UINT32 cmp_constructColor(CMP_IN CGU_UINT32 R, CMP_IN CGU_UINT32 G, CMP_IN CGU_UINT32 B) -{ - return (((R & 0x000000F8) << 8) | ((G & 0x000000FC) << 3) | ((B & 0x000000F8) >> 3)); -} - -CMP_STATIC CGU_FLOAT cmp_clampf(CMP_IN CGU_FLOAT v, CMP_IN CGU_FLOAT a, CMP_IN CGU_FLOAT b) -{ - if (v < a) - return a; - else if (v > b) - return b; - return v; -} - -CMP_STATIC CGU_Vec3f cmp_clampVec3f(CMP_IN CGU_Vec3f value, CMP_IN CGU_FLOAT minValue, CMP_IN CGU_FLOAT maxValue) -{ -#ifdef ASPM_GPU - return clamp(value, minValue, maxValue); -#else - CGU_Vec3f revalue; - revalue.x = cmp_clampf(value.x, minValue, maxValue); - revalue.y = cmp_clampf(value.y, minValue, maxValue); - revalue.z = cmp_clampf(value.z, minValue, maxValue); - return revalue; -#endif -} - -CMP_STATIC CGU_Vec3f cmp_saturate(CMP_IN CGU_Vec3f value) -{ -#ifdef ASPM_HLSL - return saturate(value); -#else - return cmp_clampVec3f(value, 0.0f, 1.0f); -#endif -} - -static CGU_Vec3f cmp_powVec3f(CGU_Vec3f color, CGU_FLOAT ex) -{ -#ifdef ASPM_GPU - return pow(color, ex); -#else - CGU_Vec3f ColorSrgbPower; - ColorSrgbPower.x = pow(color.x, ex); - ColorSrgbPower.y = pow(color.y, ex); - ColorSrgbPower.z = pow(color.z, ex); - return ColorSrgbPower; -#endif -} - -CMP_STATIC CGU_Vec3f cmp_minVec3f(CMP_IN CGU_Vec3f a, CMP_IN CGU_Vec3f b) -{ -#ifdef ASPM_HLSL - return min(a, b); -#endif - CGU_Vec3f res; - if (a.x < b.x) - res.x = a.x; - else - res.x = b.x; - if (a.y < b.y) - res.y = a.y; - else - res.y = b.y; - if (a.z < b.z) - res.z = a.z; - else - res.z = b.z; - return res; -} - -CMP_STATIC CGU_Vec3f cmp_maxVec3f(CMP_IN CGU_Vec3f a, CMP_IN CGU_Vec3f b) -{ -#ifdef ASPM_HLSL - return max(a, b); -#endif - CGU_Vec3f res; - if (a.x > b.x) - res.x = a.x; - else - res.x = b.x; - if (a.y > b.y) - res.y = a.y; - else - res.y = b.y; - if (a.z > b.z) - res.z = a.z; - else - res.z = b.z; - return res; -} - -inline CGU_Vec3f cmp_min3f(CMP_IN CGU_Vec3f value1, CMP_IN CGU_Vec3f value2) -{ -#ifdef ASPM_GPU - return min(value1, value2); -#else - CGU_Vec3f res; - res.x = CMP_MIN(value1.x, value2.x); - res.y = CMP_MIN(value1.y, value2.y); - res.z = CMP_MIN(value1.z, value2.z); - return res; -#endif -} - -inline CGU_Vec3f cmp_max3f(CMP_IN CGU_Vec3f value1, CMP_IN CGU_Vec3f value2) -{ -#ifdef ASPM_GPU - return max(value1, value2); -#else - CGU_Vec3f res; - res.x = CMP_MAX(value1.x, value2.x); - res.y = CMP_MAX(value1.y, value2.y); - res.z = CMP_MAX(value1.z, value2.z); - return res; -#endif -} - -CMP_STATIC CGU_FLOAT cmp_minf(CMP_IN CGU_FLOAT a, CMP_IN CGU_FLOAT b) -{ - return a < b ? a : b; -} - -CMP_STATIC CGU_FLOAT cmp_maxf(CMP_IN CGU_FLOAT a, CMP_IN CGU_FLOAT b) -{ - return a > b ? a : b; -} - -CMP_STATIC CGU_FLOAT cmp_floor(CMP_IN CGU_FLOAT value) -{ - return floor(value); -} - -CMP_STATIC CGU_Vec3f cmp_floorVec3f(CMP_IN CGU_Vec3f value) -{ -#ifdef ASPM_GPU - return floor(value); -#else - CGU_Vec3f revalue; - revalue.x = floor(value.x); - revalue.y = floor(value.y); - revalue.z = floor(value.z); - return revalue; -#endif -} - -#ifndef ASPM_OPENCL - -//======================================================= -// COMMON GPU & CPU API -//======================================================= - -//====================== -// implicit vector cast -//====================== -CMP_STATIC CGU_Vec4i cmp_castimp(CGU_Vec4ui v1) -{ -#ifdef ASPM_HLSL - return (v1); -#else - return (v1.x, v1.y, v1.z, v1.w); -#endif -} - -CMP_STATIC CGU_Vec3i cmp_castimp(CGU_Vec3ui v1) -{ -#ifdef ASPM_HLSL - return (v1); -#else - return (v1.x, v1.y, v1.z); -#endif -} - -//====================== -// Min / Max -//====================== - -CMP_STATIC CGU_UINT8 cmp_min8(CMP_IN CGU_UINT8 a, CMP_IN CGU_UINT8 b) -{ - return a < b ? a : b; -} - -CMP_STATIC CGU_UINT8 cmp_max8(CMP_IN CGU_UINT8 a, CMP_IN CGU_UINT8 b) -{ - return a > b ? a : b; -} - -CMP_STATIC CGU_UINT32 cmp_mini(CMP_IN CGU_UINT32 a, CMP_IN CGU_UINT32 b) -{ - return (a < b) ? a : b; -} - -CMP_STATIC CGU_UINT32 cmp_maxi(CMP_IN CGU_UINT32 a, CMP_IN CGU_UINT32 b) -{ - return (a > b) ? a : b; -} - -CMP_STATIC CGU_FLOAT cmp_max3(CMP_IN CGU_FLOAT i, CMP_IN CGU_FLOAT j, CMP_IN CGU_FLOAT k) -{ -#ifdef ASPM_GLSL - return max3(i, j, k); -#else - CGU_FLOAT max = i; - - if (max < j) - max = j; - - if (max < k) - max = k; - - return (max); -#endif -} - - -CMP_STATIC CGU_Vec4ui cmp_minVec4ui(CMP_IN CGU_Vec4ui a, CMP_IN CGU_Vec4ui b) -{ - //#ifdef ASPM_HLSL - // return min(a, b); - //#endif - //#ifndef ASPM_GPU - CGU_Vec4ui res; - if (a.x < b.x) - res.x = a.x; - else - res.x = b.x; - if (a.y < b.y) - res.y = a.y; - else - res.y = b.y; - if (a.z < b.z) - res.z = a.z; - else - res.z = b.z; - if (a.w < b.w) - res.w = a.w; - else - res.w = b.w; - return res; - //#endif -} - -CMP_STATIC CGU_Vec4ui cmp_maxVec4ui(CMP_IN CGU_Vec4ui a, CMP_IN CGU_Vec4ui b) -{ - //#ifdef ASPM_HLSL - // return max(a, b); - //#endif - //#ifndef ASPM_GPU - CGU_Vec4ui res; - if (a.x > b.x) - res.x = a.x; - else - res.x = b.x; - if (a.y > b.y) - res.y = a.y; - else - res.y = b.y; - if (a.z > b.z) - res.z = a.z; - else - res.z = b.z; - if (a.w > b.w) - res.w = a.w; - else - res.w = b.w; - return res; - //#endif -} - -//====================== -// Clamps -//====================== - -CMP_STATIC CGU_UINT32 cmp_clampui32(CMP_IN CGU_UINT32 v, CMP_IN CGU_UINT32 a, CMP_IN CGU_UINT32 b) -{ - if (v < a) - return a; - else if (v > b) - return b; - return v; -} - - -// Test Ref:https://en.wikipedia.org/wiki/Half-precision_floating-point_format -// Half (in Hex) Float Comment -// --------------------------------------------------------------------------- -// 0001 (approx) = 0.000000059604645 smallest positive subnormal number -// 03ff (approx) = 0.000060975552 largest subnormal number -// 0400 (approx) = 0.00006103515625 smallest positive normal number -// 7bff (approx) = 65504 largest normal number -// 3bff (approx) = 0.99951172 largest number less than one -// 3c00 (approx) = 1.00097656 smallest number larger than one -// 3555 = 0.33325195 the rounding of 1/3 to nearest -// c000 = ?2 -// 8000 = -0 -// 0000 = 0 -// 7c00 = infinity -// fc00 = infinity -// Half Float Math - -CMP_STATIC CGU_FLOAT HalfToFloat(CGU_UINT32 h) -{ -#if defined(ASPM_GPU) - CGU_FLOAT f = min16float((float)(h)); - return f; -#else - union FP32 - { - CGU_UINT32 u; - CGU_FLOAT f; - }; - - const FP32 magic = {(254 - 15) << 23}; - const FP32 was_infnan = {(127 + 16) << 23}; - - FP32 o; - o.u = (h & 0x7fff) << 13; // exponent/mantissa bits - o.f *= magic.f; // exponent adjust - if (o.f >= was_infnan.f) // check Inf/NaN - o.u |= 255 << 23; - o.u |= (h & 0x8000) << 16; // sign bit - return o.f; -#endif -} - -// From BC6HEcode.hlsl - -CMP_STATIC CGU_FLOAT cmp_half2float1(CGU_UINT32 Value) -{ - CGU_UINT32 Mantissa = (CGU_UINT32)(Value & 0x03FF); - - CGU_UINT32 Exponent; - if ((Value & 0x7C00) != 0) // The value is normalized - { - Exponent = (CGU_UINT32)((Value >> 10) & 0x1F); - } - else if (Mantissa != 0) // The value is denormalized - { - // Normalize the value in the resulting float - Exponent = 1; - - do - { - Exponent--; - Mantissa <<= 1; - } while ((Mantissa & 0x0400) == 0); - - Mantissa &= 0x03FF; - } - else // The value is zero - { - Exponent = (CGU_UINT32)(-112); - } - - CGU_UINT32 Result = ((Value & 0x8000) << 16) | // Sign - ((Exponent + 112) << 23) | // Exponent - (Mantissa << 13); // Mantissa - - return CGU_FLOAT(Result); -} - -CMP_STATIC CGU_Vec3f cmp_half2floatVec3(CGU_Vec3ui color_h) -{ - //uint3 sign = color_h & 0x8000; - //uint3 expo = color_h & 0x7C00; - //uint3 base = color_h & 0x03FF; - //return ( expo == 0 ) ? asfloat( ( sign << 16 ) | asuint( float3(base) / 16777216 ) ) //16777216 = 2^24 - // : asfloat( ( sign << 16 ) | ( ( ( expo + 0x1C000 ) | base ) << 13 ) ); //0x1C000 = 0x1FC00 - 0x3C00 - - return CGU_Vec3f(cmp_half2float1(color_h.x), cmp_half2float1(color_h.y), cmp_half2float1(color_h.z)); -} - -CMP_STATIC CGU_UINT16 FloatToHalf(CGU_FLOAT value) -{ -#if defined(ASPM_GPU) - return 0; -#else - union FP32 - { - CGU_UINT16 u; - float f; - struct - { - CGU_UINT32 Mantissa : 23; - CGU_UINT32 Exponent : 8; - CGU_UINT32 Sign : 1; - }; - }; - - union FP16 - { - CGU_UINT16 u; - struct - { - CGU_UINT32 Mantissa : 10; - CGU_UINT32 Exponent : 5; - CGU_UINT32 Sign : 1; - }; - }; - - FP16 o = {0}; - FP32 f; - f.f = value; - - // Based on ISPC reference code (with minor modifications) - if (f.Exponent == 0) // Signed zero/denormal (which will underflow) - o.Exponent = 0; - else if (f.Exponent == 255) // Inf or NaN (all exponent bits set) - { - o.Exponent = 31; - o.Mantissa = f.Mantissa ? 0x200 : 0; // NaN->qNaN and Inf->Inf - } - else // Normalized number - { - // Exponent unbias the single, then bias the halfp - int newexp = f.Exponent - 127 + 15; - if (newexp >= 31) // Overflow, return signed infinity - o.Exponent = 31; - else if (newexp <= 0) // Underflow - { - if ((14 - newexp) <= 24) // Mantissa might be non-zero - { - CGU_UINT32 mant = f.Mantissa | 0x800000; // Hidden 1 bit - o.Mantissa = mant >> (14 - newexp); - if ((mant >> (13 - newexp)) & 1) // Check for rounding - o.u++; // Round, might overflow into exp bit, but this is OK - } - } - else - { - o.Exponent = newexp; - o.Mantissa = f.Mantissa >> 13; - if (f.Mantissa & 0x1000) // Check for rounding - o.u++; // Round, might overflow to inf, this is OK - } - } - - o.Sign = f.Sign; - return o.u; -#endif -} - -CMP_STATIC CGU_UINT32 cmp_float2halfui(CGU_FLOAT f) -{ - CGU_UINT32 Result; - - CGU_UINT32 IValue = CGU_UINT32(f); - CGU_UINT32 Sign = (IValue & 0x80000000U) >> 16U; - IValue = IValue & 0x7FFFFFFFU; - - if (IValue > 0x47FFEFFFU) - { - // The number is too large to be represented as a half. Saturate to infinity. - Result = 0x7FFFU; - } - else - { - if (IValue < 0x38800000U) - { - // The number is too small to be represented as a normalized half. - // Convert it to a denormalized value. - CGU_UINT32 Shift = 113U - (IValue >> 23U); - IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift; - } - else - { - // Rebias the exponent to represent the value as a normalized half. - IValue += 0xC8000000U; - } - - Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U) & 0x7FFFU; - } - return (Result | Sign); -} - -CMP_STATIC CGU_Vec3ui cmp_float2half(CGU_Vec3f endPoint_f) -{ - return CGU_Vec3ui(cmp_float2halfui(endPoint_f.x), cmp_float2halfui(endPoint_f.y), cmp_float2halfui(endPoint_f.z)); -} - -CMP_STATIC CGU_UINT32 cmp_float2half1(CGU_FLOAT f) -{ - CGU_UINT32 Result; - - CGU_UINT32 IValue = CGU_UINT32(f); //asuint(f); - CGU_UINT32 Sign = (IValue & 0x80000000U) >> 16U; - IValue = IValue & 0x7FFFFFFFU; - - if (IValue > 0x47FFEFFFU) - { - // The number is too large to be represented as a half. Saturate to infinity. - Result = 0x7FFFU; - } - else - { - if (IValue < 0x38800000U) - { - // The number is too small to be represented as a normalized half. - // Convert it to a denormalized value. - CGU_UINT32 Shift = 113U - (IValue >> 23U); - IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift; - } - else - { - // Rebias the exponent to represent the value as a normalized half. - IValue += 0xC8000000U; - } - - Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U) & 0x7FFFU; - } - return (Result | Sign); -} - -CMP_STATIC CGU_Vec3ui cmp_float2halfVec3(CGU_Vec3f endPoint_f) -{ - return CGU_Vec3ui(cmp_float2half1(endPoint_f.x), cmp_float2half1(endPoint_f.y), cmp_float2half1(endPoint_f.z)); -} - -CMP_STATIC CGU_FLOAT cmp_f32tof16(CMP_IN CGU_FLOAT value) -{ -#ifdef ASPM_GLSL - return packHalf2x16(CGU_Vec2f(value.x, 0.0)); -#endif -#ifdef ASPM_HLSL - return f32tof16(value); -#endif -#ifndef ASPM_GPU - return FloatToHalf(value); -#endif -} - -CMP_STATIC CGU_Vec3f cmp_f32tof16(CMP_IN CGU_Vec3f value) -{ -#ifdef ASPM_GLSL - return CGU_Vec3f(packHalf2x16(CGU_Vec2f(value.x, 0.0)), packHalf2x16(CGU_Vec2f(value.y, 0.0)), packHalf2x16(CGU_Vec2f(value.z, 0.0))); -#endif -#ifdef ASPM_HLSL - return f32tof16(value); -#endif -#ifndef ASPM_GPU - CGU_Vec3f res; - res.x = FloatToHalf(value.x); - res.y = FloatToHalf(value.y); - res.z = FloatToHalf(value.z); - return res; -#endif -} - -CMP_STATIC CGU_FLOAT cmp_f16tof32(CGU_UINT32 value) -{ -#ifdef ASPM_GLSL - return unpackHalf2x16(value).x; -#endif -#ifdef ASPM_HLSL - return f16tof32(value); -#endif -#ifndef ASPM_GPU - return HalfToFloat(value); -#endif -} - -CMP_STATIC CGU_Vec3f cmp_f16tof32(CGU_Vec3ui value) -{ -#ifdef ASPM_GLSL - return CGU_Vec3f(unpackHalf2x16(value.x).x, unpackHalf2x16(value.y).x, unpackHalf2x16(value.z).x); -#endif -#ifdef ASPM_HLSL - return f16tof32(value); -#endif -#ifndef ASPM_GPU - CGU_Vec3f res; - res.x = HalfToFloat(value.x); - res.y = HalfToFloat(value.y); - res.z = HalfToFloat(value.z); - return res; -#endif -} - -CMP_STATIC CGU_Vec3f cmp_f16tof32(CGU_Vec3f value) -{ -#ifdef ASPM_GLSL - return CGU_Vec3f(unpackHalf2x16(value.x).x, unpackHalf2x16(value.y).x, unpackHalf2x16(value.z).x); -#endif -#ifdef ASPM_HLSL - return f16tof32(value); -#endif -#ifndef ASPM_GPU - CGU_Vec3f res; - res.x = HalfToFloat((CGU_UINT32)value.x); - res.y = HalfToFloat((CGU_UINT32)value.y); - res.z = HalfToFloat((CGU_UINT32)value.z); - return res; -#endif -} - -CMP_STATIC void cmp_swap(CMP_INOUT CGU_Vec3f CMP_REFINOUT a, CMP_INOUT CGU_Vec3f CMP_REFINOUT b) -{ - CGU_Vec3f tmp = a; - a = b; - b = tmp; -} - -CMP_STATIC void cmp_swap(CMP_INOUT CGU_FLOAT CMP_REFINOUT a, CMP_INOUT CGU_FLOAT CMP_REFINOUT b) -{ - CGU_FLOAT tmp = a; - a = b; - b = tmp; -} - -CMP_STATIC void cmp_swap(CMP_INOUT CGU_Vec3i CMP_REFINOUT lhs, CMP_INOUT CGU_Vec3i CMP_REFINOUT rhs) // valided with msc code -{ - CGU_Vec3i tmp = lhs; - lhs = rhs; - rhs = tmp; -} - -CMP_STATIC CGU_INT cmp_dotVec2i(CMP_IN CGU_Vec2i value1, CMP_IN CGU_Vec2i value2) -{ -#ifdef ASPM_GPU - return dot(value1, value2); -#else - return (value1.x * value2.x) + (value1.y * value2.y); -#endif -} - -CMP_STATIC CGU_FLOAT cmp_dotVec3f(CMP_IN CGU_Vec3f value1, CMP_IN CGU_Vec3f value2) -{ -#ifdef ASPM_GPU - return dot(value1, value2); -#else - return (value1.x * value2.x) + (value1.y * value2.y) + (value1.z * value2.z); -#endif -} - -CMP_STATIC CGU_UINT32 cmp_dotVec3ui(CMP_IN CGU_Vec3ui value1, CMP_IN CGU_Vec3ui value2) -{ -#ifdef ASPM_GPU - return dot(value1, value2); -#else - return (value1.x * value2.x) + (value1.y * value2.y) + (value1.z * value2.z); -#endif -} - -CMP_STATIC CGU_UINT32 cmp_dotVec4i(CMP_IN CGU_Vec4i value1, CMP_IN CGU_Vec4i value2) -{ -#ifdef ASPM_GPU - return dot(value1, value2); -#else - return (value1.x * value2.x) + (value1.y * value2.y) + (value1.z * value2.z) + (value1.w * value2.w); -#endif -} - -CMP_STATIC CGU_UINT32 cmp_dotVec4ui(CMP_IN CGU_Vec4ui value1, CMP_IN CGU_Vec4ui value2) -{ -#ifdef ASPM_GPU - return dot(value1, value2); -#else - return (value1.x * value2.x) + (value1.y * value2.y) + (value1.z * value2.z) + (value1.w * value2.w); -#endif -} - -CMP_STATIC CGU_Vec3f cmp_clampVec3fi(CMP_IN CGU_Vec3f value, CMP_IN CGU_INT minValue, CMP_IN CGU_INT maxValue) -{ -#ifdef ASPM_GPU - return clamp(value, minValue, maxValue); -#else - CGU_Vec3f revalue; - revalue.x = cmp_clampf(value.x, (CGU_FLOAT)minValue, (CGU_FLOAT)maxValue); - revalue.y = cmp_clampf(value.y, (CGU_FLOAT)minValue, (CGU_FLOAT)maxValue); - revalue.z = cmp_clampf(value.z, (CGU_FLOAT)minValue, (CGU_FLOAT)maxValue); - return revalue; -#endif -} - -CMP_STATIC CGU_Vec4ui cmp_clampVec4ui(CMP_IN CGU_Vec4ui value, CMP_IN CGU_UINT32 minValue, CMP_IN CGU_UINT32 maxValue) -{ -#ifdef ASPM_GPU - return clamp(value, minValue, maxValue); -#else - CGU_Vec4ui revalue; - revalue.x = cmp_clampui32(value.x, minValue, maxValue); - revalue.y = cmp_clampui32(value.y, minValue, maxValue); - revalue.z = cmp_clampui32(value.z, minValue, maxValue); - revalue.w = cmp_clampui32(value.w, minValue, maxValue); - return revalue; -#endif -} - -CMP_STATIC CGU_Vec4f cmp_clampVec4f(CMP_IN CGU_Vec4f value, CMP_IN CGU_FLOAT minValue, CMP_IN CGU_FLOAT maxValue) -{ -#ifdef ASPM_GPU - return clamp(value, minValue, maxValue); -#else - CGU_Vec4f revalue; - revalue.x = cmp_clampf(value.x, minValue, maxValue); - revalue.y = cmp_clampf(value.y, minValue, maxValue); - revalue.z = cmp_clampf(value.z, minValue, maxValue); - revalue.w = cmp_clampf(value.w, minValue, maxValue); - return revalue; -#endif -} - -CMP_STATIC CGU_Vec3f cmp_clamp3Vec3f(CMP_IN CGU_Vec3f value, CMP_IN CGU_Vec3f minValue, CMP_IN CGU_Vec3f maxValue) -{ -#ifdef ASPM_GPU - return clamp(value, minValue, maxValue); -#else - CGU_Vec3f revalue; - revalue.x = cmp_clampf(value.x, minValue.x, maxValue.x); - revalue.y = cmp_clampf(value.y, minValue.y, maxValue.y); - revalue.z = cmp_clampf(value.z, minValue.z, maxValue.z); - return revalue; -#endif -} - -CMP_STATIC CGU_Vec3f cmp_exp2(CMP_IN CGU_Vec3f value) -{ -#ifdef ASPM_GPU - return exp2(value); -#else - CGU_Vec3f revalue; - revalue.x = exp2(value.x); - revalue.y = exp2(value.y); - revalue.z = exp2(value.z); - return revalue; -#endif -} - - -CMP_STATIC CGU_Vec3f cmp_roundVec3f(CMP_IN CGU_Vec3f value) -{ -#ifdef ASPM_HLSL - return round(value); -#endif -#ifndef ASPM_HLSL - CGU_Vec3f res; - res.x = round(value.x); - res.y = round(value.y); - res.z = round(value.z); - return res; -#endif -} - -CMP_STATIC CGU_Vec3f cmp_log2Vec3f(CMP_IN CGU_Vec3f value) -{ -#ifdef ASPM_GPU - return log2(value); -#else - CGU_Vec3f res; - res.x = log2(value.x); - res.y = log2(value.y); - res.z = log2(value.z); - return res; -#endif -} - -// used in BC1 LowQuality code -CMP_STATIC CGU_FLOAT cmp_saturate(CMP_IN CGU_FLOAT value) -{ -#ifdef ASPM_HLSL - return saturate(value); -#else - return cmp_clampf(value, 0.0f, 1.0f); -#endif -} - -CMP_STATIC CGU_FLOAT cmp_rcp(CMP_IN CGU_FLOAT det) -{ -#ifdef ASPM_HLSL - return rcp(det); -#else - if (det > 0.0f) - return (1 / det); - else - return 0.0f; -#endif -} - -CMP_STATIC CGU_UINT32 cmp_Get4BitIndexPos(CMP_IN CGU_FLOAT indexPos, CMP_IN CGU_FLOAT endPoint0Pos, CMP_IN CGU_FLOAT endPoint1Pos) -{ - CGU_FLOAT r = (indexPos - endPoint0Pos) / (endPoint1Pos - endPoint0Pos); - return cmp_clampui32(CGU_UINT32(r * 14.93333f + 0.03333f + 0.5f), 0, 15); -} - -// Calculate Mean Square Least Error (MSLE) for 2 Vectors -CMP_STATIC CGU_FLOAT cmp_CalcMSLE(CMP_IN CGU_Vec3f a, CMP_IN CGU_Vec3f b) -{ - CGU_Vec3f err = cmp_log2Vec3f((b + 1.0f) / (a + 1.0f)); - err = err * err; - return err.x + err.y + err.z; -} - - - -// Compute Endpoints (min/max) bounding box -CMP_STATIC void cmp_GetTexelMinMax(CMP_IN CGU_Vec3f texels[16], CMP_INOUT CGU_Vec3f CMP_REFINOUT blockMin, CMP_INOUT CGU_Vec3f CMP_REFINOUT blockMax) -{ - blockMin = texels[0]; - blockMax = texels[0]; - for (CGU_UINT32 i = 1u; i < 16u; ++i) - { - blockMin = cmp_minVec3f(blockMin, texels[i]); - blockMax = cmp_maxVec3f(blockMax, texels[i]); - } -} - -// Refine Endpoints (min/max) by insetting bounding box in log2 RGB space -CMP_STATIC void cmp_RefineMinMaxAsLog2(CMP_IN CGU_Vec3f texels[16], CMP_INOUT CGU_Vec3f CMP_REFINOUT blockMin, CMP_INOUT CGU_Vec3f CMP_REFINOUT blockMax) -{ - CGU_Vec3f refinedBlockMin = blockMax; - CGU_Vec3f refinedBlockMax = blockMin; - for (CGU_UINT32 i = 0u; i < 16u; ++i) - { - refinedBlockMin = cmp_minVec3f(refinedBlockMin, texels[i] == blockMin ? refinedBlockMin : texels[i]); - refinedBlockMax = cmp_maxVec3f(refinedBlockMax, texels[i] == blockMax ? refinedBlockMax : texels[i]); - } - - CGU_Vec3f logBlockMax = cmp_log2Vec3f(blockMax + 1.0f); - CGU_Vec3f logBlockMin = cmp_log2Vec3f(blockMin + 1.0f); - - CGU_Vec3f logRefinedBlockMax = cmp_log2Vec3f(refinedBlockMax + 1.0f); - CGU_Vec3f logRefinedBlockMin = cmp_log2Vec3f(refinedBlockMin + 1.0f); - CGU_Vec3f logBlockMaxExt = (logBlockMax - logBlockMin) * (1.0f / 32.0f); - - logBlockMin += cmp_minVec3f(logRefinedBlockMin - logBlockMin, logBlockMaxExt); - logBlockMax -= cmp_minVec3f(logBlockMax - logRefinedBlockMax, logBlockMaxExt); - - blockMin = cmp_exp2(logBlockMin) - 1.0f; - blockMax = cmp_exp2(logBlockMax) - 1.0f; -} - -// Refine Endpoints (min/max) by Least Squares Optimization -CMP_STATIC void cmp_RefineMinMaxAs16BitLeastSquares(CMP_IN CGU_Vec3f texels[16], - CMP_INOUT CGU_Vec3f CMP_REFINOUT blockMin, - CMP_INOUT CGU_Vec3f CMP_REFINOUT blockMax) -{ - CGU_Vec3f blockDir = blockMax - blockMin; - blockDir = blockDir / (blockDir.x + blockDir.y + blockDir.z); - - CGU_FLOAT endPoint0Pos = cmp_f32tof16(cmp_dotVec3f(blockMin, blockDir)); - CGU_FLOAT endPoint1Pos = cmp_f32tof16(cmp_dotVec3f(blockMax, blockDir)); - - CGU_Vec3f alphaTexelSum = 0.0f; - CGU_Vec3f betaTexelSum = 0.0f; - CGU_FLOAT alphaBetaSum = 0.0f; - CGU_FLOAT alphaSqSum = 0.0f; - CGU_FLOAT betaSqSum = 0.0f; - - for (CGU_UINT32 i = 0; i < 16; i++) - { - CGU_FLOAT texelPos = cmp_f32tof16(cmp_dotVec3f(texels[i], blockDir)); - CGU_UINT32 texelIndex = cmp_Get4BitIndexPos(texelPos, endPoint0Pos, endPoint1Pos); - - CGU_FLOAT beta = cmp_saturate(texelIndex / 15.0f); - CGU_FLOAT alpha = 1.0f - beta; - - CGU_Vec3f texelF16; - texelF16.x = cmp_f32tof16(texels[i].x); - texelF16.y = cmp_f32tof16(texels[i].y); - texelF16.z = cmp_f32tof16(texels[i].z); - - alphaTexelSum += texelF16 * alpha; - betaTexelSum += texelF16 * beta; - - alphaBetaSum += alpha * beta; - - alphaSqSum += alpha * alpha; - betaSqSum += beta * beta; - } - - CGU_FLOAT det = alphaSqSum * betaSqSum - alphaBetaSum * alphaBetaSum; - - if (abs(det) > 0.00001f) - { - CGU_FLOAT detRcp = cmp_rcp(det); - blockMin = cmp_clampVec3f((alphaTexelSum * betaSqSum - betaTexelSum * alphaBetaSum) * detRcp, 0.0f, CMP_MAX_16BITFLOAT); - blockMax = cmp_clampVec3f((betaTexelSum * alphaSqSum - alphaTexelSum * alphaBetaSum) * detRcp, 0.0f, CMP_MAX_16BITFLOAT); - blockMin = cmp_f16tof32(blockMin); - blockMax = cmp_f16tof32(blockMax); - } -} - -//============================================================================================= - -CMP_STATIC CGU_Vec3f cmp_fabsVec3f(CGU_Vec3f value) -{ -#ifdef ASPM_HLSL - return abs(value); -#else - CGU_Vec3f res; - res.x = abs(value.x); - res.y = abs(value.y); - res.z = abs(value.z); - return res; -#endif -} - - -CMP_STATIC CGU_UINT32 cmp_constructColor(CMP_IN CGU_Vec3ui EndPoints) -{ - return (((EndPoints.r & 0x000000F8) << 8) | ((EndPoints.g & 0x000000FC) << 3) | ((EndPoints.b & 0x000000F8) >> 3)); -} - -CMP_STATIC CGU_UINT32 cmp_constructColorBGR(CMP_IN CGU_Vec3f EndPoints) -{ - return (((CGU_UINT32(EndPoints.b) & 0x000000F8) << 8) | ((CGU_UINT32(EndPoints.g) & 0x000000FC) << 3) | ((CGU_UINT32(EndPoints.r) & 0x000000F8) >> 3)); -} - -CMP_STATIC CGU_FLOAT cmp_mod(CMP_IN CGU_FLOAT value, CMP_IN CGU_FLOAT modval) -{ -#ifdef ASPM_GLSL - return mod(value, modval); -#endif - return fmod(value, modval); -} - -CMP_STATIC CGU_Vec3f cmp_truncVec3f(CMP_IN CGU_Vec3f value) -{ -#ifdef ASPM_HLSL - return trunc(value); -#else - CGU_Vec3f res; - res.x = trunc(value.x); - res.y = trunc(value.y); - res.z = trunc(value.z); - return res; -#endif -} - -CMP_STATIC CGU_Vec3f cmp_ceilVec3f(CMP_IN CGU_Vec3f value) -{ - CGU_Vec3f res; - res.x = ceil(value.x); - res.y = ceil(value.y); - res.z = ceil(value.z); - return res; -} - -CMP_STATIC CGU_FLOAT cmp_sqrt(CGU_FLOAT value) -{ - return sqrt(value); -} - -// Computes inverse square root over an implementation-defined range. The maximum error is implementation-defined. -CMP_STATIC CGV_FLOAT cmp_rsqrt(CGV_FLOAT f) -{ - CGV_FLOAT sf = sqrt(f); - if (sf != 0) - return 1 / sqrt(f); - else - return 0.0f; -} - -// Common to BC7 API ------------------------------------------------------------------------------------------------------------------------ - -// valid bit range is 0..8 for mode 1 -CMP_STATIC INLINE CGU_UINT32 cmp_shift_right_uint32(CMP_IN CGU_UINT32 v, CMP_IN CGU_INT bits) -{ - return v >> bits; // (perf warning expected) -} - -CMP_STATIC INLINE CGU_INT cmp_clampi(CMP_IN CGU_INT value, CMP_IN CGU_INT low, CMP_IN CGU_INT high) -{ - if (value < low) - return low; - else if (value > high) - return high; - return value; -} - -CMP_STATIC INLINE CGU_INT32 cmp_clampi32(CMP_IN CGU_INT32 value, CMP_IN CGU_INT32 low, CMP_IN CGU_INT32 high) -{ - if (value < low) - value = low; - else if (value > high) - value = high; - return value; -} - -CMP_STATIC CGV_FLOAT cmp_dot4f(CMP_IN CGV_Vec4f value1, CMP_IN CGV_Vec4f value2) -{ -#ifdef ASPM_GPU - return dot(value1, value2); -#else - return (value1.x * value2.x) + (value1.y * value2.y) + (value1.z * value2.z) + (value1.w * value2.w); -#endif -} - -CMP_STATIC INLINE void cmp_set_vec4f(CMP_INOUT CGU_Vec4f CMP_REFINOUT pV, CMP_IN CGU_FLOAT x, CMP_IN CGU_FLOAT y, CMP_IN CGU_FLOAT z, CMP_IN CGU_FLOAT w) -{ - pV[0] = x; - pV[1] = y; - pV[2] = z; - pV[3] = w; -} - -CMP_STATIC INLINE void cmp_set_vec4ui(CGU_Vec4ui CMP_REFINOUT pV, CMP_IN CGU_UINT8 x, CMP_IN CGU_UINT8 y, CMP_IN CGU_UINT8 z, CMP_IN CGU_UINT8 w) -{ - pV[0] = x; - pV[1] = y; - pV[2] = z; - pV[3] = w; -} - -CMP_STATIC inline void cmp_set_vec4ui_clamped(CGU_Vec4ui CMP_REFINOUT pRes, CMP_IN CGU_INT32 r, CMP_IN CGU_INT32 g, CMP_IN CGU_INT32 b, CMP_IN CGU_INT32 a) -{ - pRes[0] = (CGU_UINT8)cmp_clampi32(r, 0, 255); - pRes[1] = (CGU_UINT8)cmp_clampi32(g, 0, 255); - pRes[2] = (CGU_UINT8)cmp_clampi32(b, 0, 255); - pRes[3] = (CGU_UINT8)cmp_clampi32(a, 0, 255); -} - -CMP_STATIC inline CGU_Vec4f cmp_clampNorm4f(CMP_IN CGU_Vec4f pV) -{ - CGU_Vec4f res; - res[0] = cmp_clampf(pV[0], 0.0f, 1.0f); - res[1] = cmp_clampf(pV[1], 0.0f, 1.0f); - res[2] = cmp_clampf(pV[2], 0.0f, 1.0f); - res[3] = cmp_clampf(pV[3], 0.0f, 1.0f); - return res; -} - -CMP_STATIC INLINE CGU_Vec4f cmp_vec4ui_to_vec4f(CMP_IN CGU_Vec4ui pC) -{ - CGU_Vec4f res; - cmp_set_vec4f(res, (CGU_FLOAT)pC[0], (CGU_FLOAT)pC[1], (CGU_FLOAT)pC[2], (CGU_FLOAT)pC[3]); - return res; -} - -CMP_STATIC INLINE void cmp_normalize(CGU_Vec4f CMP_REFINOUT pV) -{ - CGU_FLOAT s = cmp_dot4f(pV, pV); - if (s != 0.0f) - { - s = 1.0f / cmp_sqrt(s); - pV *= s; - } -} - -CMP_STATIC INLINE CGV_FLOAT cmp_squaref(CMP_IN CGV_FLOAT v) -{ - return v * v; -} - -CMP_STATIC INLINE CGU_INT cmp_squarei(CMP_IN CGU_INT i) -{ - return i * i; -} - -CMP_STATIC CGU_UINT8 cmp_clampui8(CMP_IN CGU_UINT8 v, CMP_IN CGU_UINT8 a, CMP_IN CGU_UINT8 b) -{ - if (v < a) - return a; - else if (v > b) - return b; - return v; -} - -CMP_STATIC CGU_INT32 cmp_abs32(CMP_IN CGU_INT32 v) -{ - CGU_UINT32 msk = v >> 31; - return (v ^ msk) - msk; -} - -CMP_STATIC void cmp_swap32(CMP_INOUT CGU_UINT32 CMP_REFINOUT a, CMP_INOUT CGU_UINT32 CMP_REFINOUT b) -{ - CGU_UINT32 t = a; - a = b; - b = t; -} - -// Computes inverse square root over an implementation-defined range. The maximum error is implementation-defined. -CMP_STATIC CGV_FLOAT cmp_Image_rsqrt(CMP_IN CGV_FLOAT f) -{ - CGV_FLOAT sf = sqrt(f); - if (sf != 0) - return 1 / sqrt(f); - else - return 0.0f; -} - -CMP_STATIC void cmp_pack4bitindex32(CMP_INOUT CGU_UINT32 packed_index[2], CMP_IN CGU_UINT32 src_index[16]) -{ - // Converts from unpacked index to packed index - packed_index[0] = 0x0000; - packed_index[1] = 0x0000; - CGU_UINT32 shift = 0; // was CGU_UINT8 - for (CGU_INT k = 0; k < 8; k++) - { - packed_index[0] |= (CGU_UINT32)(src_index[k] & 0x0F) << shift; - packed_index[1] |= (CGU_UINT32)(src_index[k + 8] & 0x0F) << shift; - shift += 4; - } -} - -CMP_STATIC void cmp_pack4bitindex(CMP_INOUT CGU_UINT32 packed_index[2], CMP_IN CGU_UINT8 src_index[16]) -{ - // Converts from unpacked index to packed index - packed_index[0] = 0x0000; - packed_index[1] = 0x0000; - CGU_UINT32 shift = 0; // was CGU_UINT8 - for (CGU_INT k = 0; k < 8; k++) - { - packed_index[0] |= (CGU_UINT32)(src_index[k] & 0x0F) << shift; - packed_index[1] |= (CGU_UINT32)(src_index[k + 8] & 0x0F) << shift; - shift += 4; - } -} - -CMP_STATIC INLINE CGU_INT cmp_expandbits(CMP_IN CGU_INT v, CMP_IN CGU_INT bits) -{ - CGU_INT vv = v << (8 - bits); - return vv + cmp_shift_right_uint32(vv, bits); -} - -// This code need further improvements and investigation -CMP_STATIC INLINE CGU_UINT8 cmp_ep_find_floor2(CMP_IN CGV_FLOAT v, CMP_IN CGU_UINT8 bits, CMP_IN CGU_UINT8 use_par, CMP_IN CGU_UINT8 odd) -{ - CGU_UINT8 i1 = 0; - CGU_UINT8 i2 = 1 << (bits - use_par); - odd = use_par ? odd : 0; - while (i2 - i1 > 1) - { - CGU_UINT8 j = (CGU_UINT8)((i1 + i2) * 0.5f); - CGV_FLOAT ep_d = (CGV_FLOAT)cmp_expandbits((j << use_par) + odd, bits); - if (v >= ep_d) - i1 = j; - else - i2 = j; - } - - return (i1 << use_par) + odd; -} - -CMP_STATIC CGV_FLOAT cmp_absf(CMP_IN CGV_FLOAT a) -{ - return a > 0.0F ? a : -a; -} - -CMP_STATIC INLINE CGU_UINT32 cmp_pow2Packed(CMP_IN CGU_INT x) -{ - return 1 << x; -} - -CMP_STATIC INLINE CGU_UINT8 cmp_clampIndex(CMP_IN CGU_UINT8 v, CMP_IN CGU_UINT8 a, CMP_IN CGU_UINT8 b) -{ - if (v < a) - return a; - else if (v > b) - return b; - return v; -} - -CMP_STATIC INLINE CGU_UINT8 shift_right_uint82(CMP_IN CGU_UINT8 v, CMP_IN CGU_UINT8 bits) -{ - return v >> bits; // (perf warning expected) -} - -#endif - -CMP_STATIC CGU_INT cmp_QuantizeToBitSize(CMP_IN CGU_INT value, CMP_IN CGU_INT prec, CMP_IN CGU_BOOL signedfloat16) -{ - if (prec <= 1) - return 0; - CGU_BOOL negvalue = false; - - // move data to use extra bits for processing - CGU_INT ivalue = value; - - if (signedfloat16) - { - if (value < 0) - { - negvalue = true; - value = -value; - } - prec--; - } - else - { - // clamp -ve - if (value < 0) - value = 0; - } - - CGU_INT iQuantized; - CGU_INT bias = (prec > 10 && prec != 16) ? ((1 << (prec - 11)) - 1) : 0; - bias = (prec == 16) ? 15 : bias; - - iQuantized = ((ivalue << prec) + bias) / (0x7bff + 1); // 16 bit Float Max 0x7bff - - return (negvalue ? -iQuantized : iQuantized); -} - -//======================================================= -// CPU GPU Macro API -//======================================================= - -#ifdef ASPM_GPU -#define cmp_min(a, b) min(a, b) -#define cmp_max(a, b) max(a, b) -#else -#ifndef cmp_min -#define cmp_min(a, b) ((a) < (b) ? (a) : (b)) -#endif -#ifndef cmp_max -#define cmp_max(a, b) ((a) > (b) ? (a) : (b)) -#endif -#endif - -//======================================================= -// CPU Template API -//======================================================= - -#ifndef ASPM_GPU -#ifndef TEMPLATE_API_INTERFACED -#define TEMPLATE_API_INTERFACED - -template -T clamp(T& v, const T& lo, const T& hi) -{ - if (v < lo) - return lo; - else if (v > hi) - return hi; - return v; -} - -template -Vec4T clamp(Vec4T& v, const T& lo, const T& hi) -{ - Vec4T res = v; - if (v.x < lo) - res.x = lo; - else if (v.x > hi) - res.x = hi; - if (v.y < lo) - res.y = lo; - else if (v.y > hi) - res.y = hi; - if (v.z < lo) - res.z = lo; - else if (v.w > hi) - res.w = hi; - if (v.w < lo) - res.w = lo; - else if (v.z > hi) - res.z = hi; - return res; -} - - -template -T dot(T& v1, T2& v2) -{ - return (v1 * v2); -} - -template -T dot(Vec4T& v1, Vec4T& v2) -{ - return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z + v1.w * v2.w); -} - -template -T dot(Vec4T& v1, Vec4T& v2) -{ - return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z + v1.w * v2.w); -} -template -T dot(Vec3T& v1, Vec3T& v2) -{ - return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z); -} - -template -T dot(Vec3T& v1, Vec3T& v2) -{ - return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z); -} - -#endif // API_INTERFACED -#endif // ASPM_GPU - -#endif // diff --git a/WickedEngine/shaders/compressonator/bcn_common_kernel.h b/WickedEngine/shaders/compressonator/bcn_common_kernel.h deleted file mode 100644 index e6ca1dea9..000000000 --- a/WickedEngine/shaders/compressonator/bcn_common_kernel.h +++ /dev/null @@ -1,2956 +0,0 @@ -//============================================================================= -// Copyright (c) 2018-2021 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files(the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions : -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. -// -//===================================================================== -//===================================================================== -// Block-compression (BC) functionality ref -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. -//===================================================================== -//************************************************************************************ -// ** NOTE ** -// Content and data types may change, use CMP_Core.h for interface to your application -//************************************************************************************ - -#ifndef _BCN_COMMON_KERNEL_H -#define _BCN_COMMON_KERNEL_H - -#pragma warning(disable : 4505) // disable warnings on unreferenced local function has been removed - -#include "common_def.h" -#include "bcn_common_api.h" - -//----------------------------------------------------------------------- -// When build is for CPU, we have some missing API calls common to GPU -// Use CPU CMP_Core replacements -//----------------------------------------------------------------------- -// used in BC1 HiQuaity -#if defined(ASPM_GPU) || defined(ASPM_HLSL) || defined(ASPM_OPENCL) -#define ALIGN_16 -#else -#include INC_cmp_math_func -#if defined(WIN32) || defined(_WIN64) -#define ALIGN_16 __declspec(align(16)) -#else // !WIN32 && !_WIN64 -#define ALIGN_16 -#endif // !WIN32 && !_WIN64 -#endif - - - -#define DXTC_OFFSET_ALPHA 0 -#define DXTC_OFFSET_RGB 2 - -#define BC1CompBlockSize 8 - -#define RC 2 -#define GC 1 -#define BC 0 -#define AC 3 - -/* -Channel Bits -*/ -#define RGBA8888_CHANNEL_A 3 -#define RGBA8888_CHANNEL_R 2 -#define RGBA8888_CHANNEL_G 1 -#define RGBA8888_CHANNEL_B 0 -#define RGBA8888_OFFSET_A (RGBA8888_CHANNEL_A * 8) -#define RGBA8888_OFFSET_R (RGBA8888_CHANNEL_R * 8) -#define RGBA8888_OFFSET_G (RGBA8888_CHANNEL_G * 8) -#define RGBA8888_OFFSET_B (RGBA8888_CHANNEL_B * 8) - -#ifndef MAX_ERROR -#define MAX_ERROR 128000.f -#endif - -#define MAX_BLOCK 64 -#define MAX_POINTS 16 -#define BLOCK_SIZE MAX_BLOCK -#define NUM_CHANNELS 4 -#define NUM_ENDPOINTS 2 -#define BLOCK_SIZE_4X4 16 -#define CMP_ALPHA_RAMP 8 // Number of Ramp Points used for Alpha Channels in BC5 - -#define ConstructColour(r, g, b) (((r) << 11) | ((g) << 5) | (b)) // same as BC1ConstructColour in common_api - -#define BYTEPP 4 -#define CMP_QUALITY0 0.25f -#define CMP_QUALITY1 0.50f -#define CMP_QUALITY2 0.75f -#define POS(x, y) (pos_on_axis[(x) + (y)*4]) - -// Find the first approximation of the line -// Assume there is a linear relation -// Z = a * X_In -// Z = b * Y_In -// Find a,b to minimize MSE between Z and Z_In -#define EPS (2.f / 255.f) * (2.f / 255.f) -#define EPS2 3.f * (2.f / 255.f) * (2.f / 255.f) - -// Grid precision -#define PIX_GRID 8 - -#define BYTE_MASK 0x00ff - -#define SCH_STPS 3 // number of search steps to make at each end of interval -static CMP_CONSTANT CGU_FLOAT sMvF[] = {0.f, -1.f, 1.f, -2.f, 2.f, -3.f, 3.f, -4.f, 4.f, -5.f, 5.f, -6.f, 6.f, -7.f, 7.f, -8.f, 8.f}; - -#ifndef GBL_SCH_STEP -#define GBL_SCH_STEP_MXS 0.018f -#define GBL_SCH_EXT_MXS 0.1f -#define LCL_SCH_STEP_MXS 0.6f -#define GBL_SCH_STEP_MXQ 0.0175f -#define GBL_SCH_EXT_MXQ 0.154f -#define LCL_SCH_STEP_MXQ 0.45f - -#define GBL_SCH_STEP GBL_SCH_STEP_MXS -#define GBL_SCH_EXT GBL_SCH_EXT_MXS -#define LCL_SCH_STEP LCL_SCH_STEP_MXS -#endif - -#ifndef ASPM_GPU - -typedef union -{ - struct colorblock64U - { - CGU_UINT8 col0; - CGU_UINT8 col1; - CGU_UINT8 indices[6]; - }; - CGU_INT8 cmp_data8[8]; - CGU_INT32 cmp_data32[2]; - CGU_UINT64 cmp_data64; -} CMP_BLOCK64_UNORM; - -typedef union -{ - struct colorblock64S - { - CGU_UINT8 col0; - CGU_UINT8 col1; - CGU_UINT8 indices[6]; - }; - CGU_INT8 cmp_data8[8]; - CGU_INT32 cmp_data32[2]; - CGU_UINT64 cmp_data64[2]; -} CMP_BLOCK64_SNORM; - -typedef union -{ - CGU_INT8 cmp_data8[16]; - CGU_INT32 cmp_data32[4]; - CGU_UINT64 cmp_data64[2]; -} CMP_BLOCK128_UNORM; - -#endif - -typedef struct -{ - CGU_UINT32 data; - CGU_UINT32 index; -} CMP_di; - -typedef struct -{ - CGU_FLOAT data; - CGU_UINT32 index; -} CMP_df; - -typedef struct -{ - // user setable - CGU_FLOAT m_fquality; - CGU_FLOAT m_fChannelWeights[3]; - CGU_BOOL m_bUseChannelWeighting; - CGU_BOOL m_bUseAdaptiveWeighting; - CGU_BOOL m_bUseFloat; - CGU_BOOL m_b3DRefinement; - CGU_BOOL m_bUseAlpha; - CGU_BOOL m_bIsSRGB; // Use Linear to SRGB color conversion used in BC1, default is false - CGU_BOOL m_bIsSNORM; - CGU_BOOL m_sintsrc; // source data pointer is signed data - CGU_UINT32 m_nRefinementSteps; - CGU_UINT32 m_nAlphaThreshold; - CGU_BOOL m_mapDecodeRGBA; - CGU_UINT32 m_src_width; - CGU_UINT32 m_src_height; -} CMP_BC15Options; - -typedef struct -{ - CGU_Vec3i end_point0; - CGU_Vec3i end_point1; - CGU_UINT8 indices[16]; - CGU_BOOL m_3color; -} CMP_BC1_Encode_Results; - -// used in BC1 LowQuality code -typedef struct -{ - CGU_Vec3f Color0; - CGU_Vec3f Color1; -} CMP_EndPoints; - -// Common data info used between encoders -// Defines properties of current 4x4 pixel block -typedef struct -{ - CGU_UINT32 grayscale_flag; - CGU_UINT32 any_black_pixels; - CGU_BOOL all_colors_equal; - CGU_Vec3i min; - CGU_Vec3i max; - CGU_Vec3i total; - CGU_Vec3i avg; -} CMP_EncodeData; - -typedef struct CMP_BC1_Block_t -{ - // Union struct not supported on GPU - // 8 Bytes Total - #ifndef ASPM_GPU - union { - struct { // 2 x 32bit - CGU_UINT32 colors; - CGU_UINT32 indices; - }; - struct { // 8 x 8bit - CGU_UINT8 m_low_color[2]; - CGU_UINT8 m_high_color[2]; - CGU_UINT8 m_selectors[4]; - }; - }; - - inline void set_low_color(CGU_UINT16 c) - { - m_low_color[0] = static_cast(c & 0xFF); - m_low_color[1] = static_cast((c >> 8) & 0xFF); - } - inline void set_high_color(CGU_UINT16 c) - { - m_high_color[0] = static_cast(c & 0xFF); - m_high_color[1] = static_cast((c >> 8) & 0xFF); - } - #else - CGU_UINT32 colors; - CGU_UINT32 indices; - #endif -} CMP_BC1_Block; - -// Helper functions to cut precision of floats -// Prec is a power of 10 value from 1,10,100,...,10000... INT MAX power 10 -static CGU_BOOL cmp_compareprecision(CGU_FLOAT f1, CGU_FLOAT f2, CGU_INT Prec) -{ - CGU_INT scale1 = (CGU_INT)(f1 * Prec); - CGU_INT scale2 = (CGU_INT)(f2 * Prec); - return (scale1 == scale2); -} - -// Helper function to compare floats to a set precision -static CGU_FLOAT cmp_getfloatprecision(CGU_FLOAT f1, CGU_INT Prec) -{ - CGU_INT scale1 = (CGU_INT)(f1 * Prec); - return ((CGU_FLOAT)(scale1) / Prec); -} - -static CGU_FLOAT cmp_getIndicesRGB(CMP_INOUT CGU_UINT32 CMP_PTRINOUT cmpindex, - const CGU_Vec3f block[16], - CGU_Vec3f minColor, - CGU_Vec3f maxColor, - CGU_BOOL getErr) -{ - CGU_UINT32 PackedIndices = 0; - CGU_FLOAT err = 0.0f; - CGU_Vec3f cn[4]; - CGU_FLOAT minDistance; - - if (getErr) - { - // remap to BC1 spec for decoding offsets, - // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1 - cn[0] = maxColor; - cn[1] = minColor; - cn[2] = cn[0] * 2.0f / 3.0f + cn[1] * 1.0f / 3.0f; - cn[3] = cn[0] * 1.0f / 3.0f + cn[1] * 2.0f / 3.0f; - } - - CGU_FLOAT Scale = 3.f / dot(minColor - maxColor, minColor - maxColor); - CGU_Vec3f ScaledRange = (minColor - maxColor) * Scale; - CGU_FLOAT Bias = (dot(maxColor, maxColor) - dot(maxColor, minColor)) * Scale; - CGU_INT indexMap[4] = {0, 2, 3, 1}; // mapping based on BC1 Spec for color0 > color1 - CGU_UINT32 index; - CGU_FLOAT diff; - - for (CGU_UINT32 i = 0; i < 16; i++) - { - // Get offset from base scale - diff = dot(block[i], ScaledRange) + Bias; - index = ((CGU_UINT32)round(diff)) & 0x3; - - // remap linear offset to spec offset - index = indexMap[index]; - - // use err calc for use in higher quality code - if (getErr) - { - minDistance = dot(block[i] - cn[index], block[i] - cn[index]); - err += minDistance; - } - - // Map the 2 bit index into compress 32 bit block - if (index) - PackedIndices |= (index << (2 * i)); - } - - if (getErr) - err = err * 0.0208333f; - - CMP_PTRINOUT cmpindex = PackedIndices; - return err; -} - -//---------------------------------------- BCn Common Utility Code ------------------------------------------------------- - -#ifndef ASPM_GPU -static void SetDefaultBC15Options(CMP_BC15Options* BC15Options) -{ - if (BC15Options) - { - BC15Options->m_fquality = 1.0f; - BC15Options->m_bUseChannelWeighting = false; - BC15Options->m_bUseAdaptiveWeighting = false; - BC15Options->m_fChannelWeights[0] = 0.3086f; - BC15Options->m_fChannelWeights[1] = 0.6094f; - BC15Options->m_fChannelWeights[2] = 0.0820f; - BC15Options->m_nAlphaThreshold = 128; - BC15Options->m_bUseFloat = false; - BC15Options->m_b3DRefinement = false; - BC15Options->m_bUseAlpha = false; - BC15Options->m_bIsSNORM = false; - BC15Options->m_bIsSRGB = false; - BC15Options->m_nRefinementSteps = 0; - BC15Options->m_src_width = 4; - BC15Options->m_src_height = 4; -#ifdef CMP_SET_BC13_DECODER_RGBA - BC15Options->m_mapDecodeRGBA = true; -#else - BC15Options->m_mapDecodeRGBA = false; -#endif - } -} -#endif - -static CMP_BC15Options CalculateColourWeightings(CGU_Vec4f rgbaBlock[BLOCK_SIZE_4X4], CMP_BC15Options BC15options) -{ - CGU_FLOAT fBaseChannelWeights[3] = {0.3086f, 0.6094f, 0.0820f}; - - if (!BC15options.m_bUseChannelWeighting) - { - BC15options.m_fChannelWeights[0] = 1.0F; - BC15options.m_fChannelWeights[1] = 1.0F; - BC15options.m_fChannelWeights[2] = 1.0F; - return BC15options; - } - - if (BC15options.m_bUseAdaptiveWeighting) - { - float medianR = 0.0f, medianG = 0.0f, medianB = 0.0f; - - for (CGU_UINT32 k = 0; k < BLOCK_SIZE_4X4; k++) - { - medianR += rgbaBlock[k].x; - medianG += rgbaBlock[k].y; - medianB += rgbaBlock[k].z; - } - - medianR /= BLOCK_SIZE_4X4; - medianG /= BLOCK_SIZE_4X4; - medianB /= BLOCK_SIZE_4X4; - - // Now skew the colour weightings based on the gravity center of the block - float largest = max(max(medianR, medianG), medianB); - - if (largest > 0) - { - medianR /= largest; - medianG /= largest; - medianB /= largest; - } - else - medianR = medianG = medianB = 1.0f; - - // Scale weightings back up to 1.0f - CGU_FLOAT fWeightScale = 1.0f / (fBaseChannelWeights[0] + fBaseChannelWeights[1] + fBaseChannelWeights[2]); - - BC15options.m_fChannelWeights[0] = fBaseChannelWeights[0] * fWeightScale; - BC15options.m_fChannelWeights[1] = fBaseChannelWeights[1] * fWeightScale; - BC15options.m_fChannelWeights[2] = fBaseChannelWeights[2] * fWeightScale; - - BC15options.m_fChannelWeights[0] = ((BC15options.m_fChannelWeights[0] * 3 * medianR) + BC15options.m_fChannelWeights[0]) * 0.25f; - BC15options.m_fChannelWeights[1] = ((BC15options.m_fChannelWeights[1] * 3 * medianG) + BC15options.m_fChannelWeights[1]) * 0.25f; - BC15options.m_fChannelWeights[2] = ((BC15options.m_fChannelWeights[2] * 3 * medianB) + BC15options.m_fChannelWeights[2]) * 0.25f; - - fWeightScale = 1.0f / (BC15options.m_fChannelWeights[0] + BC15options.m_fChannelWeights[1] + BC15options.m_fChannelWeights[2]); - - BC15options.m_fChannelWeights[0] *= fWeightScale; - BC15options.m_fChannelWeights[1] *= fWeightScale; - BC15options.m_fChannelWeights[2] *= fWeightScale; - } - else - { - BC15options.m_fChannelWeights[0] = fBaseChannelWeights[0]; - BC15options.m_fChannelWeights[1] = fBaseChannelWeights[1]; - BC15options.m_fChannelWeights[2] = fBaseChannelWeights[2]; - } - - return BC15options; -} - -static CMP_BC15Options CalculateColourWeightings3f(CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4], CMP_BC15Options BC15options) -{ - CGU_FLOAT fBaseChannelWeights[3] = {0.3086f, 0.6094f, 0.0820f}; - - if (!BC15options.m_bUseChannelWeighting) - { - BC15options.m_fChannelWeights[0] = 1.0F; - BC15options.m_fChannelWeights[1] = 1.0F; - BC15options.m_fChannelWeights[2] = 1.0F; - return BC15options; - } - - if (BC15options.m_bUseAdaptiveWeighting) - { - float medianR = 0.0f, medianG = 0.0f, medianB = 0.0f; - - for (CGU_UINT32 k = 0; k < BLOCK_SIZE_4X4; k++) - { - medianR += rgbBlock[k].x; - medianG += rgbBlock[k].y; - medianB += rgbBlock[k].z; - } - - medianR /= BLOCK_SIZE_4X4; - medianG /= BLOCK_SIZE_4X4; - medianB /= BLOCK_SIZE_4X4; - - // Now skew the colour weightings based on the gravity center of the block - float largest = max(max(medianR, medianG), medianB); - - if (largest > 0) - { - medianR /= largest; - medianG /= largest; - medianB /= largest; - } - else - medianR = medianG = medianB = 1.0f; - - // Scale weightings back up to 1.0f - CGU_FLOAT fWeightScale = 1.0f / (fBaseChannelWeights[0] + fBaseChannelWeights[1] + fBaseChannelWeights[2]); - - BC15options.m_fChannelWeights[0] = fBaseChannelWeights[0] * fWeightScale; - BC15options.m_fChannelWeights[1] = fBaseChannelWeights[1] * fWeightScale; - BC15options.m_fChannelWeights[2] = fBaseChannelWeights[2] * fWeightScale; - - BC15options.m_fChannelWeights[0] = ((BC15options.m_fChannelWeights[0] * 3 * medianR) + BC15options.m_fChannelWeights[0]) * 0.25f; - BC15options.m_fChannelWeights[1] = ((BC15options.m_fChannelWeights[1] * 3 * medianG) + BC15options.m_fChannelWeights[1]) * 0.25f; - BC15options.m_fChannelWeights[2] = ((BC15options.m_fChannelWeights[2] * 3 * medianB) + BC15options.m_fChannelWeights[2]) * 0.25f; - - fWeightScale = 1.0f / (BC15options.m_fChannelWeights[0] + BC15options.m_fChannelWeights[1] + BC15options.m_fChannelWeights[2]); - - BC15options.m_fChannelWeights[0] *= fWeightScale; - BC15options.m_fChannelWeights[1] *= fWeightScale; - BC15options.m_fChannelWeights[2] *= fWeightScale; - } - else - { - BC15options.m_fChannelWeights[0] = fBaseChannelWeights[0]; - BC15options.m_fChannelWeights[1] = fBaseChannelWeights[1]; - BC15options.m_fChannelWeights[2] = fBaseChannelWeights[2]; - } - - return BC15options; -} - -static CGU_FLOAT cmp_getRampErr(CGU_FLOAT Prj[BLOCK_SIZE_4X4], - CGU_FLOAT PrjErr[BLOCK_SIZE_4X4], - CGU_FLOAT PreMRep[BLOCK_SIZE_4X4], - CGU_FLOAT StepErr, - CGU_FLOAT lowPosStep, - CGU_FLOAT highPosStep, - CGU_UINT32 dwUniqueColors) -{ - CGU_FLOAT error = 0; - CGU_FLOAT step = (highPosStep - lowPosStep) / 3; // using (dwNumChannels=4 - 1); - CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; - CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; - - for (CGU_UINT32 i = 0; i < dwUniqueColors; i++) - { - CGU_FLOAT v; - // Work out which value in the block this select - CGU_FLOAT del; - - if ((del = Prj[i] - lowPosStep) <= 0) - v = lowPosStep; - else if (Prj[i] - highPosStep >= 0) - v = highPosStep; - else - v = floor((del + step_h) * rstep) * step + lowPosStep; - - // And accumulate the error - CGU_FLOAT d = (Prj[i] - v); - d *= d; - CGU_FLOAT err = PreMRep[i] * d + PrjErr[i]; - error += err; - if (StepErr < error) - { - error = StepErr; - break; - } - } - return error; -} - -static CGU_Vec2ui cmp_compressExplicitAlphaBlock(const CGU_FLOAT AlphaBlockUV[16]) -{ - CGU_Vec2ui compBlock = {0, 0}; - CGU_UINT8 i; - for (i = 0; i < 16; i++) - { - CGU_UINT8 v = (CGU_UINT8)(AlphaBlockUV[i] * 255.0F); - v = (v + 7 - (v >> 4)); - v >>= 4; - - if (v < 0) - v = 0; - else if (v > 0xf) - v = 0xf; - - if (i < 8) - compBlock.x |= v << (4 * i); - else - compBlock.y |= v << (4 * (i - 8)); - } - return compBlock; -} - -static CGU_FLOAT cmp_getRampError(CGU_FLOAT _Blk[BLOCK_SIZE_4X4], - CGU_FLOAT _Rpt[BLOCK_SIZE_4X4], - CGU_FLOAT _maxerror, - CGU_FLOAT _min_ex, - CGU_FLOAT _max_ex, - CGU_INT _NmbrClrs) -{ // Max 16 - CGU_INT i; - CGU_FLOAT error = 0; - const CGU_FLOAT step = (_max_ex - _min_ex) / 7; // (CGU_FLOAT)(dwNumPoints - 1); - const CGU_FLOAT step_h = step * 0.5f; - const CGU_FLOAT rstep = 1.0f / step; - - for (i = 0; i < _NmbrClrs; i++) - { - CGU_FLOAT v; - // Work out which value in the block this select - CGU_FLOAT del; - - if ((del = _Blk[i] - _min_ex) <= 0) - v = _min_ex; - else if (_Blk[i] - _max_ex >= 0) - v = _max_ex; - else - v = (floor((del + step_h) * rstep) * step) + _min_ex; - - // And accumulate the error - CGU_FLOAT del2 = (_Blk[i] - v); - error += del2 * del2 * _Rpt[i]; - - // if we've already lost to the previous step bail out - if (_maxerror < error) - { - error = _maxerror; - break; - } - } - return error; -} - -static CGU_FLOAT cmp_linearBlockRefine(CGU_FLOAT _Blk[BLOCK_SIZE_4X4], - CGU_FLOAT _Rpt[BLOCK_SIZE_4X4], - CGU_FLOAT _MaxError, - CMP_INOUT CGU_FLOAT CMP_PTRINOUT _min_ex, - CMP_INOUT CGU_FLOAT CMP_PTRINOUT _max_ex, - CGU_FLOAT _m_step, - CGU_FLOAT _min_bnd, - CGU_FLOAT _max_bnd, - CGU_INT _NmbrClrs) -{ - // Start out assuming our endpoints are the min and max values we've - // determined - - // Attempt a (simple) progressive refinement step to reduce noise in the - // output image by trying to find a better overall match for the endpoints. - - CGU_FLOAT maxerror = _MaxError; - CGU_FLOAT min_ex = CMP_PTRINOUT _min_ex; - CGU_FLOAT max_ex = CMP_PTRINOUT _max_ex; - - CGU_INT mode, bestmode; - - do - { - CGU_FLOAT cr_min0 = min_ex; - CGU_FLOAT cr_max0 = max_ex; - for (bestmode = -1, mode = 0; mode < SCH_STPS * SCH_STPS; mode++) - { - // check each move (see sStep for direction) - CGU_FLOAT cr_min = min_ex + _m_step * sMvF[mode / SCH_STPS]; - CGU_FLOAT cr_max = max_ex + _m_step * sMvF[mode % SCH_STPS]; - - cr_min = max(cr_min, _min_bnd); - cr_max = min(cr_max, _max_bnd); - - CGU_FLOAT error; - error = cmp_getRampError(_Blk, _Rpt, maxerror, cr_min, cr_max, _NmbrClrs); - - if (error < maxerror) - { - maxerror = error; - bestmode = mode; - cr_min0 = cr_min; - cr_max0 = cr_max; - } - } - - if (bestmode != -1) - { - // make move (see sStep for direction) - min_ex = cr_min0; - max_ex = cr_max0; - } - } while (bestmode != -1); - - CMP_PTRINOUT _min_ex = min_ex; - CMP_PTRINOUT _max_ex = max_ex; - - return maxerror; -} - -static CGU_Vec2f cmp_getLinearEndPoints(CGU_FLOAT _Blk[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality, CMP_IN CGU_BOOL isSigned) -{ - CGU_UINT32 i; - CGU_Vec2f cmpMinMax; - - //================================================================ - // Bounding Box - // lowest quality calculation to get min and max value to use - //================================================================ - if (fquality < CMP_QUALITY2) - { - cmpMinMax.x = _Blk[0]; - cmpMinMax.y = _Blk[0]; - for (i = 1; i < BLOCK_SIZE_4X4; ++i) - { - cmpMinMax.x = min(cmpMinMax.x, _Blk[i]); - cmpMinMax.y = max(cmpMinMax.y, _Blk[i]); - } - return cmpMinMax; - } - - //================================================================ - // Do more calculations to get the best min and max values to use - //================================================================ - CGU_FLOAT Ramp[2]; - - // Result defaults for SNORM or UNORM - Ramp[0] = isSigned ? -1.0f : 0.0f; - Ramp[1] = 1.0f; - - ALIGN_16 CGU_FLOAT afUniqueValues[BLOCK_SIZE_4X4]; - ALIGN_16 CGU_FLOAT afValueRepeats[BLOCK_SIZE_4X4]; - for (i = 0; i < BLOCK_SIZE_4X4; i++) - afUniqueValues[i] = afValueRepeats[i] = 0.f; - - // For each unique value we compute the number of it appearances. - CGU_FLOAT fBlk[BLOCK_SIZE_4X4]; - -// sort the input -#ifndef ASPM_GPU - memcpy(fBlk, _Blk, BLOCK_SIZE_4X4 * sizeof(CGU_FLOAT)); - qsort((void*)fBlk, (size_t)BLOCK_SIZE_4X4, sizeof(CGU_FLOAT), QSortFCmp); -#else - CGU_UINT32 j; - - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - fBlk[i] = _Blk[i]; - } - - CMP_df what[BLOCK_SIZE]; - - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - what[i].index = i; - what[i].data = fBlk[i]; - } - - CGU_UINT32 tmp_index; - CGU_FLOAT tmp_data; - - for (i = 1; i < BLOCK_SIZE_4X4; i++) - { - for (j = i; j > 0; j--) - { - if (what[j - 1].data > what[j].data) - { - tmp_index = what[j].index; - tmp_data = what[j].data; - what[j].index = what[j - 1].index; - what[j].data = what[j - 1].data; - what[j - 1].index = tmp_index; - what[j - 1].data = tmp_data; - } - } - } - - for (i = 0; i < BLOCK_SIZE_4X4; i++) - fBlk[i] = what[i].data; -#endif - - CGU_FLOAT new_p = -2.0f; - - CGU_UINT32 dwUniqueValues = 0; - afUniqueValues[0] = 0.0f; - CGU_BOOL requiresCalculation = true; - - { - // Ramp not fixed - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - if (new_p != fBlk[i]) - { - afUniqueValues[dwUniqueValues] = new_p = fBlk[i]; - afValueRepeats[dwUniqueValues] = 1.f; - dwUniqueValues++; - } - else if (dwUniqueValues) - afValueRepeats[dwUniqueValues - 1] += 1.f; - } - - // if number of unique colors is less or eq 2, we've done - if (dwUniqueValues <= 2) - { - Ramp[0] = floor(afUniqueValues[0] * 255.0f + 0.5f); - if (dwUniqueValues == 1) - Ramp[1] = Ramp[0] + 1.f; - else - Ramp[1] = floor(afUniqueValues[1] * 255.0f + 0.5f); - requiresCalculation = false; - } - } // Ramp not fixed - - if (requiresCalculation) - { - CGU_FLOAT min_ex = afUniqueValues[0]; - CGU_FLOAT max_ex = afUniqueValues[dwUniqueValues - 1]; - CGU_FLOAT min_bnd = 0, max_bnd = 1.; - CGU_FLOAT min_r = min_ex, max_r = max_ex; - CGU_FLOAT gbl_l = 0, gbl_r = 0; - CGU_FLOAT cntr = (min_r + max_r) / 2; - - CGU_FLOAT gbl_err = MAX_ERROR; - // Trying to avoid unnecessary calculations. Heuristics: after some analisis - // it appears that in integer case, if the input interval not more then 48 - // we won't get much better - bool wantsSearch = !((max_ex - min_ex) <= (48.f / 256.0f)); - - if (wantsSearch) - { - // Search. - // 1. take the vicinities of both low and high bound of the input - // interval. - // 2. setup some search step - // 3. find the new low and high bound which provides an (sub) optimal - // (infinite precision) clusterization. - CGU_FLOAT gbl_llb = (min_bnd > min_r - GBL_SCH_EXT) ? min_bnd : min_r - GBL_SCH_EXT; - CGU_FLOAT gbl_rrb = (max_bnd < max_r + GBL_SCH_EXT) ? max_bnd : max_r + GBL_SCH_EXT; - CGU_FLOAT gbl_lrb = (cntr < min_r + GBL_SCH_EXT) ? cntr : min_r + GBL_SCH_EXT; - CGU_FLOAT gbl_rlb = (cntr > max_r - GBL_SCH_EXT) ? cntr : max_r - GBL_SCH_EXT; - - for (CGU_FLOAT step_l = gbl_llb; step_l < gbl_lrb; step_l += GBL_SCH_STEP) - { - for (CGU_FLOAT step_r = gbl_rrb; gbl_rlb <= step_r; step_r -= GBL_SCH_STEP) - { - CGU_FLOAT sch_err; - // an sse version is avaiable - sch_err = cmp_getRampError(afUniqueValues, afValueRepeats, gbl_err, step_l, step_r, dwUniqueValues); - if (sch_err < gbl_err) - { - gbl_err = sch_err; - gbl_l = step_l; - gbl_r = step_r; - } - } - } - - min_r = gbl_l; - max_r = gbl_r; - } // want search - - // This is a refinement call. The function tries to make several small - // stretches or squashes to minimize quantization error. - CGU_FLOAT m_step = LCL_SCH_STEP / 256.0f; - cmp_linearBlockRefine(afUniqueValues, afValueRepeats, gbl_err, CMP_REFINOUT min_r, CMP_REFINOUT max_r, m_step, min_bnd, max_bnd, dwUniqueValues); - - min_ex = min_r; - max_ex = max_r; - max_ex *= 255.0f; - min_ex *= 255.0f; - - Ramp[0] = floor(min_ex + 0.5f); - Ramp[1] = floor(max_ex + 0.5f); - } - - // Ensure that the two endpoints are not the same - // This is legal but serves no need & can break some optimizations in the compressor - if (Ramp[0] == Ramp[1]) - { - if (Ramp[1] < 255.f) - Ramp[1] = Ramp[1] + 1.0f; - else if (Ramp[1] > 0.0f) - Ramp[1] = Ramp[1] - 1.0f; - } - - cmpMinMax.x = Ramp[0]; - cmpMinMax.y = Ramp[1]; - - return cmpMinMax; -} - -static CGU_Vec2ui cmp_getBlockPackedIndices(CGU_Vec2f RampMinMax, CGU_FLOAT alphaBlock[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality) -{ - CGU_UINT32 i; - CGU_UINT32 j; - CGU_Vec2ui cmpBlock = {0, 0}; - CGU_UINT32 MinRampU; - CGU_UINT32 MaxRampU; - CGU_INT32 pcIndices[BLOCK_SIZE_4X4]; - - if (fquality < CMP_QUALITY2) - { - CGU_FLOAT Range; - CGU_FLOAT RampSteps; // segments into 0..7 sections - CGU_FLOAT Bias; - - if (RampMinMax.x != RampMinMax.y) - Range = RampMinMax.x - RampMinMax.y; - else - Range = 1.0f; - - RampSteps = 7.f / Range; // segments into 0..7 sections - Bias = -RampSteps * RampMinMax.y; - - for (i = 0; i < 16; ++i) - { - pcIndices[i] = (CGU_UINT32)round(alphaBlock[i] * RampSteps + Bias); - if (i < 5) - { - pcIndices[i] += (pcIndices[i] > 0) - (7 * (pcIndices[i] == 7)); - } - else if (i > 5) - { - pcIndices[i] += (pcIndices[i] > 0) - (7 * (pcIndices[i] == 7 ? 1 : 0)); - } - else - { - pcIndices[i] += (pcIndices[i] > 0) - (7 * (pcIndices[i] == 7)); - } - } - - MinRampU = (CGU_UINT32)round(RampMinMax.x * 255.0f); - MaxRampU = (CGU_UINT32)round(RampMinMax.y * 255.0f); - - cmpBlock.x = (MinRampU << 8) | MaxRampU; - cmpBlock.y = 0; - for (i = 0; i < 5; ++i) - { - cmpBlock.x |= (pcIndices[i] << (16 + (i * 3))); - } - { - cmpBlock.x |= (pcIndices[5] << 31); - cmpBlock.y |= (pcIndices[5] >> 1); - } - for (i = 6; i < BLOCK_SIZE_4X4; ++i) - { - cmpBlock.y |= (pcIndices[i] << (i * 3 - 16)); - } - } - else - { - CGU_UINT32 epoint; - CGU_FLOAT alpha[BLOCK_SIZE_4X4]; - CGU_FLOAT OverIntFctr; - CGU_FLOAT shortest; - CGU_FLOAT adist; - - for (i = 0; i < BLOCK_SIZE_4X4; i++) - pcIndices[i] = 0; - - for (i = 0; i < MAX_POINTS; i++) - alpha[i] = 0; - - // GetRmp1 - { - if (RampMinMax.x <= RampMinMax.y) - { - CGU_FLOAT t = RampMinMax.x; - RampMinMax.x = RampMinMax.y; - RampMinMax.y = t; - } - - //============================= - // final clusterization applied - //============================= - CGU_FLOAT ramp[NUM_ENDPOINTS]; - - ramp[0] = RampMinMax.x; - ramp[1] = RampMinMax.y; - - { - // BldRmp1 - alpha[0] = ramp[0]; - alpha[1] = ramp[1]; - for (epoint = 1; epoint < CMP_ALPHA_RAMP - 1; epoint++) - alpha[epoint + 1] = (alpha[0] * (CMP_ALPHA_RAMP - 1 - epoint) + alpha[1] * epoint) / (CGU_FLOAT)(CMP_ALPHA_RAMP - 1); - for (epoint = CMP_ALPHA_RAMP; epoint < BLOCK_SIZE_4X4; epoint++) - alpha[epoint] = 100000.f; - } // BldRmp1 - - // FixedRamp - for (i = 0; i < CMP_ALPHA_RAMP; i++) - { - alpha[i] = floor(alpha[i] + 0.5f); - } - } // GetRmp1 - - OverIntFctr = 1.f / 255.0f; - for (i = 0; i < CMP_ALPHA_RAMP; i++) - alpha[i] *= OverIntFctr; - - // For each colour in the original block, calculate its weighted - // distance from each point in the original and assign it - // to the closest cluster - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - shortest = 10000000.f; - for (j = 0; j < CMP_ALPHA_RAMP; j++) - { - adist = (alphaBlock[i] - alpha[j]); - adist *= adist; - if (adist < shortest) - { - shortest = adist; - pcIndices[i] = j; - } - } - } - - //================================================== - // EncodeAlphaBlock - //================================================== - MinRampU = (CGU_UINT32)RampMinMax.x; - MaxRampU = (CGU_UINT32)RampMinMax.y; - - cmpBlock.x = (MaxRampU << 8) | MinRampU; - cmpBlock.y = 0; - for (i = 0; i < 5; i++) - { - cmpBlock.x |= (pcIndices[i]) << (16 + (i * 3)); - } - { - cmpBlock.x |= (pcIndices[5] & 0x1) << 31; - cmpBlock.y |= (pcIndices[5] & 0x6) >> 1; - } - for (i = 6; i < BLOCK_SIZE_4X4; i++) - { - cmpBlock.y |= (pcIndices[i]) << (i * 3 - 16); - } - } - return cmpBlock; -} - -//======================= SNORM CODE ================================== - -static CGU_INT8 cmp_snormFloatToSInt(CGU_FLOAT fsnorm) -{ - if (isnan(fsnorm)) - fsnorm = 0; - else if (fsnorm > 1) - fsnorm = 1; // Clamp to 1 - else if (fsnorm < -1) - fsnorm = -1; // Clamp to -1 - - fsnorm = fsnorm * 127U; - - // shift round up or down - if (fsnorm >= 0) - fsnorm += .5f; - else - fsnorm -= .5f; - -#ifdef ASPM_GPU - CGU_INT8 res = (CGU_INT8)fsnorm; -#else - CGU_INT8 res = static_cast(fsnorm); -#endif - return (res); -} - -static CGU_Vec2f cmp_optimizeEndPoints(CGU_FLOAT pPoints[BLOCK_SIZE_4X4], CGU_INT8 cSteps, CGU_BOOL isSigned) -{ - CGU_Vec2f fendpoints; - CGU_FLOAT MAX_VALUE = 1.0f; - CGU_FLOAT MIN_VALUE = isSigned ? -1.0f : 0.0f; - - // Find Min and Max points, as starting point - CGU_FLOAT fX = MAX_VALUE; - CGU_FLOAT fY = MIN_VALUE; - - if (8 == cSteps) - { - for (CGU_INT8 iPoint = 0; iPoint < BLOCK_SIZE_4X4; iPoint++) - { - if (pPoints[iPoint] < fX) - fX = pPoints[iPoint]; - - if (pPoints[iPoint] > fY) - fY = pPoints[iPoint]; - } - } - else - { - for (CGU_INT8 iPoint = 0; iPoint < BLOCK_SIZE_4X4; iPoint++) - { - if (pPoints[iPoint] < fX && pPoints[iPoint] > MIN_VALUE) - fX = pPoints[iPoint]; - - if (pPoints[iPoint] > fY && pPoints[iPoint] < MAX_VALUE) - fY = pPoints[iPoint]; - } - - if (fX == fY) - { - fY = MAX_VALUE; - } - } - - //=================== - // Use Newton Method - //=================== -#ifdef ASPM_GPU - CGU_FLOAT cStepsDiv = (CGU_FLOAT)(cSteps - 1); -#else - CGU_FLOAT cStepsDiv = static_cast(cSteps - 1); -#endif - CGU_FLOAT pSteps[8]; - CGU_FLOAT fc; - CGU_FLOAT fd; - - for (CGU_INT8 iIteration = 0; iIteration < 8; iIteration++) - { - // reach minimum threashold break - if ((fY - fX) < (1.0f / 256.0f)) - break; - - CGU_FLOAT fScale = cStepsDiv / (fY - fX); - - // Calculate new steps - for (CGU_INT8 iStep = 0; iStep < cSteps; iStep++) - { - fc = (cStepsDiv - (CGU_FLOAT)iStep) / cStepsDiv; - fd = (CGU_FLOAT)iStep / cStepsDiv; - pSteps[iStep] = fc * fX + fd * fY; - } - - if (6 == cSteps) - { - pSteps[6] = MIN_VALUE; - pSteps[7] = MAX_VALUE; - } - - // Evaluate function, and derivatives - CGU_FLOAT dX = 0.0f; - CGU_FLOAT dY = 0.0f; - CGU_FLOAT d2X = 0.0f; - CGU_FLOAT d2Y = 0.0f; - - for (CGU_INT8 iPoint = 0; iPoint < BLOCK_SIZE_4X4; iPoint++) - { - CGU_FLOAT fDot = (pPoints[iPoint] - fX) * fScale; - - CGU_INT8 iStep; - if (fDot <= 0.0f) - { - iStep = ((6 == cSteps) && (pPoints[iPoint] <= (fX + MIN_VALUE) * 0.5f)) ? 6u : 0u; - } - else if (fDot >= cStepsDiv) - { - iStep = ((6 == cSteps) && (pPoints[iPoint] >= (fY + MAX_VALUE) * 0.5f)) ? 7u : (cSteps - 1); - } - else - { - iStep = (CGU_UINT32)(fDot + 0.5f); - } - - // steps to improve quality - if (iStep < cSteps) - { - fc = (cStepsDiv - (CGU_FLOAT)iStep) / cStepsDiv; - fd = (CGU_FLOAT)iStep / cStepsDiv; - CGU_FLOAT fDiff = pSteps[iStep] - pPoints[iPoint]; - dX += fc * fDiff; - d2X += fc * fc; - dY += fd * fDiff; - d2Y += fd * fd; - } - } - - // Move endpoints - if (d2X > 0.0f) - fX -= dX / d2X; - - if (d2Y > 0.0f) - fY -= dY / d2Y; - - if (fX > fY) - { - float f = fX; - fX = fY; - fY = f; - } - - if ((dX * dX < (1.0f / 64.0f)) && (dY * dY < (1.0f / 64.0f))) - break; - } - - fendpoints.x = (fX < MIN_VALUE) ? MIN_VALUE : (fX > MAX_VALUE) ? MAX_VALUE : fX; - fendpoints.y = (fY < MIN_VALUE) ? MIN_VALUE : (fY > MAX_VALUE) ? MAX_VALUE : fY; - - return fendpoints; -} - -static CGU_Vec2i cmp_findEndpointsAlphaBlockSnorm(CGU_FLOAT alphaBlockSnorm[BLOCK_SIZE_4X4]) -{ - //================================================================ - // Bounding Box - // lowest quality calculation to get min and max value to use - //================================================================ - CGU_Vec2f cmpMinMax; - cmpMinMax.x = alphaBlockSnorm[0]; - cmpMinMax.y = alphaBlockSnorm[0]; - - for (CGU_UINT8 i = 0; i < BLOCK_SIZE_4X4; ++i) - { - if (alphaBlockSnorm[i] < cmpMinMax.x) - { - cmpMinMax.x = alphaBlockSnorm[i]; - } - else if (alphaBlockSnorm[i] > cmpMinMax.y) - { - cmpMinMax.y = alphaBlockSnorm[i]; - } - } - - CGU_Vec2i endpoints; - CGU_Vec2f fendpoints; - - // Are we done for lowest quality setting! - // CGU_FLOAT fquality = 1.0f; - // - // if (fquality < CMP_QUALITY2) { - // endpoints.x = (CGU_INT8)(cmpMinMax.x); - // endpoints.y = (CGU_INT8)(cmpMinMax.y); - // return endpoints; - // } - - //================================================================ - // Do more calculations to get the best min and max values to use - //================================================================ - if ((-1.0f == cmpMinMax.x || 1.0f == cmpMinMax.y)) - { - fendpoints = cmp_optimizeEndPoints(alphaBlockSnorm, 6, true); - endpoints.x = cmp_snormFloatToSInt(fendpoints.x); - endpoints.y = cmp_snormFloatToSInt(fendpoints.y); - } - else - { - fendpoints = cmp_optimizeEndPoints(alphaBlockSnorm, 8, true); - endpoints.x = cmp_snormFloatToSInt(fendpoints.y); - endpoints.y = cmp_snormFloatToSInt(fendpoints.x); - } - - return endpoints; -} - -#ifndef ASPM_HLSL -static CGU_UINT64 cmp_getBlockPackedIndicesSNorm(CGU_Vec2f alphaMinMax, CGU_FLOAT alphaBlockSnorm[BLOCK_SIZE_4X4], CGU_UINT64 data) -{ - CGU_FLOAT alpha[8]; - alpha[0] = alphaMinMax.x; - alpha[1] = alphaMinMax.y; - - if (alphaMinMax.x > alphaMinMax.y) - { - // 8-alpha block: derive the other six alphas. - // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated. - alpha[2] = (alpha[0] * 6.0f + alpha[1]) / 7.0f; - alpha[3] = (alpha[0] * 5.0f + alpha[1] * 2.0f) / 7.0f; - alpha[4] = (alpha[0] * 4.0f + alpha[1] * 3.0f) / 7.0f; - alpha[5] = (alpha[0] * 3.0f + alpha[1] * 4.0f) / 7.0f; - alpha[6] = (alpha[0] * 2.0f + alpha[1] * 5.0f) / 7.0f; - alpha[7] = (alpha[0] + alpha[1] * 6.0f) / 7.0f; - } - else - { - // 6-alpha block. - // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated. - alpha[2] = (alpha[0] * 4.0f + alpha[1]) / 5.0f; - alpha[3] = (alpha[0] * 3.0f + alpha[1] * 2.0f) / 5.0f; - alpha[4] = (alpha[0] * 2.0f + alpha[1] * 3.0f) / 5.0f; - alpha[5] = (alpha[0] + alpha[1] * 4.0f) / 5.0f; - alpha[6] = -1.0f; - alpha[7] = 1.0f; - } - - // Index all colors using best alpha value - for (CGU_UINT8 i = 0; i < BLOCK_SIZE_4X4; ++i) - { - CGU_UINT8 uBestIndex = 0; - CGU_FLOAT fBestDelta = CMP_FLOAT_MAX; - for (CGU_INT32 uIndex = 0; uIndex < 8; uIndex++) - { - CGU_FLOAT fCurrentDelta = fabs(alpha[uIndex] - alphaBlockSnorm[i]); - if (fCurrentDelta < fBestDelta) - { - uBestIndex = uIndex; - fBestDelta = fCurrentDelta; - } - } - - data &= ~((CGU_UINT64)(0x07) << (3 * i + 16)); - data |= ((CGU_UINT64)(uBestIndex) << (3 * i + 16)); - } - - return data; -} -#endif -static void cmp_getCompressedAlphaRampS(CGU_INT8 alpha[8], const CGU_UINT32 compressedBlock[2]) -{ - alpha[0] = (CGU_INT8)(compressedBlock[0] & 0xff); - alpha[1] = (CGU_INT8)((compressedBlock[0] >> 8) & 0xff); - - if (alpha[0] > alpha[1]) - { - // 8-alpha block: derive the other six alphas. - // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated. -#ifdef ASPM_GPU - alpha[2] = (CGU_UINT8)((6 * alpha[0] + 1 * alpha[1] + 3) / 7); // bit code 010 - alpha[3] = (CGU_UINT8)((5 * alpha[0] + 2 * alpha[1] + 3) / 7); // bit code 011 - alpha[4] = (CGU_UINT8)((4 * alpha[0] + 3 * alpha[1] + 3) / 7); // bit code 100 - alpha[5] = (CGU_UINT8)((3 * alpha[0] + 4 * alpha[1] + 3) / 7); // bit code 101 - alpha[6] = (CGU_UINT8)((2 * alpha[0] + 5 * alpha[1] + 3) / 7); // bit code 110 - alpha[7] = (CGU_UINT8)((1 * alpha[0] + 6 * alpha[1] + 3) / 7); // bit code 111 -#else - alpha[2] = static_cast((6 * alpha[0] + 1 * alpha[1] + 3) / 7); // bit code 010 - alpha[3] = static_cast((5 * alpha[0] + 2 * alpha[1] + 3) / 7); // bit code 011 - alpha[4] = static_cast((4 * alpha[0] + 3 * alpha[1] + 3) / 7); // bit code 100 - alpha[5] = static_cast((3 * alpha[0] + 4 * alpha[1] + 3) / 7); // bit code 101 - alpha[6] = static_cast((2 * alpha[0] + 5 * alpha[1] + 3) / 7); // bit code 110 - alpha[7] = static_cast((1 * alpha[0] + 6 * alpha[1] + 3) / 7); // bit code 111 -#endif - } - else - { - // 6-alpha block. - // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated. -#ifdef ASPM_GPU - alpha[2] = (CGU_UINT8)((4 * alpha[0] + 1 * alpha[1] + 2) / 5); // Bit code 010 - alpha[3] = (CGU_UINT8)((3 * alpha[0] + 2 * alpha[1] + 2) / 5); // Bit code 011 - alpha[4] = (CGU_UINT8)((2 * alpha[0] + 3 * alpha[1] + 2) / 5); // Bit code 100 - alpha[5] = (CGU_UINT8)((1 * alpha[0] + 4 * alpha[1] + 2) / 5); // Bit code 101 -#else - alpha[2] = static_cast((4 * alpha[0] + 1 * alpha[1] + 2) / 5); // Bit code 010 - alpha[3] = static_cast((3 * alpha[0] + 2 * alpha[1] + 2) / 5); // Bit code 011 - alpha[4] = static_cast((2 * alpha[0] + 3 * alpha[1] + 2) / 5); // Bit code 100 - alpha[5] = static_cast((1 * alpha[0] + 4 * alpha[1] + 2) / 5); // Bit code 101 -#endif - alpha[6] = -128; // Bit code 110 - alpha[7] = 127; // Bit code 111 - } -} - -static void cmp_decompressAlphaBlockS(CGU_INT8 alphaBlock[BLOCK_SIZE_4X4], const CGU_UINT32 compressedBlock[2]) -{ - CGU_UINT32 i; - CGU_INT8 alpha[8]; - cmp_getCompressedAlphaRampS(alpha, compressedBlock); - - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - CGU_UINT32 index; - if (i < 5) - index = (compressedBlock[0] & (0x7 << (16 + (i * 3)))) >> (16 + (i * 3)); - else if (i > 5) - index = (compressedBlock[1] & (0x7 << (2 + (i - 6) * 3))) >> (2 + (i - 6) * 3); - else - { - index = (compressedBlock[0] & 0x80000000) >> 31; - index |= (compressedBlock[1] & 0x3) << 1; - } - - alphaBlock[i] = alpha[index]; - } -} - -//============================================================================= - -// Processes Alpha Channel either as Unsigned Norm (0..1) or (Signed Norm -1..1) -static CGU_Vec2ui cmp_compressAlphaBlock(CMP_IN CGU_FLOAT alphaBlock[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality, CMP_IN CGU_BOOL isSigned) -{ - CGU_Vec2ui CmpBlock; - - if (isSigned) - { -#ifndef ASPM_HLSL - union - { - CGU_INT32 compressedBlock[2]; - struct - { - CGU_INT8 red_0; - CGU_INT8 red_1; - CGU_UINT8 indices[6]; - }; - CGU_UINT64 data; - } BC4_Snorm_block; - -#ifndef ASPM_GPU - BC4_Snorm_block.data = 0LL; -#else - BC4_Snorm_block.data = 0; -#endif - - CGU_Vec2i reds; - reds = cmp_findEndpointsAlphaBlockSnorm(alphaBlock); - - BC4_Snorm_block.red_0 = reds.x & 0xFF; - BC4_Snorm_block.red_1 = reds.y & 0xFF; - - // check low end boundaries - if (BC4_Snorm_block.red_0 == -128) - BC4_Snorm_block.red_0 = -127; - if (BC4_Snorm_block.red_1 == -128) - BC4_Snorm_block.red_1 = -127; - - // Normalize signed int -128..127 to float -1..1 - CGU_Vec2f alphaMinMax; - alphaMinMax.x = (CGU_FLOAT)(BC4_Snorm_block.red_0) / 127.0f; - alphaMinMax.y = (CGU_FLOAT)(BC4_Snorm_block.red_1) / 127.0f; - - BC4_Snorm_block.data = cmp_getBlockPackedIndicesSNorm(alphaMinMax, alphaBlock, BC4_Snorm_block.data); - CmpBlock.x = BC4_Snorm_block.compressedBlock[0]; - CmpBlock.y = BC4_Snorm_block.compressedBlock[1]; -#else - CGU_Vec2f RampMinMax; - RampMinMax = cmp_getLinearEndPoints(alphaBlock, fquality, false); // revert code to remove the false param - CmpBlock = cmp_getBlockPackedIndices(RampMinMax, alphaBlock, fquality); -#endif - } - else - { - CGU_Vec2f RampMinMax; - RampMinMax = cmp_getLinearEndPoints(alphaBlock, fquality, false); // revert code to remove the false param - CmpBlock = cmp_getBlockPackedIndices(RampMinMax, alphaBlock, fquality); - } - - return CmpBlock; -} - -static void cmp_getCompressedAlphaRamp(CGU_UINT8 alpha[8], const CGU_UINT32 compressedBlock[2]) -{ - alpha[0] = (CGU_UINT8)(compressedBlock[0] & 0xff); - alpha[1] = (CGU_UINT8)((compressedBlock[0] >> 8) & 0xff); - - if (alpha[0] > alpha[1]) - { - // 8-alpha block: derive the other six alphas. - // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated. -#ifdef ASPM_GPU - alpha[2] = (CGU_UINT8)((6 * alpha[0] + 1 * alpha[1] + 3) / 7); // bit code 010 - alpha[3] = (CGU_UINT8)((5 * alpha[0] + 2 * alpha[1] + 3) / 7); // bit code 011 - alpha[4] = (CGU_UINT8)((4 * alpha[0] + 3 * alpha[1] + 3) / 7); // bit code 100 - alpha[5] = (CGU_UINT8)((3 * alpha[0] + 4 * alpha[1] + 3) / 7); // bit code 101 - alpha[6] = (CGU_UINT8)((2 * alpha[0] + 5 * alpha[1] + 3) / 7); // bit code 110 - alpha[7] = (CGU_UINT8)((1 * alpha[0] + 6 * alpha[1] + 3) / 7); // bit code 111 -#else - alpha[2] = static_cast((6 * alpha[0] + 1 * alpha[1] + 3) / 7); // bit code 010 - alpha[3] = static_cast((5 * alpha[0] + 2 * alpha[1] + 3) / 7); // bit code 011 - alpha[4] = static_cast((4 * alpha[0] + 3 * alpha[1] + 3) / 7); // bit code 100 - alpha[5] = static_cast((3 * alpha[0] + 4 * alpha[1] + 3) / 7); // bit code 101 - alpha[6] = static_cast((2 * alpha[0] + 5 * alpha[1] + 3) / 7); // bit code 110 - alpha[7] = static_cast((1 * alpha[0] + 6 * alpha[1] + 3) / 7); // bit code 111 -#endif - } - else - { - // 6-alpha block. - // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated. -#ifdef ASPM_GPU - alpha[2] = (CGU_UINT8)((4 * alpha[0] + 1 * alpha[1] + 2) / 5); // Bit code 010 - alpha[3] = (CGU_UINT8)((3 * alpha[0] + 2 * alpha[1] + 2) / 5); // Bit code 011 - alpha[4] = (CGU_UINT8)((2 * alpha[0] + 3 * alpha[1] + 2) / 5); // Bit code 100 - alpha[5] = (CGU_UINT8)((1 * alpha[0] + 4 * alpha[1] + 2) / 5); // Bit code 101 -#else - alpha[2] = static_cast((4 * alpha[0] + 1 * alpha[1] + 2) / 5); // Bit code 010 - alpha[3] = static_cast((3 * alpha[0] + 2 * alpha[1] + 2) / 5); // Bit code 011 - alpha[4] = static_cast((2 * alpha[0] + 3 * alpha[1] + 2) / 5); // Bit code 100 - alpha[5] = static_cast((1 * alpha[0] + 4 * alpha[1] + 2) / 5); // Bit code 101 -#endif - alpha[6] = 0; // Bit code 110 - alpha[7] = 255; // Bit code 111 - } -} - -static void cmp_decompressAlphaBlock(CGU_UINT8 alphaBlock[BLOCK_SIZE_4X4], const CGU_UINT32 compressedBlock[2]) -{ - CGU_UINT32 i; - CGU_UINT8 alpha[8]; - cmp_getCompressedAlphaRamp(alpha, compressedBlock); - - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - CGU_UINT32 index; - if (i < 5) - index = (compressedBlock[0] & (0x7 << (16 + (i * 3)))) >> (16 + (i * 3)); - else if (i > 5) - index = (compressedBlock[1] & (0x7 << (2 + (i - 6) * 3))) >> (2 + (i - 6) * 3); - else - { - index = (compressedBlock[0] & 0x80000000) >> 31; - index |= (compressedBlock[1] & 0x3) << 1; - } - - alphaBlock[i] = alpha[index]; - } -} - -static void cmp_ProcessColors(CMP_INOUT CGU_Vec3f CMP_PTRINOUT colorMin, - CMP_INOUT CGU_Vec3f CMP_PTRINOUT colorMax, - CMP_INOUT CGU_UINT32 CMP_PTRINOUT c0, - CMP_INOUT CGU_UINT32 CMP_PTRINOUT c1, - CGU_INT setopt, - CGU_BOOL isSRGB) -{ - // CGU_UINT32 srbMap[32] = {0,5,8,11,12,13,14,15,16,17,18,19,20,21,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31}; - // CGU_UINT32 sgMap[64] = {0,10,14,16,19,20,22,24,25,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,42,43,43,44,45,45, - // 46,47,47,48,48,49,50,50,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63}; - CGU_INT32 x, y, z; - CGU_Vec3f scale = {31.0f, 63.0f, 31.0f}; - CGU_Vec3f MinColorScaled; - CGU_Vec3f MaxColorScaled; - - // Clamp or Transform is needed, the transforms have built in clamps - if (isSRGB) - { - MinColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMin); - MaxColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMax); - } - else - { - MinColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMin, 0.0f, 1.0f); - MaxColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMax, 0.0f, 1.0f); - } - - switch (setopt) - { - case 0: // Use Min Max processing - MinColorScaled = floor(MinColorScaled * scale); - MaxColorScaled = ceil(MaxColorScaled * scale); - CMP_PTRINOUT colorMin = MinColorScaled / scale; - CMP_PTRINOUT colorMax = MaxColorScaled / scale; - break; - default: // Use round processing - MinColorScaled = round(MinColorScaled * scale); - MaxColorScaled = round(MaxColorScaled * scale); - break; - } - - x = (CGU_UINT32)(MinColorScaled.x); - y = (CGU_UINT32)(MinColorScaled.y); - z = (CGU_UINT32)(MinColorScaled.z); - - //if (isSRGB) { - // // scale RB - // x = srbMap[x]; // &0x1F]; - // y = sgMap [y]; // &0x3F]; - // z = srbMap[z]; // &0x1F]; - // // scale G - //} - CMP_PTRINOUT c0 = (x << 11) | (y << 5) | z; - - x = (CGU_UINT32)(MaxColorScaled.x); - y = (CGU_UINT32)(MaxColorScaled.y); - z = (CGU_UINT32)(MaxColorScaled.z); - CMP_PTRINOUT c1 = (x << 11) | (y << 5) | z; -} - -#ifndef ASPM_GPU // Used by BC1, BC2 & BC3 -//---------------------------------------------------- -// This function decompresses a DXT colour block -// The block is decompressed to 8 bits per channel -// Result buffer is RGBA format, A is set to 255 -//---------------------------------------------------- -static void cmp_decompressDXTRGBA_Internal(CGU_UINT8 rgbBlock[BLOCK_SIZE_4X4X4], const CGU_Vec2ui compressedBlock, const CGU_BOOL mapDecodeRGBA) -{ - CGU_BOOL bDXT1 = TRUE; - CGU_UINT32 n0 = compressedBlock.x & 0xffff; - CGU_UINT32 n1 = compressedBlock.x >> 16; - CGU_UINT32 r0; - CGU_UINT32 g0; - CGU_UINT32 b0; - CGU_UINT32 r1; - CGU_UINT32 g1; - CGU_UINT32 b1; - - r0 = ((n0 & 0xf800) >> 8); - g0 = ((n0 & 0x07e0) >> 3); - b0 = ((n0 & 0x001f) << 3); - - r1 = ((n1 & 0xf800) >> 8); - g1 = ((n1 & 0x07e0) >> 3); - b1 = ((n1 & 0x001f) << 3); - - // Apply the lower bit replication to give full dynamic range - r0 += (r0 >> 5); - r1 += (r1 >> 5); - g0 += (g0 >> 6); - g1 += (g1 >> 6); - b0 += (b0 >> 5); - b1 += (b1 >> 5); - - if (!mapDecodeRGBA) - { - //-------------------------------------------------------------- - // Channel mapping output as BGRA - //-------------------------------------------------------------- - CGU_UINT32 c0 = 0xff000000 | (r0 << 16) | (g0 << 8) | b0; - CGU_UINT32 c1 = 0xff000000 | (r1 << 16) | (g1 << 8) | b1; - - if (!bDXT1 || n0 > n1) - { - CGU_UINT32 c2 = 0xff000000 | (((2 * r0 + r1) / 3) << 16) | (((2 * g0 + g1) / 3) << 8) | (((2 * b0 + b1) / 3)); - CGU_UINT32 c3 = 0xff000000 | (((2 * r1 + r0) / 3) << 16) | (((2 * g1 + g0) / 3) << 8) | (((2 * b1 + b0) / 3)); - - for (int i = 0; i < 16; i++) - { - int index = (compressedBlock.y >> (2 * i)) & 3; - - switch (index) - { - case 0: - ((CGU_UINT32*)rgbBlock)[i] = c0; - break; - case 1: - ((CGU_UINT32*)rgbBlock)[i] = c1; - break; - case 2: - ((CGU_UINT32*)rgbBlock)[i] = c2; - break; - case 3: - ((CGU_UINT32*)rgbBlock)[i] = c3; - break; - } - } - } - else - { - // Transparent decode - CGU_UINT32 c2 = 0xff000000 | (((r0 + r1) / 2) << 16) | (((g0 + g1) / 2) << 8) | (((b0 + b1) / 2)); - - for (int i = 0; i < 16; i++) - { - int index = (compressedBlock.y >> (2 * i)) & 3; - - switch (index) - { - case 0: - ((CGU_UINT32*)rgbBlock)[i] = c0; - break; - case 1: - ((CGU_UINT32*)rgbBlock)[i] = c1; - break; - case 2: - ((CGU_UINT32*)rgbBlock)[i] = c2; - break; - case 3: - ((CGU_UINT32*)rgbBlock)[i] = 0x00000000; - break; - } - } - } - } - else - { - // MAP_BC15_TO_ABGR - //-------------------------------------------------------------- - // Channel mapping output as RGBA - //-------------------------------------------------------------- - - CGU_UINT32 c0 = 0xff000000 | (b0 << 16) | (g0 << 8) | r0; - CGU_UINT32 c1 = 0xff000000 | (b1 << 16) | (g1 << 8) | r1; - - if (!bDXT1 || n0 > n1) - { - CGU_UINT32 c2 = 0xff000000 | (((2 * b0 + b1 + 1) / 3) << 16) | (((2 * g0 + g1 + 1) / 3) << 8) | (((2 * r0 + r1 + 1) / 3)); - CGU_UINT32 c3 = 0xff000000 | (((2 * b1 + b0 + 1) / 3) << 16) | (((2 * g1 + g0 + 1) / 3) << 8) | (((2 * r1 + r0 + 1) / 3)); - - for (int i = 0; i < 16; i++) - { - int index = (compressedBlock.y >> (2 * i)) & 3; - switch (index) - { - case 0: - ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c0; - break; - case 1: - ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c1; - break; - case 2: - ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c2; - break; - case 3: - ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c3; - break; - } - } - } - else - { - // Transparent decode - CGU_UINT32 c2 = 0xff000000 | (((b0 + b1) / 2) << 16) | (((g0 + g1) / 2) << 8) | (((r0 + r1) / 2)); - - for (int i = 0; i < 16; i++) - { - int index = (compressedBlock.y >> (2 * i)) & 3; - switch (index) - { - case 0: - ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c0; - break; - case 1: - ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c1; - break; - case 2: - ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c2; - break; - case 3: - ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = 0x00000000; - break; - } - } - } - } //MAP_ABGR -} -#endif // !ASPM_GPU - -//-------------------------------------------------------------------------------------------------------- -// Decompress is RGB (0.0f..255.0f) -//-------------------------------------------------------------------------------------------------------- -static void cmp_decompressRGBBlock(CMP_INOUT CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock) -{ - CGU_UINT32 n0 = compressedBlock.x & 0xffff; - CGU_UINT32 n1 = compressedBlock.x >> 16; - CGU_UINT32 index; - - //------------------------------------------------------- - // Decode the compressed block 0..255 color range - //------------------------------------------------------- - CGU_Vec3f c0 = cmp_565ToLinear(n0); // max color - CGU_Vec3f c1 = cmp_565ToLinear(n1); // min color - CGU_Vec3f c2; - CGU_Vec3f c3; - - if (n0 > n1) - { - c2 = (c0 * 2.0f + c1) / 3.0f; - c3 = (c1 * 2.0f + c0) / 3.0f; - - for (CGU_UINT32 i = 0; i < 16; i++) - { - index = (compressedBlock.y >> (2 * i)) & 3; - switch (index) - { - case 0: - rgbBlock[i] = c0; - break; - case 1: - rgbBlock[i] = c1; - break; - case 2: - rgbBlock[i] = c2; - break; - case 3: - rgbBlock[i] = c3; - break; - } - } - } - else - { - // Transparent decode - c2 = (c0 + c1) / 2.0f; - - for (CGU_UINT32 i = 0; i < 16; i++) - { - index = (compressedBlock.y >> (2 * i)) & 3; - switch (index) - { - case 0: - rgbBlock[i] = c0; - break; - case 1: - rgbBlock[i] = c1; - break; - case 2: - rgbBlock[i] = c2; - break; - case 3: - rgbBlock[i] = 0.0f; - break; - } - } - } -} - -// The source is 0..1, decompressed data using cmp_decompressRGBBlock is 0..255 which is converted down to 0..1 -static float CMP_RGBBlockError(const CGU_Vec3f src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock, CGU_BOOL isSRGB) -{ - CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4]; - - // Decompressed block channels are 0..255 - cmp_decompressRGBBlock(rgbBlock, compressedBlock); - - //------------------------------------------------------------------ - // Calculate MSE of the block - // Note : pow is used as Float type for the code to be usable on CPU - //------------------------------------------------------------------ - CGU_Vec3f serr; - serr = 0.0f; - - float sR, sG, sB, R, G, B; - - for (int j = 0; j < 16; j++) - { - if (isSRGB) - { - sR = round(cmp_linearToSrgbf(src_rgbBlock[j].x) * 255.0f); - sG = round(cmp_linearToSrgbf(src_rgbBlock[j].y) * 255.0f); - sB = round(cmp_linearToSrgbf(src_rgbBlock[j].z) * 255.0f); - } - else - { - sR = round(src_rgbBlock[j].x * 255.0f); - sG = round(src_rgbBlock[j].y * 255.0f); - sB = round(src_rgbBlock[j].z * 255.0f); - } - - rgbBlock[j] = rgbBlock[j]; - - R = rgbBlock[j].x; - G = rgbBlock[j].y; - B = rgbBlock[j].z; - - // Norm colors - serr.x += pow(sR - R, 2.0f); - serr.y += pow(sG - G, 2.0f); - serr.z += pow(sB - B, 2.0f); - } - - // MSE for 16 texels - return (serr.x + serr.y + serr.z) / 48.0f; -} - -// Processing input source 0..1.0f) -static CGU_Vec2ui CompressRGBBlock_FM(const CGU_Vec3f rgbBlockUVf[16], CMP_IN CGU_FLOAT fquality, CGU_BOOL isSRGB, CMP_INOUT CGU_FLOAT CMP_PTRINOUT errout) -{ - CGU_Vec3f axisVectorRGB = {0.0f, 0.0f, 0.0f}; // The axis vector for index projection - CGU_FLOAT pos_on_axis[16]; // The distance each unique falls along the compression axis - CGU_FLOAT axisleft = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis - CGU_FLOAT axisright = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis - CGU_FLOAT axiscentre = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis - CGU_INT32 swap = 0; // Indicator if the RGB values need swapping to generate an opaque result - CGU_Vec3f average_rgb; // The centrepoint of the axis - CGU_Vec3f srcRGB[16]; // The list of source colors with blue channel altered - CGU_Vec3f srcBlock[16]; // The list of source colors with any color space transforms and clipping - CGU_Vec3f rgb; - CGU_UINT32 c0 = 0; - CGU_UINT32 c1 = 0; - CGU_Vec2ui compressedBlock = {0, 0}; - CGU_FLOAT Q1CompErr = CMP_FLT_MAX; - CGU_Vec2ui Q1CompData = {0,0}; - - // ------------------------------------------------------------------------------------- - // (1) Find the array of unique pixel values and sum them to find their average position - // ------------------------------------------------------------------------------------- - { - CGU_FLOAT errLQ = 0.0f; - CGU_BOOL fastProcess = (fquality <= CMP_QUALITY1); - CGU_Vec3f srcMin = 1.0f; // Min source color - CGU_Vec3f srcMax = 0.0f; // Max source color - CGU_Vec2ui Q1compressedBlock = {0, 0}; - - average_rgb = 0.0f; - // Get average and modifed src - // find average position and save list of pixels as 0F..255F range for processing - // Note: z (blue) is average of blue+green channels - for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) - { - srcMin = cmp_minVec3f(srcMin, rgbBlockUVf[i]); - srcMax = cmp_maxVec3f(srcMax, rgbBlockUVf[i]); - if (!fastProcess) - { - rgb = isSRGB ? cmp_linearToSrgb(rgbBlockUVf[i]) : cmp_saturate(rgbBlockUVf[i]); - rgb.z = (rgb.y + rgb.z) * 0.5F; // Z-axiz => (R+G)/2 - srcRGB[i] = rgb; - average_rgb = average_rgb + rgb; - } - } - - // Process two colors for saving in 565 format as C0 and C1 - cmp_ProcessColors(CMP_REFINOUT srcMin, CMP_REFINOUT srcMax, CMP_REFINOUT c0, CMP_REFINOUT c1, isSRGB ? 1 : 0, isSRGB); - - // Save simple min-max encoding - CGU_UINT32 index = 0; - if (c0 < c1) - { - Q1CompData.x = (c0 << 16) | c1; - errLQ = cmp_getIndicesRGB(CMP_REFINOUT index, rgbBlockUVf, srcMin, srcMax, false); - Q1CompData.y = index; - CMP_PTRINOUT errout = errLQ; - } - else - { - // Most simple case all colors are equal or 0.0f - Q1compressedBlock.x = (c1 << 16) | c0; - Q1compressedBlock.y = 0; - CMP_PTRINOUT errout = 0.0f; - return Q1compressedBlock; - } - - if (fastProcess) - return Q1CompData; - - // 0.0625F is (1/BLOCK_SIZE_4X4) - average_rgb = average_rgb * 0.0625F; - } - - // ------------------------------------------------------------------------------------- - // (4) For each component, reflect points about the average so all lie on the same side - // of the average, and compute the new average - this gives a second point that defines the axis - // To compute the sign of the axis sum the positive differences of G for each of R and B (the - // G axis is always positive in this implementation - // ------------------------------------------------------------------------------------- - // An interesting situation occurs if the G axis contains no information, in which case the RB - // axis is also compared. I am not entirely sure if this is the correct implementation - should - // the priority axis be determined by magnitude? - { - CGU_FLOAT rg_pos = 0.0f; - CGU_FLOAT bg_pos = 0.0f; - CGU_FLOAT rb_pos = 0.0f; - - for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) - { - rgb = srcRGB[i] - average_rgb; - axisVectorRGB = axisVectorRGB + fabs(rgb); - if (rgb.x > 0) - { - rg_pos += rgb.y; - rb_pos += rgb.z; - } - if (rgb.z > 0) - bg_pos += rgb.y; - } - - // Average over BLOCK_SIZE_4X4 - axisVectorRGB = axisVectorRGB * 0.0625F; - - // New average position - if (rg_pos < 0) - axisVectorRGB.x = -axisVectorRGB.x; - if (bg_pos < 0) - axisVectorRGB.z = -axisVectorRGB.z; - if ((rg_pos == bg_pos) && (rg_pos == 0)) - { - if (rb_pos < 0) - axisVectorRGB.z = -axisVectorRGB.z; - } - } - - // ------------------------------------------------------------------------------------- - // (5) Axis projection and remapping - // ------------------------------------------------------------------------------------- - { - CGU_FLOAT v2_recip; - // Normalize the axis for simplicity of future calculation - v2_recip = dot(axisVectorRGB, axisVectorRGB); - if (v2_recip > 0) - v2_recip = 1.0f / (CGU_FLOAT)sqrt(v2_recip); - else - v2_recip = 1.0f; - axisVectorRGB = axisVectorRGB * v2_recip; - } - - // ------------------------------------------------------------------------------------- - // (6) Map the axis - // ------------------------------------------------------------------------------------- - // the line joining (and extended on either side of) average and axis - // defines the axis onto which the points will be projected - // Project all the points onto the axis, calculate the distance along - // the axis from the centre of the axis (average) - // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is - // P + ((R-P).v) / (v.v))v - // The distance along v is therefore (R-P).v / (v.v) where (v.v) is 1 if v is a unit vector. - // - // Calculate the extremities at the same time - these need to be reasonably accurately - // represented in all cases - { - axisleft = CMP_FLOAT_MAX; - axisright = -CMP_FLOAT_MAX; - for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) - { - // Compute the distance along the axis of the point of closest approach - CGU_Vec3f temp = (srcRGB[i] - average_rgb); - pos_on_axis[i] = dot(temp, axisVectorRGB); - - // Work out the extremities - if (pos_on_axis[i] < axisleft) - axisleft = pos_on_axis[i]; - if (pos_on_axis[i] > axisright) - axisright = pos_on_axis[i]; - } - } - - // --------------------------------------------------------------------------------------------- - // (7) Now we have a good axis and the basic information about how the points are mapped to it - // Our initial guess is to represent the endpoints accurately, by moving the average - // to the centre and recalculating the point positions along the line - // --------------------------------------------------------------------------------------------- - { - axiscentre = (axisleft + axisright) * 0.5F; - average_rgb = average_rgb + (axisVectorRGB * axiscentre); - for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) - pos_on_axis[i] -= axiscentre; - axisright -= axiscentre; - axisleft -= axiscentre; - } - - // ------------------------------------------------------------------------------------- - // (8) Calculate the high and low output colour values - // Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A - // straight rounded average is not correct, as the decompressor 'unrounds' by replicating - // the top bits to the bottom. - // In order to take account of this process, we don't just apply a straight rounding correction, - // but base our rounding on the input value (a straight rounding is actually pretty good in terms of - // error measure, but creates a visual colour and/or brightness shift relative to the original image) - // The method used here is to apply a centre-biased rounding dependent on the input value, which was - // (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of - // the image. - // rgb = (average_rgb + (left|right)*axisVectorRGB); - // ------------------------------------------------------------------------------------- - { - CGU_Vec3f MinColor, MaxColor; - - MinColor = average_rgb + (axisVectorRGB * axisleft); - MaxColor = average_rgb + (axisVectorRGB * axisright); - MinColor.z = (MinColor.z * 2) - MinColor.y; - MaxColor.z = (MaxColor.z * 2) - MaxColor.y; - - cmp_ProcessColors(CMP_REFINOUT MinColor, CMP_REFINOUT MaxColor, CMP_REFINOUT c0, CMP_REFINOUT c1, 1, false); - - // Force to be a 4-colour opaque block - in which case, c0 is greater than c1 - swap = 0; - if (c0 < c1) - { - CGU_UINT32 t; - t = c0; - c0 = c1; - c1 = t; - swap = 1; - } - else if (c0 == c1) - { - // This block will always be encoded in 3-colour mode - // Need to ensure that only one of the two points gets used, - // avoiding accidentally setting some transparent pixels into the block - for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) - pos_on_axis[i] = axisleft; - } - - compressedBlock.x = c0 | (c1 << 16); - - // ------------------------------------------------------------------------------------- - // (9) Final clustering, creating the 2-bit values that define the output - // ------------------------------------------------------------------------------------- - - CGU_UINT32 index; - CGU_FLOAT division; - { - compressedBlock.y = 0; - division = axisright * 2.0f / 3.0f; - axiscentre = (axisleft + axisright) / 2; // Actually, this code only works if centre is 0 or approximately so - - CGU_FLOAT CompMinErr; - - // This feature is work in progress - // remap to BC1 spec for decoding offsets, - // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1 - // CGU_Vec3f cn[4]; - // cn[0] = MaxColor; - // cn[1] = MinColor; - // cn[2] = cn[0]*2.0f/3.0f + cn[1]*1.0f/3.0f; - // cn[3] = cn[0]*1.0f/3.0f + cn[1]*2.0f/3.0f; - - for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) - { - // Endpoints (indicated by block > average) are 0 and 1, while - // interpolants are 2 and 3 - if (fabs(pos_on_axis[i]) >= division) - index = 0; - else - index = 2; - // Positive is in the latter half of the block - if (pos_on_axis[i] >= axiscentre) - index += 1; - - index = index ^ swap; - // Set the output, taking swapping into account - compressedBlock.y |= (index << (2 * i)); - - // use err calc for use in higher quality code - //CompMinErr += dot(srcRGBRef[i] - cn[index],srcRGBRef[i] - cn[index]); - } - - //CompMinErr = CompMinErr * 0.0208333f; - - CompMinErr = CMP_RGBBlockError(rgbBlockUVf, compressedBlock, isSRGB); - Q1CompErr = CMP_RGBBlockError(rgbBlockUVf, Q1CompData, isSRGB); - - if (CompMinErr > Q1CompErr) - { - compressedBlock = Q1CompData; - CMP_PTRINOUT errout = Q1CompErr; - } - else - CMP_PTRINOUT errout = CompMinErr; - } - } - // done - - return compressedBlock; -} - -#ifndef CMP_USE_LOWQUALITY -static CMP_EndPoints CompressRGBBlock_Slow(CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4], - CGU_FLOAT Rpt[BLOCK_SIZE_4X4], - CGU_UINT32 dwUniqueColors, - CGU_Vec3f channelWeightsBGR, - CGU_UINT32 m_nRefinementSteps) -{ - CMP_UNUSED(channelWeightsBGR); - CMP_UNUSED(m_nRefinementSteps); - ALIGN_16 CGU_FLOAT Prj0[BLOCK_SIZE_4X4]; - ALIGN_16 CGU_FLOAT Prj[BLOCK_SIZE_4X4]; - ALIGN_16 CGU_FLOAT PrjErr[BLOCK_SIZE_4X4]; - ALIGN_16 CGU_FLOAT RmpIndxs[BLOCK_SIZE_4X4]; - - CGU_Vec3f LineDirG; - CGU_Vec3f LineDir; - CGU_FLOAT LineDir0[NUM_CHANNELS]; - CGU_Vec3f BlkUV[BLOCK_SIZE_4X4]; - CGU_Vec3f BlkSh[BLOCK_SIZE_4X4]; - CGU_Vec3f Mdl; - - CGU_Vec3f rsltC0; - CGU_Vec3f rsltC1; - CGU_Vec3f PosG0 = {0.0f, 0.0f, 0.0f}; - CGU_Vec3f PosG1 = {0.0f, 0.0f, 0.0f}; - CGU_UINT32 i; - - for (i = 0; i < dwUniqueColors; i++) - { - BlkUV[i] = BlkInBGRf_UV[i]; - } - - // if not more then 2 different colors, we've done - if (dwUniqueColors <= 2) - { - rsltC0 = BlkInBGRf_UV[0] * 255.0f; - rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f; - } - else - { - // This is our first attempt to find an axis we will go along. - // The cumulation is done to find a line minimizing the MSE from the - // input 3D points. - - // While trying to find the axis we found that the diameter of the input - // set is quite small. Do not bother. - - // FindAxisIsSmall(BlkSh, LineDir0, Mdl, Blk, Rpt,dwUniqueColors); - { - CGU_UINT32 ii; - CGU_UINT32 jj; - CGU_UINT32 kk; - - // These vars cannot be Vec3 as index to them are varying - CGU_FLOAT Crrl[NUM_CHANNELS]; - CGU_FLOAT RGB2[NUM_CHANNELS]; - - LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = Mdl.x = Mdl.y = Mdl.z = 0.f; - - // sum position of all points - CGU_FLOAT fNumPoints = 0.0f; - for (ii = 0; ii < dwUniqueColors; ii++) - { - Mdl.x += BlkUV[ii].x * Rpt[ii]; - Mdl.y += BlkUV[ii].y * Rpt[ii]; - Mdl.z += BlkUV[ii].z * Rpt[ii]; - fNumPoints += Rpt[ii]; - } - - // and then average to calculate center coordinate of block - Mdl /= fNumPoints; - - for (ii = 0; ii < dwUniqueColors; ii++) - { - // calculate output block as offsets around block center - BlkSh[ii] = BlkUV[ii] - Mdl; - - // compute correlation matrix - // RGB2 = sum of ((distance from point from center) squared) - RGB2[0] += BlkSh[ii].x * BlkSh[ii].x * Rpt[ii]; - RGB2[1] += BlkSh[ii].y * BlkSh[ii].y * Rpt[ii]; - RGB2[2] += BlkSh[ii].z * BlkSh[ii].z * Rpt[ii]; - - Crrl[0] += BlkSh[ii].x * BlkSh[ii].y * Rpt[ii]; - Crrl[1] += BlkSh[ii].y * BlkSh[ii].z * Rpt[ii]; - Crrl[2] += BlkSh[ii].z * BlkSh[ii].x * Rpt[ii]; - } - - // if set's diameter is small - CGU_UINT32 i0 = 0, i1 = 1; - CGU_FLOAT mxRGB2 = 0.0f; - - CGU_FLOAT fEPS = fNumPoints * EPS; - for (kk = 0, jj = 0; jj < 3; jj++) - { - if (RGB2[jj] >= fEPS) - kk++; - else - RGB2[jj] = 0.0f; - - if (mxRGB2 < RGB2[jj]) - { - mxRGB2 = RGB2[jj]; - i0 = jj; - } - } - - CGU_FLOAT fEPS2 = fNumPoints * EPS2; - CGU_BOOL AxisIsSmall; - - AxisIsSmall = (RGB2[0] < fEPS2); - AxisIsSmall = AxisIsSmall && (RGB2[1] < fEPS2); - AxisIsSmall = AxisIsSmall && (RGB2[2] < fEPS2); - - // all are very small to avoid division on the small determinant - if (AxisIsSmall) - { - rsltC0 = BlkInBGRf_UV[0] * 255.0f; - rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f; - } - else - { - // !AxisIsSmall - if (kk == 1) // really only 1 dimension - LineDir0[i0] = 1.; - else if (kk == 2) - { // really only 2 dimensions - i1 = (RGB2[(i0 + 1) % 3] > 0.f) ? (i0 + 1) % 3 : (i0 + 2) % 3; - CGU_FLOAT Crl = (i1 == (i0 + 1) % 3) ? Crrl[i0] : Crrl[(i0 + 2) % 3]; - LineDir0[i1] = Crl / RGB2[i0]; - LineDir0[i0] = 1.; - } - else - { - CGU_FLOAT maxDet = 100000.f; - CGU_FLOAT Cs[3]; - // select max det for precision - for (jj = 0; jj < 3; jj++) - { - // 3 = nDimensions - CGU_FLOAT Det = RGB2[jj] * RGB2[(jj + 1) % 3] - Crrl[jj] * Crrl[jj]; - Cs[jj] = fabs(Crrl[jj] / sqrt(RGB2[jj] * RGB2[(jj + 1) % 3])); - if (maxDet < Det) - { - maxDet = Det; - i0 = jj; - } - } - - // inverse correl matrix - // -- -- -- -- - // | A B | | C -B | - // | B C | => | -B A | - // -- -- -- -- - CGU_FLOAT mtrx1[2][2]; - CGU_FLOAT vc1[2]; - CGU_FLOAT vc[2]; - vc1[0] = Crrl[(i0 + 2) % 3]; - vc1[1] = Crrl[(i0 + 1) % 3]; - // C - mtrx1[0][0] = RGB2[(i0 + 1) % 3]; - // A - mtrx1[1][1] = RGB2[i0]; - // -B - mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0]; - // find a solution - vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1]; - vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1]; - // normalize - vc[0] /= maxDet; - vc[1] /= maxDet; - // find a line direction vector - LineDir0[i0] = 1.; - LineDir0[(i0 + 1) % 3] = 1.; - LineDir0[(i0 + 2) % 3] = vc[0] + vc[1]; - } - - // normalize direction vector - CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2]; - Len = sqrt(Len); - - LineDir0[0] = (Len > 0.f) ? LineDir0[0] / Len : 0.0f; - LineDir0[1] = (Len > 0.f) ? LineDir0[1] / Len : 0.0f; - LineDir0[2] = (Len > 0.f) ? LineDir0[2] / Len : 0.0f; - } - } // FindAxisIsSmall - - // GCC is being an awful being when it comes to goto-jumps. - // So please bear with this. - CGU_FLOAT ErrG = 10000000.f; - CGU_FLOAT PrjBnd0; - CGU_FLOAT PrjBnd1; - ALIGN_16 CGU_FLOAT PreMRep[BLOCK_SIZE_4X4]; - - LineDir.x = LineDir0[0]; - LineDir.y = LineDir0[1]; - LineDir.z = LineDir0[2]; - - // Here is the main loop. - // 1. Project input set on the axis in consideration. - // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points. - // 3. Compute the vector of indexes (or clusters) for the current approximate ramp. - // 4. Present our color channels as 3 16DIM vectors. - // 5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector. - // 6. Plug the projections as a new directional vector for the axis. - // 7. Goto 1. - // D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3,2/3, 0, ...,}, but shifted and normalized). - // Ci - is a 16 dim vector of color i. for each Ci find a scalar Ai such that (Ai * D - Ci) (Ai * D - Ci) -> min , - // i.e distance between vector AiD and C is min. You can think of D as a unit interval(vector) "clusterizer", and Ai is a scale - // you need to apply to the clusterizer to approximate the Ci vector instead of the unit vector. - // Solution is - // Ai = (D . Ci) / (D . D); . - is a dot product. - // in 3 dim space Ai(s) represent a line direction, along which - // we again try to find (sub)optimal quantizer. - // That's what our for(;;) loop is about. - for (;;) - { - // 1. Project input set on the axis in consideration. - // From Foley & Van Dam: Closest point of approach of a line (P + v) to a - // point (R) is - // P + ((R-P).v) / (v.v))v - // The distance along v is therefore (R-P).v / (v.v) - // (v.v) is 1 if v is a unit vector. - // - PrjBnd0 = 1000.0f; - PrjBnd1 = -1000.0f; - for (i = 0; i < BLOCK_SIZE_4X4; i++) - Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f; - - for (i = 0; i < dwUniqueColors; i++) - { - Prj0[i] = Prj[i] = dot(BlkSh[i], LineDir); - PrjErr[i] = dot(BlkSh[i] - LineDir * Prj[i], BlkSh[i] - LineDir * Prj[i]); - PrjBnd0 = min(PrjBnd0, Prj[i]); - PrjBnd1 = max(PrjBnd1, Prj[i]); - } - - // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal - // pair of end points. - - // min and max of the search interval - CGU_FLOAT Scl0; - CGU_FLOAT Scl1; - Scl0 = PrjBnd0 - (PrjBnd1 - PrjBnd0) * 0.125f; - Scl1 = PrjBnd1 + (PrjBnd1 - PrjBnd0) * 0.125f; - - // compute scaling factor to scale down the search interval to [0.,1] - const CGU_FLOAT Scl2 = (Scl1 - Scl0) * (Scl1 - Scl0); - const CGU_FLOAT overScl = 1.f / (Scl1 - Scl0); - - for (i = 0; i < dwUniqueColors; i++) - { - // scale them - Prj[i] = (Prj[i] - Scl0) * overScl; - // premultiply the scale square to plug into error computation later - PreMRep[i] = Rpt[i] * Scl2; - } - - // scale first approximation of end points - PrjBnd0 = (PrjBnd0 - Scl0) * overScl; - PrjBnd1 = (PrjBnd1 - Scl0) * overScl; - - CGU_FLOAT StepErr = MAX_ERROR; - - // search step - CGU_FLOAT searchStep = 0.025f; - - // low Start/End; high Start/End - const CGU_FLOAT lowStartEnd = (PrjBnd0 - 2.f * searchStep > 0.f) ? PrjBnd0 - 2.f * searchStep : 0.f; - const CGU_FLOAT highStartEnd = (PrjBnd1 + 2.f * searchStep < 1.f) ? PrjBnd1 + 2.f * searchStep : 1.f; - - // find the best endpoints - CGU_FLOAT Pos0 = 0; - CGU_FLOAT Pos1 = 0; - CGU_FLOAT lowPosStep, highPosStep; - CGU_FLOAT err; - - int l, h; - for (l = 0, lowPosStep = lowStartEnd; l < 8; l++, lowPosStep += searchStep) - { - for (h = 0, highPosStep = highStartEnd; h < 8; h++, highPosStep -= searchStep) - { - // compute an error for the current pair of end points. - err = cmp_getRampErr(Prj, PrjErr, PreMRep, StepErr, lowPosStep, highPosStep, dwUniqueColors); - - if (err < StepErr) - { - // save better result - StepErr = err; - Pos0 = lowPosStep; - Pos1 = highPosStep; - } - } - } - - // inverse the scaling - Pos0 = Pos0 * (Scl1 - Scl0) + Scl0; - Pos1 = Pos1 * (Scl1 - Scl0) + Scl0; - - // did we find somthing better from the previous run? - if (StepErr + 0.001 < ErrG) - { - // yes, remember it - ErrG = StepErr; - LineDirG = LineDir; - - PosG0.x = Pos0; - PosG0.y = Pos0; - PosG0.z = Pos0; - PosG1.x = Pos1; - PosG1.y = Pos1; - PosG1.z = Pos1; - - // 3. Compute the vector of indexes (or clusters) for the current - // approximate ramp. - // indexes - const CGU_FLOAT step = (Pos1 - Pos0) / 3.0f; // (dwNumChannels=4 - 1); - const CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; - const CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; - const CGU_FLOAT overBlkTp = 1.f / 3.0f; // (dwNumChannels=4 - 1); - - // here the index vector is computed, - // shifted and normalized - CGU_FLOAT indxAvrg = 3.0f / 2.0f; // (dwNumChannels=4 - 1); - - for (i = 0; i < dwUniqueColors; i++) - { - CGU_FLOAT del; - // CGU_UINT32 n = (CGU_UINT32)((b - _min_ex + (step*0.5f)) * rstep); - if ((del = Prj0[i] - Pos0) <= 0) - RmpIndxs[i] = 0.f; - else if (Prj0[i] - Pos1 >= 0) - RmpIndxs[i] = 3.0f; // (dwNumChannels=4 - 1); - else - RmpIndxs[i] = floor((del + step_h) * rstep); - // shift and normalization - RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp; - } - - // 4. Present our color channels as 3 16 DIM vectors. - // 5. Find closest aproximation of each of 16DIM color vector with the - // pojection of the 16DIM index vector. - CGU_Vec3f Crs = {0.0f, 0.0f, 0.0f}; - CGU_FLOAT Len = 0.0f; - - for (i = 0; i < dwUniqueColors; i++) - { - const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i]; - Len += RmpIndxs[i] * PreMlt; - Crs.x += BlkSh[i].x * PreMlt; - Crs.y += BlkSh[i].y * PreMlt; - Crs.z += BlkSh[i].z * PreMlt; - } - - LineDir.x = LineDir.y = LineDir.z = 0.0f; - if (Len > 0.0f) - { - CGU_FLOAT Len2; - LineDir = Crs / Len; - // 6. Plug the projections as a new directional vector for the axis. - // 7. Goto 1. - Len2 = dot(LineDir, LineDir); // LineDir.x * LineDir.x + LineDir.y * LineDir.y + LineDir.z * LineDir.z; - Len2 = sqrt(Len2); - LineDir /= Len2; - } - } - else // We was not able to find anything better. Drop out. - break; - } - - // inverse transform to find end-points of 3-color ramp - rsltC0 = (PosG0 * LineDirG + Mdl) * 255.f; - rsltC1 = (PosG1 * LineDirG + Mdl) * 255.f; - } // !isDone - - // We've dealt with (almost) unrestricted full precision realm. - // Now back digital world. - - // round the end points to make them look like compressed ones - CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f}; - CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f}; - CGU_Vec3f Fctrs0 = {8.0f, 4.0f, 8.0f}; //(1 << (PIX_GRID - BG)); x (1 << (PIX_GRID - GG)); y (1 << (PIX_GRID - RG)); z - CGU_Vec3f Fctrs1 = {32.0f, 64.0f, 32.0f}; //(CGU_FLOAT)(1 << RG); z (CGU_FLOAT)(1 << GG); y (CGU_FLOAT)(1 << BG); x - CGU_FLOAT _Min = 0.0f; - CGU_FLOAT _Max = 255.0f; - - { - // MkRmpOnGrid(inpRmpEndPts, rsltC, _Min, _Max); - - inpRmpEndPts0 = floor(rsltC0); - - if (inpRmpEndPts0.x <= _Min) - inpRmpEndPts0.x = _Min; - else - { - inpRmpEndPts0.x += floor(128.f / Fctrs1.x) - floor(inpRmpEndPts0.x / Fctrs1.x); - inpRmpEndPts0.x = min(inpRmpEndPts0.x, _Max); - } - if (inpRmpEndPts0.y <= _Min) - inpRmpEndPts0.y = _Min; - else - { - inpRmpEndPts0.y += floor(128.f / Fctrs1.y) - floor(inpRmpEndPts0.y / Fctrs1.y); - inpRmpEndPts0.y = min(inpRmpEndPts0.y, _Max); - } - if (inpRmpEndPts0.z <= _Min) - inpRmpEndPts0.z = _Min; - else - { - inpRmpEndPts0.z += floor(128.f / Fctrs1.z) - floor(inpRmpEndPts0.z / Fctrs1.z); - inpRmpEndPts0.z = min(inpRmpEndPts0.z, _Max); - } - - inpRmpEndPts0 = floor(inpRmpEndPts0 / Fctrs0) * Fctrs0; - - inpRmpEndPts1 = floor(rsltC1); - if (inpRmpEndPts1.x <= _Min) - inpRmpEndPts1.x = _Min; - else - { - inpRmpEndPts1.x += floor(128.f / Fctrs1.x) - floor(inpRmpEndPts1.x / Fctrs1.x); - inpRmpEndPts1.x = min(inpRmpEndPts1.x, _Max); - } - if (inpRmpEndPts1.y <= _Min) - inpRmpEndPts1.y = _Min; - else - { - inpRmpEndPts1.y += floor(128.f / Fctrs1.y) - floor(inpRmpEndPts1.y / Fctrs1.y); - inpRmpEndPts1.y = min(inpRmpEndPts1.y, _Max); - } - if (inpRmpEndPts1.z <= _Min) - inpRmpEndPts1.z = _Min; - else - { - inpRmpEndPts1.z += floor(128.f / Fctrs1.z) - floor(inpRmpEndPts1.z / Fctrs1.z); - inpRmpEndPts1.z = min(inpRmpEndPts1.z, _Max); - } - - inpRmpEndPts1 = floor(inpRmpEndPts1 / Fctrs0) * Fctrs0; - } // MkRmpOnGrid - - CMP_EndPoints EndPoints; - EndPoints.Color0 = inpRmpEndPts0; - EndPoints.Color1 = inpRmpEndPts1; - - return EndPoints; -} -#endif - -// Process a rgbBlock which is normalized (0.0f ... 1.0f), signed normal is not implemented -static CGU_Vec2ui CompressBlockBC1_RGBA_Internal(const CGU_Vec3f rgbBlockUVf[BLOCK_SIZE_4X4], - const CGU_FLOAT BlockA[BLOCK_SIZE_4X4], - CGU_Vec3f channelWeights, - CGU_UINT32 dwAlphaThreshold, - CGU_UINT32 m_nRefinementSteps, - CMP_IN CGU_FLOAT fquality, - CGU_BOOL isSRGB) -{ - CGU_Vec2ui cmpBlock = {0, 0}; - CGU_FLOAT errLQ = 1e6f; - - cmpBlock = CompressRGBBlock_FM(rgbBlockUVf, fquality, isSRGB, CMP_REFINOUT errLQ); - -#ifndef CMP_USE_LOWQUALITY - //------------------------------------------------------------------ - // Processing is in 0..255 range, code needs to be normized to 0..1 - //------------------------------------------------------------------ - if ((errLQ > 0.0f) && (fquality > CMP_QUALITY2)) - { - CGU_Vec3f rgbBlock_normal[BLOCK_SIZE_4X4]; - CGU_UINT32 nCmpIndices = 0; - CGU_UINT32 c0, c1; - // High Quality - CMP_EndPoints EndPoints = {{0, 0, 0xFF}, {0, 0, 0xFF}}; - // Hold a err ref to lowest quality compression, to check if new compression is any better - CGU_Vec2ui Q1CompData = cmpBlock; - // High Quality - CGU_UINT32 i; - - ALIGN_16 CGU_FLOAT Rpt[BLOCK_SIZE_4X4]; - CGU_UINT32 pcIndices = 0; - - m_nRefinementSteps = 0; - - CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4]; // Normalized Block Input (0..1) in BGR channel format - // Default inidices & endpoints for Transparent Block - CGU_Vec3ui nEndpoints0 = {0, 0, 0}; // Endpoints are stored BGR as x,y,z - CGU_Vec3ui nEndpoints1 = {0xFF, 0xFF, 0xFF}; // Endpoints are stored BGR as x,y,z - - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - Rpt[i] = 0.0f; - } - - //=============================================================== - // Check if we have more then 2 colors and process Alpha block - CGU_UINT32 dwColors = 0; - CGU_UINT32 dwBlk[BLOCK_SIZE_4X4]; - CGU_UINT32 R, G, B, A; - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - // Do any color conversion prior to processing the block - rgbBlock_normal[i] = isSRGB ? cmp_linearToSrgb(rgbBlockUVf[i]) : rgbBlockUVf[i]; - - R = (CGU_UINT32)(rgbBlock_normal[i].x * 255.0f); - G = (CGU_UINT32)(rgbBlock_normal[i].y * 255.0f); - B = (CGU_UINT32)(rgbBlock_normal[i].z * 255.0f); - - if (dwAlphaThreshold > 0) - A = (CGU_UINT32)BlockA[i]; - else - A = 255; - - // Punch Through Alpha in BC1 Codec (1 bit alpha) - if ((dwAlphaThreshold == 0) || (A >= dwAlphaThreshold)) - { - // copy to local RGB data and have alpha set to 0xFF - dwBlk[dwColors++] = A << 24 | R << 16 | G << 8 | B; - } - } - - if (!dwColors) - { - // All are colors transparent - EndPoints.Color0.x = EndPoints.Color0.y = EndPoints.Color0.z = 0.0f; - EndPoints.Color1.x = EndPoints.Color1.y = EndPoints.Color0.z = 255.0f; - nCmpIndices = 0xFFFFFFFF; - } - else - { - // We have colors to process - nCmpIndices = 0; -// Punch Through Alpha Support ToDo -// CGU_BOOL bHasAlpha = (dwColors != BLOCK_SIZE_4X4); -// bHasAlpha = bHasAlpha && (dwAlphaThreshold > 0); // valid for (dwNumChannels=4); -// if (bHasAlpha) { -// CGU_Vec2ui compBlock = {0xf800f800,0}; -// return compBlock; -// } - -// Here we are computing an unique number of sorted colors. -// For each unique value we compute the number of it appearences. -// qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp); -#ifndef ASPM_GPU - std::sort(dwBlk, dwBlk + 15); -#else - { - CGU_UINT32 j; - CMP_di what[BLOCK_SIZE_4X4]; - - for (i = 0; i < dwColors; i++) - { - what[i].index = i; - what[i].data = dwBlk[i]; - } - - CGU_UINT32 tmp_index; - CGU_UINT32 tmp_data; - - for (i = 1; i < dwColors; i++) - { - for (j = i; j > 0; j--) - { - if (what[j - 1].data > what[j].data) - { - tmp_index = what[j].index; - tmp_data = what[j].data; - what[j].index = what[j - 1].index; - what[j].data = what[j - 1].data; - what[j - 1].index = tmp_index; - what[j - 1].data = tmp_data; - } - } - } - for (i = 0; i < dwColors; i++) - dwBlk[i] = what[i].data; - } -#endif - CGU_UINT32 new_p; - CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4]; - CGU_UINT32 dwUniqueColors = 0; - new_p = dwBlkU[0] = dwBlk[0]; - Rpt[dwUniqueColors] = 1.f; - for (i = 1; i < dwColors; i++) - { - if (new_p != dwBlk[i]) - { - dwUniqueColors++; - new_p = dwBlkU[dwUniqueColors] = dwBlk[i]; - Rpt[dwUniqueColors] = 1.f; - } - else - Rpt[dwUniqueColors] += 1.f; - } - dwUniqueColors++; - - // Simple case of only 2 colors to process - // no need for futher processing as lowest quality methods work best for this case - if (dwUniqueColors <= 2) - { - return Q1CompData; - } - else - { - // switch from int range back to UV floats - for (i = 0; i < dwUniqueColors; i++) - { - R = (dwBlkU[i] >> 16) & 0xff; - G = (dwBlkU[i] >> 8) & 0xff; - B = (dwBlkU[i] >> 0) & 0xff; - BlkInBGRf_UV[i].z = (CGU_FLOAT)R / 255.0f; - BlkInBGRf_UV[i].y = (CGU_FLOAT)G / 255.0f; - BlkInBGRf_UV[i].x = (CGU_FLOAT)B / 255.0f; - } - - CGU_Vec3f channelWeightsBGR; - channelWeightsBGR.x = channelWeights.z; - channelWeightsBGR.y = channelWeights.y; - channelWeightsBGR.z = channelWeights.x; - - EndPoints = CompressRGBBlock_Slow(BlkInBGRf_UV, Rpt, dwUniqueColors, channelWeightsBGR, m_nRefinementSteps); - } - } // colors - - //=================================================================== - // Process Cluster INPUT is constant EndPointsf OUTPUT is pcIndices - //=================================================================== - if (nCmpIndices == 0) - { - R = (CGU_UINT32)(EndPoints.Color0.z); - G = (CGU_UINT32)(EndPoints.Color0.y); - B = (CGU_UINT32)(EndPoints.Color0.x); - CGU_INT32 cluster0 = cmp_constructColor(R, G, B); - - R = (CGU_UINT32)(EndPoints.Color1.z); - G = (CGU_UINT32)(EndPoints.Color1.y); - B = (CGU_UINT32)(EndPoints.Color1.x); - CGU_INT32 cluster1 = cmp_constructColor(R, G, B); - - CGU_Vec3f InpRmp[NUM_ENDPOINTS]; - if ((cluster0 <= cluster1) // valid for 4 channels - // || (cluster0 > cluster1) // valid for 3 channels - ) - { - // inverse endpoints - InpRmp[0] = EndPoints.Color1; - InpRmp[1] = EndPoints.Color0; - } - else - { - InpRmp[0] = EndPoints.Color0; - InpRmp[1] = EndPoints.Color1; - } - - CGU_Vec3f srcblockBGR[BLOCK_SIZE_4X4]; - CGU_FLOAT srcblockA[BLOCK_SIZE_4X4]; - - // Swizzle the source RGB to BGR for processing - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - srcblockBGR[i].z = rgbBlock_normal[i].x * 255.0f; - srcblockBGR[i].y = rgbBlock_normal[i].y * 255.0f; - srcblockBGR[i].x = rgbBlock_normal[i].z * 255.0f; - srcblockA[i] = 0.0f; - if (dwAlphaThreshold > 0) - { - CGU_UINT32 alpha = (CGU_UINT32)BlockA[i]; - if (alpha >= dwAlphaThreshold) - srcblockA[i] = BlockA[i]; - } - } - - // input ramp is on the coarse grid - // make ramp endpoints the way they'll going to be decompressed - CGU_Vec3f InpRmpL[NUM_ENDPOINTS]; - CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F}; // 1 << RG,1 << GG,1 << BG - - { - // ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp); - InpRmpL[0] = InpRmp[0] + floor(InpRmp[0] / Fctrs); - InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f); - InpRmpL[1] = InpRmp[1] + floor(InpRmp[1] / Fctrs); - InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f); - } // MkWkRmpPts - - // build ramp - CGU_Vec3f LerpRmp[4]; - CGU_Vec3f offset = {1.0f, 1.0f, 1.0f}; - { - //BldRmp(Rmp, InpRmpL, dwNumChannels); - // linear interpolate end points to get the ramp - LerpRmp[0] = InpRmpL[0]; - LerpRmp[3] = InpRmpL[1]; - LerpRmp[1] = floor((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f); - LerpRmp[2] = floor((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f); - } // BldRmp - - //========================================================================= - // Clusterize, Compute error and find DXTC indexes for the current cluster - //========================================================================= - { - // Clusterize - CGU_UINT32 alpha; - - // For each colour in the original block assign it - // to the closest cluster and compute the cumulative error - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - alpha = (CGU_UINT32)srcblockA[i]; - if ((dwAlphaThreshold > 0) && alpha == 0) - { //*((CGU_DWORD *)&_Blk[i][AC]) == 0) - pcIndices |= cmp_set2Bit32(4, i); // dwNumChannels 3 or 4 (default is 4) - } - else - { - CGU_FLOAT shortest = 99999999999.f; - CGU_UINT8 shortestIndex = 0; - - CGU_Vec3f channelWeightsBGR; - channelWeightsBGR.x = channelWeights.z; - channelWeightsBGR.y = channelWeights.y; - channelWeightsBGR.z = channelWeights.x; - - for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++) - { - // r is either 1 or 4 - // calculate the distance for each component - CGU_FLOAT distance = - dot(((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR), ((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR)); - if (distance < shortest) - { - shortest = distance; - shortestIndex = rampindex; - } - } - - // The total is a sum of (error += shortest) - // We have the index of the best cluster, so assign this in the block - // Reorder indices to match correct DXTC ordering - if (shortestIndex == 3) // dwNumChannels - 1 - shortestIndex = 1; - else if (shortestIndex) - shortestIndex++; - pcIndices |= cmp_set2Bit32(shortestIndex, i); - } - } // BLOCK_SIZE_4X4 - } // Clusterize - } // Process Cluster - - //============================================================== - // Generate Compressed Result from nEndpoints & pcIndices - //============================================================== - R = (CGU_UINT32)(EndPoints.Color0.z); - G = (CGU_UINT32)(EndPoints.Color0.y); - B = (CGU_UINT32)(EndPoints.Color0.x); - c0 = cmp_constructColor(R, G, B); - - R = (CGU_UINT32)(EndPoints.Color1.z); - G = (CGU_UINT32)(EndPoints.Color1.y); - B = (CGU_UINT32)(EndPoints.Color1.x); - c1 = cmp_constructColor(R, G, B); - - // Get Processed indices if not set - if (nCmpIndices == 0) - nCmpIndices = pcIndices; - - if (c0 <= c1) - { - cmpBlock.x = c1 | (c0 << 16); - } - else - cmpBlock.x = c0 | (c1 << 16); - - cmpBlock.y = nCmpIndices; - - // Select best compression - CGU_FLOAT CompErr = CMP_RGBBlockError(rgbBlockUVf, cmpBlock, isSRGB); - if (CompErr > errLQ) - cmpBlock = Q1CompData; - } -#endif - return cmpBlock; -} - -//============================= Alpha: New single header interfaces: supports GPU shader interface ================================================== - -// Compress a BC1 block - Use new code in cmp_bc1.h -static CGU_Vec2ui CompressBlockBC1_UNORM(CGU_Vec3f rgbablockf[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality, CGU_BOOL isSRGB) -{ - CGU_FLOAT BlockA[BLOCK_SIZE_4X4]; // Not used but required - CGU_Vec3f channelWeights = {1.0f, 1.0f, 1.0f}; - - return CompressBlockBC1_RGBA_Internal(rgbablockf, - BlockA, // ToDo support nullptr - channelWeights, - 0, - 1, - fquality, - isSRGB); -} - -// Compress a BC2 block -static CGU_Vec4ui CompressBlockBC2_UNORM(CMP_IN CGU_Vec3f BlockRGB[BLOCK_SIZE_4X4], - CMP_IN CGU_FLOAT BlockA[BLOCK_SIZE_4X4], - CGU_FLOAT fquality, - CGU_BOOL isSRGB) -{ - CGU_Vec2ui compressedBlocks; - CGU_Vec4ui compBlock; - compressedBlocks = cmp_compressExplicitAlphaBlock(BlockA); - compBlock.x = compressedBlocks.x; - compBlock.y = compressedBlocks.y; - - CGU_Vec3f channelWeights = {1.0f, 1.0f, 1.0f}; - compressedBlocks = CompressBlockBC1_RGBA_Internal(BlockRGB, BlockA, channelWeights, 0, 1, fquality, isSRGB); - compBlock.z = compressedBlocks.x; - compBlock.w = compressedBlocks.y; - return compBlock; -} - -// Compress a BC3 block -static CGU_Vec4ui CompressBlockBC3_UNORM(CMP_IN CGU_Vec3f BlockRGB[BLOCK_SIZE_4X4], - CMP_IN CGU_FLOAT BlockA[BLOCK_SIZE_4X4], - CGU_FLOAT fquality, - CGU_BOOL isSRGB) -{ - CGU_Vec4ui compBlock; - CGU_Vec2ui cmpBlock; - - cmpBlock = cmp_compressAlphaBlock(BlockA, fquality, FALSE); - compBlock.x = cmpBlock.x; - compBlock.y = cmpBlock.y; - - CGU_Vec2ui compressedBlocks; - compressedBlocks = CompressBlockBC1_UNORM(BlockRGB, fquality, isSRGB); - compBlock.z = compressedBlocks.x; - compBlock.w = compressedBlocks.y; - return compBlock; -} - -// Compress a BC4 block -static CGU_Vec2ui CompressBlockBC4_UNORM(CMP_IN CGU_FLOAT Block[BLOCK_SIZE_4X4], CGU_FLOAT fquality) -{ - CGU_Vec2ui cmpBlock; - cmpBlock = cmp_compressAlphaBlock(Block, fquality, FALSE); - return cmpBlock; -} - -// Compress a BC4 block -static CGU_Vec2ui CompressBlockBC4_SNORM(CMP_IN CGU_FLOAT Block[BLOCK_SIZE_4X4], CGU_FLOAT fquality) -{ - CGU_Vec2ui cmpBlock; - cmpBlock = cmp_compressAlphaBlock(Block, fquality, TRUE); - return cmpBlock; -} - -// Compress a BC5 block -static CGU_Vec4ui CompressBlockBC5_UNORM(CMP_IN CGU_FLOAT BlockU[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT BlockV[BLOCK_SIZE_4X4], CGU_FLOAT fquality) -{ - CGU_Vec4ui compressedBlock = {0, 0, 0, 0}; - CGU_Vec2ui cmpBlock; - cmpBlock = cmp_compressAlphaBlock(BlockU, fquality, FALSE); - compressedBlock.x = cmpBlock.x; - compressedBlock.y = cmpBlock.y; - - cmpBlock = cmp_compressAlphaBlock(BlockV, fquality, FALSE); - compressedBlock.z = cmpBlock.x; - compressedBlock.w = cmpBlock.y; - - return compressedBlock; -} - -// Compress a BC6 & BC7 UNORM block ToDo - -#endif diff --git a/WickedEngine/shaders/compressonator/common_def.h b/WickedEngine/shaders/compressonator/common_def.h deleted file mode 100644 index 53105955c..000000000 --- a/WickedEngine/shaders/compressonator/common_def.h +++ /dev/null @@ -1,2282 +0,0 @@ -// ================================================================================================== -// Copyright (c) 2007-2021 Advanced Micro Devices, Inc. All rights reserved. -// Copyright (c) 2004-2006 ATI Technologies Inc. -// Copyright (c) <2014> -// ================================================================================================== -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files(the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions : -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. -// -// -// File Name: Common_Def -// Description: common definitions used for CPU/HPC/GPU -// -// Pull changes: -// Fixed build of cmp_core on Mac. (#164) -// - -#ifndef CMP_COMMON_DEFINITIONS_H -#define CMP_COMMON_DEFINITIONS_H - -//#define USE_CMP_FIDELITY_FX_H - -// Proxy ISPC compiler (Warning! Not all ASPM features will be available : expect build errors for specialized ASPM code! -#ifdef ISPC -#define ASPM -#endif - -// Using OpenCL Compiler -#ifdef __OPENCL_VERSION__ -#define ASPM_OPENCL -#endif - -#ifdef USE_CMP_FIDELITY_FX_H - -// Fidelity FX SDK mapping -#if defined(A_CPU) -#ifndef ASPM_CPU -#define ASPM_CPU -#endif -#endif - -#if defined(A_GPU) -#ifndef ASPM_GPU -#define ASPM_GPU -#endif -#endif - -#if defined(A_GLS) -#ifndef ASPM_GLS -#define ASPM_GLS -#endif -#endif - -#if defined(A_HLSL) -#ifndef ASPM_HLSL -#define ASPM_HLSL -#endif -#endif - -#if defined(A_GCC) -#ifndef ASPM_GCC -#define ASPM_GCC -#endif -#endif - -#endif - -#if (defined(ASPM_HLSL) || defined(ASPM_OPENCL)) -#ifndef ASPM_GPU -#define ASPM_GPU -#endif -#endif - - -//======================================= -// Skip CMP if only using FidelityFX Code -//======================================= -#if !(defined(A_GPU) || defined(A_GLS) || defined(A_HLSL) || defined(A_GCC)) - -// The shaders for UE4 require extension in the form of .ush in place of standard .h -// this directive is used to make the change without users requiring to modify all of the include extensions -// specific to UE4 - -#ifdef ASPM_HLSL_UE4 -#pragma once -#define INC_cmp_math_vec4 "cmp_math_vec4.ush" -#define INC_cmp_math_func "cmp_math_func.ush" -#else -#define INC_cmp_math_vec4 "cmp_math_vec4.h" -#define INC_cmp_math_func "cmp_math_func.h" -#endif - -#if defined(__unix__) || defined(__APPLE__) -#ifndef _LINUX -#define _LINUX -#endif -#ifdef ASPM_GPU -#undef ASPM_GPU -#endif -#include -#include -#include -#include INC_cmp_math_vec4 -#endif - -#ifdef _WIN32 -//#define USE_ASPM_CODE -#include -#endif - -#ifndef CMP_MAX -#define CMP_MAX(x, y) (((x) > (y)) ? (x) : (y)) -#endif - -#ifndef CMP_MIN -#define CMP_MIN(x, y) (((x) < (y)) ? (x) : (y)) -#endif - -#ifndef cmp_isnan -#ifdef ASPM_GPU -#define cmp_isnan(x) isnan(x) -#else -#define cmp_isnan(x) std::isnan(x) -#endif -#endif - -#ifdef ASPM_GPU -#define CMP_STATIC_CAST(x, y) (x)(y) -#define CMP_TYPE_CAST(x) (x) -#else -#define CMP_STATIC_CAST(x, y) static_cast(y) -#define CMP_TYPE_CAST(x) (x&) -#endif - -// Sets mapping BC1, BC2 & BC3 to decode Red,Green,Blue and Alpha -// RGBA to channels [0,1,2,3] else BGRA maps to [0,1,2,3] -// BC4 alpha always maps as AAAA to channels [0,1,2,3] -// BC5 decoded (Red&Green) maps R,G,B=0,A=255 to [0,1,2,3] else maps [B=0,G,R,A=255] to [0,1,2,3] -#define CMP_SET_BC13_DECODER_RGBA -#define CMP_FLOAT_MAX 3.402823466e+38F // max value used to detect an Error in processing -#define CMP_FLOAT_MAX_EXP 38 -#define USE_PROCESS_SEPERATE_ALPHA // Enable this to use higher quality code using CompressDualIndexBlock -#define COMPRESSED_BLOCK_SIZE 16 // Size of a compressed block in bytes -#define MAX_DIMENSION_BIG 4 // Max number of channels (RGBA) -#define MAX_SUBSETS 3 // Maximum number of possible subsets -#define MAX_SUBSET_SIZE 16 // Largest possible size for an individual subset -#define BLOCK_SIZE_4X4X4 64 -#define BLOCK_SIZE_4X4 16 -#define BlockX 4 -#define BlockY 4 -//#define USE_BLOCK_LINEAR // Source Data is organized in linear form for each block : Experimental Code not fully developed -//#define USE_DOUBLE // Default is to use float, enable to use double data types only for float definitions - -//--------------------------------------------- -// Predefinitions for GPU and CPU compiled code -//--------------------------------------------- -#ifdef ASPM_HLSL -// ==== Vectors ==== -typedef float2 CGU_Vec2f; -typedef float2 CGV_Vec2f; -typedef float3 CGU_Vec3f; -typedef float3 CGV_Vec3f; -typedef float4 CGU_Vec4f; -typedef float4 CGV_Vec4f; - -typedef int2 CGU_Vec2i; -typedef int2 CGV_Vec2i; -typedef uint2 CGU_Vec2ui; -typedef uint2 CGV_Vec2ui; - -typedef int3 CGU_Vec3i; -typedef int3 CGV_Vec3i; -typedef uint3 CGU_Vec3ui; -typedef uint3 CGV_Vec3ui; - -typedef int4 CGU_Vec4i; -typedef int4 CGV_Vec4i; - -typedef int4 CGU_Vec4uc; -typedef int4 CGV_Vec4uc; - -typedef uint4 CGU_Vec4ui; -typedef uint4 CGV_Vec4ui; - -// ==== Scalar Types ==== to remove from code -typedef int CGU_INT8; -typedef int CGU_INT; -typedef int CGV_INT; -typedef uint CGU_UINT8; -typedef uint CGU_UINT; - -// ==== Scalar Types ==== -typedef int CGU_BOOL; -typedef int CGV_BOOL; -typedef int CGV_INT8; -typedef int CGV_UINT8; -typedef uint CGU_UINT16; -typedef int CGU_INT32; -typedef int CGV_INT32; -typedef uint CGU_UINT32; -typedef uint CGV_UINT32; -typedef float CGV_FLOAT; -typedef float CGU_FLOAT; -typedef min16float CGU_MIN16_FLOAT; // FP16 GPU support defaults to 32 bit if no HW support - -#define TRUE 1 -#define FALSE 0 -#define CMP_CDECL - -#define BC7_ENCODECLASS -#define CMP_EXPORT -#define INLINE inline -#define uniform -#define varying -#define CMP_GLOBAL -#define CMP_KERNEL -#define CMP_CONSTANT const -#define CMP_STATIC static -#define CMP_REFINOUT -#define CMP_PTRINOUT -#define CMP_INOUT inout -#define CMP_OUT out -#define CMP_IN in -#define CMP_UNUSED(x) (x); -#define CMP_UNROLL [unroll] -#define CMP_SVGROUPINDEX :SV_GroupIndex -#define CMP_SVGROUPID :SV_GroupID -#define CMP_NUMTHREADS(x, y, z) [numthreads(x, y, z)] - -#else - -#define CMP_SVGROUPINDEX -#define CMP_SVGROUPID -#define CMP_NUMTHREADS(x, y, z) - -typedef enum -{ - CGU_CORE_OK = 0, // No errors, call was successfull - CGU_CORE_ERR_UNKOWN, // An unknown error occurred - CGU_CORE_ERR_NEWMEM, // New Memory Allocation Failed - CGU_CORE_ERR_INVALIDPTR, // The pointer value used is invalid or null - CGU_CORE_ERR_RANGERED, // values for Red Channel is out of range (too high or too low) - CGU_CORE_ERR_RANGEGREEN, // values for Green Channel is out of range (too high or too low) - CGU_CORE_ERR_RANGEBLUE, // values for Blue Channel is out of range (too high or too low) -} CGU_ERROR_CODES; - -#ifdef ASPM_OPENCL // GPU Based code using OpenCL -// ==== Vectors ==== -typedef float2 CGU_Vec2f; -typedef float2 CGV_Vec2f; -typedef float3 CMP_Vec3f; -typedef float3 CGU_Vec3f; -typedef float3 CGV_Vec3f; -typedef float4 CGU_Vec4f; -typedef float4 CGV_Vec4f; - -typedef uchar3 CGU_Vec3uc; -typedef uchar3 CGV_Vec3uc; - -typedef uchar4 CMP_Vec4uc; -typedef uchar4 CGU_Vec4uc; -typedef uchar4 CGV_Vec4uc; - -typedef int2 CGU_Vec2i; -typedef int2 CGV_Vec2i; -typedef int3 CGU_Vec3i; -typedef int3 CGV_Vec3i; -typedef int4 CGU_Vec4i; -typedef int4 CGV_Vec4i; - - -typedef uint2 CGU_Vec2ui; -typedef uint2 CGV_Vec2ui; -typedef uint3 CGU_Vec3ui; -typedef uint3 CGV_Vec3ui; -typedef uint4 CGU_Vec4ui; -typedef uint4 CGV_Vec4ui; - -#define USE_BC7_SP_ERR_IDX -#define BC7_ENCODECLASS -#define ASPM_PRINT(args) printf args - -#define CMP_EXPORT -#define INLINE -#define uniform -#define varying -#define CMP_GLOBAL __global -#define CMP_KERNEL __kernel -#define CMP_CONSTANT __constant -#define CMP_STATIC -#define CMP_REFINOUT & -#define CMP_PTRINOUT * -#define CMP_INOUT -#define CMP_OUT -#define CMP_IN -#define CMP_UNUSED(x) -#define CMP_UNROLL - -typedef unsigned int CGU_DWORD; //32bits -typedef int CGU_INT; //32bits -typedef bool CGU_BOOL; -typedef unsigned short CGU_SHORT; //16bits -typedef float CGU_FLOAT; -typedef half CGU_MIN16_FLOAT; // FP16 GPU support defaults to 32 bit if no HW support -typedef unsigned int uint32; // need to remove this def - -typedef int CGV_INT; -typedef unsigned int CGU_UINT; -typedef int CGUV_INT; -typedef int CGV_BOOL; - -typedef char CGU_INT8; -typedef unsigned char CGU_UINT8; -typedef short CGU_INT16; -typedef unsigned short CGU_UINT16; -typedef int CGU_INT32; -typedef unsigned int CGU_UINT32; -typedef unsigned long long CGU_UINT64; - -typedef char CGV_INT8; -typedef unsigned char CGV_UINT8; -typedef short CGV_INT16; -typedef unsigned short CGV_UINT16; -typedef int CGV_INT32; -typedef unsigned int CGV_UINT32; -typedef unsigned long CGV_UINT64; - -typedef float CGV_FLOAT; - -#define TRUE 1 -#define FALSE 0 -#define CMP_CDECL - -#else -// CPU & ASPM definitions - -#define CMP_REFINOUT & -#define CMP_PTRINOUT * -#define CMP_INOUT -#define CMP_OUT -#define CMP_IN -#define CMP_UNUSED(x) (void)(x); -#define CMP_UNROLL - -#ifdef ASPM // SPMD ,SIMD CPU code -// using hybrid (CPU/GPU) aspm compiler -#define ASPM_PRINT(args) print args -#define CMP_USE_FOREACH_ASPM -#define __ASPM__ -#define BC7_ENCODECLASS - -#define USE_BC7_SP_ERR_IDX -//#define USE_BC7_RAMP - -#define CMP_EXPORT export -#define TRUE true -#define FALSE false -typedef uniform bool CGU_BOOL; -typedef bool CGV_BOOL; - -typedef unsigned int8 uint8; -typedef unsigned int16 uint16; -typedef unsigned int32 uint32; -typedef unsigned int64 uint64; -typedef uniform float CGU_FLOAT; -typedef varying float CGV_FLOAT; -typedef uniform float CGU_MIN16_FLOAT; - -typedef uniform uint16 CGU_UINT16; - -typedef uniform uint8 CGU_UINT8; -typedef varying uint8 CGV_UINT8; - -typedef uniform uint64 CGU_UINT64; - -typedef CGV_UINT8<4> CGV_Vec4uc; -typedef CGU_UINT8<4> CGU_Vec4uc; - -typedef CGU_FLOAT<2> CGU_Vec2f; -typedef CGV_FLOAT<2> CGV_Vec2f; -typedef CGU_FLOAT<3> CGU_Vec3f; -typedef CGV_FLOAT<3> CGV_Vec3f; -typedef CGU_FLOAT<4> CGU_Vec4f; -typedef CGV_FLOAT<4> CGV_Vec4f; - -typedef CGU_UINT32<3> CGU_Vec3ui; -typedef CGV_UINT32<3> CGV_Vec3ui; - -typedef CGU_UINT32<4> CGU_Vec4ui; -typedef CGV_UINT32<4> CGV_Vec4ui; - -#define CMP_CDECL -#else // standard CPU code -#include -#include -#include INC_cmp_math_vec4 - -// using CPU compiler -#define ASPM_PRINT(args) printf args -#define USE_BC7_RAMP -#define USE_BC7_SP_ERR_IDX - -#define CMP_EXPORT -#define BC7_ENCODECLASS BC7_EncodeClass:: -#define TRUE 1 -#define FALSE 0 -#define uniform -#define varying - -typedef char int8; -typedef short int16; -typedef int int32; -typedef long long int64; -typedef unsigned char uint8; -typedef unsigned short uint16; -typedef unsigned int uint32; -typedef unsigned long long uint64; - -typedef int8 CGV_BOOL; -typedef bool CGU_BOOL; -typedef int16 CGU_WORD; -typedef uint8 CGU_SHORT; -typedef long CGU_LONG; -typedef unsigned long CGU_ULONG; - -typedef uniform float CGU_FLOAT; -typedef varying float CGV_FLOAT; -typedef uniform float CGU_MIN16_FLOAT; - -typedef uniform uint8 CGU_UINT8; -typedef varying uint8 CGV_UINT8; - -typedef uniform uint16 CGU_UINT16; - -typedef CMP_Vec3ui CGU_Vec3ui; -typedef CMP_Vec3ui CGV_Vec3ui; - -typedef CMP_Vec2f CGU_Vec2f; -typedef CMP_Vec3f CGU_Vec3f; - -typedef CMP_Vec4uc CGU_Vec4uc; -typedef CMP_Vec4uc CGV_Vec4uc; -typedef CMP_Vec4i CGU_Vec4i; -typedef CMP_Vec4ui CGU_Vec4ui; -typedef CMP_Vec4ui CGV_Vec4ui; -typedef CMP_Vec4f CGU_Vec4f; -typedef CMP_Vec4f CGV_Vec4f; - -#if defined(WIN32) || defined(_WIN64) -#define CMP_CDECL __cdecl -#else -#define CMP_CDECL -#endif -#endif - -// Common CPU & ASPM definitions -#define CMP_ASSERT(arg) - -#define CMP_GLOBAL - -#define CMP_KERNEL -#define __local const -#define __constant const -#define CMP_CONSTANT const -#define INLINE inline -#define CMP_STATIC static - -typedef uniform int32 CGU_DWORD; -typedef uniform uint8 CGU_UBYTE; -typedef uniform int CGU_INT; -typedef uniform int8 CGU_INT8; - -typedef uniform int16 CGU_INT16; -typedef uniform uint16 CGU_UINT16; -typedef uniform int32 CGU_INT32; -typedef uniform uint32 CGU_UINT32; -typedef uniform uint64 CGU_UINT64; - -typedef int CGV_INT; -typedef int8 CGV_INT8; -typedef int16 CGV_INT16; -typedef int32 CGV_INT32; -typedef uint16 CGV_UINT16; -typedef uint32 CGV_UINT32; -typedef uint64 CGV_UINT64; - -#endif // else ASPM_GPU - -#define CMP_UNIFORM uniform -#define CMP_VARYING varying - -typedef struct -{ - CGU_UINT32 m_src_width; - CGU_UINT32 m_src_height; - CGU_UINT32 m_width_in_blocks; - CGU_UINT32 m_height_in_blocks; - CGU_FLOAT m_fquality; -} Source_Info; - -typedef unsigned char* CGU_PTR; - -// Ref Compute_CPU_HPC -struct texture_surface -{ - CGU_PTR ptr; - CGU_INT width, height, stride; - CGU_INT channels; -}; -#endif // else ASPM_HLSL - -#endif // USE CMP defines - -//======================================= -// using FidelityFX Code -//======================================= -#ifdef USE_CMP_FIDELITY_FX_H -// ================================================================================================== -// [A] SHADER PORTABILITY 1.20190530 -// ================================================================================================== -// ABOUT -// ===== -// Common central point for high-level shading language and C portability for various shader headers. -//--------------------------------------------------------------------------------------------------- -// DEFINES TYPES FOR CMP AND FIDELITY FX -// Both ASPM_ and A_ types are interchanable in code, the table below show compatibility between types -// ================================================================================================== -// Compressonator Fidelity FX Comments -// ASPM_CPU A_CPU Include the CPU related code. -// ASPM_GPU A_GPU Include the GPU related code. -// ASPM_GLSL A_GLS Using GLSL. -// ASMP_HLSL A_HLS Using HLSL. -// - A_GCC Using a GCC compatible compiler (else assume MSVC compatible compiler by default). -// ======= -// CGU_UINT8 A_BYTE Support 8-bit integer. -// CGU_HALF A_HALF Support 16-bit integer and floating point. -// A_LONG Support 64-bit integer. (GLSL & CPU only) -// A_DUBL Support 64-bit floating point. -// ======= -// A_WAVE Support wave-wide operations. -//-------------------------------------------------------------------------------------------------- -// To get #include "common_def.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require' -//--------------------------------------------------------------------------------------------------- -// SIMPLIFIED TYPE SYSTEM FOR FX CODE -// ================================================================================================== -// - All ints will be unsigned with exception of when signed is required. -// - Type naming simplified and shortened "A<#components>", -// - H = 16-bit float (half) -// - F = 32-bit float (float) -// - D = 64-bit float (double) -// - P = 1-bit integer (predicate, not using bool because 'B' is used for byte) -// - B = 8-bit integer (byte) -// - W = 16-bit integer (word) -// - U = 32-bit integer (unsigned) -// - L = 64-bit integer (long) -// - Using "AS<#components>" for signed when required. -//--------------------------------------------------------------------------------------------------- -// TODO -// ==== -// - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops). -// - Add subgroup ops. -//------------------------------------------------------------------------------------------------------------------------------ -// CHANGE LOG -// ========== -// 20210518 - Merged CMP Common_def.h with ffx_a.h -//============================================================================================================================== - - -#define A_2PI 6.28318530718 // 2xPI - -//============================================================================================================================== -// CPU -//============================================================================================================================== -// This provides a minimum subset of functionality compared to the GPU parts. -//============================================================================================================================== -#if defined(A_CPU) || !(defined(ASPM_GPU) || defined(ASPM_HLSL) || defined(ASPM_OPENCL)) - -#include "stdint.h" - -// Supporting user defined overrides. -#ifndef A_RESTRICT -#define A_RESTRICT __restrict // CMP_RESTRICT -#endif -//------------------------------------------------------------------------------------------------------------------------------ -#ifndef A_STATIC -#define A_STATIC static // CMP_STATIC -#endif - //------------------------------------------------------------------------------------------------------------------------------ - // Same types across CPU and GPU. - // Predicate uses 32-bit integer (C friendly bool). - typedef uint32_t AP1; // CGU_UINT32 - typedef float AF1; // CGU_FLOAT - typedef double AD1; // CGU_DOUBLE - typedef uint8_t AB1; // CGU_UINT8 - typedef uint16_t AW1; // CGU_UINT16 - typedef uint32_t AU1; // CGU_UINT32 - typedef uint64_t AL1; // CGU_UINT64 - typedef int8_t ASB1; // CGU_INT8 - typedef int16_t ASW1; // CGU_INT16 - typedef int32_t ASU1; // CGU_INT32 - typedef int64_t ASL1; // CGU_INT64 - -//------------------------------------------------------------------------------------------------------------------------------ - #define AD1_(a) ((AD1)(a)) - #define AF1_(a) ((AF1)(a)) - #define AL1_(a) ((AL1)(a)) - #define AU1_(a) ((AU1)(a)) -//------------------------------------------------------------------------------------------------------------------------------ - #define ASL1_(a) ((ASL1)(a)) - #define ASU1_(a) ((ASU1)(a)) -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;} -//------------------------------------------------------------------------------------------------------------------------------ - #define A_TRUE 1 - #define A_FALSE 0 -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// -// CPU/GPU PORTING -// -//------------------------------------------------------------------------------------------------------------------------------ -// Hackary to get CPU and GPU to share all setup code, without duplicate code paths. -// Unfortunately this is the level of "ugly" that is required since the languages are very different. -// This uses a lower-case prefix for special vector constructs. -// - In C restrict pointers are used. -// - In the shading language, in/inout/out arguments are used. -// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]). -//============================================================================================================================== -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY -//============================================================================================================================== - #define retAD2 AD1 *A_RESTRICT - #define retAD3 AD1 *A_RESTRICT - #define retAD4 AD1 *A_RESTRICT - #define retAF2 AF1 *A_RESTRICT - #define retAF3 AF1 *A_RESTRICT - #define retAF4 AF1 *A_RESTRICT - #define retAL2 AL1 *A_RESTRICT - #define retAL3 AL1 *A_RESTRICT - #define retAL4 AL1 *A_RESTRICT - #define retAU2 AU1 *A_RESTRICT - #define retAU3 AU1 *A_RESTRICT - #define retAU4 AU1 *A_RESTRICT -//------------------------------------------------------------------------------------------------------------------------------ - #define inAD2 AD1 *A_RESTRICT - #define inAD3 AD1 *A_RESTRICT - #define inAD4 AD1 *A_RESTRICT - #define inAF2 AF1 *A_RESTRICT - #define inAF3 AF1 *A_RESTRICT - #define inAF4 AF1 *A_RESTRICT - #define inAL2 AL1 *A_RESTRICT - #define inAL3 AL1 *A_RESTRICT - #define inAL4 AL1 *A_RESTRICT - #define inAU2 AU1 *A_RESTRICT - #define inAU3 AU1 *A_RESTRICT - #define inAU4 AU1 *A_RESTRICT -//------------------------------------------------------------------------------------------------------------------------------ - #define inoutAD2 AD1 *A_RESTRICT - #define inoutAD3 AD1 *A_RESTRICT - #define inoutAD4 AD1 *A_RESTRICT - #define inoutAF2 AF1 *A_RESTRICT - #define inoutAF3 AF1 *A_RESTRICT - #define inoutAF4 AF1 *A_RESTRICT - #define inoutAL2 AL1 *A_RESTRICT - #define inoutAL3 AL1 *A_RESTRICT - #define inoutAL4 AL1 *A_RESTRICT - #define inoutAU2 AU1 *A_RESTRICT - #define inoutAU3 AU1 *A_RESTRICT - #define inoutAU4 AU1 *A_RESTRICT -//------------------------------------------------------------------------------------------------------------------------------ - #define outAD2 AD1 *A_RESTRICT - #define outAD3 AD1 *A_RESTRICT - #define outAD4 AD1 *A_RESTRICT - #define outAF2 AF1 *A_RESTRICT - #define outAF3 AF1 *A_RESTRICT - #define outAF4 AF1 *A_RESTRICT - #define outAL2 AL1 *A_RESTRICT - #define outAL3 AL1 *A_RESTRICT - #define outAL4 AL1 *A_RESTRICT - #define outAU2 AU1 *A_RESTRICT - #define outAU3 AU1 *A_RESTRICT - #define outAU4 AU1 *A_RESTRICT -//------------------------------------------------------------------------------------------------------------------------------ - #define varAD2(x) AD1 x[2] - #define varAD3(x) AD1 x[3] - #define varAD4(x) AD1 x[4] - #define varAF2(x) AF1 x[2] - #define varAF3(x) AF1 x[3] - #define varAF4(x) AF1 x[4] - #define varAL2(x) AL1 x[2] - #define varAL3(x) AL1 x[3] - #define varAL4(x) AL1 x[4] - #define varAU2(x) AU1 x[2] - #define varAU3(x) AU1 x[3] - #define varAU4(x) AU1 x[4] -//------------------------------------------------------------------------------------------------------------------------------ - #define initAD2(x,y) {x,y} - #define initAD3(x,y,z) {x,y,z} - #define initAD4(x,y,z,w) {x,y,z,w} - #define initAF2(x,y) {x,y} - #define initAF3(x,y,z) {x,y,z} - #define initAF4(x,y,z,w) {x,y,z,w} - #define initAL2(x,y) {x,y} - #define initAL3(x,y,z) {x,y,z} - #define initAL4(x,y,z,w) {x,y,z,w} - #define initAU2(x,y) {x,y} - #define initAU3(x,y,z) {x,y,z} - #define initAU4(x,y,z,w) {x,y,z,w} -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// SCALAR RETURN OPS -//------------------------------------------------------------------------------------------------------------------------------ -// TODO -// ==== -// - Replace transcendentals with manual versions. -//============================================================================================================================== - #ifdef A_GCC - A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);} - A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);} - A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));} - A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_labs(ASL1_(a)));} - #else - A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);} - A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);} - A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));} - A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(llabs(ASL1_(a)));} - #endif -//------------------------------------------------------------------------------------------------------------------------------ - #ifdef A_GCC - A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);} - A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);} - #else - A_STATIC AD1 ACosD1(AD1 a){return cos(a);} - A_STATIC AF1 ACosF1(AF1 a){return cosf(a);} - #endif -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];} - A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} - A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} - A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];} - A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} - A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} -//------------------------------------------------------------------------------------------------------------------------------ - #ifdef A_GCC - A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);} - A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);} - #else - A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);} - A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);} - #endif -//------------------------------------------------------------------------------------------------------------------------------ - #ifdef A_GCC - A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);} - A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);} - #else - A_STATIC AD1 AFloorD1(AD1 a){return floor(a);} - A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);} - #endif -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);} - A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);} -//------------------------------------------------------------------------------------------------------------------------------ - #ifdef A_GCC - A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);} - A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);} - #else - A_STATIC AD1 ALog2D1(AD1 a){return log2(a);} - A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);} - #endif -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;} - A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;} - A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;} - A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;} -//------------------------------------------------------------------------------------------------------------------------------ - // These follow the convention that A integer types don't have signage, until they are operated on. - A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;} - A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a>ASL1_(b));} - A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));} -//------------------------------------------------------------------------------------------------------------------------------ - #ifdef A_GCC - A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);} - A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);} - #else - A_STATIC AD1 ASinD1(AD1 a){return sin(a);} - A_STATIC AF1 ASinF1(AF1 a){return sinf(a);} - #endif -//------------------------------------------------------------------------------------------------------------------------------ - #ifdef A_GCC - A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);} - A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);} - #else - A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);} - A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);} - #endif -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// SCALAR RETURN OPS - DEPENDENT -//============================================================================================================================== - A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);} - A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));} - A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));} - A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));} - A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));} -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// VECTOR OPS -//------------------------------------------------------------------------------------------------------------------------------ -// These are added as needed for production or prototyping, so not necessarily a complete set. -// They follow a convention of taking in a destination and also returning the destination value to increase utility. -//============================================================================================================================== - A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;} - A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;} - A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;} - A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;} - A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;} -//============================================================================================================================== - A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} - A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} - A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} - A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} - A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} -//============================================================================================================================== - A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;} - A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} - A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;} - A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} - A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} -//============================================================================================================================== - A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;} - A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;} - A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;} - A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;} - A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;} -//============================================================================================================================== - A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;} - A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;} - A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;} - A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;} - A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;} -//============================================================================================================================== - A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;} - A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;} - A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;} - A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;} - A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;} -//============================================================================================================================== - A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;} - A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;} - A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;} - A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;} - A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;} -//============================================================================================================================== - A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} - A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} - A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} - A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} - A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} -//============================================================================================================================== - A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} - A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} - A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} - A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} - A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} -//============================================================================================================================== - A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;} - A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} - A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;} - A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} - A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} -//============================================================================================================================== - A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;} - A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;} - A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;} - A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;} - A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;} -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// HALF FLOAT PACKING -//============================================================================================================================== - // Convert float to half (in lower 16-bits of output). - // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf - // Supports denormals. - // Conversion rules are to make computations possibly "safer" on the GPU, - // -INF & -NaN -> -65504 - // +INF & +NaN -> +65504 - A_STATIC AU1 AU1_AH1_AF1(AF1 f){ - static AW1 base[512]={ - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100, - 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00, - 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff, - 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, - 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, - 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, - 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, - 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, - 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, - 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, - 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, - 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, - 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, - 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, - 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, - 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, - 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100, - 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00, - 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff, - 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, - 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, - 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, - 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, - 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, - 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, - 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff}; - static AB1 shift[512]={ - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, - 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, - 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, - 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, - 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18}; - union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);} -//------------------------------------------------------------------------------------------------------------------------------ - // Used to output packed constant. - A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);} -#endif - -//============================================================================================================================== -// GLSL -//============================================================================================================================== -#if (defined(A_GLSL) && defined(A_GPU)) || (defined(ASPM_GLSL) && defined(ASPM_GPU)) - #ifndef A_SKIP_EXT - #ifdef A_HALF - #extension GL_EXT_shader_16bit_storage:require - #extension GL_EXT_shader_explicit_arithmetic_types:require - #endif -//------------------------------------------------------------------------------------------------------------------------------ - #ifdef A_LONG - #extension GL_ARB_gpu_shader_int64:require - // TODO: Fixme to more portable extension!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - #extension GL_NV_shader_atomic_int64:require - #endif -//------------------------------------------------------------------------------------------------------------------------------ - #ifdef A_WAVE - #extension GL_KHR_shader_subgroup_arithmetic:require - #extension GL_KHR_shader_subgroup_ballot:require - #extension GL_KHR_shader_subgroup_quad:require - #extension GL_KHR_shader_subgroup_shuffle:require - #endif - #endif -//============================================================================================================================== - #define AP1 bool - #define AP2 bvec2 - #define AP3 bvec3 - #define AP4 bvec4 -//------------------------------------------------------------------------------------------------------------------------------ - #define AF1 float - #define AF2 vec2 - #define AF3 vec3 - #define AF4 vec4 -//------------------------------------------------------------------------------------------------------------------------------ - #define AU1 uint - #define AU2 uvec2 - #define AU3 uvec3 - #define AU4 uvec4 -//------------------------------------------------------------------------------------------------------------------------------ - #define ASU1 int - #define ASU2 ivec2 - #define ASU3 ivec3 - #define ASU4 ivec4 -//============================================================================================================================== - #define AF1_AU1(x) uintBitsToFloat(AU1(x)) - #define AF2_AU2(x) uintBitsToFloat(AU2(x)) - #define AF3_AU3(x) uintBitsToFloat(AU3(x)) - #define AF4_AU4(x) uintBitsToFloat(AU4(x)) -//------------------------------------------------------------------------------------------------------------------------------ - #define AU1_AF1(x) floatBitsToUint(AF1(x)) - #define AU2_AF2(x) floatBitsToUint(AF2(x)) - #define AU3_AF3(x) floatBitsToUint(AF3(x)) - #define AU4_AF4(x) floatBitsToUint(AF4(x)) -//------------------------------------------------------------------------------------------------------------------------------ - #define AU1_AH2_AF2 packHalf2x16 - #define AU1_AW2Unorm_AF2 packUnorm2x16 - #define AU1_AB4Unorm_AF4 packUnorm4x8 -//------------------------------------------------------------------------------------------------------------------------------ - #define AF2_AH2_AU1 unpackHalf2x16 - #define AF2_AW2Unorm_AU1 unpackUnorm2x16 - #define AF4_AB4Unorm_AU1 unpackUnorm4x8 -//============================================================================================================================== - AF1 AF1_x(AF1 a){return AF1(a);} - AF2 AF2_x(AF1 a){return AF2(a,a);} - AF3 AF3_x(AF1 a){return AF3(a,a,a);} - AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} - #define AF1_(a) AF1_x(AF1(a)) - #define AF2_(a) AF2_x(AF1(a)) - #define AF3_(a) AF3_x(AF1(a)) - #define AF4_(a) AF4_x(AF1(a)) -//------------------------------------------------------------------------------------------------------------------------------ - AU1 AU1_x(AU1 a){return AU1(a);} - AU2 AU2_x(AU1 a){return AU2(a,a);} - AU3 AU3_x(AU1 a){return AU3(a,a,a);} - AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} - #define AU1_(a) AU1_x(AU1(a)) - #define AU2_(a) AU2_x(AU1(a)) - #define AU3_(a) AU3_x(AU1(a)) - #define AU4_(a) AU4_x(AU1(a)) -//============================================================================================================================== - AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} - AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} - AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} - AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} -//------------------------------------------------------------------------------------------------------------------------------ - AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));} - AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} - // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<>ASU1(b));} - AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} - AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} - AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// GLSL BYTE -//============================================================================================================================== - #ifdef A_BYTE - #define AB1 uint8_t - #define AB2 u8vec2 - #define AB3 u8vec3 - #define AB4 u8vec4 -//------------------------------------------------------------------------------------------------------------------------------ - #define ASB1 int8_t - #define ASB2 i8vec2 - #define ASB3 i8vec3 - #define ASB4 i8vec4 -//------------------------------------------------------------------------------------------------------------------------------ - AB1 AB1_x(AB1 a){return AB1(a);} - AB2 AB2_x(AB1 a){return AB2(a,a);} - AB3 AB3_x(AB1 a){return AB3(a,a,a);} - AB4 AB4_x(AB1 a){return AB4(a,a,a,a);} - #define AB1_(a) AB1_x(AB1(a)) - #define AB2_(a) AB2_x(AB1(a)) - #define AB3_(a) AB3_x(AB1(a)) - #define AB4_(a) AB4_x(AB1(a)) - #endif -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// GLSL HALF -//============================================================================================================================== - #ifdef A_HALF - #define AH1 float16_t - #define AH2 f16vec2 - #define AH3 f16vec3 - #define AH4 f16vec4 -//------------------------------------------------------------------------------------------------------------------------------ - #define AW1 uint16_t - #define AW2 u16vec2 - #define AW3 u16vec3 - #define AW4 u16vec4 -//------------------------------------------------------------------------------------------------------------------------------ - #define ASW1 int16_t - #define ASW2 i16vec2 - #define ASW3 i16vec3 - #define ASW4 i16vec4 -//============================================================================================================================== - #define AH2_AU1(x) unpackFloat2x16(AU1(x)) - AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));} - #define AH4_AU2(x) AH4_AU2_x(AU2(x)) - #define AW2_AU1(x) unpackUint2x16(AU1(x)) - #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x))) -//------------------------------------------------------------------------------------------------------------------------------ - #define AU1_AH2(x) packFloat2x16(AH2(x)) - AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));} - #define AU2_AH4(x) AU2_AH4_x(AH4(x)) - #define AU1_AW2(x) packUint2x16(AW2(x)) - #define AU2_AW4(x) unpack32(packUint4x16(AW4(x))) -//============================================================================================================================== - #define AW1_AH1(x) halfBitsToUint16(AH1(x)) - #define AW2_AH2(x) halfBitsToUint16(AH2(x)) - #define AW3_AH3(x) halfBitsToUint16(AH3(x)) - #define AW4_AH4(x) halfBitsToUint16(AH4(x)) -//------------------------------------------------------------------------------------------------------------------------------ - #define AH1_AW1(x) uint16BitsToHalf(AW1(x)) - #define AH2_AW2(x) uint16BitsToHalf(AW2(x)) - #define AH3_AW3(x) uint16BitsToHalf(AW3(x)) - #define AH4_AW4(x) uint16BitsToHalf(AW4(x)) -//============================================================================================================================== - AH1 AH1_x(AH1 a){return AH1(a);} - AH2 AH2_x(AH1 a){return AH2(a,a);} - AH3 AH3_x(AH1 a){return AH3(a,a,a);} - AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} - #define AH1_(a) AH1_x(AH1(a)) - #define AH2_(a) AH2_x(AH1(a)) - #define AH3_(a) AH3_x(AH1(a)) - #define AH4_(a) AH4_x(AH1(a)) -//------------------------------------------------------------------------------------------------------------------------------ - AW1 AW1_x(AW1 a){return AW1(a);} - AW2 AW2_x(AW1 a){return AW2(a,a);} - AW3 AW3_x(AW1 a){return AW3(a,a,a);} - AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} - #define AW1_(a) AW1_x(AW1(a)) - #define AW2_(a) AW2_x(AW1(a)) - #define AW3_(a) AW3_x(AW1(a)) - #define AW4_(a) AW4_x(AW1(a)) -//============================================================================================================================== - AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} - AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} - AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} - AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} -//------------------------------------------------------------------------------------------------------------------------------ - AH1 AFractH1(AH1 x){return fract(x);} - AH2 AFractH2(AH2 x){return fract(x);} - AH3 AFractH3(AH3 x){return fract(x);} - AH4 AFractH4(AH4 x){return fract(x);} -//------------------------------------------------------------------------------------------------------------------------------ - AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);} - AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);} - AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);} - AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);} -//------------------------------------------------------------------------------------------------------------------------------ - // No packed version of max3. - AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} - AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} - AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} - AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} -//------------------------------------------------------------------------------------------------------------------------------ - AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} - AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} - AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} - AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} -//------------------------------------------------------------------------------------------------------------------------------ - // No packed version of min3. - AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} - AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} - AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} - AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} -//------------------------------------------------------------------------------------------------------------------------------ - AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} - AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} - AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} - AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} -//------------------------------------------------------------------------------------------------------------------------------ - AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;} - AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;} - AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;} - AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;} -//------------------------------------------------------------------------------------------------------------------------------ - AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);} - AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);} - AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);} - AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);} -//------------------------------------------------------------------------------------------------------------------------------ - AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));} - AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));} - AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));} - AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));} -//------------------------------------------------------------------------------------------------------------------------------ - AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} - AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} - AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} - AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} - #endif -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// GLSL DOUBLE -//============================================================================================================================== - #ifdef A_DUBL - #define AD1 double - #define AD2 dvec2 - #define AD3 dvec3 - #define AD4 dvec4 -//------------------------------------------------------------------------------------------------------------------------------ - AD1 AD1_x(AD1 a){return AD1(a);} - AD2 AD2_x(AD1 a){return AD2(a,a);} - AD3 AD3_x(AD1 a){return AD3(a,a,a);} - AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} - #define AD1_(a) AD1_x(AD1(a)) - #define AD2_(a) AD2_x(AD1(a)) - #define AD3_(a) AD3_x(AD1(a)) - #define AD4_(a) AD4_x(AD1(a)) -//============================================================================================================================== - AD1 AFractD1(AD1 x){return fract(x);} - AD2 AFractD2(AD2 x){return fract(x);} - AD3 AFractD3(AD3 x){return fract(x);} - AD4 AFractD4(AD4 x){return fract(x);} -//------------------------------------------------------------------------------------------------------------------------------ - AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);} - AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);} - AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);} - AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);} -//------------------------------------------------------------------------------------------------------------------------------ - AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;} - AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;} - AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;} - AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;} -//------------------------------------------------------------------------------------------------------------------------------ - AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);} - AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);} - AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);} - AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);} -//------------------------------------------------------------------------------------------------------------------------------ - AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));} - AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));} - AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));} - AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));} - #endif -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// GLSL LONG -//============================================================================================================================== - #ifdef A_LONG - #define AL1 uint64_t - #define AL2 u64vec2 - #define AL3 u64vec3 - #define AL4 u64vec4 -//------------------------------------------------------------------------------------------------------------------------------ - #define ASL1 int64_t - #define ASL2 i64vec2 - #define ASL3 i64vec3 - #define ASL4 i64vec4 -//------------------------------------------------------------------------------------------------------------------------------ - #define AL1_AU2(x) packUint2x32(AU2(x)) - #define AU2_AL1(x) unpackUint2x32(AL1(x)) -//------------------------------------------------------------------------------------------------------------------------------ - AL1 AL1_x(AL1 a){return AL1(a);} - AL2 AL2_x(AL1 a){return AL2(a,a);} - AL3 AL3_x(AL1 a){return AL3(a,a,a);} - AL4 AL4_x(AL1 a){return AL4(a,a,a,a);} - #define AL1_(a) AL1_x(AL1(a)) - #define AL2_(a) AL2_x(AL1(a)) - #define AL3_(a) AL3_x(AL1(a)) - #define AL4_(a) AL4_x(AL1(a)) -//============================================================================================================================== - AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));} - AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));} - AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));} - AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));} -//------------------------------------------------------------------------------------------------------------------------------ - AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));} - AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));} - AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));} - AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));} -//------------------------------------------------------------------------------------------------------------------------------ - AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));} - AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));} - AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));} - AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));} - #endif -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// WAVE OPERATIONS -//============================================================================================================================== - #ifdef A_WAVE - AF1 AWaveAdd(AF1 v){return subgroupAdd(v);} - AF2 AWaveAdd(AF2 v){return subgroupAdd(v);} - AF3 AWaveAdd(AF3 v){return subgroupAdd(v);} - AF4 AWaveAdd(AF4 v){return subgroupAdd(v);} - #endif -//============================================================================================================================== -#endif - -//============================================================================================================================== -// HLSL -//============================================================================================================================== -#if (defined(A_HLSL) && defined(A_GPU)) || (defined(ASPM_HLSL) && defined(ASPM_GPU)) - #define AP1 bool - #define AP2 bool2 - #define AP3 bool3 - #define AP4 bool4 -//------------------------------------------------------------------------------------------------------------------------------ - #define AF1 float - #define AF2 float2 - #define AF3 float3 - #define AF4 float4 -//------------------------------------------------------------------------------------------------------------------------------ - #define AU1 uint - #define AU2 uint2 - #define AU3 uint3 - #define AU4 uint4 -//------------------------------------------------------------------------------------------------------------------------------ - #define ASU1 int - #define ASU2 int2 - #define ASU3 int3 - #define ASU4 int4 -//============================================================================================================================== - #define AF1_AU1(x) asfloat(AU1(x)) - #define AF2_AU2(x) asfloat(AU2(x)) - #define AF3_AU3(x) asfloat(AU3(x)) - #define AF4_AU4(x) asfloat(AU4(x)) -//------------------------------------------------------------------------------------------------------------------------------ - #define AU1_AF1(x) asuint(AF1(x)) - #define AU2_AF2(x) asuint(AF2(x)) - #define AU3_AF3(x) asuint(AF3(x)) - #define AU4_AF4(x) asuint(AF4(x)) -//------------------------------------------------------------------------------------------------------------------------------ - AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);} - #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) - #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x)) -//------------------------------------------------------------------------------------------------------------------------------ - AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));} - #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x)) -//============================================================================================================================== - AF1 AF1_x(AF1 a){return AF1(a);} - AF2 AF2_x(AF1 a){return AF2(a,a);} - AF3 AF3_x(AF1 a){return AF3(a,a,a);} - AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} - #define AF1_(a) AF1_x(AF1(a)) - #define AF2_(a) AF2_x(AF1(a)) - #define AF3_(a) AF3_x(AF1(a)) - #define AF4_(a) AF4_x(AF1(a)) -//------------------------------------------------------------------------------------------------------------------------------ - AU1 AU1_x(AU1 a){return AU1(a);} - AU2 AU2_x(AU1 a){return AU2(a,a);} - AU3 AU3_x(AU1 a){return AU3(a,a,a);} - AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} - #define AU1_(a) AU1_x(AU1(a)) - #define AU2_(a) AU2_x(AU1(a)) - #define AU3_(a) AU3_x(AU1(a)) - #define AU4_(a) AU4_x(AU1(a)) -//============================================================================================================================== - AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} - AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} - AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} - AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} -//------------------------------------------------------------------------------------------------------------------------------ - AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1<>off)&mask;} - AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} - AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1<>ASU1(b));} - AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} - AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} - AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// HLSL BYTE -//============================================================================================================================== - #ifdef A_BYTE - #endif -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// HLSL HALF -//============================================================================================================================== - #ifdef A_HALF - #define AH1 min16float - #define AH2 min16float2 - #define AH3 min16float3 - #define AH4 min16float4 -//------------------------------------------------------------------------------------------------------------------------------ - #define AW1 min16uint - #define AW2 min16uint2 - #define AW3 min16uint3 - #define AW4 min16uint4 -//------------------------------------------------------------------------------------------------------------------------------ - #define ASW1 min16int - #define ASW2 min16int2 - #define ASW3 min16int3 - #define ASW4 min16int4 -//============================================================================================================================== - // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). - // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ - AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);} - AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));} - AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);} - AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));} - #define AH2_AU1(x) AH2_AU1_x(AU1(x)) - #define AH4_AU2(x) AH4_AU2_x(AU2(x)) - #define AW2_AU1(x) AW2_AU1_x(AU1(x)) - #define AW4_AU2(x) AW4_AU2_x(AU2(x)) -//------------------------------------------------------------------------------------------------------------------------------ - AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);} - AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));} - AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);} - AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));} - #define AU1_AH2(x) AU1_AH2_x(AH2(x)) - #define AU2_AH4(x) AU2_AH4_x(AH4(x)) - #define AU1_AW2(x) AU1_AW2_x(AW2(x)) - #define AU2_AW4(x) AU2_AW4_x(AW4(x)) -//============================================================================================================================== - // TODO: These are broken!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - #define AW1_AH1(x) AW1(asuint(AF1(x))) - #define AW2_AH2(x) AW2(asuint(AF2(x))) - #define AW3_AH3(x) AW3(asuint(AF3(x))) - #define AW4_AH4(x) AW4(asuint(AF4(x))) -//------------------------------------------------------------------------------------------------------------------------------ - // TODO: These are broken!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - #define AH1_AW1(x) AH1(asfloat(AU1(x))) - #define AH2_AW2(x) AH2(asfloat(AU2(x))) - #define AH3_AW3(x) AH3(asfloat(AU3(x))) - #define AH4_AW4(x) AH4(asfloat(AU4(x))) -//============================================================================================================================== - AH1 AH1_x(AH1 a){return AH1(a);} - AH2 AH2_x(AH1 a){return AH2(a,a);} - AH3 AH3_x(AH1 a){return AH3(a,a,a);} - AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} - #define AH1_(a) AH1_x(AH1(a)) - #define AH2_(a) AH2_x(AH1(a)) - #define AH3_(a) AH3_x(AH1(a)) - #define AH4_(a) AH4_x(AH1(a)) -//------------------------------------------------------------------------------------------------------------------------------ - AW1 AW1_x(AW1 a){return AW1(a);} - AW2 AW2_x(AW1 a){return AW2(a,a);} - AW3 AW3_x(AW1 a){return AW3(a,a,a);} - AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} - #define AW1_(a) AW1_x(AW1(a)) - #define AW2_(a) AW2_x(AW1(a)) - #define AW3_(a) AW3_x(AW1(a)) - #define AW4_(a) AW4_x(AW1(a)) -//============================================================================================================================== - AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} - AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} - AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} - AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} -//------------------------------------------------------------------------------------------------------------------------------ - // V_FRACT_F16 (note DX frac() is different). - AH1 AFractH1(AH1 x){return x-floor(x);} - AH2 AFractH2(AH2 x){return x-floor(x);} - AH3 AFractH3(AH3 x){return x-floor(x);} - AH4 AFractH4(AH4 x){return x-floor(x);} -//------------------------------------------------------------------------------------------------------------------------------ - AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);} - AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);} - AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);} - AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);} -//------------------------------------------------------------------------------------------------------------------------------ - AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} - AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} - AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} - AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} -//------------------------------------------------------------------------------------------------------------------------------ - AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} - AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} - AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} - AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} -//------------------------------------------------------------------------------------------------------------------------------ - AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} - AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} - AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} - AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} -//------------------------------------------------------------------------------------------------------------------------------ - AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} - AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} - AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} - AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} -//------------------------------------------------------------------------------------------------------------------------------ - AH1 ARcpH1(AH1 x){return rcp(x);} - AH2 ARcpH2(AH2 x){return rcp(x);} - AH3 ARcpH3(AH3 x){return rcp(x);} - AH4 ARcpH4(AH4 x){return rcp(x);} -//------------------------------------------------------------------------------------------------------------------------------ - AH1 ARsqH1(AH1 x){return rsqrt(x);} - AH2 ARsqH2(AH2 x){return rsqrt(x);} - AH3 ARsqH3(AH3 x){return rsqrt(x);} - AH4 ARsqH4(AH4 x){return rsqrt(x);} -//------------------------------------------------------------------------------------------------------------------------------ - AH1 ASatH1(AH1 x){return saturate(x);} - AH2 ASatH2(AH2 x){return saturate(x);} - AH3 ASatH3(AH3 x){return saturate(x);} - AH4 ASatH4(AH4 x){return saturate(x);} -//------------------------------------------------------------------------------------------------------------------------------ - AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} - AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} - AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} - AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} - #endif -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// HLSL DOUBLE -//============================================================================================================================== - #ifdef A_DUBL - #define AD1 double - #define AD2 double2 - #define AD3 double3 - #define AD4 double4 -//------------------------------------------------------------------------------------------------------------------------------ - AD1 AD1_x(AD1 a){return AD1(a);} - AD2 AD2_x(AD1 a){return AD2(a,a);} - AD3 AD3_x(AD1 a){return AD3(a,a,a);} - AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} - #define AD1_(a) AD1_x(AD1(a)) - #define AD2_(a) AD2_x(AD1(a)) - #define AD3_(a) AD3_x(AD1(a)) - #define AD4_(a) AD4_x(AD1(a)) -//============================================================================================================================== - AD1 AFractD1(AD1 a){return a-floor(a);} - AD2 AFractD2(AD2 a){return a-floor(a);} - AD3 AFractD3(AD3 a){return a-floor(a);} - AD4 AFractD4(AD4 a){return a-floor(a);} -//------------------------------------------------------------------------------------------------------------------------------ - AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);} - AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);} - AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);} - AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);} -//------------------------------------------------------------------------------------------------------------------------------ - AD1 ARcpD1(AD1 x){return rcp(x);} - AD2 ARcpD2(AD2 x){return rcp(x);} - AD3 ARcpD3(AD3 x){return rcp(x);} - AD4 ARcpD4(AD4 x){return rcp(x);} -//------------------------------------------------------------------------------------------------------------------------------ - AD1 ARsqD1(AD1 x){return rsqrt(x);} - AD2 ARsqD2(AD2 x){return rsqrt(x);} - AD3 ARsqD3(AD3 x){return rsqrt(x);} - AD4 ARsqD4(AD4 x){return rsqrt(x);} -//------------------------------------------------------------------------------------------------------------------------------ - AD1 ASatD1(AD1 x){return saturate(x);} - AD2 ASatD2(AD2 x){return saturate(x);} - AD3 ASatD3(AD3 x){return saturate(x);} - AD4 ASatD4(AD4 x){return saturate(x);} - #endif -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// HLSL LONG -//============================================================================================================================== - #ifdef A_LONG - #endif -//============================================================================================================================== -#endif - -//============================================================================================================================== -// GPU COMMON -//============================================================================================================================== -#if defined(A_GPU) || (defined(ASPM_GPU) && !(defined(ASPM_OPENCL))) - // Negative and positive infinity. - #define A_INFN_F AF1_AU1(0x7f800000u) - #define A_INFP_F AF1_AU1(0xff800000u) -//------------------------------------------------------------------------------------------------------------------------------ - // Copy sign from 's' to positive 'd'. - AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));} - AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));} - AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));} - AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));} -//------------------------------------------------------------------------------------------------------------------------------ - // Single operation to return (useful to create a mask to use in lerp for branch free logic), - // m=NaN := 0 - // m>=0 := 0 - // m<0 := 1 - // Uses the following useful floating point logic, - // saturate(+a*(-INF)==-INF) := 0 - // saturate( 0*(-INF)== NaN) := 0 - // saturate(-a*(-INF)==+INF) := 1 - AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));} - AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));} - AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));} - AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));} -//============================================================================================================================== - #ifdef A_HALF - #define A_INFN_H AH1_AW1(0x7c00u) - #define A_INFP_H AH1_AW1(0xfc00u) -//------------------------------------------------------------------------------------------------------------------------------ - AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));} - AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));} - AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));} - AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));} -//------------------------------------------------------------------------------------------------------------------------------ - AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));} - AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));} - AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));} - AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));} - #endif -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// HALF APPROXIMATIONS -//------------------------------------------------------------------------------------------------------------------------------ -// These support only positive inputs. -// Did not see value yet in specialization for range. -// Using quick testing, ended up mostly getting the same "best" approximation for various ranges. -// With hardware that can co-execute transcendentals, the value in approximations could be less than expected. -// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total. -// And co-execution would require a compiler interleaving a lot of independent work for packed usage. -//------------------------------------------------------------------------------------------------------------------------------ -// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total). -// Same with sqrt(), as this could be x*rsq() (7 ops). -//------------------------------------------------------------------------------------------------------------------------------ -// IDEAS -// ===== -// - Polaris hardware has 16-bit support, but non-double rate. -// Could be possible still get part double rate for some of this logic, -// by clearing out the lower half's sign when necessary and using 32-bit ops... -//============================================================================================================================== - #ifdef A_HALF - // Minimize squared error across full positive range, 2 ops. - // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output. - AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));} - AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));} -//------------------------------------------------------------------------------------------------------------------------------ - // Lower precision estimation, 1 op. - // Minimize squared error across {smallest normal to 16384.0}. - AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));} - AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));} -//------------------------------------------------------------------------------------------------------------------------------ - // Medium precision estimation, one Newton Raphson iteration, 3 ops. - AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));} - AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));} -//------------------------------------------------------------------------------------------------------------------------------ - // Minimize squared error across {smallest normal to 16384.0}, 2 ops. - AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));} - AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));} - #endif -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// FLOAT APPROXIMATIONS -//------------------------------------------------------------------------------------------------------------------------------ -// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN", -// - Idea dates back to SGI, then to Quake 3, etc. -// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf -// - sqrt(x)=rsqrt(x)*x -// - rcp(x)=rsqrt(x)*rsqrt(x) for positive x -// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h -//------------------------------------------------------------------------------------------------------------------------------ -// These below are from perhaps less complete searching for optimal. -// Used FP16 normal range for testing with +4096 32-bit step size for sampling error. -// So these match up well with the half approximations. -//============================================================================================================================== - AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));} - AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));} - AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));} - AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));} -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// PARABOLIC SIN & COS -//------------------------------------------------------------------------------------------------------------------------------ -// Approximate answers to transcendental questions. -//------------------------------------------------------------------------------------------------------------------------------ -// TODO -// ==== -// - Verify packed math ABS is correctly doing an AND. -//============================================================================================================================== - // Valid input range is {-1 to 1} representing {0 to 2 pi}. - // Output range is {-1/4 to -1/4} representing {-1 to 1}. - AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD. - AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT -//------------------------------------------------------------------------------------------------------------------------------ - #ifdef A_HALF - // For a packed {sin,cos} pair, - // - Native takes 16 clocks and 4 issue slots (no packed transcendentals). - // - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed). - AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA - AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND - #endif -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// COLOR CONVERSIONS -//------------------------------------------------------------------------------------------------------------------------------ -// These are all linear to/from some other space (where 'linear' has been shortened out of the function name). -// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'. -// These are branch free implementations. -// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion. -//------------------------------------------------------------------------------------------------------------------------------ -// TRANSFER FUNCTIONS -// ================== -// 709 ..... Rec709 used for some HDTVs -// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native -// Pq ...... PQ native for HDR10 -// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type -// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations) -//------------------------------------------------------------------------------------------------------------------------------ -// FOR PQ -// ====== -// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2. -// All constants are only specified to FP32 precision. -// External PQ source reference, -// - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl -//------------------------------------------------------------------------------------------------------------------------------ -// PACKED VERSIONS -// =============== -// These are the A*H2() functions. -// There is no PQ functions as FP16 seemed to not have enough precision for the conversion. -// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors. -// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least). -//------------------------------------------------------------------------------------------------------------------------------ -// NOTES -// ===== -// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case. -//============================================================================================================================== - AF1 ATo709F1(AF1 c){return max(min(c*AF1_(4.5),AF1_(0.018)),AF1_(1.099)*pow(c,AF1_(0.45))-AF1_(0.099));} -//------------------------------------------------------------------------------------------------------------------------------ - // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma(). - AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,rcpX);} -//------------------------------------------------------------------------------------------------------------------------------ - AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302)); - return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));} -//------------------------------------------------------------------------------------------------------------------------------ - AF1 AToSrgbF1(AF1 c){return max(min(c*AF1_(12.92),AF1_(0.0031308)),AF1_(1.055)*pow(c,AF1_(0.41666))-AF1_(0.055));} -//------------------------------------------------------------------------------------------------------------------------------ - AF1 AToTwoF1(AF1 c){return sqrt(c);} -//============================================================================================================================== - AF1 AFrom709F1(AF1 c){return max(min(c*AF1_(1.0/4.5),AF1_(0.081)), - pow((c+AF1_(0.099))*(AF1_(1.0)/(AF1_(1.099))),AF1_(1.0/0.45)));} -//------------------------------------------------------------------------------------------------------------------------------ - AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,x);} -//------------------------------------------------------------------------------------------------------------------------------ - AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833)); - return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));} -//------------------------------------------------------------------------------------------------------------------------------ - AF1 AFromSrgbF1(AF1 c){return max(min(c*AF1_(1.0/12.92),AF1_(0.04045)), - pow((c+AF1_(0.055))*(AF1_(1.0)/AF1_(1.055)),AF1_(2.4)));} -//------------------------------------------------------------------------------------------------------------------------------ - AF1 AFromTwoF1(AF1 c){return c*c;} -//============================================================================================================================== - #ifdef A_HALF - AH2 ATo709H2(AH2 c){return max(min(c*AH2_(4.5),AH2_(0.018)),AH2_(1.099)*pow(c,AH2_(0.45))-AH2_(0.099));} -//------------------------------------------------------------------------------------------------------------------------------ - AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));} -//------------------------------------------------------------------------------------------------------------------------------ - AH2 AToSrgbH2(AH2 c){return max(min(c*AH2_(12.92),AH2_(0.0031308)),AH2_(1.055)*pow(c,AH2_(0.41666))-AH2_(0.055));} -//------------------------------------------------------------------------------------------------------------------------------ - AH2 AToTwoH2(AH2 c){return sqrt(c);} - #endif -//============================================================================================================================== - #ifdef A_HALF - AH2 AFrom709H2(AH2 c){return max(min(c*AH2_(1.0/4.5),AH2_(0.081)), - pow((c+AH2_(0.099))*(AH2_(1.0)/(AH2_(1.099))),AH2_(1.0/0.45)));} -//------------------------------------------------------------------------------------------------------------------------------ - AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));} -//------------------------------------------------------------------------------------------------------------------------------ - AH2 AFromSrgbH2(AH2 c){return max(min(c*AH2_(1.0/12.92),AH2_(0.04045)), - pow((c+AH2_(0.055))*(AH2_(1.0)/AH2_(1.055)),AH2_(2.4)));} -//------------------------------------------------------------------------------------------------------------------------------ - AH2 AFromTwoH2(AH2 c){return c*c;} - #endif -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// CS REMAP -//============================================================================================================================== - // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear. - // 543210 - // ====== - // ..xxx. - // yy...y - AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} -//============================================================================================================================== - // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions. - // 543210 - // ====== - // .xx..x - // y..yy. - // Details, - // LANE TO 8x8 MAPPING - // =================== - // 00 01 08 09 10 11 18 19 - // 02 03 0a 0b 12 13 1a 1b - // 04 05 0c 0d 14 15 1c 1d - // 06 07 0e 0f 16 17 1e 1f - // 20 21 28 29 30 31 38 39 - // 22 23 2a 2b 32 33 3a 3b - // 24 25 2c 2d 34 35 3c 3d - // 26 27 2e 2f 36 37 3e 3f - AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} -#endif - -//============================================================================================================================== -// GPU/CPU PORTABILITY -//------------------------------------------------------------------------------------------------------------------------------ -// This is the GPU implementation. -// See the CPU implementation for docs. -//============================================================================================================================== -#if defined(A_GPU) || (defined(ASPM_GPU) && !(defined(ASPM_OPENCL))) - #define A_TRUE true - #define A_FALSE false - #define A_STATIC -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY -//============================================================================================================================== - #define retAD2 AD2 - #define retAD3 AD3 - #define retAD4 AD4 - #define retAF2 AF2 - #define retAF3 AF3 - #define retAF4 AF4 - #define retAL2 AL2 - #define retAL3 AL3 - #define retAL4 AL4 - #define retAU2 AU2 - #define retAU3 AU3 - #define retAU4 AU4 -//------------------------------------------------------------------------------------------------------------------------------ - #define inAD2 in AD2 - #define inAD3 in AD3 - #define inAD4 in AD4 - #define inAF2 in AF2 - #define inAF3 in AF3 - #define inAF4 in AF4 - #define inAL2 in AL2 - #define inAL3 in AL3 - #define inAL4 in AL4 - #define inAU2 in AU2 - #define inAU3 in AU3 - #define inAU4 in AU4 -//------------------------------------------------------------------------------------------------------------------------------ - #define inoutAD2 inout AD2 - #define inoutAD3 inout AD3 - #define inoutAD4 inout AD4 - #define inoutAF2 inout AF2 - #define inoutAF3 inout AF3 - #define inoutAF4 inout AF4 - #define inoutAL2 inout AL2 - #define inoutAL3 inout AL3 - #define inoutAL4 inout AL4 - #define inoutAU2 inout AU2 - #define inoutAU3 inout AU3 - #define inoutAU4 inout AU4 -//------------------------------------------------------------------------------------------------------------------------------ - #define outAD2 out AD2 - #define outAD3 out AD3 - #define outAD4 out AD4 - #define outAF2 out AF2 - #define outAF3 out AF3 - #define outAF4 out AF4 - #define outAL2 out AL2 - #define outAL3 out AL3 - #define outAL4 out AL4 - #define outAU2 out AU2 - #define outAU3 out AU3 - #define outAU4 out AU4 -//------------------------------------------------------------------------------------------------------------------------------ - #define varAD2(x) AD2 x - #define varAD3(x) AD3 x - #define varAD4(x) AD4 x - #define varAF2(x) AF2 x - #define varAF3(x) AF3 x - #define varAF4(x) AF4 x - #define varAL2(x) AL2 x - #define varAL3(x) AL3 x - #define varAL4(x) AL4 x - #define varAU2(x) AU2 x - #define varAU3(x) AU3 x - #define varAU4(x) AU4 x -//------------------------------------------------------------------------------------------------------------------------------ - #define initAD2(x,y) AD2(x,y) - #define initAD3(x,y,z) AD3(x,y,z) - #define initAD4(x,y,z,w) AD4(x,y,z,w) - #define initAF2(x,y) AF2(x,y) - #define initAF3(x,y,z) AF3(x,y,z) - #define initAF4(x,y,z,w) AF4(x,y,z,w) - #define initAL2(x,y) AL2(x,y) - #define initAL3(x,y,z) AL3(x,y,z) - #define initAL4(x,y,z,w) AL4(x,y,z,w) - #define initAU2(x,y) AU2(x,y) - #define initAU3(x,y,z) AU3(x,y,z) - #define initAU4(x,y,z,w) AU4(x,y,z,w) -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// SCALAR RETURN OPS -//============================================================================================================================== - #define AAbsD1(a) abs(AD1(a)) - #define AAbsF1(a) abs(AF1(a)) -//------------------------------------------------------------------------------------------------------------------------------ - #define ACosD1(a) cos(AD1(a)) - #define ACosF1(a) cos(AF1(a)) -//------------------------------------------------------------------------------------------------------------------------------ - #define ADotD2(a,b) dot(AD2(a),AD2(b)) - #define ADotD3(a,b) dot(AD3(a),AD3(b)) - #define ADotD4(a,b) dot(AD4(a),AD4(b)) - #define ADotF2(a,b) dot(AF2(a),AF2(b)) - #define ADotF3(a,b) dot(AF3(a),AF3(b)) - #define ADotF4(a,b) dot(AF4(a),AF4(b)) -//------------------------------------------------------------------------------------------------------------------------------ - #define AExp2D1(a) exp2(AD1(a)) - #define AExp2F1(a) exp2(AF1(a)) -//------------------------------------------------------------------------------------------------------------------------------ - #define AFloorD1(a) floor(AD1(a)) - #define AFloorF1(a) floor(AF1(a)) -//------------------------------------------------------------------------------------------------------------------------------ - #define ALog2D1(a) log2(AD1(a)) - #define ALog2F1(a) log2(AF1(a)) -//------------------------------------------------------------------------------------------------------------------------------ - #define AMaxD1(a,b) min(a,b) - #define AMaxF1(a,b) min(a,b) - #define AMaxL1(a,b) min(a,b) - #define AMaxU1(a,b) min(a,b) -//------------------------------------------------------------------------------------------------------------------------------ - #define AMinD1(a,b) min(a,b) - #define AMinF1(a,b) min(a,b) - #define AMinL1(a,b) min(a,b) - #define AMinU1(a,b) min(a,b) -//------------------------------------------------------------------------------------------------------------------------------ - #define ASinD1(a) sin(AD1(a)) - #define ASinF1(a) sin(AF1(a)) -//------------------------------------------------------------------------------------------------------------------------------ - #define ASqrtD1(a) sqrt(AD1(a)) - #define ASqrtF1(a) sqrt(AF1(a)) -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// SCALAR RETURN OPS - DEPENDENT -//============================================================================================================================== - #define APowD1(a,b) pow(AD1(a),AF1(b)) - #define APowF1(a,b) pow(AF1(a),AF1(b)) -//_____________________________________________________________/\_______________________________________________________________ -//============================================================================================================================== -// VECTOR OPS -//------------------------------------------------------------------------------------------------------------------------------ -// These are added as needed for production or prototyping, so not necessarily a complete set. -// They follow a convention of taking in a destination and also returning the destination value to increase utility. -//============================================================================================================================== - #ifdef A_DUBL - AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;} - AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;} - AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;} - AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;} - AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;} - AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;} - AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;} - AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;} - AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;} - AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;} - AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;} - AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;} - AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;} - AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;} - AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;} - AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;} - AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;} - AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;} - AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;} - AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;} - AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;} - AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;} - AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;} - #endif -//============================================================================================================================== - AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;} - AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;} - AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;} - AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;} - AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;} - AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;} - AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;} - AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;} - AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;} - AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;} - AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;} - AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;} - AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;} - AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;} - AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;} - AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;} - AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;} - AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;} - AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;} - AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;} - AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;} -//------------------------------------------------------------------------------------------------------------------------------ - AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;} - AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;} - AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;} -#endif -#endif // USE_CMP_FIDELITY_FX_H - -#endif // Common_Def.h diff --git a/WickedEngine/shaders/terrainVirtualTextureUpdateCS.hlsl b/WickedEngine/shaders/terrainVirtualTextureUpdateCS.hlsl index ce2004e3d..d1d82d647 100644 --- a/WickedEngine/shaders/terrainVirtualTextureUpdateCS.hlsl +++ b/WickedEngine/shaders/terrainVirtualTextureUpdateCS.hlsl @@ -1,11 +1,6 @@ #include "globals.hlsli" - -#pragma dxc diagnostic push -#pragma dxc diagnostic ignored "-Wambig-lit-shift" -#pragma dxc diagnostic ignored "-Wunused-value" - -#define ASPM_HLSL -#include "compressonator/bcn_common_kernel.h" +#include "BlockCompress.hlsli" +#include "ColorSpaceUtility.hlsli" static const uint region_count = 4; @@ -24,7 +19,7 @@ RWTexture2D output : register(u0); RWTexture2D output : register(u0); #endif // UPDATE_NORMALMAP -static const uint2 block_offsets[BLOCK_SIZE_4X4] = { +static const uint2 block_offsets[16] = { uint2(0, 0), uint2(1, 0), uint2(2, 0), uint2(3, 0), uint2(0, 1), uint2(1, 1), uint2(2, 1), uint2(3, 1), uint2(0, 2), uint2(1, 2), uint2(2, 2), uint2(3, 2), @@ -40,20 +35,20 @@ void main(uint3 DTid : SV_DispatchThreadID) Texture2D region_weights_texture = bindless_textures[push.region_weights_textureRO]; #ifdef UPDATE_BASECOLORMAP - float3 block_rgb[BLOCK_SIZE_4X4]; + float3 block_rgb[16]; #endif // UPDATE_BASECOLORMAP #ifdef UPDATE_NORMALMAP - float block_x[BLOCK_SIZE_4X4]; - float block_y[BLOCK_SIZE_4X4]; + float block_x[16]; + float block_y[16]; #endif // UPDATE_NORMALMAP #ifdef UPDATE_SURFACEMAP - float3 block_rgb[BLOCK_SIZE_4X4]; - float block_a[BLOCK_SIZE_4X4]; + float3 block_rgb[16]; + float block_a[16]; #endif // UPDATE_SURFACEMAP - for(uint idx = 0; idx < BLOCK_SIZE_4X4; ++idx) + for(uint idx = 0; idx < 16; ++idx) { const uint2 block_offset = block_offsets[idx]; const int2 pixel = push.offset + DTid.xy * 4 + block_offset; @@ -127,7 +122,7 @@ void main(uint3 DTid : SV_DispatchThreadID) total_color /= weight_sum; #ifdef UPDATE_BASECOLORMAP - block_rgb[idx] = total_color.rgb; + block_rgb[idx] = ApplySRGBCurve_Fast(total_color.rgb); #endif // UPDATE_BASECOLORMAP #ifdef UPDATE_NORMALMAP @@ -144,16 +139,15 @@ void main(uint3 DTid : SV_DispatchThreadID) const uint2 write_coord = push.write_offset + DTid.xy; #ifdef UPDATE_BASECOLORMAP - output[write_coord] = CompressBlockBC1_UNORM(block_rgb, CMP_QUALITY0, /*isSRGB =*/ true); + output[write_coord] = CompressBC1Block(block_rgb); #endif // UPDATE_BASECOLORMAP #ifdef UPDATE_NORMALMAP - output[write_coord] = CompressBlockBC5_UNORM(block_x, block_y, CMP_QUALITY0); + output[write_coord] = CompressBC5Block(block_x, block_y); #endif // UPDATE_NORMALMAP #ifdef UPDATE_SURFACEMAP - output[write_coord] = CompressBlockBC3_UNORM(block_rgb, block_a, CMP_QUALITY2, /*isSRGB =*/ false); + output[write_coord] = CompressBC3Block(block_rgb, block_a); #endif // UPDATE_SURFACEMAP } -#pragma dxc diagnostic pop diff --git a/WickedEngine/wiVersion.cpp b/WickedEngine/wiVersion.cpp index 1cbb1b336..a4807e31f 100644 --- a/WickedEngine/wiVersion.cpp +++ b/WickedEngine/wiVersion.cpp @@ -9,7 +9,7 @@ namespace wi::version // minor features, major updates, breaking compatibility changes const int minor = 71; // minor bug fixes, alterations, refactors, updates - const int revision = 211; + const int revision = 212; const std::string version_string = std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(revision); diff --git a/third_party_software.txt b/third_party_software.txt index 1b5dcd8ff..1ea62f449 100644 --- a/third_party_software.txt +++ b/third_party_software.txt @@ -510,33 +510,6 @@ OTHERWISE, ARISING FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR SOFTWARE. -############################################################################################################################### - -Compressonator: https://github.com/GPUOpen-Tools/compressonator - -//=============================================================================== -// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files(the "Software"), to deal -// in the Software without restriction, including without limitation the rights to -// use, copy, modify, merge, publish, distribute, sublicense, and / or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions : -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. -// -//=============================================================================== - ############################################################################################################################### pugixml: https://github.com/zeux/pugixml