From 4fcf2020af6047efeb085f6d40ee709fa1e2aa51 Mon Sep 17 00:00:00 2001 From: Turanszki Janos Date: Sat, 13 Jun 2020 19:38:49 +0100 Subject: [PATCH] added linux build --- WickedEngine.sln | 27 + WickedEngine/CommonInclude.h | 16 + WickedEngine/LUA/luac.c | 449 - WickedEngine/MainComponent.cpp | 18 +- WickedEngine/Utility/D3D12MemAlloc.cpp | 4 + WickedEngine/Utility/DirectXCollision.h | 362 + WickedEngine/Utility/DirectXCollision.inl | 4787 +++++ WickedEngine/Utility/DirectXColors.h | 166 + WickedEngine/Utility/DirectXMath.h | 2270 +++ WickedEngine/Utility/DirectXMathCommon.h | 75 + WickedEngine/Utility/DirectXMathConvert.inl | 2181 +++ WickedEngine/Utility/DirectXMathMatrix.inl | 3317 ++++ WickedEngine/Utility/DirectXMathMisc.inl | 2516 +++ WickedEngine/Utility/DirectXMathVector.inl | 14643 ++++++++++++++++ WickedEngine/Utility/DirectXPackedVector.h | 1203 ++ WickedEngine/Utility/DirectXPackedVector.inl | 4621 +++++ WickedEngine/Utility/sal.h | 2961 ++++ WickedEngine/WickedEngine_Linux.vcxproj | 211 + WickedEngine/WickedEngine_SOURCE.vcxitems | 16 +- .../WickedEngine_SOURCE.vcxitems.filters | 42 +- WickedEngine/WickedEngine_Windows.vcxproj | 2 + WickedEngine/wiAudio.cpp | 30 + WickedEngine/wiBackLog.cpp | 8 +- WickedEngine/wiGraphics.h | 36 +- WickedEngine/wiGraphicsDevice.cpp | 2 + WickedEngine/wiGraphicsDevice_DX11.cpp | 5 + WickedEngine/wiGraphicsDevice_DX11.h | 8 + WickedEngine/wiGraphicsDevice_DX12.cpp | 5 + WickedEngine/wiGraphicsDevice_DX12.h | 8 + WickedEngine/wiHelper.cpp | 106 +- WickedEngine/wiInput.cpp | 13 +- WickedEngine/wiLua.cpp | 52 +- WickedEngine/wiLua.h | 12 +- WickedEngine/wiLuna.h | 3 +- WickedEngine/wiNetwork_Linux.cpp | 45 + WickedEngine/wiNetwork_UWP.cpp | 4 +- WickedEngine/wiNetwork_Windows.cpp | 4 +- WickedEngine/wiPlatform.h | 6 + WickedEngine/wiRectPacker.cpp | 3 +- WickedEngine/wiRectPacker.h | 2 + WickedEngine/wiScene.cpp | 1 - WickedEngine/wiScene_Serializers.cpp | 2 +- WickedEngine/wiTimer.cpp | 10 +- WickedEngine/wiTimer.h | 3 - WickedEngine/wiVersion.cpp | 2 +- WickedEngine/wiWidget.cpp | 2 - 46 files changed, 39647 insertions(+), 612 deletions(-) delete mode 100644 WickedEngine/LUA/luac.c create mode 100644 WickedEngine/Utility/DirectXCollision.h create mode 100644 WickedEngine/Utility/DirectXCollision.inl create mode 100644 WickedEngine/Utility/DirectXColors.h create mode 100644 WickedEngine/Utility/DirectXMath.h create mode 100644 WickedEngine/Utility/DirectXMathCommon.h create mode 100644 WickedEngine/Utility/DirectXMathConvert.inl create mode 100644 WickedEngine/Utility/DirectXMathMatrix.inl create mode 100644 WickedEngine/Utility/DirectXMathMisc.inl create mode 100644 WickedEngine/Utility/DirectXMathVector.inl create mode 100644 WickedEngine/Utility/DirectXPackedVector.h create mode 100644 WickedEngine/Utility/DirectXPackedVector.inl create mode 100644 WickedEngine/Utility/sal.h create mode 100644 WickedEngine/WickedEngine_Linux.vcxproj create mode 100644 WickedEngine/wiNetwork_Linux.cpp diff --git a/WickedEngine.sln b/WickedEngine.sln index d24b8e278..4f0324a0c 100644 --- a/WickedEngine.sln +++ b/WickedEngine.sln @@ -31,6 +31,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WickedEngine_SHADERS", "Wic EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Content", "Content.vcxitems", "{C48F6BFF-F91B-4DB5-98B5-15287DFB7C95}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WickedEngine_Linux", "WickedEngine\WickedEngine_Linux.vcxproj", "{D294C41D-D886-4B95-9FD6-EE13EEE8D976}" +EndProject Global GlobalSection(SharedMSBuildProjectFiles) = preSolution WickedEngine\WickedEngine_SHADERS.vcxitems*{06163dcb-b183-4ed9-9c62-13ef1658e049}*SharedItemsImports = 4 @@ -42,6 +44,7 @@ Global WickedEngine\WickedEngine_SHADERS.vcxitems*{92e86448-0724-4387-abac-96e63edf4190}*SharedItemsImports = 9 WickedEngine\WickedEngine_SHADERS.vcxitems*{c222218b-b6d1-406b-b2c0-8c1ced4a8d19}*SharedItemsImports = 4 Content.vcxitems*{c48f6bff-f91b-4db5-98b5-15287dfb7c95}*SharedItemsImports = 9 + WickedEngine\WickedEngine_SOURCE.vcxitems*{d294c41d-d886-4b95-9fd6-ee13eee8d976}*SharedItemsImports = 4 Content.vcxitems*{fa78bfad-4b23-4a6b-92fa-a48ce56bed03}*SharedItemsImports = 4 Editor\Editor_SOURCE.vcxitems*{fa78bfad-4b23-4a6b-92fa-a48ce56bed03}*SharedItemsImports = 4 WickedEngine\WickedEngine_SHADERS.vcxitems*{fa78bfad-4b23-4a6b-92fa-a48ce56bed03}*SharedItemsImports = 4 @@ -169,6 +172,30 @@ Global {60DA258F-E95F-4CF4-A46B-17D80644464B}.Release|Win32.Build.0 = Release|Win32 {60DA258F-E95F-4CF4-A46B-17D80644464B}.Release|x64.ActiveCfg = Release|x64 {60DA258F-E95F-4CF4-A46B-17D80644464B}.Release|x64.Build.0 = Release|x64 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Debug|ARM.ActiveCfg = Debug|ARM + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Debug|ARM.Build.0 = Debug|ARM + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Debug|ARM.Deploy.0 = Debug|ARM + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Debug|ARM64.ActiveCfg = Debug|ARM64 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Debug|ARM64.Build.0 = Debug|ARM64 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Debug|ARM64.Deploy.0 = Debug|ARM64 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Debug|Win32.ActiveCfg = Debug|x86 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Debug|Win32.Build.0 = Debug|x86 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Debug|Win32.Deploy.0 = Debug|x86 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Debug|x64.ActiveCfg = Debug|x64 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Debug|x64.Build.0 = Debug|x64 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Debug|x64.Deploy.0 = Debug|x64 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Release|ARM.ActiveCfg = Release|ARM + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Release|ARM.Build.0 = Release|ARM + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Release|ARM.Deploy.0 = Release|ARM + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Release|ARM64.ActiveCfg = Release|ARM64 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Release|ARM64.Build.0 = Release|ARM64 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Release|ARM64.Deploy.0 = Release|ARM64 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Release|Win32.ActiveCfg = Release|x86 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Release|Win32.Build.0 = Release|x86 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Release|Win32.Deploy.0 = Release|x86 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Release|x64.ActiveCfg = Release|x64 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Release|x64.Build.0 = Release|x64 + {D294C41D-D886-4B95-9FD6-EE13EEE8D976}.Release|x64.Deploy.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/WickedEngine/CommonInclude.h b/WickedEngine/CommonInclude.h index 2ba3250e2..3ba866345 100644 --- a/WickedEngine/CommonInclude.h +++ b/WickedEngine/CommonInclude.h @@ -4,10 +4,26 @@ // This is a helper include file pasted into all engine headers try to keep it minimal! // Do not include engine features in this file! +#include +#include #include +#if __has_include("DirectXMath.h") +// In this case, DirectXMath is coming from Windows SDK. +// It is better to use this on Windows as some Windows libraries could depend on the same +// DirectXMath headers #include #include +#include +#else +// In this case, DirectXMath is coming from supplied source code +// On platforms that don't have Windows SDK, the source code for DirectXMath is provided +// as part of the engine utilities +#include "Utility/DirectXMath.h" +#include "Utility/DirectXPackedVector.h" +#include "Utility/DirectXCollision.h" +#endif + using namespace DirectX; using namespace DirectX::PackedVector; static const XMFLOAT4X4 IDENTITYMATRIX = XMFLOAT4X4(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1); diff --git a/WickedEngine/LUA/luac.c b/WickedEngine/LUA/luac.c deleted file mode 100644 index c0c91d017..000000000 --- a/WickedEngine/LUA/luac.c +++ /dev/null @@ -1,449 +0,0 @@ -/* -** $Id: luac.c,v 1.75 2015/03/12 01:58:27 lhf Exp $ -** Lua compiler (saves bytecodes to files; also lists bytecodes) -** See Copyright Notice in lua.h -*/ - -#define luac_c -#define LUA_CORE - -#include "lprefix.h" - -#include -#include -#include -#include -#include - -#include "lua.h" -#include "lauxlib.h" - -#include "lobject.h" -#include "lstate.h" -#include "lundump.h" - -static void PrintFunction(const Proto* f, int full); -#define luaU_print PrintFunction - -#define PROGNAME "luac" /* default program name */ -#define OUTPUT PROGNAME ".out" /* default output file */ - -static int listing=0; /* list bytecodes? */ -static int dumping=1; /* dump bytecodes? */ -static int stripping=0; /* strip debug information? */ -static char Output[]={ OUTPUT }; /* default output file name */ -static const char* output=Output; /* actual output file name */ -static const char* progname=PROGNAME; /* actual program name */ - -static void fatal(const char* message) -{ - fprintf(stderr,"%s: %s\n",progname,message); - exit(EXIT_FAILURE); -} - -static void cannot(const char* what) -{ - fprintf(stderr,"%s: cannot %s %s: %s\n",progname,what,output,strerror(errno)); - exit(EXIT_FAILURE); -} - -static void usage(const char* message) -{ - if (*message=='-') - fprintf(stderr,"%s: unrecognized option '%s'\n",progname,message); - else - fprintf(stderr,"%s: %s\n",progname,message); - fprintf(stderr, - "usage: %s [options] [filenames]\n" - "Available options are:\n" - " -l list (use -l -l for full listing)\n" - " -o name output to file 'name' (default is \"%s\")\n" - " -p parse only\n" - " -s strip debug information\n" - " -v show version information\n" - " -- stop handling options\n" - " - stop handling options and process stdin\n" - ,progname,Output); - exit(EXIT_FAILURE); -} - -#define IS(s) (strcmp(argv[i],s)==0) - -static int doargs(int argc, char* argv[]) -{ - int i; - int version=0; - if (argv[0]!=NULL && *argv[0]!=0) progname=argv[0]; - for (i=1; itop+(i)) - -static const Proto* combine(lua_State* L, int n) -{ - if (n==1) - return toproto(L,-1); - else - { - Proto* f; - int i=n; - if (lua_load(L,reader,&i,"=(" PROGNAME ")",NULL)!=LUA_OK) fatal(lua_tostring(L,-1)); - f=toproto(L,-1); - for (i=0; ip[i]=toproto(L,i-n-1); - if (f->p[i]->sizeupvalues>0) f->p[i]->upvalues[0].instack=0; - } - f->sizelineinfo=0; - return f; - } -} - -static int writer(lua_State* L, const void* p, size_t size, void* u) -{ - UNUSED(L); - return (fwrite(p,size,1,(FILE*)u)!=1) && (size!=0); -} - -static int pmain(lua_State* L) -{ - int argc=(int)lua_tointeger(L,1); - char** argv=(char**)lua_touserdata(L,2); - const Proto* f; - int i; - if (!lua_checkstack(L,argc)) fatal("too many input files"); - for (i=0; i1); - if (dumping) - { - FILE* D= (output==NULL) ? stdout : fopen(output,"wb"); - if (D==NULL) cannot("open"); - lua_lock(L); - luaU_dump(L,f,writer,D,stripping); - lua_unlock(L); - if (ferror(D)) cannot("write"); - if (fclose(D)) cannot("close"); - } - return 0; -} - -int main(int argc, char* argv[]) -{ - lua_State* L; - int i=doargs(argc,argv); - argc-=i; argv+=i; - if (argc<=0) usage("no input files given"); - L=luaL_newstate(); - if (L==NULL) fatal("cannot create state: not enough memory"); - lua_pushcfunction(L,&pmain); - lua_pushinteger(L,argc); - lua_pushlightuserdata(L,argv); - if (lua_pcall(L,2,0,0)!=LUA_OK) fatal(lua_tostring(L,-1)); - lua_close(L); - return EXIT_SUCCESS; -} - -/* -** $Id: luac.c,v 1.75 2015/03/12 01:58:27 lhf Exp $ -** print bytecodes -** See Copyright Notice in lua.h -*/ - -#include -#include - -#define luac_c -#define LUA_CORE - -#include "ldebug.h" -#include "lobject.h" -#include "lopcodes.h" - -#define VOID(p) ((const void*)(p)) - -static void PrintString(const TString* ts) -{ - const char* s=getstr(ts); - size_t i,n=tsslen(ts); - printf("%c",'"'); - for (i=0; ik[i]; - switch (ttype(o)) - { - case LUA_TNIL: - printf("nil"); - break; - case LUA_TBOOLEAN: - printf(bvalue(o) ? "true" : "false"); - break; - case LUA_TNUMFLT: - { - char buff[100]; - sprintf(buff,LUA_NUMBER_FMT,fltvalue(o)); - printf("%s",buff); - if (buff[strspn(buff,"-0123456789")]=='\0') printf(".0"); - break; - } - case LUA_TNUMINT: - printf(LUA_INTEGER_FMT,ivalue(o)); - break; - case LUA_TSHRSTR: case LUA_TLNGSTR: - PrintString(tsvalue(o)); - break; - default: /* cannot happen */ - printf("? type=%d",ttype(o)); - break; - } -} - -#define UPVALNAME(x) ((f->upvalues[x].name) ? getstr(f->upvalues[x].name) : "-") -#define MYK(x) (-1-(x)) - -static void PrintCode(const Proto* f) -{ - const Instruction* code=f->code; - int pc,n=f->sizecode; - for (pc=0; pc0) printf("[%d]\t",line); else printf("[-]\t"); - printf("%-9s\t",luaP_opnames[o]); - switch (getOpMode(o)) - { - case iABC: - printf("%d",a); - if (getBMode(o)!=OpArgN) printf(" %d",ISK(b) ? (MYK(INDEXK(b))) : b); - if (getCMode(o)!=OpArgN) printf(" %d",ISK(c) ? (MYK(INDEXK(c))) : c); - break; - case iABx: - printf("%d",a); - if (getBMode(o)==OpArgK) printf(" %d",MYK(bx)); - if (getBMode(o)==OpArgU) printf(" %d",bx); - break; - case iAsBx: - printf("%d %d",a,sbx); - break; - case iAx: - printf("%d",MYK(ax)); - break; - } - switch (o) - { - case OP_LOADK: - printf("\t; "); PrintConstant(f,bx); - break; - case OP_GETUPVAL: - case OP_SETUPVAL: - printf("\t; %s",UPVALNAME(b)); - break; - case OP_GETTABUP: - printf("\t; %s",UPVALNAME(b)); - if (ISK(c)) { printf(" "); PrintConstant(f,INDEXK(c)); } - break; - case OP_SETTABUP: - printf("\t; %s",UPVALNAME(a)); - if (ISK(b)) { printf(" "); PrintConstant(f,INDEXK(b)); } - if (ISK(c)) { printf(" "); PrintConstant(f,INDEXK(c)); } - break; - case OP_GETTABLE: - case OP_SELF: - if (ISK(c)) { printf("\t; "); PrintConstant(f,INDEXK(c)); } - break; - case OP_SETTABLE: - case OP_ADD: - case OP_SUB: - case OP_MUL: - case OP_POW: - case OP_DIV: - case OP_IDIV: - case OP_BAND: - case OP_BOR: - case OP_BXOR: - case OP_SHL: - case OP_SHR: - case OP_EQ: - case OP_LT: - case OP_LE: - if (ISK(b) || ISK(c)) - { - printf("\t; "); - if (ISK(b)) PrintConstant(f,INDEXK(b)); else printf("-"); - printf(" "); - if (ISK(c)) PrintConstant(f,INDEXK(c)); else printf("-"); - } - break; - case OP_JMP: - case OP_FORLOOP: - case OP_FORPREP: - case OP_TFORLOOP: - printf("\t; to %d",sbx+pc+2); - break; - case OP_CLOSURE: - printf("\t; %p",VOID(f->p[bx])); - break; - case OP_SETLIST: - if (c==0) printf("\t; %d",(int)code[++pc]); else printf("\t; %d",c); - break; - case OP_EXTRAARG: - printf("\t; "); PrintConstant(f,ax); - break; - default: - break; - } - printf("\n"); - } -} - -#define SS(x) ((x==1)?"":"s") -#define S(x) (int)(x),SS(x) - -static void PrintHeader(const Proto* f) -{ - const char* s=f->source ? getstr(f->source) : "=?"; - if (*s=='@' || *s=='=') - s++; - else if (*s==LUA_SIGNATURE[0]) - s="(bstring)"; - else - s="(string)"; - printf("\n%s <%s:%d,%d> (%d instruction%s at %p)\n", - (f->linedefined==0)?"main":"function",s, - f->linedefined,f->lastlinedefined, - S(f->sizecode),VOID(f)); - printf("%d%s param%s, %d slot%s, %d upvalue%s, ", - (int)(f->numparams),f->is_vararg?"+":"",SS(f->numparams), - S(f->maxstacksize),S(f->sizeupvalues)); - printf("%d local%s, %d constant%s, %d function%s\n", - S(f->sizelocvars),S(f->sizek),S(f->sizep)); -} - -static void PrintDebug(const Proto* f) -{ - int i,n; - n=f->sizek; - printf("constants (%d) for %p:\n",n,VOID(f)); - for (i=0; isizelocvars; - printf("locals (%d) for %p:\n",n,VOID(f)); - for (i=0; ilocvars[i].varname),f->locvars[i].startpc+1,f->locvars[i].endpc+1); - } - n=f->sizeupvalues; - printf("upvalues (%d) for %p:\n",n,VOID(f)); - for (i=0; iupvalues[i].instack,f->upvalues[i].idx); - } -} - -static void PrintFunction(const Proto* f, int full) -{ - int i,n=f->sizep; - PrintHeader(f); - PrintCode(f); - if (full) PrintDebug(f); - for (i=0; ip[i],full); -} diff --git a/WickedEngine/MainComponent.cpp b/WickedEngine/MainComponent.cpp index 97331b2ea..d5b98bd30 100644 --- a/WickedEngine/MainComponent.cpp +++ b/WickedEngine/MainComponent.cpp @@ -50,22 +50,30 @@ void MainComponent::Initialize() wiRenderer::SetShaderPath(wiRenderer::GetShaderPath() + "spirv/"); wiRenderer::SetDevice(std::make_shared(window, fullscreen, debugdevice)); #else - wiHelper::messageBox("Vulkan SDK not found during building the application! Vulkan API disabled!", "Error"); + wiHelper::messageBox("Vulkan SDK not found during build! Vulkan API disabled!", "Error"); #endif } else if (wiStartupArguments::HasArgument("dx12")) { +#ifdef WICKEDENGINE_BUILD_DX12 if (wiStartupArguments::HasArgument("hlsl6")) { wiRenderer::SetShaderPath(wiRenderer::GetShaderPath() + "hlsl6/"); } wiRenderer::SetDevice(std::make_shared(window, fullscreen, debugdevice)); +#else + wiHelper::messageBox("DirectX 12 not found during build! DirectX 12 API disabled!", "Error"); +#endif } // default graphics device: if (wiRenderer::GetDevice() == nullptr) { +#ifdef WICKEDENGINE_BUILD_DX11 wiRenderer::SetDevice(std::make_shared(window, fullscreen, debugdevice)); +#else + wiHelper::messageBox("DirectX 11 not found during build! DirectX 11 API disabled!", "Error"); +#endif } } @@ -266,16 +274,20 @@ void MainComponent::Compose(CommandList cmd) ss << "[UWP]"; #endif +#ifdef WICKEDENGINE_BUILD_DX11 if (dynamic_cast(wiRenderer::GetDevice())) { ss << "[DX11]"; } - else if (dynamic_cast(wiRenderer::GetDevice())) +#endif +#ifdef WICKEDENGINE_BUILD_DX12 + if (dynamic_cast(wiRenderer::GetDevice())) { ss << "[DX12]"; } +#endif #ifdef WICKEDENGINE_BUILD_VULKAN - else if (dynamic_cast(wiRenderer::GetDevice())) + if (dynamic_cast(wiRenderer::GetDevice())) { ss << "[Vulkan]"; } diff --git a/WickedEngine/Utility/D3D12MemAlloc.cpp b/WickedEngine/Utility/D3D12MemAlloc.cpp index 952b56396..25a8ea4e2 100644 --- a/WickedEngine/Utility/D3D12MemAlloc.cpp +++ b/WickedEngine/Utility/D3D12MemAlloc.cpp @@ -20,6 +20,8 @@ // THE SOFTWARE. // +#ifdef _WIN32 + #include "D3D12MemAlloc.h" #include @@ -4191,3 +4193,5 @@ HRESULT CreateAllocator(const ALLOCATOR_DESC* pDesc, Allocator** ppAllocator) } } // namespace D3D12MA + +#endif // _WIN32 diff --git a/WickedEngine/Utility/DirectXCollision.h b/WickedEngine/Utility/DirectXCollision.h new file mode 100644 index 000000000..58c5094f7 --- /dev/null +++ b/WickedEngine/Utility/DirectXCollision.h @@ -0,0 +1,362 @@ +//------------------------------------------------------------------------------------- +// DirectXCollision.h -- C++ Collision Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMathCommon.h" +#include "DirectXMath.h" + +namespace DirectX +{ + +enum ContainmentType +{ + DISJOINT = 0, + INTERSECTS = 1, + CONTAINS = 2 +}; + +enum PlaneIntersectionType +{ + FRONT = 0, + INTERSECTING = 1, + BACK = 2 +}; + +struct BoundingBox; +struct BoundingOrientedBox; +struct BoundingFrustum; + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable:4324 4820) +// C4324: alignment padding warnings +// C4820: Off by default noise +#endif + +//------------------------------------------------------------------------------------- +// Bounding sphere +//------------------------------------------------------------------------------------- +struct BoundingSphere +{ + XMFLOAT3 Center; // Center of the sphere. + float Radius; // Radius of the sphere. + + // Creators + BoundingSphere() noexcept : Center(0, 0, 0), Radius(1.f) {} + + BoundingSphere(const BoundingSphere&) = default; + BoundingSphere& operator=(const BoundingSphere&) = default; + + BoundingSphere(BoundingSphere&&) = default; + BoundingSphere& operator=(BoundingSphere&&) = default; + + XM_CONSTEXPR BoundingSphere( _In_ const XMFLOAT3& center, _In_ float radius ) + : Center(center), Radius(radius) {} + + // Methods + void XM_CALLCONV Transform( _Out_ BoundingSphere& Out, _In_ FXMMATRIX M ) const; + void XM_CALLCONV Transform( _Out_ BoundingSphere& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; + // Transform the sphere + + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR Point ) const; + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + ContainmentType Contains( _In_ const BoundingSphere& sh ) const; + ContainmentType Contains( _In_ const BoundingBox& box ) const; + ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; + ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; + + bool Intersects( _In_ const BoundingSphere& sh ) const; + bool Intersects( _In_ const BoundingBox& box ) const; + bool Intersects( _In_ const BoundingOrientedBox& box ) const; + bool Intersects( _In_ const BoundingFrustum& fr ) const; + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + // Triangle-sphere test + + PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR Plane ) const; + // Plane-sphere test + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; + // Ray-sphere test + + ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const; + // Test sphere against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateMerged( _Out_ BoundingSphere& Out, _In_ const BoundingSphere& S1, _In_ const BoundingSphere& S2 ); + + static void CreateFromBoundingBox( _Out_ BoundingSphere& Out, _In_ const BoundingBox& box ); + static void CreateFromBoundingBox( _Out_ BoundingSphere& Out, _In_ const BoundingOrientedBox& box ); + + static void CreateFromPoints( _Out_ BoundingSphere& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride ); + + static void CreateFromFrustum( _Out_ BoundingSphere& Out, _In_ const BoundingFrustum& fr ); +}; + +//------------------------------------------------------------------------------------- +// Axis-aligned bounding box +//------------------------------------------------------------------------------------- +struct BoundingBox +{ + static const size_t CORNER_COUNT = 8; + + XMFLOAT3 Center; // Center of the box. + XMFLOAT3 Extents; // Distance from the center to each side. + + // Creators + BoundingBox() noexcept : Center(0, 0, 0), Extents(1.f, 1.f, 1.f) {} + + BoundingBox(const BoundingBox&) = default; + BoundingBox& operator=(const BoundingBox&) = default; + + BoundingBox(BoundingBox&&) = default; + BoundingBox& operator=(BoundingBox&&) = default; + + XM_CONSTEXPR BoundingBox( _In_ const XMFLOAT3& center, _In_ const XMFLOAT3& extents ) + : Center(center), Extents(extents) {} + + // Methods + void XM_CALLCONV Transform( _Out_ BoundingBox& Out, _In_ FXMMATRIX M ) const; + void XM_CALLCONV Transform( _Out_ BoundingBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; + + void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const; + // Gets the 8 corners of the box + + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR Point ) const; + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + ContainmentType Contains( _In_ const BoundingSphere& sh ) const; + ContainmentType Contains( _In_ const BoundingBox& box ) const; + ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; + ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; + + bool Intersects( _In_ const BoundingSphere& sh ) const; + bool Intersects( _In_ const BoundingBox& box ) const; + bool Intersects( _In_ const BoundingOrientedBox& box ) const; + bool Intersects( _In_ const BoundingFrustum& fr ) const; + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + // Triangle-Box test + + PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR Plane ) const; + // Plane-box test + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; + // Ray-Box test + + ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const; + // Test box against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateMerged( _Out_ BoundingBox& Out, _In_ const BoundingBox& b1, _In_ const BoundingBox& b2 ); + + static void CreateFromSphere( _Out_ BoundingBox& Out, _In_ const BoundingSphere& sh ); + + static void XM_CALLCONV CreateFromPoints( _Out_ BoundingBox& Out, _In_ FXMVECTOR pt1, _In_ FXMVECTOR pt2 ); + static void CreateFromPoints( _Out_ BoundingBox& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride ); +}; + +//------------------------------------------------------------------------------------- +// Oriented bounding box +//------------------------------------------------------------------------------------- +struct BoundingOrientedBox +{ + static const size_t CORNER_COUNT = 8; + + XMFLOAT3 Center; // Center of the box. + XMFLOAT3 Extents; // Distance from the center to each side. + XMFLOAT4 Orientation; // Unit quaternion representing rotation (box -> world). + + // Creators + BoundingOrientedBox() noexcept : Center(0, 0, 0), Extents(1.f, 1.f, 1.f), Orientation(0, 0, 0, 1.f) {} + + BoundingOrientedBox(const BoundingOrientedBox&) = default; + BoundingOrientedBox& operator=(const BoundingOrientedBox&) = default; + + BoundingOrientedBox(BoundingOrientedBox&&) = default; + BoundingOrientedBox& operator=(BoundingOrientedBox&&) = default; + + XM_CONSTEXPR BoundingOrientedBox( _In_ const XMFLOAT3& _Center, _In_ const XMFLOAT3& _Extents, _In_ const XMFLOAT4& _Orientation ) + : Center(_Center), Extents(_Extents), Orientation(_Orientation) {} + + // Methods + void XM_CALLCONV Transform( _Out_ BoundingOrientedBox& Out, _In_ FXMMATRIX M ) const; + void XM_CALLCONV Transform( _Out_ BoundingOrientedBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; + + void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const; + // Gets the 8 corners of the box + + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR Point ) const; + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + ContainmentType Contains( _In_ const BoundingSphere& sh ) const; + ContainmentType Contains( _In_ const BoundingBox& box ) const; + ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; + ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; + + bool Intersects( _In_ const BoundingSphere& sh ) const; + bool Intersects( _In_ const BoundingBox& box ) const; + bool Intersects( _In_ const BoundingOrientedBox& box ) const; + bool Intersects( _In_ const BoundingFrustum& fr ) const; + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + // Triangle-OrientedBox test + + PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR Plane ) const; + // Plane-OrientedBox test + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; + // Ray-OrientedBox test + + ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const; + // Test OrientedBox against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateFromBoundingBox( _Out_ BoundingOrientedBox& Out, _In_ const BoundingBox& box ); + + static void CreateFromPoints( _Out_ BoundingOrientedBox& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride ); +}; + +//------------------------------------------------------------------------------------- +// Bounding frustum +//------------------------------------------------------------------------------------- +struct BoundingFrustum +{ + static const size_t CORNER_COUNT = 8; + + XMFLOAT3 Origin; // Origin of the frustum (and projection). + XMFLOAT4 Orientation; // Quaternion representing rotation. + + float RightSlope; // Positive X (X/Z) + float LeftSlope; // Negative X + float TopSlope; // Positive Y (Y/Z) + float BottomSlope; // Negative Y + float Near, Far; // Z of the near plane and far plane. + + // Creators + BoundingFrustum() noexcept : + Origin(0, 0, 0), Orientation(0, 0, 0, 1.f), RightSlope(1.f), LeftSlope(-1.f), + TopSlope(1.f), BottomSlope(-1.f), Near(0), Far(1.f) {} + + BoundingFrustum(const BoundingFrustum&) = default; + BoundingFrustum& operator=(const BoundingFrustum&) = default; + + BoundingFrustum(BoundingFrustum&&) = default; + BoundingFrustum& operator=(BoundingFrustum&&) = default; + + XM_CONSTEXPR BoundingFrustum( _In_ const XMFLOAT3& _Origin, _In_ const XMFLOAT4& _Orientation, + _In_ float _RightSlope, _In_ float _LeftSlope, _In_ float _TopSlope, _In_ float _BottomSlope, + _In_ float _Near, _In_ float _Far ) + : Origin(_Origin), Orientation(_Orientation), + RightSlope(_RightSlope), LeftSlope(_LeftSlope), TopSlope(_TopSlope), BottomSlope(_BottomSlope), + Near(_Near), Far(_Far) {} + BoundingFrustum( _In_ CXMMATRIX Projection ); + + // Methods + void XM_CALLCONV Transform( _Out_ BoundingFrustum& Out, _In_ FXMMATRIX M ) const; + void XM_CALLCONV Transform( _Out_ BoundingFrustum& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; + + void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const; + // Gets the 8 corners of the frustum + + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR Point ) const; + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + ContainmentType Contains( _In_ const BoundingSphere& sp ) const; + ContainmentType Contains( _In_ const BoundingBox& box ) const; + ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; + ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; + // Frustum-Frustum test + + bool Intersects( _In_ const BoundingSphere& sh ) const; + bool Intersects( _In_ const BoundingBox& box ) const; + bool Intersects( _In_ const BoundingOrientedBox& box ) const; + bool Intersects( _In_ const BoundingFrustum& fr ) const; + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + // Triangle-Frustum test + + PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR Plane ) const; + // Plane-Frustum test + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR rayOrigin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; + // Ray-Frustum test + + ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const; + // Test frustum against six planes (see BoundingFrustum::GetPlanes) + + void GetPlanes( _Out_opt_ XMVECTOR* NearPlane, _Out_opt_ XMVECTOR* FarPlane, _Out_opt_ XMVECTOR* RightPlane, + _Out_opt_ XMVECTOR* LeftPlane, _Out_opt_ XMVECTOR* TopPlane, _Out_opt_ XMVECTOR* BottomPlane ) const; + // Create 6 Planes representation of Frustum + + // Static methods + static void XM_CALLCONV CreateFromMatrix( _Out_ BoundingFrustum& Out, _In_ FXMMATRIX Projection ); +}; + +//----------------------------------------------------------------------------- +// Triangle intersection testing routines. +//----------------------------------------------------------------------------- +namespace TriangleTests +{ + bool XM_CALLCONV Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _In_ FXMVECTOR V0, _In_ GXMVECTOR V1, _In_ HXMVECTOR V2, _Out_ float& Dist ); + // Ray-Triangle + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR A0, _In_ FXMVECTOR A1, _In_ FXMVECTOR A2, _In_ GXMVECTOR B0, _In_ HXMVECTOR B1, _In_ HXMVECTOR B2 ); + // Triangle-Triangle + + PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, _In_ GXMVECTOR Plane ); + // Plane-Triangle + + ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, + _In_ GXMVECTOR Plane0, _In_ HXMVECTOR Plane1, _In_ HXMVECTOR Plane2, + _In_ CXMVECTOR Plane3, _In_ CXMVECTOR Plane4, _In_ CXMVECTOR Plane5 ); + // Test a triangle against six planes at once (see BoundingFrustum::GetPlanes) +} + +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +/**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4068 4365 4616 6001) +// C4068/4616: ignore unknown pragmas +// C4365: Off by default noise +// C6001: False positives + +# ifdef _PREFAST_ +# pragma prefast(push) +# pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +# pragma prefast(disable : 26495, "Union initialization confuses /analyze") +# endif +#endif + +#include "DirectXCollision.inl" + +#ifdef _MSC_VER +# ifdef _PREFAST_ +# pragma prefast(pop) +# endif + +# pragma warning(pop) +#endif + +} // namespace DirectX + diff --git a/WickedEngine/Utility/DirectXCollision.inl b/WickedEngine/Utility/DirectXCollision.inl new file mode 100644 index 000000000..a1e58b160 --- /dev/null +++ b/WickedEngine/Utility/DirectXCollision.inl @@ -0,0 +1,4787 @@ +//------------------------------------------------------------------------------------- +// DirectXCollision.inl -- C++ Collision Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +XMGLOBALCONST XMVECTORF32 g_BoxOffset[8] = +{ + { -1.0f, -1.0f, 1.0f, 0.0f }, + { 1.0f, -1.0f, 1.0f, 0.0f }, + { 1.0f, 1.0f, 1.0f, 0.0f }, + { -1.0f, 1.0f, 1.0f, 0.0f }, + { -1.0f, -1.0f, -1.0f, 0.0f }, + { 1.0f, -1.0f, -1.0f, 0.0f }, + { 1.0f, 1.0f, -1.0f, 0.0f }, + { -1.0f, 1.0f, -1.0f, 0.0f }, +}; + +XMGLOBALCONST XMVECTORF32 g_RayEpsilon = { 1e-20f, 1e-20f, 1e-20f, 1e-20f }; +XMGLOBALCONST XMVECTORF32 g_RayNegEpsilon = { -1e-20f, -1e-20f, -1e-20f, -1e-20f }; +XMGLOBALCONST XMVECTORF32 g_FltMin = { -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX }; +XMGLOBALCONST XMVECTORF32 g_FltMax = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX }; + +namespace Internal +{ + +//----------------------------------------------------------------------------- +// Return true if any of the elements of a 3 vector are equal to 0xffffffff. +// Slightly more efficient than using XMVector3EqualInt. +//----------------------------------------------------------------------------- +inline bool XMVector3AnyTrue( _In_ FXMVECTOR V ) +{ + // Duplicate the fourth element from the first element. + XMVECTOR C = XMVectorSwizzle(V); + + return XMComparisonAnyTrue( XMVector4EqualIntR( C, XMVectorTrueInt() ) ); +} + + +//----------------------------------------------------------------------------- +// Return true if all of the elements of a 3 vector are equal to 0xffffffff. +// Slightly more efficient than using XMVector3EqualInt. +//----------------------------------------------------------------------------- +inline bool XMVector3AllTrue( _In_ FXMVECTOR V ) +{ + // Duplicate the fourth element from the first element. + XMVECTOR C = XMVectorSwizzle( V ); + + return XMComparisonAllTrue( XMVector4EqualIntR( C, XMVectorTrueInt() ) ); +} + +#if defined(_PREFAST_) || !defined(NDEBUG) + +XMGLOBALCONST XMVECTORF32 g_UnitVectorEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f }; +XMGLOBALCONST XMVECTORF32 g_UnitQuaternionEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f }; +XMGLOBALCONST XMVECTORF32 g_UnitPlaneEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f }; + +//----------------------------------------------------------------------------- +// Return true if the vector is a unit vector (length == 1). +//----------------------------------------------------------------------------- +inline bool XMVector3IsUnit( _In_ FXMVECTOR V ) +{ + XMVECTOR Difference = XMVectorSubtract( XMVector3Length( V ), XMVectorSplatOne() ); + return XMVector4Less( XMVectorAbs( Difference ), g_UnitVectorEpsilon ); +} + +//----------------------------------------------------------------------------- +// Return true if the quaterion is a unit quaternion. +//----------------------------------------------------------------------------- +inline bool XMQuaternionIsUnit( _In_ FXMVECTOR Q ) +{ + XMVECTOR Difference = XMVectorSubtract( XMVector4Length( Q ), XMVectorSplatOne() ); + return XMVector4Less( XMVectorAbs( Difference ), g_UnitQuaternionEpsilon ); +} + +//----------------------------------------------------------------------------- +// Return true if the plane is a unit plane. +//----------------------------------------------------------------------------- +inline bool XMPlaneIsUnit( _In_ FXMVECTOR Plane ) +{ + XMVECTOR Difference = XMVectorSubtract( XMVector3Length( Plane ), XMVectorSplatOne() ); + return XMVector4Less( XMVectorAbs( Difference ), g_UnitPlaneEpsilon ); +} + +#endif // _PREFAST_ || !NDEBUG + +//----------------------------------------------------------------------------- +inline XMVECTOR XMPlaneTransform( _In_ FXMVECTOR Plane, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) +{ + XMVECTOR vNormal = XMVector3Rotate( Plane, Rotation ); + XMVECTOR vD = XMVectorSubtract( XMVectorSplatW( Plane ), XMVector3Dot( vNormal, Translation ) ); + + return XMVectorInsert<0, 0, 0, 0, 1>( vNormal, vD ); +} + +//----------------------------------------------------------------------------- +// Return the point on the line segement (S1, S2) nearest the point P. +//----------------------------------------------------------------------------- +inline XMVECTOR PointOnLineSegmentNearestPoint( _In_ FXMVECTOR S1, _In_ FXMVECTOR S2, _In_ FXMVECTOR P ) +{ + XMVECTOR Dir = XMVectorSubtract( S2, S1 ); + XMVECTOR Projection = XMVectorSubtract( XMVector3Dot( P, Dir ), XMVector3Dot( S1, Dir ) ); + XMVECTOR LengthSq = XMVector3Dot( Dir, Dir ); + + XMVECTOR t = XMVectorMultiply( Projection, XMVectorReciprocal( LengthSq ) ); + XMVECTOR Point = XMVectorMultiplyAdd( t, Dir, S1 ); + + // t < 0 + XMVECTOR SelectS1 = XMVectorLess( Projection, XMVectorZero() ); + Point = XMVectorSelect( Point, S1, SelectS1 ); + + // t > 1 + XMVECTOR SelectS2 = XMVectorGreater( Projection, LengthSq ); + Point = XMVectorSelect( Point, S2, SelectS2 ); + + return Point; +} + +//----------------------------------------------------------------------------- +// Test if the point (P) on the plane of the triangle is inside the triangle +// (V0, V1, V2). +//----------------------------------------------------------------------------- +inline XMVECTOR XM_CALLCONV PointOnPlaneInsideTriangle( _In_ FXMVECTOR P, _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ GXMVECTOR V2 ) +{ + // Compute the triangle normal. + XMVECTOR N = XMVector3Cross( XMVectorSubtract( V2, V0 ), XMVectorSubtract( V1, V0 ) ); + + // Compute the cross products of the vector from the base of each edge to + // the point with each edge vector. + XMVECTOR C0 = XMVector3Cross( XMVectorSubtract( P, V0 ), XMVectorSubtract( V1, V0 ) ); + XMVECTOR C1 = XMVector3Cross( XMVectorSubtract( P, V1 ), XMVectorSubtract( V2, V1 ) ); + XMVECTOR C2 = XMVector3Cross( XMVectorSubtract( P, V2 ), XMVectorSubtract( V0, V2 ) ); + + // If the cross product points in the same direction as the normal the the + // point is inside the edge (it is zero if is on the edge). + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Inside0 = XMVectorGreaterOrEqual( XMVector3Dot( C0, N ), Zero ); + XMVECTOR Inside1 = XMVectorGreaterOrEqual( XMVector3Dot( C1, N ), Zero ); + XMVECTOR Inside2 = XMVectorGreaterOrEqual( XMVector3Dot( C2, N ), Zero ); + + // If the point inside all of the edges it is inside. + return XMVectorAndInt( XMVectorAndInt( Inside0, Inside1 ), Inside2 ); +} + +//----------------------------------------------------------------------------- +inline bool SolveCubic( _In_ float e, _In_ float f, _In_ float g, _Out_ float* t, _Out_ float* u, _Out_ float* v ) +{ + float p, q, h, rc, d, theta, costh3, sinth3; + + p = f - e * e / 3.0f; + q = g - e * f / 3.0f + e * e * e * 2.0f / 27.0f; + h = q * q / 4.0f + p * p * p / 27.0f; + + if( h > 0 ) + { + *t = *u = *v = 0.f; + return false; // only one real root + } + + if( ( h == 0 ) && ( q == 0 ) ) // all the same root + { + *t = - e / 3; + *u = - e / 3; + *v = - e / 3; + + return true; + } + + d = sqrtf( q * q / 4.0f - h ); + if( d < 0 ) + rc = -powf( -d, 1.0f / 3.0f ); + else + rc = powf( d, 1.0f / 3.0f ); + + theta = XMScalarACos( -q / ( 2.0f * d ) ); + costh3 = XMScalarCos( theta / 3.0f ); + sinth3 = sqrtf( 3.0f ) * XMScalarSin( theta / 3.0f ); + *t = 2.0f * rc * costh3 - e / 3.0f; + *u = -rc * ( costh3 + sinth3 ) - e / 3.0f; + *v = -rc * ( costh3 - sinth3 ) - e / 3.0f; + + return true; +} + +//----------------------------------------------------------------------------- +inline XMVECTOR CalculateEigenVector( _In_ float m11, _In_ float m12, _In_ float m13, + _In_ float m22, _In_ float m23, _In_ float m33, _In_ float e ) +{ + float fTmp[3]; + fTmp[0] = m12 * m23 - m13 * ( m22 - e ); + fTmp[1] = m13 * m12 - m23 * ( m11 - e ); + fTmp[2] = ( m11 - e ) * ( m22 - e ) - m12 * m12; + + XMVECTOR vTmp = XMLoadFloat3( reinterpret_cast(fTmp) ); + + if( XMVector3Equal( vTmp, XMVectorZero() ) ) // planar or linear + { + float f1, f2, f3; + + // we only have one equation - find a valid one + if( ( m11 - e != 0 ) || ( m12 != 0 ) || ( m13 != 0 ) ) + { + f1 = m11 - e; f2 = m12; f3 = m13; + } + else if( ( m12 != 0 ) || ( m22 - e != 0 ) || ( m23 != 0 ) ) + { + f1 = m12; f2 = m22 - e; f3 = m23; + } + else if( ( m13 != 0 ) || ( m23 != 0 ) || ( m33 - e != 0 ) ) + { + f1 = m13; f2 = m23; f3 = m33 - e; + } + else + { + // error, we'll just make something up - we have NO context + f1 = 1.0f; f2 = 0.0f; f3 = 0.0f; + } + + if( f1 == 0 ) + vTmp = XMVectorSetX( vTmp, 0.0f ); + else + vTmp = XMVectorSetX( vTmp, 1.0f ); + + if( f2 == 0 ) + vTmp = XMVectorSetY( vTmp, 0.0f ); + else + vTmp = XMVectorSetY( vTmp, 1.0f ); + + if( f3 == 0 ) + { + vTmp = XMVectorSetZ( vTmp, 0.0f ); + // recalculate y to make equation work + if( m12 != 0 ) + vTmp = XMVectorSetY( vTmp, -f1 / f2 ); + } + else + { + vTmp = XMVectorSetZ( vTmp, ( f2 - f1 ) / f3 ); + } + } + + if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) > 1e-5f ) + { + return XMVector3Normalize( vTmp ); + } + else + { + // Multiply by a value large enough to make the vector non-zero. + vTmp = XMVectorScale( vTmp, 1e5f ); + return XMVector3Normalize( vTmp ); + } +} + +//----------------------------------------------------------------------------- +inline bool CalculateEigenVectors( _In_ float m11, _In_ float m12, _In_ float m13, + _In_ float m22, _In_ float m23, _In_ float m33, + _In_ float e1, _In_ float e2, _In_ float e3, + _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3 ) +{ + *pV1 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e1 ); + *pV2 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e2 ); + *pV3 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e3 ); + + bool v1z = false; + bool v2z = false; + bool v3z = false; + + XMVECTOR Zero = XMVectorZero(); + + if ( XMVector3Equal( *pV1, Zero ) ) + v1z = true; + + if ( XMVector3Equal( *pV2, Zero ) ) + v2z = true; + + if ( XMVector3Equal( *pV3, Zero )) + v3z = true; + + bool e12 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV1, *pV2 ) ) ) > 0.1f ); // check for non-orthogonal vectors + bool e13 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV1, *pV3 ) ) ) > 0.1f ); + bool e23 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV2, *pV3 ) ) ) > 0.1f ); + + if( ( v1z && v2z && v3z ) || ( e12 && e13 && e23 ) || + ( e12 && v3z ) || ( e13 && v2z ) || ( e23 && v1z ) ) // all eigenvectors are 0- any basis set + { + *pV1 = g_XMIdentityR0.v; + *pV2 = g_XMIdentityR1.v; + *pV3 = g_XMIdentityR2.v; + return true; + } + + if( v1z && v2z ) + { + XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV3 ); + if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f ) + { + vTmp = XMVector3Cross( g_XMIdentityR0, *pV3 ); + } + *pV1 = XMVector3Normalize( vTmp ); + *pV2 = XMVector3Cross( *pV3, *pV1 ); + return true; + } + + if( v3z && v1z ) + { + XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV2 ); + if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f ) + { + vTmp = XMVector3Cross( g_XMIdentityR0, *pV2 ); + } + *pV3 = XMVector3Normalize( vTmp ); + *pV1 = XMVector3Cross( *pV2, *pV3 ); + return true; + } + + if( v2z && v3z ) + { + XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV1 ); + if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f ) + { + vTmp = XMVector3Cross( g_XMIdentityR0, *pV1 ); + } + *pV2 = XMVector3Normalize( vTmp ); + *pV3 = XMVector3Cross( *pV1, *pV2 ); + return true; + } + + if( ( v1z ) || e12 ) + { + *pV1 = XMVector3Cross( *pV2, *pV3 ); + return true; + } + + if( ( v2z ) || e23 ) + { + *pV2 = XMVector3Cross( *pV3, *pV1 ); + return true; + } + + if( ( v3z ) || e13 ) + { + *pV3 = XMVector3Cross( *pV1, *pV2 ); + return true; + } + + return true; +} + +//----------------------------------------------------------------------------- +inline bool CalculateEigenVectorsFromCovarianceMatrix( _In_ float Cxx, _In_ float Cyy, _In_ float Czz, + _In_ float Cxy, _In_ float Cxz, _In_ float Cyz, + _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3 ) +{ + // Calculate the eigenvalues by solving a cubic equation. + float e = -( Cxx + Cyy + Czz ); + float f = Cxx * Cyy + Cyy * Czz + Czz * Cxx - Cxy * Cxy - Cxz * Cxz - Cyz * Cyz; + float g = Cxy * Cxy * Czz + Cxz * Cxz * Cyy + Cyz * Cyz * Cxx - Cxy * Cyz * Cxz * 2.0f - Cxx * Cyy * Czz; + + float ev1, ev2, ev3; + if( !DirectX::Internal::SolveCubic( e, f, g, &ev1, &ev2, &ev3 ) ) + { + // set them to arbitrary orthonormal basis set + *pV1 = g_XMIdentityR0.v; + *pV2 = g_XMIdentityR1.v; + *pV3 = g_XMIdentityR2.v; + return false; + } + + return DirectX::Internal::CalculateEigenVectors( Cxx, Cxy, Cxz, Cyy, Cyz, Czz, ev1, ev2, ev3, pV1, pV2, pV3 ); +} + +//----------------------------------------------------------------------------- +inline void XM_CALLCONV FastIntersectTrianglePlane( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane, + XMVECTOR& Outside, XMVECTOR& Inside ) +{ + // Plane0 + XMVECTOR Dist0 = XMVector4Dot( V0, Plane ); + XMVECTOR Dist1 = XMVector4Dot( V1, Plane ); + XMVECTOR Dist2 = XMVector4Dot( V2, Plane ); + + XMVECTOR MinDist = XMVectorMin( Dist0, Dist1 ); + MinDist = XMVectorMin( MinDist, Dist2 ); + + XMVECTOR MaxDist = XMVectorMax( Dist0, Dist1 ); + MaxDist = XMVectorMax( MaxDist, Dist2 ); + + XMVECTOR Zero = XMVectorZero(); + + // Outside the plane? + Outside = XMVectorGreater( MinDist, Zero ); + + // Fully inside the plane? + Inside = XMVectorLess( MaxDist, Zero ); +} + +//----------------------------------------------------------------------------- +inline void FastIntersectSpherePlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Radius, _In_ FXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) +{ + XMVECTOR Dist = XMVector4Dot( Center, Plane ); + + // Outside the plane? + Outside = XMVectorGreater( Dist, Radius ); + + // Fully inside the plane? + Inside = XMVectorLess( Dist, XMVectorNegate( Radius ) ); +} + +//----------------------------------------------------------------------------- +inline void FastIntersectAxisAlignedBoxPlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) +{ + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot( Center, Plane ); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. In this case b(i) = [(1,0,0), (0,1,0), (0,0,1)]. + XMVECTOR Radius = XMVector3Dot( Extents, XMVectorAbs( Plane ) ); + + // Outside the plane? + Outside = XMVectorGreater( Dist, Radius ); + + // Fully inside the plane? + Inside = XMVectorLess( Dist, XMVectorNegate( Radius ) ); +} + +//----------------------------------------------------------------------------- +inline void XM_CALLCONV FastIntersectOrientedBoxPlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Axis0, _In_ GXMVECTOR Axis1, + _In_ HXMVECTOR Axis2, _In_ HXMVECTOR Plane, _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) +{ + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot( Center, Plane ); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. + XMVECTOR Radius = XMVector3Dot( Plane, Axis0 ); + Radius = XMVectorInsert<0, 0, 1, 0, 0>( Radius, XMVector3Dot( Plane, Axis1 ) ); + Radius = XMVectorInsert<0, 0, 0, 1, 0>( Radius, XMVector3Dot( Plane, Axis2 ) ); + Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) ); + + // Outside the plane? + Outside = XMVectorGreater( Dist, Radius ); + + // Fully inside the plane? + Inside = XMVectorLess( Dist, XMVectorNegate( Radius ) ); +} + +//----------------------------------------------------------------------------- +inline void XM_CALLCONV FastIntersectFrustumPlane( _In_ FXMVECTOR Point0, _In_ FXMVECTOR Point1, _In_ FXMVECTOR Point2, _In_ GXMVECTOR Point3, + _In_ HXMVECTOR Point4, _In_ HXMVECTOR Point5, _In_ CXMVECTOR Point6, _In_ CXMVECTOR Point7, + _In_ CXMVECTOR Plane, _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) +{ + // Find the min/max projection of the frustum onto the plane normal. + XMVECTOR Min, Max, Dist; + + Min = Max = XMVector3Dot( Plane, Point0 ); + + Dist = XMVector3Dot( Plane, Point1 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point2 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point3 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point4 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point5 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point6 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point7 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + XMVECTOR PlaneDist = XMVectorNegate( XMVectorSplatW( Plane ) ); + + // Outside the plane? + Outside = XMVectorGreater( Min, PlaneDist ); + + // Fully inside the plane? + Inside = XMVectorLess( Max, PlaneDist ); +} + +} // namespace Internal + + +/**************************************************************************** + * + * BoundingSphere + * + ****************************************************************************/ + +//----------------------------------------------------------------------------- +// Transform a sphere by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingSphere::Transform( BoundingSphere& Out, FXMMATRIX M ) const +{ + // Load the center of the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + + // Transform the center of the sphere. + XMVECTOR C = XMVector3Transform( vCenter, M ); + + XMVECTOR dX = XMVector3Dot( M.r[0], M.r[0] ); + XMVECTOR dY = XMVector3Dot( M.r[1], M.r[1] ); + XMVECTOR dZ = XMVector3Dot( M.r[2], M.r[2] ); + + XMVECTOR d = XMVectorMax( dX, XMVectorMax( dY, dZ ) ); + + // Store the center sphere. + XMStoreFloat3( &Out.Center, C ); + + // Scale the radius of the pshere. + float Scale = sqrtf( XMVectorGetX(d) ); + Out.Radius = Radius * Scale; +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingSphere::Transform( BoundingSphere& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const +{ + // Load the center of the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + + // Transform the center of the sphere. + vCenter = XMVectorAdd( XMVector3Rotate( XMVectorScale( vCenter, Scale ), Rotation ), Translation ); + + // Store the center sphere. + XMStoreFloat3( &Out.Center, vCenter ); + + // Scale the radius of the pshere. + Out.Radius = Radius * Scale; +} + + +//----------------------------------------------------------------------------- +// Point in sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingSphere::Contains( FXMVECTOR Point ) const +{ + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + XMVECTOR DistanceSquared = XMVector3LengthSq( XMVectorSubtract( Point, vCenter ) ); + XMVECTOR RadiusSquared = XMVectorMultiply( vRadius, vRadius ); + + return XMVector3LessOrEqual( DistanceSquared, RadiusSquared ) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingSphere::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + if ( !Intersects(V0,V1,V2) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + XMVECTOR RadiusSquared = XMVectorMultiply( vRadius, vRadius ); + + XMVECTOR DistanceSquared = XMVector3LengthSq( XMVectorSubtract( V0, vCenter ) ); + XMVECTOR Inside = XMVectorLessOrEqual(DistanceSquared, RadiusSquared); + + DistanceSquared = XMVector3LengthSq( XMVectorSubtract( V1, vCenter ) ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared) ); + + DistanceSquared = XMVector3LengthSq( XMVectorSubtract( V2, vCenter ) ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared) ); + + return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere in sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( const BoundingSphere& sh ) const +{ + XMVECTOR Center1 = XMLoadFloat3( &Center ); + float r1 = Radius; + + XMVECTOR Center2 = XMLoadFloat3( &sh.Center ); + float r2 = sh.Radius; + + XMVECTOR V = XMVectorSubtract( Center2, Center1 ); + + XMVECTOR Dist = XMVector3Length( V ); + + float d = XMVectorGetX( Dist ); + + return (r1 + r2 >= d) ? ((r1 - r2 >= d) ? CONTAINS : INTERSECTS) : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( const BoundingBox& box ) const +{ + if ( !box.Intersects(*this) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + XMVECTOR RadiusSq = XMVectorMultiply( vRadius, vRadius ); + + XMVECTOR boxCenter = XMLoadFloat3( &box.Center ); + XMVECTOR boxExtents = XMLoadFloat3( &box.Extents ); + + XMVECTOR InsideAll = XMVectorTrueInt(); + + XMVECTOR offset = XMVectorSubtract( boxCenter, vCenter ); + + for( size_t i = 0; i < BoundingBox::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVectorMultiplyAdd( boxExtents, g_BoxOffset[i], offset ); + XMVECTOR d = XMVector3LengthSq( C ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) ); + } + + return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Oriented box in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( const BoundingOrientedBox& box ) const +{ + if ( !box.Intersects(*this) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + XMVECTOR RadiusSq = XMVectorMultiply( vRadius, vRadius ); + + XMVECTOR boxCenter = XMLoadFloat3( &box.Center ); + XMVECTOR boxExtents = XMLoadFloat3( &box.Extents ); + XMVECTOR boxOrientation = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( boxOrientation ) ); + + XMVECTOR InsideAll = XMVectorTrueInt(); + + for( size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVectorAdd( XMVector3Rotate( XMVectorMultiply( boxExtents, g_BoxOffset[i] ), boxOrientation ), boxCenter ); + XMVECTOR d = XMVector3LengthSq( XMVectorSubtract( vCenter, C ) ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) ); + } + + return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; + +} + + +//----------------------------------------------------------------------------- +// Frustum in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( const BoundingFrustum& fr ) const +{ + if ( !fr.Intersects(*this) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + XMVECTOR RadiusSq = XMVectorMultiply( vRadius, vRadius ); + + XMVECTOR vOrigin = XMLoadFloat3( &fr.Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &fr.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( fr.RightSlope, fr.TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &fr.Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &fr.Far ); + + XMVECTOR Corners[BoundingFrustum::CORNER_COUNT]; + Corners[0] = XMVectorMultiply( vRightTop, vNear ); + Corners[1] = XMVectorMultiply( vRightBottom, vNear ); + Corners[2] = XMVectorMultiply( vLeftTop, vNear ); + Corners[3] = XMVectorMultiply( vLeftBottom, vNear ); + Corners[4] = XMVectorMultiply( vRightTop, vFar ); + Corners[5] = XMVectorMultiply( vRightBottom, vFar ); + Corners[6] = XMVectorMultiply( vLeftTop, vFar ); + Corners[7] = XMVectorMultiply( vLeftBottom, vFar ); + + XMVECTOR InsideAll = XMVectorTrueInt(); + for( size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVectorAdd( XMVector3Rotate( Corners[i], vOrientation ), vOrigin ); + XMVECTOR d = XMVector3LengthSq( XMVectorSubtract( vCenter, C ) ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) ); + } + + return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( const BoundingSphere& sh ) const +{ + // Load A. + XMVECTOR vCenterA = XMLoadFloat3( &Center ); + XMVECTOR vRadiusA = XMVectorReplicatePtr( &Radius ); + + // Load B. + XMVECTOR vCenterB = XMLoadFloat3( &sh.Center ); + XMVECTOR vRadiusB = XMVectorReplicatePtr( &sh.Radius ); + + // Distance squared between centers. + XMVECTOR Delta = XMVectorSubtract( vCenterB, vCenterA ); + XMVECTOR DistanceSquared = XMVector3LengthSq( Delta ); + + // Sum of the radii squared. + XMVECTOR RadiusSquared = XMVectorAdd( vRadiusA, vRadiusB ); + RadiusSquared = XMVectorMultiply( RadiusSquared, RadiusSquared ); + + return XMVector3LessOrEqual( DistanceSquared, RadiusSquared ); +} + + +//----------------------------------------------------------------------------- +// Box vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( const BoundingBox& box ) const +{ + return box.Intersects( *this ); +} + +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( const BoundingOrientedBox& box ) const +{ + return box.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Frustum vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( const BoundingFrustum& fr ) const +{ + return fr.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Triangle vs sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + // Compute the plane of the triangle (has to be normalized). + XMVECTOR N = XMVector3Normalize( XMVector3Cross( XMVectorSubtract( V1, V0 ), XMVectorSubtract( V2, V0 ) ) ); + + // Assert that the triangle is not degenerate. + assert( !XMVector3Equal( N, XMVectorZero() ) ); + + // Find the nearest feature on the triangle to the sphere. + XMVECTOR Dist = XMVector3Dot( XMVectorSubtract( vCenter, V0 ), N ); + + // If the center of the sphere is farther from the plane of the triangle than + // the radius of the sphere, then there cannot be an intersection. + XMVECTOR NoIntersection = XMVectorLess( Dist, XMVectorNegate( vRadius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Dist, vRadius ) ); + + // Project the center of the sphere onto the plane of the triangle. + XMVECTOR Point = XMVectorNegativeMultiplySubtract( N, Dist, vCenter ); + + // Is it inside all the edges? If so we intersect because the distance + // to the plane is less than the radius. + XMVECTOR Intersection = DirectX::Internal::PointOnPlaneInsideTriangle( Point, V0, V1, V2 ); + + // Find the nearest point on each edge. + XMVECTOR RadiusSq = XMVectorMultiply( vRadius, vRadius ); + + // Edge 0,1 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V0, V1, vCenter ); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( XMVectorSubtract( vCenter, Point ) ), RadiusSq ) ); + + // Edge 1,2 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V1, V2, vCenter ); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( XMVectorSubtract( vCenter, Point ) ), RadiusSq ) ); + + // Edge 2,0 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V2, V0, vCenter ); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( XMVectorSubtract( vCenter, Point ) ), RadiusSq ) ); + + return XMVector4EqualInt( XMVectorAndCInt( Intersection, NoIntersection ), XMVectorTrueInt() ); +} + + +//----------------------------------------------------------------------------- +// Sphere-plane intersection +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR Plane ) const +{ + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane, Outside, Inside ); + + // If the sphere is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the sphere is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The sphere is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with a sphere. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const +{ + assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + // l is the vector from the ray origin to the center of the sphere. + XMVECTOR l = XMVectorSubtract( vCenter, Origin ); + + // s is the projection of the l onto the ray direction. + XMVECTOR s = XMVector3Dot( l, Direction ); + + XMVECTOR l2 = XMVector3Dot( l, l ); + + XMVECTOR r2 = XMVectorMultiply( vRadius, vRadius ); + + // m2 is squared distance from the center of the sphere to the projection. + XMVECTOR m2 = XMVectorNegativeMultiplySubtract( s, s, l2 ); + + XMVECTOR NoIntersection; + + // If the ray origin is outside the sphere and the center of the sphere is + // behind the ray origin there is no intersection. + NoIntersection = XMVectorAndInt( XMVectorLess( s, XMVectorZero() ), XMVectorGreater( l2, r2 ) ); + + // If the squared distance from the center of the sphere to the projection + // is greater than the radius squared the ray will miss the sphere. + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( m2, r2 ) ); + + // The ray hits the sphere, compute the nearest intersection point. + XMVECTOR q = XMVectorSqrt( XMVectorSubtract( r2, m2 ) ); + XMVECTOR t1 = XMVectorSubtract( s, q ); + XMVECTOR t2 = XMVectorAdd( s, q ); + + XMVECTOR OriginInside = XMVectorLessOrEqual( l2, r2 ); + XMVECTOR t = XMVectorSelect( t1, t2, OriginInside ); + + if( XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ) ) + { + // Store the x-component to *pDist. + XMStoreFloat( &Dist, t ); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test a sphere vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingSphere::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const +{ + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane1, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane2, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane3, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane4, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane5, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the sphere is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the sphere is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The sphere is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Creates a bounding sphere that contains two other bounding spheres +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateMerged( BoundingSphere& Out, const BoundingSphere& S1, const BoundingSphere& S2 ) +{ + XMVECTOR Center1 = XMLoadFloat3( &S1.Center ); + float r1 = S1.Radius; + + XMVECTOR Center2 = XMLoadFloat3( &S2.Center ); + float r2 = S2.Radius; + + XMVECTOR V = XMVectorSubtract( Center2, Center1 ); + + XMVECTOR Dist = XMVector3Length( V ); + + float d = XMVectorGetX(Dist); + + if ( r1 + r2 >= d ) + { + if ( r1 - r2 >= d ) + { + Out = S1; + return; + } + else if ( r2 - r1 >= d ) + { + Out = S2; + return; + } + } + + XMVECTOR N = XMVectorDivide( V, Dist ); + + float t1 = XMMin( -r1, d-r2 ); + float t2 = XMMax( r1, d+r2 ); + float t_5 = (t2 - t1) * 0.5f; + + XMVECTOR NCenter = XMVectorAdd( Center1, XMVectorMultiply( N, XMVectorReplicate( t_5 + t1 ) ) ); + + XMStoreFloat3( &Out.Center, NCenter ); + Out.Radius = t_5; +} + + +//----------------------------------------------------------------------------- +// Create sphere enscribing bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromBoundingBox( BoundingSphere& Out, const BoundingBox& box ) +{ + Out.Center = box.Center; + XMVECTOR vExtents = XMLoadFloat3( &box.Extents ); + Out.Radius = XMVectorGetX( XMVector3Length( vExtents ) ); +} + +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromBoundingBox( BoundingSphere& Out, const BoundingOrientedBox& box ) +{ + // Bounding box orientation is irrelevant because a sphere is rotationally invariant + Out.Center = box.Center; + XMVECTOR vExtents = XMLoadFloat3( &box.Extents ); + Out.Radius = XMVectorGetX( XMVector3Length( vExtents ) ); +} + + +//----------------------------------------------------------------------------- +// Find the approximate smallest enclosing bounding sphere for a set of +// points. Exact computation of the smallest enclosing bounding sphere is +// possible but is slower and requires a more complex algorithm. +// The algorithm is based on Jack Ritter, "An Efficient Bounding Sphere", +// Graphics Gems. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromPoints( BoundingSphere& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride ) +{ + assert( Count > 0 ); + assert( pPoints ); + + // Find the points with minimum and maximum x, y, and z + XMVECTOR MinX, MaxX, MinY, MaxY, MinZ, MaxZ; + + MinX = MaxX = MinY = MaxY = MinZ = MaxZ = XMLoadFloat3( pPoints ); + + for( size_t i = 1; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ); + + float px = XMVectorGetX( Point ); + float py = XMVectorGetY( Point ); + float pz = XMVectorGetZ( Point ); + + if( px < XMVectorGetX( MinX ) ) + MinX = Point; + + if( px > XMVectorGetX( MaxX ) ) + MaxX = Point; + + if( py < XMVectorGetY( MinY ) ) + MinY = Point; + + if( py > XMVectorGetY( MaxY ) ) + MaxY = Point; + + if( pz < XMVectorGetZ( MinZ ) ) + MinZ = Point; + + if( pz > XMVectorGetZ( MaxZ ) ) + MaxZ = Point; + } + + // Use the min/max pair that are farthest apart to form the initial sphere. + XMVECTOR DeltaX = XMVectorSubtract( MaxX, MinX ); + XMVECTOR DistX = XMVector3Length( DeltaX ); + + XMVECTOR DeltaY = XMVectorSubtract( MaxY, MinY ); + XMVECTOR DistY = XMVector3Length( DeltaY ); + + XMVECTOR DeltaZ = XMVectorSubtract( MaxZ, MinZ ); + XMVECTOR DistZ = XMVector3Length( DeltaZ ); + + XMVECTOR vCenter; + XMVECTOR vRadius; + + if( XMVector3Greater( DistX, DistY ) ) + { + if( XMVector3Greater( DistX, DistZ ) ) + { + // Use min/max x. + vCenter = XMVectorLerp(MaxX,MinX,0.5f); + vRadius = XMVectorScale( DistX, 0.5f ); + } + else + { + // Use min/max z. + vCenter = XMVectorLerp(MaxZ,MinZ,0.5f); + vRadius = XMVectorScale( DistZ, 0.5f ); + } + } + else // Y >= X + { + if( XMVector3Greater( DistY, DistZ ) ) + { + // Use min/max y. + vCenter = XMVectorLerp(MaxY,MinY,0.5f); + vRadius = XMVectorScale( DistY, 0.5f ); + } + else + { + // Use min/max z. + vCenter = XMVectorLerp(MaxZ,MinZ,0.5f); + vRadius = XMVectorScale( DistZ, 0.5f ); + } + } + + // Add any points not inside the sphere. + for( size_t i = 0; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ); + + XMVECTOR Delta = XMVectorSubtract( Point, vCenter ); + + XMVECTOR Dist = XMVector3Length( Delta ); + + if( XMVector3Greater( Dist, vRadius ) ) + { + // Adjust sphere to include the new point. + vRadius = XMVectorScale( XMVectorAdd( vRadius, Dist ), 0.5f ); + vCenter = XMVectorAdd( vCenter, XMVectorMultiply( XMVectorSubtract( XMVectorReplicate(1.0f), XMVectorDivide(vRadius, Dist) ), Delta ) ); + } + } + + XMStoreFloat3( &Out.Center, vCenter ); + XMStoreFloat( &Out.Radius, vRadius ); +} + + +//----------------------------------------------------------------------------- +// Create sphere containing frustum +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromFrustum( BoundingSphere& Out, const BoundingFrustum& fr ) +{ + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners( Corners ); + CreateFromPoints( Out, BoundingFrustum::CORNER_COUNT, Corners, sizeof(XMFLOAT3) ); +} + + +/**************************************************************************** + * + * BoundingBox + * + ****************************************************************************/ + +//----------------------------------------------------------------------------- +// Transform an axis aligned box by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingBox::Transform( BoundingBox& Out, FXMMATRIX M ) const +{ + // Load center and extents. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Compute and transform the corners and find new min/max bounds. + XMVECTOR Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[0], vCenter ); + Corner = XMVector3Transform( Corner, M ); + + XMVECTOR Min, Max; + Min = Max = Corner; + + for( size_t i = 1; i < CORNER_COUNT; ++i ) + { + Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter ); + Corner = XMVector3Transform( Corner, M ); + + Min = XMVectorMin( Min, Corner ); + Max = XMVectorMax( Max, Corner ); + } + + // Store center and extents. + XMStoreFloat3( &Out.Center, XMVectorScale( XMVectorAdd( Min, Max ), 0.5f ) ); + XMStoreFloat3( &Out.Extents, XMVectorScale( XMVectorSubtract( Max, Min ), 0.5f ) ); +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingBox::Transform( BoundingBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const +{ + assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) ); + + // Load center and extents. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + XMVECTOR VectorScale = XMVectorReplicate( Scale ); + + // Compute and transform the corners and find new min/max bounds. + XMVECTOR Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[0], vCenter ); + Corner = XMVectorAdd( XMVector3Rotate( XMVectorMultiply( Corner, VectorScale ), Rotation ), Translation ); + + XMVECTOR Min, Max; + Min = Max = Corner; + + for( size_t i = 1; i < CORNER_COUNT; ++i ) + { + Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter ); + Corner = XMVectorAdd( XMVector3Rotate( XMVectorMultiply( Corner, VectorScale ), Rotation ), Translation ); + + Min = XMVectorMin( Min, Corner ); + Max = XMVectorMax( Max, Corner ); + } + + // Store center and extents. + XMStoreFloat3( &Out.Center, XMVectorScale( XMVectorAdd( Min, Max ), 0.5f ) ); + XMStoreFloat3( &Out.Extents, XMVectorScale( XMVectorSubtract( Max, Min ), 0.5f ) ); +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::GetCorners( XMFLOAT3* Corners ) const +{ + assert( Corners != nullptr ); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + for( size_t i = 0; i < CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter ); + XMStoreFloat3( &Corners[i], C ); + } +} + + +//----------------------------------------------------------------------------- +// Point in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingBox::Contains( FXMVECTOR Point ) const +{ + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + return XMVector3InBounds( XMVectorSubtract( Point, vCenter ), vExtents ) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingBox::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + if ( !Intersects(V0,V1,V2) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + XMVECTOR d = XMVectorAbs( XMVectorSubtract( V0, vCenter ) ); + XMVECTOR Inside = XMVectorLessOrEqual( d, vExtents ); + + d = XMVectorAbs( XMVectorSubtract( V1, vCenter ) ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); + + d = XMVectorAbs( XMVectorSubtract( V2, vCenter ) ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); + + return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( const BoundingSphere& sh ) const +{ + XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR BoxCenter = XMLoadFloat3( &Center ); + XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); + + XMVECTOR BoxMin = XMVectorSubtract( BoxCenter, BoxExtents ); + XMVECTOR BoxMax = XMVectorAdd( BoxCenter, BoxExtents ); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess( SphereCenter, BoxMin ); + XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxMax ); + + XMVECTOR MinDelta = XMVectorSubtract( SphereCenter, BoxMin ); + XMVECTOR MaxDelta = XMVectorSubtract( SphereCenter, BoxMax ); + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect( d, MinDelta, LessThanMin ); + d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot( d, d ); + + if ( XMVector3Greater( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ) ) + return DISJOINT; + + XMVECTOR InsideAll = XMVectorLessOrEqual( XMVectorAdd( BoxMin, SphereRadius ), SphereCenter ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( SphereCenter, XMVectorSubtract( BoxMax, SphereRadius ) ) ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorGreater( XMVectorSubtract( BoxMax, BoxMin ), SphereRadius ) ); + + return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( const BoundingBox& box ) const +{ + XMVECTOR CenterA = XMLoadFloat3( &Center ); + XMVECTOR ExtentsA = XMLoadFloat3( &Extents ); + + XMVECTOR CenterB = XMLoadFloat3( &box.Center ); + XMVECTOR ExtentsB = XMLoadFloat3( &box.Extents ); + + XMVECTOR MinA = XMVectorSubtract( CenterA, ExtentsA ); + XMVECTOR MaxA = XMVectorAdd( CenterA, ExtentsA ); + + XMVECTOR MinB = XMVectorSubtract( CenterB, ExtentsB ); + XMVECTOR MaxB = XMVectorAdd( CenterB, ExtentsB ); + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false + XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( MinA, MaxB ), XMVectorGreater( MinB, MaxA ) ); + + if ( DirectX::Internal::XMVector3AnyTrue( Disjoint ) ) + return DISJOINT; + + // for each i in (x, y, z) if a_min(i) <= b_min(i) and b_max(i) <= a_max(i) then A contains B + XMVECTOR Inside = XMVectorAndInt( XMVectorLessOrEqual( MinA, MinB ), XMVectorLessOrEqual( MaxB, MaxA ) ); + + return DirectX::Internal::XMVector3AllTrue( Inside ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Oriented box in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( const BoundingOrientedBox& box ) const +{ + if ( !box.Intersects( *this ) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Subtract off the AABB center to remove a subtract below + XMVECTOR oCenter = XMVectorSubtract( XMLoadFloat3( &box.Center ), vCenter ); + + XMVECTOR oExtents = XMLoadFloat3( &box.Extents ); + XMVECTOR oOrientation = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( oOrientation ) ); + + XMVECTOR Inside = XMVectorTrueInt(); + + for( size_t i=0; i < BoundingOrientedBox::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVectorAdd( XMVector3Rotate( XMVectorMultiply( oExtents, g_BoxOffset[i] ), oOrientation ), oCenter ); + XMVECTOR d = XMVectorAbs(C); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); + } + + return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Frustum in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( const BoundingFrustum& fr ) const +{ + if ( !fr.Intersects( *this ) ) + return DISJOINT; + + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners( Corners ); + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + XMVECTOR Inside = XMVectorTrueInt(); + + for( size_t i=0; i < BoundingFrustum::CORNER_COUNT; ++i ) + { + XMVECTOR Point = XMLoadFloat3( &Corners[i] ); + XMVECTOR d = XMVectorAbs( XMVectorSubtract( Point, vCenter ) ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); + } + + return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( const BoundingSphere& sh ) const +{ + XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR BoxCenter = XMLoadFloat3( &Center ); + XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); + + XMVECTOR BoxMin = XMVectorSubtract( BoxCenter, BoxExtents ); + XMVECTOR BoxMax = XMVectorAdd( BoxCenter, BoxExtents ); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess( SphereCenter, BoxMin ); + XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxMax ); + + XMVECTOR MinDelta = XMVectorSubtract( SphereCenter, BoxMin ); + XMVECTOR MaxDelta = XMVectorSubtract( SphereCenter, BoxMax ); + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect( d, MinDelta, LessThanMin ); + d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot( d, d ); + + return XMVector3LessOrEqual( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ); +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( const BoundingBox& box ) const +{ + XMVECTOR CenterA = XMLoadFloat3( &Center ); + XMVECTOR ExtentsA = XMLoadFloat3( &Extents ); + + XMVECTOR CenterB = XMLoadFloat3( &box.Center ); + XMVECTOR ExtentsB = XMLoadFloat3( &box.Extents ); + + XMVECTOR MinA = XMVectorSubtract( CenterA, ExtentsA ); + XMVECTOR MaxA = XMVectorAdd( CenterA, ExtentsA ); + + XMVECTOR MinB = XMVectorSubtract( CenterB, ExtentsB ); + XMVECTOR MaxB = XMVectorAdd( CenterB, ExtentsB ); + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false + XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( MinA, MaxB ), XMVectorGreater( MinB, MaxA ) ); + + return !DirectX::Internal::XMVector3AnyTrue( Disjoint ); +} + + +//----------------------------------------------------------------------------- +// Oriented box vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( const BoundingOrientedBox& box ) const +{ + return box.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Frustum vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( const BoundingFrustum& fr ) const +{ + return fr.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Triangle vs. axis aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingBox::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + XMVECTOR Zero = XMVectorZero(); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + XMVECTOR BoxMin = XMVectorSubtract( vCenter, vExtents ); + XMVECTOR BoxMax = XMVectorAdd( vCenter, vExtents ); + + // Test the axes of the box (in effect test the AAB against the minimal AAB + // around the triangle). + XMVECTOR TriMin = XMVectorMin( XMVectorMin( V0, V1 ), V2 ); + XMVECTOR TriMax = XMVectorMax( XMVectorMax( V0, V1 ), V2 ); + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then disjoint + XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( TriMin, BoxMax ), XMVectorGreater( BoxMin, TriMax ) ); + if( DirectX::Internal::XMVector3AnyTrue( Disjoint ) ) + return false; + + // Test the plane of the triangle. + XMVECTOR Normal = XMVector3Cross( XMVectorSubtract( V1, V0 ), XMVectorSubtract( V2, V0 ) ); + XMVECTOR Dist = XMVector3Dot( Normal, V0 ); + + // Assert that the triangle is not degenerate. + assert( !XMVector3Equal( Normal, Zero ) ); + + // for each i in (x, y, z) if n(i) >= 0 then v_min(i)=b_min(i), v_max(i)=b_max(i) + // else v_min(i)=b_max(i), v_max(i)=b_min(i) + XMVECTOR NormalSelect = XMVectorGreater( Normal, Zero ); + XMVECTOR V_Min = XMVectorSelect( BoxMax, BoxMin, NormalSelect ); + XMVECTOR V_Max = XMVectorSelect( BoxMin, BoxMax, NormalSelect ); + + // if n dot v_min + d > 0 || n dot v_max + d < 0 then disjoint + XMVECTOR MinDist = XMVector3Dot( V_Min, Normal ); + XMVECTOR MaxDist = XMVector3Dot( V_Max, Normal ); + + XMVECTOR NoIntersection = XMVectorGreater( MinDist, Dist ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( MaxDist, Dist ) ); + + // Move the box center to zero to simplify the following tests. + XMVECTOR TV0 = XMVectorSubtract( V0, vCenter ); + XMVECTOR TV1 = XMVectorSubtract( V1, vCenter ); + XMVECTOR TV2 = XMVectorSubtract( V2, vCenter ); + + // Test the edge/edge axes (3*3). + XMVECTOR e0 = XMVectorSubtract( TV1, TV0 ); + XMVECTOR e1 = XMVectorSubtract( TV2, TV1 ); + XMVECTOR e2 = XMVectorSubtract( TV0, TV2 ); + + // Make w zero. + e0 = XMVectorInsert<0, 0, 0, 0, 1>( e0, Zero ); + e1 = XMVectorInsert<0, 0, 0, 0, 1>( e1, Zero ); + e2 = XMVectorInsert<0, 0, 0, 0, 1>( e2, Zero ); + + XMVECTOR Axis; + XMVECTOR p0, p1, p2; + XMVECTOR Min, Max; + XMVECTOR Radius; + + // Axis == (1,0,0) x e0 = (0, -e0.z, e0.y) + Axis = XMVectorPermute( e0, XMVectorNegate( e0 ) ); + p0 = XMVector3Dot( TV0, Axis ); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot( TV2, Axis ); + Min = XMVectorMin( p0, p2 ); + Max = XMVectorMax( p0, p2 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, XMVectorNegate( Radius ) ) ); + + // Axis == (1,0,0) x e1 = (0, -e1.z, e1.y) + Axis = XMVectorPermute( e1, XMVectorNegate( e1 ) ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, XMVectorNegate( Radius ) ) ); + + // Axis == (1,0,0) x e2 = (0, -e2.z, e2.y) + Axis = XMVectorPermute( e2, XMVectorNegate( e2 ) ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, XMVectorNegate( Radius ) ) ); + + // Axis == (0,1,0) x e0 = (e0.z, 0, -e0.x) + Axis = XMVectorPermute( e0, XMVectorNegate( e0 ) ); + p0 = XMVector3Dot( TV0, Axis ); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot( TV2, Axis ); + Min = XMVectorMin( p0, p2 ); + Max = XMVectorMax( p0, p2 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, XMVectorNegate( Radius ) ) ); + + // Axis == (0,1,0) x e1 = (e1.z, 0, -e1.x) + Axis = XMVectorPermute( e1, XMVectorNegate( e1 ) ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, XMVectorNegate( Radius ) ) ); + + // Axis == (0,0,1) x e2 = (e2.z, 0, -e2.x) + Axis = XMVectorPermute( e2, XMVectorNegate( e2 ) ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, XMVectorNegate( Radius ) ) ); + + // Axis == (0,0,1) x e0 = (-e0.y, e0.x, 0) + Axis = XMVectorPermute( e0, XMVectorNegate( e0 ) ); + p0 = XMVector3Dot( TV0, Axis ); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot( TV2, Axis ); + Min = XMVectorMin( p0, p2 ); + Max = XMVectorMax( p0, p2 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, XMVectorNegate( Radius ) ) ); + + // Axis == (0,0,1) x e1 = (-e1.y, e1.x, 0) + Axis = XMVectorPermute( e1, XMVectorNegate( e1 ) ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, XMVectorNegate( Radius ) ) ); + + // Axis == (0,0,1) x e2 = (-e2.y, e2.x, 0) + Axis = XMVectorPermute( e2, XMVectorNegate( e2 ) ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, XMVectorNegate( Radius ) ) ); + + return XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingBox::Intersects( FXMVECTOR Plane ) const +{ + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane, Outside, Inside ); + + // If the box is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the box is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The box is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with an axis aligned +// box using the slabs method. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingBox::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const +{ + assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Adjust ray origin to be relative to center of the box. + XMVECTOR TOrigin = XMVectorSubtract( vCenter, Origin ); + + // Compute the dot product againt each axis of the box. + // Since the axii are (1,0,0), (0,1,0), (0,0,1) no computation is necessary. + XMVECTOR AxisDotOrigin = TOrigin; + XMVECTOR AxisDotDirection = Direction; + + // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab. + XMVECTOR IsParallel = XMVectorLessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ); + + // Test against all three axii simultaneously. + XMVECTOR InverseAxisDotDirection = XMVectorReciprocal( AxisDotDirection ); + XMVECTOR t1 = XMVectorMultiply( XMVectorSubtract( AxisDotOrigin, vExtents ), InverseAxisDotDirection ); + XMVECTOR t2 = XMVectorMultiply( XMVectorAdd( AxisDotOrigin, vExtents ), InverseAxisDotDirection ); + + // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't + // use the results from any directions parallel to the slab. + XMVECTOR t_min = XMVectorSelect( XMVectorMin( t1, t2 ), g_FltMin, IsParallel ); + XMVECTOR t_max = XMVectorSelect( XMVectorMax( t1, t2 ), g_FltMax, IsParallel ); + + // t_min.x = maximum( t_min.x, t_min.y, t_min.z ); + // t_max.x = minimum( t_max.x, t_max.y, t_max.z ); + t_min = XMVectorMax( t_min, XMVectorSplatY( t_min ) ); // x = max(x,y) + t_min = XMVectorMax( t_min, XMVectorSplatZ( t_min ) ); // x = max(max(x,y),z) + t_max = XMVectorMin( t_max, XMVectorSplatY( t_max ) ); // x = min(x,y) + t_max = XMVectorMin( t_max, XMVectorSplatZ( t_max ) ); // x = min(min(x,y),z) + + // if ( t_min > t_max ) return false; + XMVECTOR NoIntersection = XMVectorGreater( XMVectorSplatX( t_min ), XMVectorSplatX( t_max ) ); + + // if ( t_max < 0.0f ) return false; + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( XMVectorSplatX( t_max ), XMVectorZero() ) ); + + // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false; + XMVECTOR ParallelOverlap = XMVectorInBounds( AxisDotOrigin, vExtents ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorAndCInt( IsParallel, ParallelOverlap ) ); + + if( !DirectX::Internal::XMVector3AnyTrue( NoIntersection ) ) + { + // Store the x-component to *pDist + XMStoreFloat( &Dist, t_min ); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test an axis alinged box vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingBox::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane1, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane2, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane3, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane4, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane5, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the box is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the box is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The box is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box that contains two other bounding boxes +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateMerged( BoundingBox& Out, const BoundingBox& b1, const BoundingBox& b2 ) +{ + XMVECTOR b1Center = XMLoadFloat3( &b1.Center ); + XMVECTOR b1Extents = XMLoadFloat3( &b1.Extents ); + + XMVECTOR b2Center = XMLoadFloat3( &b2.Center ); + XMVECTOR b2Extents = XMLoadFloat3( &b2.Extents ); + + XMVECTOR Min = XMVectorSubtract( b1Center, b1Extents ); + Min = XMVectorMin( Min, XMVectorSubtract( b2Center, b2Extents ) ); + + XMVECTOR Max = XMVectorAdd( b1Center, b1Extents ); + Max = XMVectorMax( Max, XMVectorAdd( b2Center, b2Extents ) ); + + assert( XMVector3LessOrEqual( Min, Max ) ); + + XMStoreFloat3( &Out.Center, XMVectorScale( XMVectorAdd( Min, Max ), 0.5f ) ); + XMStoreFloat3( &Out.Extents, XMVectorScale( XMVectorSubtract( Max, Min ), 0.5f ) ); +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box that contains a bounding sphere +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateFromSphere( BoundingBox& Out, const BoundingSphere& sh ) +{ + XMVECTOR spCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR shRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR Min = XMVectorSubtract( spCenter, shRadius ); + XMVECTOR Max = XMVectorAdd( spCenter, shRadius ); + + assert( XMVector3LessOrEqual( Min, Max ) ); + + XMStoreFloat3( &Out.Center, XMVectorScale( XMVectorAdd( Min, Max ), 0.5f ) ); + XMStoreFloat3( &Out.Extents, XMVectorScale( XMVectorSubtract( Max, Min ), 0.5f ) ); +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box from min/max points +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingBox::CreateFromPoints( BoundingBox& Out, FXMVECTOR pt1, FXMVECTOR pt2 ) +{ + XMVECTOR Min = XMVectorMin( pt1, pt2 ); + XMVECTOR Max = XMVectorMax( pt1, pt2 ); + + // Store center and extents. + XMStoreFloat3( &Out.Center, XMVectorScale( XMVectorAdd( Min, Max ), 0.5f ) ); + XMStoreFloat3( &Out.Extents, XMVectorScale( XMVectorSubtract( Max, Min ), 0.5f ) ); +} + + +//----------------------------------------------------------------------------- +// Find the minimum axis aligned bounding box containing a set of points. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateFromPoints( BoundingBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride ) +{ + assert( Count > 0 ); + assert( pPoints ); + + // Find the minimum and maximum x, y, and z + XMVECTOR vMin, vMax; + + vMin = vMax = XMLoadFloat3( pPoints ); + + for( size_t i = 1; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ); + + vMin = XMVectorMin( vMin, Point ); + vMax = XMVectorMax( vMax, Point ); + } + + // Store center and extents. + XMStoreFloat3( &Out.Center, XMVectorScale( XMVectorAdd( vMin, vMax ), 0.5f ) ); + XMStoreFloat3( &Out.Extents, XMVectorScale( XMVectorSubtract( vMax, vMin ), 0.5f ) ); +} + + +/**************************************************************************** + * + * BoundingOrientedBox + * + ****************************************************************************/ + +//----------------------------------------------------------------------------- +// Transform an oriented box by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingOrientedBox::Transform( BoundingOrientedBox& Out, FXMMATRIX M ) const +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Composite the box rotation and the transform rotation. + XMMATRIX nM; + nM.r[0] = XMVector3Normalize( M.r[0] ); + nM.r[1] = XMVector3Normalize( M.r[1] ); + nM.r[2] = XMVector3Normalize( M.r[2] ); + nM.r[3] = g_XMIdentityR3; + XMVECTOR Rotation = XMQuaternionRotationMatrix( nM ); + vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); + + // Transform the center. + vCenter = XMVector3Transform( vCenter, M ); + + // Scale the box extents. + XMVECTOR dX = XMVector3Length( M.r[0] ); + XMVECTOR dY = XMVector3Length( M.r[1] ); + XMVECTOR dZ = XMVector3Length( M.r[2] ); + + XMVECTOR VectorScale = XMVectorSelect( dY, dX, g_XMSelect1000 ); + VectorScale = XMVectorSelect( dZ, VectorScale, g_XMSelect1100 ); + vExtents = XMVectorMultiply( vExtents, VectorScale ); + + // Store the box. + XMStoreFloat3( &Out.Center, vCenter ); + XMStoreFloat3( &Out.Extents, vExtents ); + XMStoreFloat4( &Out.Orientation, vOrientation ); +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingOrientedBox::Transform( BoundingOrientedBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const +{ + assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) ); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Composite the box rotation and the transform rotation. + vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); + + // Transform the center. + XMVECTOR VectorScale = XMVectorReplicate( Scale ); + vCenter = XMVectorAdd( XMVector3Rotate( XMVectorMultiply( vCenter, VectorScale ), Rotation ), Translation ); + + // Scale the box extents. + vExtents = XMVectorMultiply( vExtents, VectorScale ); + + // Store the box. + XMStoreFloat3( &Out.Center, vCenter ); + XMStoreFloat3( &Out.Extents, vExtents ); + XMStoreFloat4( &Out.Orientation, vOrientation ); +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::GetCorners( XMFLOAT3* Corners ) const +{ + assert( Corners != nullptr ); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + for( size_t i = 0; i < CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVectorAdd( XMVector3Rotate( XMVectorMultiply( vExtents, g_BoxOffset[i] ), vOrientation ), vCenter ); + XMStoreFloat3( &Corners[i], C ); + } +} + + +//----------------------------------------------------------------------------- +// Point in oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains( FXMVECTOR Point ) const +{ + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Transform the point to be local to the box. + XMVECTOR TPoint = XMVector3InverseRotate( XMVectorSubtract( Point, vCenter ), vOrientation ); + + return XMVector3InBounds( TPoint, vExtents ) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Load the box center & orientation. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Transform the triangle vertices into the space of the box. + XMVECTOR TV0 = XMVector3InverseRotate( XMVectorSubtract( V0, vCenter ), vOrientation ); + XMVECTOR TV1 = XMVector3InverseRotate( XMVectorSubtract( V1, vCenter ), vOrientation ); + XMVECTOR TV2 = XMVector3InverseRotate( XMVectorSubtract( V2, vCenter ), vOrientation ); + + BoundingBox box; + box.Center = XMFLOAT3( 0.0f, 0.0f, 0.0f ); + box.Extents = Extents; + + // Use the triangle vs axis aligned box intersection routine. + return box.Contains( TV0, TV1, TV2 ); +} + + +//----------------------------------------------------------------------------- +// Sphere in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( const BoundingSphere& sh ) const +{ + XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR BoxCenter = XMLoadFloat3( &Center ); + XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Transform the center of the sphere to be local to the box. + // BoxMin = -BoxExtents + // BoxMax = +BoxExtents + SphereCenter = XMVector3InverseRotate( XMVectorSubtract( SphereCenter, BoxCenter ), BoxOrientation ); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess( SphereCenter, XMVectorNegate( BoxExtents ) ); + XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxExtents ); + + XMVECTOR MinDelta = XMVectorAdd( SphereCenter, BoxExtents ); + XMVECTOR MaxDelta = XMVectorSubtract( SphereCenter, BoxExtents ); + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect( d, MinDelta, LessThanMin ); + d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot( d, d ); + XMVECTOR SphereRadiusSq = XMVectorMultiply( SphereRadius, SphereRadius ); + + if ( XMVector4Greater( d2, SphereRadiusSq ) ) + return DISJOINT; + + // See if we are completely inside the box + XMVECTOR SMin = XMVectorSubtract( SphereCenter, SphereRadius ); + XMVECTOR SMax = XMVectorAdd( SphereCenter, SphereRadius ); + + return ( XMVector3InBounds( SMin, BoxExtents ) && XMVector3InBounds( SMax, BoxExtents ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Axis aligned box vs. oriented box. Constructs an oriented box and uses +// the oriented box vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( const BoundingBox& box ) const +{ + // Make the axis aligned box oriented and do an OBB vs OBB test. + BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) ); + return Contains( obox ); +} + + +//----------------------------------------------------------------------------- +// Oriented bounding box in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( const BoundingOrientedBox& box ) const +{ + if ( !Intersects(box) ) + return DISJOINT; + + // Load the boxes + XMVECTOR aCenter = XMLoadFloat3( &Center ); + XMVECTOR aExtents = XMLoadFloat3( &Extents ); + XMVECTOR aOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( aOrientation ) ); + + XMVECTOR bCenter = XMLoadFloat3( &box.Center ); + XMVECTOR bExtents = XMLoadFloat3( &box.Extents ); + XMVECTOR bOrientation = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( bOrientation ) ); + + XMVECTOR offset = XMVectorSubtract( bCenter, aCenter ); + + for( size_t i = 0; i < CORNER_COUNT; ++i ) + { + // Cb = rotate( bExtents * corneroffset[i], bOrientation ) + bcenter + // Ca = invrotate( Cb - aCenter, aOrientation ) + + XMVECTOR C = XMVectorAdd( XMVector3Rotate( XMVectorMultiply( bExtents, g_BoxOffset[i] ), bOrientation ), offset ); + C = XMVector3InverseRotate( C , aOrientation ); + + if ( !XMVector3InBounds( C, aExtents ) ) + return INTERSECTS; + } + + return CONTAINS; +} + + +//----------------------------------------------------------------------------- +// Frustum in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( const BoundingFrustum& fr ) const +{ + if ( !fr.Intersects(*this) ) + return DISJOINT; + + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners( Corners ); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + for( size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVector3InverseRotate( XMVectorSubtract( XMLoadFloat3( &Corners[i] ), vCenter ), vOrientation ); + + if ( !XMVector3InBounds( C, vExtents ) ) + return INTERSECTS; + } + + return CONTAINS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs. oriented box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( const BoundingSphere& sh ) const +{ + XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR BoxCenter = XMLoadFloat3( &Center ); + XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Transform the center of the sphere to be local to the box. + // BoxMin = -BoxExtents + // BoxMax = +BoxExtents + SphereCenter = XMVector3InverseRotate( XMVectorSubtract( SphereCenter, BoxCenter ), BoxOrientation ); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess( SphereCenter, XMVectorNegate( BoxExtents ) ); + XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxExtents ); + + XMVECTOR MinDelta = XMVectorAdd( SphereCenter, BoxExtents ); + XMVECTOR MaxDelta = XMVectorSubtract( SphereCenter, BoxExtents ); + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect( d, MinDelta, LessThanMin ); + d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot( d, d ); + + return XMVector4LessOrEqual( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ) ? true : false; +} + + +//----------------------------------------------------------------------------- +// Axis aligned box vs. oriented box. Constructs an oriented box and uses +// the oriented box vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( const BoundingBox& box ) const +{ + // Make the axis aligned box oriented and do an OBB vs OBB test. + BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) ); + return Intersects( obox ); +} + + +//----------------------------------------------------------------------------- +// Fast oriented box / oriented box intersection test using the separating axis +// theorem. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( const BoundingOrientedBox& box ) const +{ + // Build the 3x3 rotation matrix that defines the orientation of B relative to A. + XMVECTOR A_quat = XMLoadFloat4( &Orientation ); + XMVECTOR B_quat = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( A_quat ) ); + assert( DirectX::Internal::XMQuaternionIsUnit( B_quat ) ); + + XMVECTOR Q = XMQuaternionMultiply( A_quat, XMQuaternionConjugate( B_quat ) ); + XMMATRIX R = XMMatrixRotationQuaternion( Q ); + + // Compute the translation of B relative to A. + XMVECTOR A_cent = XMLoadFloat3( &Center ); + XMVECTOR B_cent = XMLoadFloat3( &box.Center ); + XMVECTOR t = XMVector3InverseRotate( XMVectorSubtract( B_cent, A_cent ), A_quat ); + + // + // h(A) = extents of A. + // h(B) = extents of B. + // + // a(u) = axes of A = (1,0,0), (0,1,0), (0,0,1) + // b(u) = axes of B relative to A = (r00,r10,r20), (r01,r11,r21), (r02,r12,r22) + // + // For each possible separating axis l: + // d(A) = sum (for i = u,v,w) h(A)(i) * abs( a(i) dot l ) + // d(B) = sum (for i = u,v,w) h(B)(i) * abs( b(i) dot l ) + // if abs( t dot l ) > d(A) + d(B) then disjoint + // + + // Load extents of A and B. + XMVECTOR h_A = XMLoadFloat3( &Extents ); + XMVECTOR h_B = XMLoadFloat3( &box.Extents ); + + // Rows. Note R[0,1,2]X.w = 0. + XMVECTOR R0X = R.r[0]; + XMVECTOR R1X = R.r[1]; + XMVECTOR R2X = R.r[2]; + + R = XMMatrixTranspose( R ); + + // Columns. Note RX[0,1,2].w = 0. + XMVECTOR RX0 = R.r[0]; + XMVECTOR RX1 = R.r[1]; + XMVECTOR RX2 = R.r[2]; + + // Absolute value of rows. + XMVECTOR AR0X = XMVectorAbs( R0X ); + XMVECTOR AR1X = XMVectorAbs( R1X ); + XMVECTOR AR2X = XMVectorAbs( R2X ); + + // Absolute value of columns. + XMVECTOR ARX0 = XMVectorAbs( RX0 ); + XMVECTOR ARX1 = XMVectorAbs( RX1 ); + XMVECTOR ARX2 = XMVectorAbs( RX2 ); + + // Test each of the 15 possible seperating axii. + XMVECTOR d, d_A, d_B; + + // l = a(u) = (1, 0, 0) + // t dot l = t.x + // d(A) = h(A).x + // d(B) = h(B) dot abs(r00, r01, r02) + d = XMVectorSplatX( t ); + d_A = XMVectorSplatX( h_A ); + d_B = XMVector3Dot( h_B, AR0X ); + XMVECTOR NoIntersection = XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ); + + // l = a(v) = (0, 1, 0) + // t dot l = t.y + // d(A) = h(A).y + // d(B) = h(B) dot abs(r10, r11, r12) + d = XMVectorSplatY( t ); + d_A = XMVectorSplatY( h_A ); + d_B = XMVector3Dot( h_B, AR1X ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(w) = (0, 0, 1) + // t dot l = t.z + // d(A) = h(A).z + // d(B) = h(B) dot abs(r20, r21, r22) + d = XMVectorSplatZ( t ); + d_A = XMVectorSplatZ( h_A ); + d_B = XMVector3Dot( h_B, AR2X ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = b(u) = (r00, r10, r20) + // d(A) = h(A) dot abs(r00, r10, r20) + // d(B) = h(B).x + d = XMVector3Dot( t, RX0 ); + d_A = XMVector3Dot( h_A, ARX0 ); + d_B = XMVectorSplatX( h_B ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = b(v) = (r01, r11, r21) + // d(A) = h(A) dot abs(r01, r11, r21) + // d(B) = h(B).y + d = XMVector3Dot( t, RX1 ); + d_A = XMVector3Dot( h_A, ARX1 ); + d_B = XMVectorSplatY( h_B ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = b(w) = (r02, r12, r22) + // d(A) = h(A) dot abs(r02, r12, r22) + // d(B) = h(B).z + d = XMVector3Dot( t, RX2 ); + d_A = XMVector3Dot( h_A, ARX2 ); + d_B = XMVectorSplatZ( h_B ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(u) x b(u) = (0, -r20, r10) + // d(A) = h(A) dot abs(0, r20, r10) + // d(B) = h(B) dot abs(0, r02, r01) + d = XMVector3Dot( t, XMVectorPermute( RX0, XMVectorNegate( RX0 ) ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX0 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR0X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(u) x b(v) = (0, -r21, r11) + // d(A) = h(A) dot abs(0, r21, r11) + // d(B) = h(B) dot abs(r02, 0, r00) + d = XMVector3Dot( t, XMVectorPermute( RX1, XMVectorNegate( RX1 ) ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX1 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR0X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(u) x b(w) = (0, -r22, r12) + // d(A) = h(A) dot abs(0, r22, r12) + // d(B) = h(B) dot abs(r01, r00, 0) + d = XMVector3Dot( t, XMVectorPermute( RX2, XMVectorNegate( RX2 ) ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX2 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR0X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(v) x b(u) = (r20, 0, -r00) + // d(A) = h(A) dot abs(r20, 0, r00) + // d(B) = h(B) dot abs(0, r12, r11) + d = XMVector3Dot( t, XMVectorPermute( RX0, XMVectorNegate( RX0 ) ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX0 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR1X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(v) x b(v) = (r21, 0, -r01) + // d(A) = h(A) dot abs(r21, 0, r01) + // d(B) = h(B) dot abs(r12, 0, r10) + d = XMVector3Dot( t, XMVectorPermute( RX1, XMVectorNegate( RX1 ) ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX1 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR1X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(v) x b(w) = (r22, 0, -r02) + // d(A) = h(A) dot abs(r22, 0, r02) + // d(B) = h(B) dot abs(r11, r10, 0) + d = XMVector3Dot( t, XMVectorPermute( RX2, XMVectorNegate( RX2 ) ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX2 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR1X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(w) x b(u) = (-r10, r00, 0) + // d(A) = h(A) dot abs(r10, r00, 0) + // d(B) = h(B) dot abs(0, r22, r21) + d = XMVector3Dot( t, XMVectorPermute( RX0, XMVectorNegate( RX0 ) ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX0 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR2X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(w) x b(v) = (-r11, r01, 0) + // d(A) = h(A) dot abs(r11, r01, 0) + // d(B) = h(B) dot abs(r22, 0, r20) + d = XMVector3Dot( t, XMVectorPermute( RX1, XMVectorNegate( RX1 ) ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX1 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR2X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(w) x b(w) = (-r12, r02, 0) + // d(A) = h(A) dot abs(r12, r02, 0) + // d(B) = h(B) dot abs(r21, r20, 0) + d = XMVector3Dot( t, XMVectorPermute( RX2, XMVectorNegate( RX2 ) ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX2 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR2X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // No seperating axis found, boxes must intersect. + return XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ) ? true : false; +} + + +//----------------------------------------------------------------------------- +// Frustum vs. oriented box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( const BoundingFrustum& fr ) const +{ + return fr.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Triangle vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Load the box center & orientation. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Transform the triangle vertices into the space of the box. + XMVECTOR TV0 = XMVector3InverseRotate( XMVectorSubtract( V0, vCenter ), vOrientation ); + XMVECTOR TV1 = XMVector3InverseRotate( XMVectorSubtract( V1, vCenter ), vOrientation ); + XMVECTOR TV2 = XMVector3InverseRotate( XMVectorSubtract( V2, vCenter ), vOrientation ); + + BoundingBox box; + box.Center = XMFLOAT3( 0.0f, 0.0f, 0.0f ); + box.Extents = Extents; + + // Use the triangle vs axis aligned box intersection routine. + return box.Intersects( TV0, TV1, TV2 ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR Plane ) const +{ + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation ); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane, Outside, Inside ); + + // If the box is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the box is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The box is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with an oriented box +// using the slabs method. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const +{ + assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); + + static const XMVECTORU32 SelectY = { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 }; + static const XMVECTORU32 SelectZ = { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 }; + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Get the boxes normalized side directions. + XMMATRIX R = XMMatrixRotationQuaternion( vOrientation ); + + // Adjust ray origin to be relative to center of the box. + XMVECTOR TOrigin = XMVectorSubtract( vCenter, Origin ); + + // Compute the dot product againt each axis of the box. + XMVECTOR AxisDotOrigin = XMVector3Dot( R.r[0], TOrigin ); + AxisDotOrigin = XMVectorSelect( AxisDotOrigin, XMVector3Dot( R.r[1], TOrigin ), SelectY ); + AxisDotOrigin = XMVectorSelect( AxisDotOrigin, XMVector3Dot( R.r[2], TOrigin ), SelectZ ); + + XMVECTOR AxisDotDirection = XMVector3Dot( R.r[0], Direction ); + AxisDotDirection = XMVectorSelect( AxisDotDirection, XMVector3Dot( R.r[1], Direction ), SelectY ); + AxisDotDirection = XMVectorSelect( AxisDotDirection, XMVector3Dot( R.r[2], Direction ), SelectZ ); + + // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab. + XMVECTOR IsParallel = XMVectorLessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ); + + // Test against all three axes simultaneously. + XMVECTOR InverseAxisDotDirection = XMVectorReciprocal( AxisDotDirection ); + XMVECTOR t1 = XMVectorMultiply( XMVectorSubtract( AxisDotOrigin, vExtents ), InverseAxisDotDirection ); + XMVECTOR t2 = XMVectorMultiply( XMVectorAdd( AxisDotOrigin, vExtents ), InverseAxisDotDirection ); + + // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't + // use the results from any directions parallel to the slab. + XMVECTOR t_min = XMVectorSelect( XMVectorMin( t1, t2 ), g_FltMin, IsParallel ); + XMVECTOR t_max = XMVectorSelect( XMVectorMax( t1, t2 ), g_FltMax, IsParallel ); + + // t_min.x = maximum( t_min.x, t_min.y, t_min.z ); + // t_max.x = minimum( t_max.x, t_max.y, t_max.z ); + t_min = XMVectorMax( t_min, XMVectorSplatY( t_min ) ); // x = max(x,y) + t_min = XMVectorMax( t_min, XMVectorSplatZ( t_min ) ); // x = max(max(x,y),z) + t_max = XMVectorMin( t_max, XMVectorSplatY( t_max ) ); // x = min(x,y) + t_max = XMVectorMin( t_max, XMVectorSplatZ( t_max ) ); // x = min(min(x,y),z) + + // if ( t_min > t_max ) return false; + XMVECTOR NoIntersection = XMVectorGreater( XMVectorSplatX( t_min ), XMVectorSplatX( t_max ) ); + + // if ( t_max < 0.0f ) return false; + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( XMVectorSplatX( t_max ), XMVectorZero() ) ); + + // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false; + XMVECTOR ParallelOverlap = XMVectorInBounds( AxisDotOrigin, vExtents ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorAndCInt( IsParallel, ParallelOverlap ) ); + + if( !DirectX::Internal::XMVector3AnyTrue( NoIntersection ) ) + { + // Store the x-component to *pDist + XMStoreFloat( &Dist, t_min ); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test an oriented box vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingOrientedBox::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation ); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane1, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane2, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane3, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane4, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane5, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the box is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the box is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The box is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Create oriented bounding box from axis-aligned bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::CreateFromBoundingBox( BoundingOrientedBox& Out, const BoundingBox& box ) +{ + Out.Center = box.Center; + Out.Extents = box.Extents; + Out.Orientation = XMFLOAT4( 0.f, 0.f, 0.f, 1.f ); +} + + +//----------------------------------------------------------------------------- +// Find the approximate minimum oriented bounding box containing a set of +// points. Exact computation of minimum oriented bounding box is possible but +// is slower and requires a more complex algorithm. +// The algorithm works by computing the inertia tensor of the points and then +// using the eigenvectors of the intertia tensor as the axes of the box. +// Computing the intertia tensor of the convex hull of the points will usually +// result in better bounding box but the computation is more complex. +// Exact computation of the minimum oriented bounding box is possible but the +// best know algorithm is O(N^3) and is significanly more complex to implement. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::CreateFromPoints( BoundingOrientedBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride ) +{ + assert( Count > 0 ); + assert( pPoints != nullptr ); + + XMVECTOR CenterOfMass = XMVectorZero(); + + // Compute the center of mass and inertia tensor of the points. + for( size_t i = 0; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ); + + CenterOfMass = XMVectorAdd( CenterOfMass, Point ); + } + + CenterOfMass = XMVectorMultiply( CenterOfMass, XMVectorReciprocal( XMVectorReplicate( float( Count ) ) ) ); + + // Compute the inertia tensor of the points around the center of mass. + // Using the center of mass is not strictly necessary, but will hopefully + // improve the stability of finding the eigenvectors. + XMVECTOR XX_YY_ZZ = XMVectorZero(); + XMVECTOR XY_XZ_YZ = XMVectorZero(); + + for( size_t i = 0; i < Count; ++i ) + { + XMVECTOR Point = XMVectorSubtract( XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ), CenterOfMass ); + + XX_YY_ZZ = XMVectorAdd( XX_YY_ZZ, XMVectorMultiply( Point, Point ) ); + + XMVECTOR XXY = XMVectorSwizzle( Point ); + XMVECTOR YZZ = XMVectorSwizzle( Point ); + + XY_XZ_YZ = XMVectorAdd( XY_XZ_YZ, XMVectorMultiply( XXY, YZZ ) ); + } + + XMVECTOR v1, v2, v3; + + // Compute the eigenvectors of the inertia tensor. + DirectX::Internal::CalculateEigenVectorsFromCovarianceMatrix( XMVectorGetX( XX_YY_ZZ ), XMVectorGetY( XX_YY_ZZ ), + XMVectorGetZ( XX_YY_ZZ ), + XMVectorGetX( XY_XZ_YZ ), XMVectorGetY( XY_XZ_YZ ), + XMVectorGetZ( XY_XZ_YZ ), + &v1, &v2, &v3 ); + + // Put them in a matrix. + XMMATRIX R; + + R.r[0] = XMVectorSetW( v1, 0.f ); + R.r[1] = XMVectorSetW( v2, 0.f ); + R.r[2] = XMVectorSetW( v3, 0.f ); + R.r[3] = g_XMIdentityR3.v; + + // Multiply by -1 to convert the matrix into a right handed coordinate + // system (Det ~= 1) in case the eigenvectors form a left handed + // coordinate system (Det ~= -1) because XMQuaternionRotationMatrix only + // works on right handed matrices. + XMVECTOR Det = XMMatrixDeterminant( R ); + + if( XMVector4Less( Det, XMVectorZero() ) ) + { + R.r[0] = XMVectorMultiply( R.r[0], g_XMNegativeOne.v ); + R.r[1] = XMVectorMultiply( R.r[1], g_XMNegativeOne.v ); + R.r[2] = XMVectorMultiply( R.r[2], g_XMNegativeOne.v ); + } + + // Get the rotation quaternion from the matrix. + XMVECTOR vOrientation = XMQuaternionRotationMatrix( R ); + + // Make sure it is normal (in case the vectors are slightly non-orthogonal). + vOrientation = XMQuaternionNormalize( vOrientation ); + + // Rebuild the rotation matrix from the quaternion. + R = XMMatrixRotationQuaternion( vOrientation ); + + // Build the rotation into the rotated space. + XMMATRIX InverseR = XMMatrixTranspose( R ); + + // Find the minimum OBB using the eigenvectors as the axes. + XMVECTOR vMin, vMax; + + vMin = vMax = XMVector3TransformNormal( XMLoadFloat3( pPoints ), InverseR ); + + for( size_t i = 1; i < Count; ++i ) + { + XMVECTOR Point = XMVector3TransformNormal( XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ), + InverseR ); + + vMin = XMVectorMin( vMin, Point ); + vMax = XMVectorMax( vMax, Point ); + } + + // Rotate the center into world space. + XMVECTOR vCenter = XMVectorScale( XMVectorAdd( vMin, vMax ), 0.5f ); + vCenter = XMVector3TransformNormal( vCenter, R ); + + // Store center, extents, and orientation. + XMStoreFloat3( &Out.Center, vCenter ); + XMStoreFloat3( &Out.Extents, XMVectorScale( XMVectorSubtract( vMax, vMin ), 0.5f ) ); + XMStoreFloat4( &Out.Orientation, vOrientation ); +} + + +/**************************************************************************** + * + * BoundingFrustum + * + ****************************************************************************/ + +_Use_decl_annotations_ +inline BoundingFrustum::BoundingFrustum( CXMMATRIX Projection ) +{ + CreateFromMatrix(*this, Projection); +} + + +//----------------------------------------------------------------------------- +// Transform a frustum by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingFrustum::Transform( BoundingFrustum& Out, FXMMATRIX M ) const +{ + // Load the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Composite the frustum rotation and the transform rotation + XMMATRIX nM; + nM.r[0] = XMVector3Normalize( M.r[0] ); + nM.r[1] = XMVector3Normalize( M.r[1] ); + nM.r[2] = XMVector3Normalize( M.r[2] ); + nM.r[3] = g_XMIdentityR3; + XMVECTOR Rotation = XMQuaternionRotationMatrix( nM ); + vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); + + // Transform the center. + vOrigin = XMVector3Transform( vOrigin, M ); + + // Store the frustum. + XMStoreFloat3( &Out.Origin, vOrigin ); + XMStoreFloat4( &Out.Orientation, vOrientation ); + + // Scale the near and far distances (the slopes remain the same). + XMVECTOR dX = XMVector3Dot( M.r[0], M.r[0] ); + XMVECTOR dY = XMVector3Dot( M.r[1], M.r[1] ); + XMVECTOR dZ = XMVector3Dot( M.r[2], M.r[2] ); + + XMVECTOR d = XMVectorMax( dX, XMVectorMax( dY, dZ ) ); + float Scale = sqrtf( XMVectorGetX(d) ); + + Out.Near = Near * Scale; + Out.Far = Far * Scale; + + // Copy the slopes. + Out.RightSlope = RightSlope; + Out.LeftSlope = LeftSlope; + Out.TopSlope = TopSlope; + Out.BottomSlope = BottomSlope; +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingFrustum::Transform( BoundingFrustum& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const +{ + assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) ); + + // Load the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Composite the frustum rotation and the transform rotation. + vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); + + // Transform the origin. + vOrigin = XMVectorAdd( XMVector3Rotate( XMVectorScale( vOrigin, Scale ), Rotation ), Translation ); + + // Store the frustum. + XMStoreFloat3( &Out.Origin, vOrigin ); + XMStoreFloat4( &Out.Orientation, vOrientation ); + + // Scale the near and far distances (the slopes remain the same). + Out.Near = Near * Scale; + Out.Far = Far * Scale; + + // Copy the slopes. + Out.RightSlope = RightSlope; + Out.LeftSlope = LeftSlope; + Out.TopSlope = TopSlope; + Out.BottomSlope = BottomSlope; +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the frustum +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingFrustum::GetCorners( XMFLOAT3* Corners ) const +{ + assert( Corners != nullptr ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + // Returns 8 corners position of bounding frustum. + // Near Far + // 0----1 4----5 + // | | | | + // | | | | + // 3----2 7----6 + + XMVECTOR vCorners[CORNER_COUNT]; + vCorners[0] = XMVectorMultiply( vLeftTop, vNear ); + vCorners[1] = XMVectorMultiply( vRightTop, vNear ); + vCorners[2] = XMVectorMultiply( vRightBottom, vNear ); + vCorners[3] = XMVectorMultiply( vLeftBottom, vNear ); + vCorners[4] = XMVectorMultiply( vLeftTop, vFar ); + vCorners[5] = XMVectorMultiply( vRightTop, vFar ); + vCorners[6] = XMVectorMultiply( vRightBottom, vFar ); + vCorners[7] = XMVectorMultiply( vLeftBottom, vFar ); + + for( size_t i=0; i < CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVectorAdd( XMVector3Rotate( vCorners[i], vOrientation ), vOrigin ); + XMStoreFloat3( &Corners[i], C ); + } +} + + +//----------------------------------------------------------------------------- +// Point in frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingFrustum::Contains( FXMVECTOR Point ) const +{ + // Build frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Load origin and orientation. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Transform point into local space of frustum. + XMVECTOR TPoint = XMVector3InverseRotate( XMVectorSubtract( Point, vOrigin ), vOrientation ); + + // Set w to one. + TPoint = XMVectorInsert<0, 0, 0, 0, 1>( TPoint, XMVectorSplatOne() ); + + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Outside = Zero; + + // Test point against each plane of the frustum. + for( size_t i = 0; i < 6; ++i ) + { + XMVECTOR Dot = XMVector4Dot( TPoint, Planes[i] ); + Outside = XMVectorOrInt( Outside, XMVectorGreater( Dot, Zero ) ); + } + + return XMVector4NotEqualInt( Outside, XMVectorTrueInt() ) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingFrustum::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return TriangleTests::ContainedBy( V0, V1, V2, NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( const BoundingSphere& sh ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return sh.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( const BoundingBox& box ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return box.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( const BoundingOrientedBox& box ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return box.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( const BoundingFrustum& fr ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return fr.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +// Exact sphere vs frustum test. The algorithm first checks the sphere against +// the planes of the frustum, then if the plane checks were indeterminate finds +// the nearest feature (plane, line, point) on the frustum to the center of the +// sphere and compares the distance to the nearest feature to the radius of the +// sphere +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( const BoundingSphere& sh ) const +{ + XMVECTOR Zero = XMVectorZero(); + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Normalize the planes so we can compare to the sphere radius. + Planes[2] = XMVector3Normalize( Planes[2] ); + Planes[3] = XMVector3Normalize( Planes[3] ); + Planes[4] = XMVector3Normalize( Planes[4] ); + Planes[5] = XMVector3Normalize( Planes[5] ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &sh.Radius ); + + // Transform the center of the sphere into the local space of frustum. + vCenter = XMVector3InverseRotate( XMVectorSubtract( vCenter, vOrigin ), vOrientation ); + + // Set w of the center to one so we can dot4 with the plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + // Check against each plane of the frustum. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + XMVECTOR CenterInsideAll = XMVectorTrueInt(); + + XMVECTOR Dist[6]; + + for( size_t i = 0; i < 6; ++i ) + { + Dist[i] = XMVector4Dot( vCenter, Planes[i] ); + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist[i], vRadius ) ); + + // Fully inside the plane? + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Dist[i], XMVectorNegate( vRadius ) ) ); + + // Check if the center is inside the plane. + CenterInsideAll = XMVectorAndInt( CenterInsideAll, XMVectorLessOrEqual( Dist[i], Zero ) ); + } + + // If the sphere is outside any of the planes it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If the sphere is inside all planes it is fully inside. + if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) + return true; + + // If the center of the sphere is inside all planes and the sphere intersects + // one or more planes then it must intersect. + if ( XMVector4EqualInt( CenterInsideAll, XMVectorTrueInt() ) ) + return true; + + // The sphere may be outside the frustum or intersecting the frustum. + // Find the nearest feature (face, edge, or corner) on the frustum + // to the sphere. + + // The faces adjacent to each face are: + static const size_t adjacent_faces[6][4] = + { + { 2, 3, 4, 5 }, // 0 + { 2, 3, 4, 5 }, // 1 + { 0, 1, 4, 5 }, // 2 + { 0, 1, 4, 5 }, // 3 + { 0, 1, 2, 3 }, // 4 + { 0, 1, 2, 3 } + }; // 5 + + XMVECTOR Intersects = XMVectorFalseInt(); + + // Check to see if the nearest feature is one of the planes. + for( size_t i = 0; i < 6; ++i ) + { + // Find the nearest point on the plane to the center of the sphere. + XMVECTOR Point = XMVectorNegativeMultiplySubtract( Planes[i], Dist[i], vCenter ); + + // Set w of the point to one. + Point = XMVectorInsert<0, 0, 0, 0, 1>( Point, XMVectorSplatOne() ); + + // If the point is inside the face (inside the adjacent planes) then + // this plane is the nearest feature. + XMVECTOR InsideFace = XMVectorTrueInt(); + + for ( size_t j = 0; j < 4; j++ ) + { + size_t plane_index = adjacent_faces[i][j]; + + InsideFace = XMVectorAndInt( InsideFace, + XMVectorLessOrEqual( XMVector4Dot( Point, Planes[plane_index] ), Zero ) ); + } + + // Since we have already checked distance from the plane we know that the + // sphere must intersect if this plane is the nearest feature. + Intersects = XMVectorOrInt( Intersects, + XMVectorAndInt( XMVectorGreater( Dist[i], Zero ), InsideFace ) ); + } + + if ( XMVector4EqualInt( Intersects, XMVectorTrueInt() ) ) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = XMVectorMultiply( vRightTop, vNear ); + Corners[1] = XMVectorMultiply( vRightBottom, vNear ); + Corners[2] = XMVectorMultiply( vLeftTop, vNear ); + Corners[3] = XMVectorMultiply( vLeftBottom, vNear ); + Corners[4] = XMVectorMultiply( vRightTop, vFar ); + Corners[5] = XMVectorMultiply( vRightBottom, vFar ); + Corners[6] = XMVectorMultiply( vLeftTop, vFar ); + Corners[7] = XMVectorMultiply( vLeftBottom, vFar ); + + // The Edges are: + static const size_t edges[12][2] = + { + { 0, 1 }, { 2, 3 }, { 0, 2 }, { 1, 3 }, // Near plane + { 4, 5 }, { 6, 7 }, { 4, 6 }, { 5, 7 }, // Far plane + { 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 }, + }; // Near to far + + XMVECTOR RadiusSq = XMVectorMultiply( vRadius, vRadius ); + + // Check to see if the nearest feature is one of the edges (or corners). + for( size_t i = 0; i < 12; ++i ) + { + size_t ei0 = edges[i][0]; + size_t ei1 = edges[i][1]; + + // Find the nearest point on the edge to the center of the sphere. + // The corners of the frustum are included as the endpoints of the edges. + XMVECTOR Point = DirectX::Internal::PointOnLineSegmentNearestPoint( Corners[ei0], Corners[ei1], vCenter ); + + XMVECTOR Delta = XMVectorSubtract( vCenter, Point ); + + XMVECTOR DistSq = XMVector3Dot( Delta, Delta ); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersects = XMVectorOrInt( Intersects, XMVectorLessOrEqual( DistSq, RadiusSq ) ); + } + + if ( XMVector4EqualInt( Intersects, XMVectorTrueInt() ) ) + return true; + + // The sphere must be outside the frustum. + return false; +} + + +//----------------------------------------------------------------------------- +// Exact axis aligned box vs frustum test. Constructs an oriented box and uses +// the oriented box vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( const BoundingBox& box ) const +{ + // Make the axis aligned box oriented and do an OBB vs frustum test. + BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) ); + return Intersects( obox ); +} + + +//----------------------------------------------------------------------------- +// Exact oriented box vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( const BoundingOrientedBox& box ) const +{ + static const XMVECTORU32 SelectY = { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 }; + static const XMVECTORU32 SelectZ = { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 }; + + XMVECTOR Zero = XMVectorZero(); + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR FrustumOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( FrustumOrientation ) ); + + // Load the box. + XMVECTOR Center = XMLoadFloat3( &box.Center ); + XMVECTOR Extents = XMLoadFloat3( &box.Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Transform the oriented box into the space of the frustum in order to + // minimize the number of transforms we have to do. + Center = XMVector3InverseRotate( XMVectorSubtract( Center, vOrigin ), FrustumOrientation ); + BoxOrientation = XMQuaternionMultiply( BoxOrientation, XMQuaternionConjugate( FrustumOrientation ) ); + + // Set w of the center to one so we can dot4 with the plane. + Center = XMVectorInsert<0, 0, 0, 0, 1>( Center, XMVectorSplatOne() ); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation ); + + // Check against each plane of the frustum. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + XMVECTOR CenterInsideAll = XMVectorTrueInt(); + + for( size_t i = 0; i < 6; ++i ) + { + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot( Center, Planes[i] ); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. + XMVECTOR Radius = XMVector3Dot( Planes[i], R.r[0] ); + Radius = XMVectorSelect( Radius, XMVector3Dot( Planes[i], R.r[1] ), SelectY ); + Radius = XMVectorSelect( Radius, XMVector3Dot( Planes[i], R.r[2] ), SelectZ ); + Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) ); + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist, Radius ) ); + + // Fully inside the plane? + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Dist, XMVectorNegate( Radius ) ) ); + + // Check if the center is inside the plane. + CenterInsideAll = XMVectorAndInt( CenterInsideAll, XMVectorLessOrEqual( Dist, Zero ) ); + } + + // If the box is outside any of the planes it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If the box is inside all planes it is fully inside. + if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) + return true; + + // If the center of the box is inside all planes and the box intersects + // one or more planes then it must intersect. + if ( XMVector4EqualInt( CenterInsideAll, XMVectorTrueInt() ) ) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = XMVectorMultiply( vRightTop, vNear ); + Corners[1] = XMVectorMultiply( vRightBottom, vNear ); + Corners[2] = XMVectorMultiply( vLeftTop, vNear ); + Corners[3] = XMVectorMultiply( vLeftBottom, vNear ); + Corners[4] = XMVectorMultiply( vRightTop, vFar ); + Corners[5] = XMVectorMultiply( vRightBottom, vFar ); + Corners[6] = XMVectorMultiply( vLeftTop, vFar ); + Corners[7] = XMVectorMultiply( vLeftBottom, vFar ); + + // Test against box axes (3) + { + // Find the min/max values of the projection of the frustum onto each axis. + XMVECTOR FrustumMin, FrustumMax; + + FrustumMin = XMVector3Dot( Corners[0], R.r[0] ); + FrustumMin = XMVectorSelect( FrustumMin, XMVector3Dot( Corners[0], R.r[1] ), SelectY ); + FrustumMin = XMVectorSelect( FrustumMin, XMVector3Dot( Corners[0], R.r[2] ), SelectZ ); + FrustumMax = FrustumMin; + + for( size_t i = 1; i < BoundingOrientedBox::CORNER_COUNT; ++i ) + { + XMVECTOR Temp = XMVector3Dot( Corners[i], R.r[0] ); + Temp = XMVectorSelect( Temp, XMVector3Dot( Corners[i], R.r[1] ), SelectY ); + Temp = XMVectorSelect( Temp, XMVector3Dot( Corners[i], R.r[2] ), SelectZ ); + + FrustumMin = XMVectorMin( FrustumMin, Temp ); + FrustumMax = XMVectorMax( FrustumMax, Temp ); + } + + // Project the center of the box onto the axes. + XMVECTOR BoxDist = XMVector3Dot( Center, R.r[0] ); + BoxDist = XMVectorSelect( BoxDist, XMVector3Dot( Center, R.r[1] ), SelectY ); + BoxDist = XMVectorSelect( BoxDist, XMVector3Dot( Center, R.r[2] ), SelectZ ); + + // The projection of the box onto the axis is just its Center and Extents. + // if (min > box_max || max < box_min) reject; + XMVECTOR Result = XMVectorOrInt( XMVectorGreater( FrustumMin, XMVectorAdd( BoxDist, Extents ) ), + XMVectorLess( FrustumMax, XMVectorSubtract( BoxDist, Extents ) ) ); + + if( DirectX::Internal::XMVector3AnyTrue( Result ) ) + return false; + } + + // Test against edge/edge axes (3*6). + XMVECTOR FrustumEdgeAxis[6]; + + FrustumEdgeAxis[0] = vRightTop; + FrustumEdgeAxis[1] = vRightBottom; + FrustumEdgeAxis[2] = vLeftTop; + FrustumEdgeAxis[3] = vLeftBottom; + FrustumEdgeAxis[4] = XMVectorSubtract( vRightTop, vLeftTop ); + FrustumEdgeAxis[5] = XMVectorSubtract( vLeftBottom, vLeftTop ); + + for( size_t i = 0; i < 3; ++i ) + { + for( size_t j = 0; j < 6; j++ ) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross( R.r[i], FrustumEdgeAxis[j] ); + + // Find the min/max values of the projection of the frustum onto the axis. + XMVECTOR FrustumMin, FrustumMax; + + FrustumMin = FrustumMax = XMVector3Dot( Axis, Corners[0] ); + + for( size_t k = 1; k < CORNER_COUNT; k++ ) + { + XMVECTOR Temp = XMVector3Dot( Axis, Corners[k] ); + FrustumMin = XMVectorMin( FrustumMin, Temp ); + FrustumMax = XMVectorMax( FrustumMax, Temp ); + } + + // Project the center of the box onto the axis. + XMVECTOR Dist = XMVector3Dot( Center, Axis ); + + // Project the axes of the box onto the axis to find the "radius" of the box. + XMVECTOR Radius = XMVector3Dot( Axis, R.r[0] ); + Radius = XMVectorSelect( Radius, XMVector3Dot( Axis, R.r[1] ), SelectY ); + Radius = XMVectorSelect( Radius, XMVector3Dot( Axis, R.r[2] ), SelectZ ); + Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) ); + + // if (center > max + radius || center < min - radius) reject; + Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist, XMVectorAdd( FrustumMax, Radius ) ) ); + Outside = XMVectorOrInt( Outside, XMVectorLess( Dist, XMVectorSubtract( FrustumMin, Radius ) ) ); + } + } + + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If we did not find a separating plane then the box must intersect the frustum. + return true; +} + + +//----------------------------------------------------------------------------- +// Exact frustum vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( const BoundingFrustum& fr ) const +{ + // Load origin and orientation of frustum B. + XMVECTOR OriginB = XMLoadFloat3( &Origin ); + XMVECTOR OrientationB = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( OrientationB ) ); + + // Build the planes of frustum B. + XMVECTOR AxisB[6]; + AxisB[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, 0.0f ); + AxisB[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, 0.0f ); + AxisB[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + AxisB[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + AxisB[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + AxisB[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + XMVECTOR PlaneDistB[6]; + PlaneDistB[0] = XMVectorNegate( XMVectorReplicatePtr( &Near ) ); + PlaneDistB[1] = XMVectorReplicatePtr( &Far ); + PlaneDistB[2] = XMVectorZero(); + PlaneDistB[3] = XMVectorZero(); + PlaneDistB[4] = XMVectorZero(); + PlaneDistB[5] = XMVectorZero(); + + // Load origin and orientation of frustum A. + XMVECTOR OriginA = XMLoadFloat3( &fr.Origin ); + XMVECTOR OrientationA = XMLoadFloat4( &fr.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( OrientationA ) ); + + // Transform frustum A into the space of the frustum B in order to + // minimize the number of transforms we have to do. + OriginA = XMVector3InverseRotate( XMVectorSubtract( OriginA, OriginB ), OrientationB ); + OrientationA = XMQuaternionMultiply( OrientationA, XMQuaternionConjugate( OrientationB ) ); + + // Build the corners of frustum A (in the local space of B). + XMVECTOR RightTopA = XMVectorSet( fr.RightSlope, fr.TopSlope, 1.0f, 0.0f ); + XMVECTOR RightBottomA = XMVectorSet( fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f ); + XMVECTOR LeftTopA = XMVectorSet(fr.LeftSlope,fr.TopSlope, 1.0f, 0.0f ); + XMVECTOR LeftBottomA = XMVectorSet( fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f ); + XMVECTOR NearA = XMVectorReplicatePtr( &fr.Near ); + XMVECTOR FarA = XMVectorReplicatePtr( &fr.Far ); + + RightTopA = XMVector3Rotate( RightTopA, OrientationA ); + RightBottomA = XMVector3Rotate( RightBottomA, OrientationA ); + LeftTopA = XMVector3Rotate( LeftTopA, OrientationA ); + LeftBottomA = XMVector3Rotate( LeftBottomA, OrientationA ); + + XMVECTOR CornersA[CORNER_COUNT]; + CornersA[0] = XMVectorMultiplyAdd( RightTopA, NearA, OriginA ); + CornersA[1] = XMVectorMultiplyAdd( RightBottomA, NearA, OriginA ); + CornersA[2] = XMVectorMultiplyAdd( LeftTopA, NearA, OriginA ); + CornersA[3] = XMVectorMultiplyAdd( LeftBottomA, NearA, OriginA ); + CornersA[4] = XMVectorMultiplyAdd( RightTopA, FarA, OriginA ); + CornersA[5] = XMVectorMultiplyAdd( RightBottomA, FarA, OriginA ); + CornersA[6] = XMVectorMultiplyAdd( LeftTopA, FarA, OriginA ); + CornersA[7] = XMVectorMultiplyAdd( LeftBottomA, FarA, OriginA ); + + // Check frustum A against each plane of frustum B. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + + for( size_t i = 0; i < 6; ++i ) + { + // Find the min/max projection of the frustum onto the plane normal. + XMVECTOR Min, Max; + + Min = Max = XMVector3Dot( AxisB[i], CornersA[0] ); + + for( size_t j = 1; j < CORNER_COUNT; j++ ) + { + XMVECTOR Temp = XMVector3Dot( AxisB[i], CornersA[j] ); + Min = XMVectorMin( Min, Temp ); + Max = XMVectorMax( Max, Temp ); + } + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( Min, PlaneDistB[i] ) ); + + // Fully inside the plane? + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Max, PlaneDistB[i] ) ); + } + + // If the frustum A is outside any of the planes of frustum B it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If frustum A is inside all planes of frustum B it is fully inside. + if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) + return true; + + // Build the corners of frustum B. + XMVECTOR RightTopB = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR RightBottomB = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR LeftTopB = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR LeftBottomB = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR NearB = XMVectorReplicatePtr( &Near ); + XMVECTOR FarB = XMVectorReplicatePtr( &Far ); + + XMVECTOR CornersB[BoundingFrustum::CORNER_COUNT]; + CornersB[0] = XMVectorMultiply( RightTopB, NearB ); + CornersB[1] = XMVectorMultiply( RightBottomB, NearB ); + CornersB[2] = XMVectorMultiply( LeftTopB, NearB ); + CornersB[3] = XMVectorMultiply( LeftBottomB, NearB ); + CornersB[4] = XMVectorMultiply( RightTopB, FarB ); + CornersB[5] = XMVectorMultiply( RightBottomB, FarB ); + CornersB[6] = XMVectorMultiply( LeftTopB, FarB ); + CornersB[7] = XMVectorMultiply( LeftBottomB, FarB ); + + // Build the planes of frustum A (in the local space of B). + XMVECTOR AxisA[6]; + XMVECTOR PlaneDistA[6]; + + AxisA[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, 0.0f ); + AxisA[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, 0.0f ); + AxisA[2] = XMVectorSet( 1.0f, 0.0f, -fr.RightSlope, 0.0f ); + AxisA[3] = XMVectorSet( -1.0f, 0.0f, fr.LeftSlope, 0.0f ); + AxisA[4] = XMVectorSet( 0.0f, 1.0f, -fr.TopSlope, 0.0f ); + AxisA[5] = XMVectorSet( 0.0f, -1.0f, fr.BottomSlope, 0.0f ); + + AxisA[0] = XMVector3Rotate( AxisA[0], OrientationA ); + AxisA[1] = XMVectorNegate( AxisA[0] ); + AxisA[2] = XMVector3Rotate( AxisA[2], OrientationA ); + AxisA[3] = XMVector3Rotate( AxisA[3], OrientationA ); + AxisA[4] = XMVector3Rotate( AxisA[4], OrientationA ); + AxisA[5] = XMVector3Rotate( AxisA[5], OrientationA ); + + PlaneDistA[0] = XMVector3Dot( AxisA[0], CornersA[0] ); // Re-use corner on near plane. + PlaneDistA[1] = XMVector3Dot( AxisA[1], CornersA[4] ); // Re-use corner on far plane. + PlaneDistA[2] = XMVector3Dot( AxisA[2], OriginA ); + PlaneDistA[3] = XMVector3Dot( AxisA[3], OriginA ); + PlaneDistA[4] = XMVector3Dot( AxisA[4], OriginA ); + PlaneDistA[5] = XMVector3Dot( AxisA[5], OriginA ); + + // Check each axis of frustum A for a seperating plane (5). + for( size_t i = 0; i < 6; ++i ) + { + // Find the minimum projection of the frustum onto the plane normal. + XMVECTOR Min; + + Min = XMVector3Dot( AxisA[i], CornersB[0] ); + + for( size_t j = 1; j < CORNER_COUNT; j++ ) + { + XMVECTOR Temp = XMVector3Dot( AxisA[i], CornersB[j] ); + Min = XMVectorMin( Min, Temp ); + } + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( Min, PlaneDistA[i] ) ); + } + + // If the frustum B is outside any of the planes of frustum A it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // Check edge/edge axes (6 * 6). + XMVECTOR FrustumEdgeAxisA[6]; + FrustumEdgeAxisA[0] = RightTopA; + FrustumEdgeAxisA[1] = RightBottomA; + FrustumEdgeAxisA[2] = LeftTopA; + FrustumEdgeAxisA[3] = LeftBottomA; + FrustumEdgeAxisA[4] = XMVectorSubtract( RightTopA, LeftTopA ); + FrustumEdgeAxisA[5] = XMVectorSubtract( LeftBottomA, LeftTopA ); + + XMVECTOR FrustumEdgeAxisB[6]; + FrustumEdgeAxisB[0] = RightTopB; + FrustumEdgeAxisB[1] = RightBottomB; + FrustumEdgeAxisB[2] = LeftTopB; + FrustumEdgeAxisB[3] = LeftBottomB; + FrustumEdgeAxisB[4] = XMVectorSubtract( RightTopB, LeftTopB ); + FrustumEdgeAxisB[5] = XMVectorSubtract( LeftBottomB, LeftTopB ); + + for( size_t i = 0; i < 6; ++i ) + { + for( size_t j = 0; j < 6; j++ ) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross( FrustumEdgeAxisA[i], FrustumEdgeAxisB[j] ); + + // Find the min/max values of the projection of both frustums onto the axis. + XMVECTOR MinA, MaxA; + XMVECTOR MinB, MaxB; + + MinA = MaxA = XMVector3Dot( Axis, CornersA[0] ); + MinB = MaxB = XMVector3Dot( Axis, CornersB[0] ); + + for( size_t k = 1; k < CORNER_COUNT; k++ ) + { + XMVECTOR TempA = XMVector3Dot( Axis, CornersA[k] ); + MinA = XMVectorMin( MinA, TempA ); + MaxA = XMVectorMax( MaxA, TempA ); + + XMVECTOR TempB = XMVector3Dot( Axis, CornersB[k] ); + MinB = XMVectorMin( MinB, TempB ); + MaxB = XMVectorMax( MaxB, TempB ); + } + + // if (MinA > MaxB || MinB > MaxA) reject + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinA, MaxB ) ); + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinB, MaxA ) ); + } + } + + // If there is a seperating plane, then the frustums do not intersect. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If we did not find a separating plane then the frustums intersect. + return true; +} + + +//----------------------------------------------------------------------------- +// Triangle vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Build the frustum planes (NOTE: D is negated from the usual). + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, -Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Transform triangle into the local space of frustum. + XMVECTOR TV0 = XMVector3InverseRotate( XMVectorSubtract( V0, vOrigin ), vOrientation ); + XMVECTOR TV1 = XMVector3InverseRotate( XMVectorSubtract( V1, vOrigin ), vOrientation ); + XMVECTOR TV2 = XMVector3InverseRotate( XMVectorSubtract( V2, vOrigin ), vOrientation ); + + // Test each vertex of the triangle against the frustum planes. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + + for( size_t i = 0; i < 6; ++i ) + { + XMVECTOR Dist0 = XMVector3Dot( TV0, Planes[i] ); + XMVECTOR Dist1 = XMVector3Dot( TV1, Planes[i] ); + XMVECTOR Dist2 = XMVector3Dot( TV2, Planes[i] ); + + XMVECTOR MinDist = XMVectorMin( Dist0, Dist1 ); + MinDist = XMVectorMin( MinDist, Dist2 ); + XMVECTOR MaxDist = XMVectorMax( Dist0, Dist1 ); + MaxDist = XMVectorMax( MaxDist, Dist2 ); + + XMVECTOR PlaneDist = XMVectorSplatW( Planes[i] ); + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinDist, PlaneDist ) ); + + // Fully inside the plane? + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( MaxDist, PlaneDist ) ); + } + + // If the triangle is outside any of the planes it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If the triangle is inside all planes it is fully inside. + if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = XMVectorMultiply( vRightTop, vNear ); + Corners[1] = XMVectorMultiply( vRightBottom, vNear ); + Corners[2] = XMVectorMultiply( vLeftTop, vNear ); + Corners[3] = XMVectorMultiply( vLeftBottom, vNear ); + Corners[4] = XMVectorMultiply( vRightTop, vFar ); + Corners[5] = XMVectorMultiply( vRightBottom, vFar ); + Corners[6] = XMVectorMultiply( vLeftTop, vFar ); + Corners[7] = XMVectorMultiply( vLeftBottom, vFar ); + + // Test the plane of the triangle. + XMVECTOR Normal = XMVector3Cross( XMVectorSubtract( V1, V0 ), XMVectorSubtract( V2, V0 ) ); + XMVECTOR Dist = XMVector3Dot( Normal, V0 ); + + XMVECTOR MinDist, MaxDist; + MinDist = MaxDist = XMVector3Dot( Corners[0], Normal ); + for( size_t i = 1; i < CORNER_COUNT; ++i ) + { + XMVECTOR Temp = XMVector3Dot( Corners[i], Normal ); + MinDist = XMVectorMin( MinDist, Temp ); + MaxDist = XMVectorMax( MaxDist, Temp ); + } + + Outside = XMVectorOrInt( XMVectorGreater( MinDist, Dist ), XMVectorLess( MaxDist, Dist ) ); + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // Check the edge/edge axes (3*6). + XMVECTOR TriangleEdgeAxis[3]; + TriangleEdgeAxis[0] = XMVectorSubtract( V1, V0 ); + TriangleEdgeAxis[1] = XMVectorSubtract( V2, V1 ); + TriangleEdgeAxis[2] = XMVectorSubtract( V0, V2 ); + + XMVECTOR FrustumEdgeAxis[6]; + FrustumEdgeAxis[0] = vRightTop; + FrustumEdgeAxis[1] = vRightBottom; + FrustumEdgeAxis[2] = vLeftTop; + FrustumEdgeAxis[3] = vLeftBottom; + FrustumEdgeAxis[4] = XMVectorSubtract( vRightTop, vLeftTop ); + FrustumEdgeAxis[5] = XMVectorSubtract( vLeftBottom, vLeftTop ); + + for( size_t i = 0; i < 3; ++i ) + { + for( size_t j = 0; j < 6; j++ ) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross( TriangleEdgeAxis[i], FrustumEdgeAxis[j] ); + + // Find the min/max of the projection of the triangle onto the axis. + XMVECTOR MinA, MaxA; + + XMVECTOR Dist0 = XMVector3Dot( V0, Axis ); + XMVECTOR Dist1 = XMVector3Dot( V1, Axis ); + XMVECTOR Dist2 = XMVector3Dot( V2, Axis ); + + MinA = XMVectorMin( Dist0, Dist1 ); + MinA = XMVectorMin( MinA, Dist2 ); + MaxA = XMVectorMax( Dist0, Dist1 ); + MaxA = XMVectorMax( MaxA, Dist2 ); + + // Find the min/max of the projection of the frustum onto the axis. + XMVECTOR MinB, MaxB; + + MinB = MaxB = XMVector3Dot( Axis, Corners[0] ); + + for( size_t k = 1; k < CORNER_COUNT; k++ ) + { + XMVECTOR Temp = XMVector3Dot( Axis, Corners[k] ); + MinB = XMVectorMin( MinB, Temp ); + MaxB = XMVectorMax( MaxB, Temp ); + } + + // if (MinA > MaxB || MinB > MaxA) reject; + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinA, MaxB ) ); + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinB, MaxA ) ); + } + } + + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If we did not find a separating plane then the triangle must intersect the frustum. + return true; +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR Plane ) const +{ + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Set w of the origin to one so we can dot4 with a plane. + vOrigin = XMVectorInsert<0, 0, 0, 0, 1>( vOrigin, XMVectorSplatOne() ); + + // Build the corners of the frustum (in world space). + XMVECTOR RightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR RightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR LeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR LeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + RightTop = XMVector3Rotate( RightTop, vOrientation ); + RightBottom = XMVector3Rotate( RightBottom, vOrientation ); + LeftTop = XMVector3Rotate( LeftTop, vOrientation ); + LeftBottom = XMVector3Rotate( LeftBottom, vOrientation ); + + XMVECTOR Corners0 = XMVectorMultiplyAdd( RightTop, vNear, vOrigin ); + XMVECTOR Corners1 = XMVectorMultiplyAdd( RightBottom, vNear, vOrigin ); + XMVECTOR Corners2 = XMVectorMultiplyAdd( LeftTop, vNear, vOrigin ); + XMVECTOR Corners3 = XMVectorMultiplyAdd( LeftBottom, vNear, vOrigin ); + XMVECTOR Corners4 = XMVectorMultiplyAdd( RightTop, vFar, vOrigin ); + XMVECTOR Corners5 = XMVectorMultiplyAdd( RightBottom, vFar, vOrigin ); + XMVECTOR Corners6 = XMVectorMultiplyAdd( LeftTop, vFar, vOrigin ); + XMVECTOR Corners7 = XMVectorMultiplyAdd( LeftBottom, vFar, vOrigin ); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane, Outside, Inside ); + + // If the frustum is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the frustum is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The frustum is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Ray vs. frustum test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR rayOrigin, FXMVECTOR Direction, float& Dist ) const +{ + // If ray starts inside the frustum, return a distance of 0 for the hit + if ( Contains(rayOrigin) == CONTAINS ) + { + Dist = 0.0f; + return true; + } + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Load origin and orientation of the frustum. + XMVECTOR frOrigin = XMLoadFloat3( &Origin ); + XMVECTOR frOrientation = XMLoadFloat4( &Orientation ); + + // This algorithm based on "Fast Ray-Convex Polyhedron Intersectin," in James Arvo, ed., Graphics Gems II pp. 247-250 + float tnear = -FLT_MAX; + float tfar = FLT_MAX; + + for( size_t i=0; i < 6; ++i ) + { + XMVECTOR Plane = DirectX::Internal::XMPlaneTransform( Planes[i], frOrientation, frOrigin ); + Plane = XMPlaneNormalize( Plane ); + + XMVECTOR AxisDotOrigin = XMPlaneDotCoord( Plane, rayOrigin ); + XMVECTOR AxisDotDirection = XMVector3Dot( Plane, Direction ); + + if ( XMVector3LessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ) ) + { + // Ray is parallel to plane - check if ray origin is inside plane's + if ( XMVector3Greater( AxisDotOrigin, g_XMZero ) ) + { + // Ray origin is outside half-space. + Dist = 0.f; + return false; + } + } + else + { + // Ray not parallel - get distance to plane. + float vd = XMVectorGetX( AxisDotDirection ); + float vn = XMVectorGetX( AxisDotOrigin ); + float t = -vn / vd; + if (vd < 0.0f) + { + // Front face - T is a near point. + if (t > tfar) + { + Dist = 0.f; + return false; + } + if (t > tnear) + { + // Hit near face. + tnear = t; + } + } + else + { + // back face - T is far point. + if (t < tnear) + { + Dist = 0.f; + return false; + } + if (t < tfar) + { + // Hit far face. + tfar = t; + } + } + } + } + + // Survived all tests. + // Note: if ray originates on polyhedron, may want to change 0.0f to some + // epsilon to avoid intersecting the originating face. + float distance = ( tnear >= 0.0f ) ? tnear : tfar; + if (distance >= 0.0f) + { + Dist = distance; + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test a frustum vs 6 planes (typically forming another frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingFrustum::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Set w of the origin to one so we can dot4 with a plane. + vOrigin = XMVectorInsert<0, 0, 0, 0, 1>( vOrigin, XMVectorSplatOne() ); + + // Build the corners of the frustum (in world space). + XMVECTOR RightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR RightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR LeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR LeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + RightTop = XMVector3Rotate( RightTop, vOrientation ); + RightBottom = XMVector3Rotate( RightBottom, vOrientation ); + LeftTop = XMVector3Rotate( LeftTop, vOrientation ); + LeftBottom = XMVector3Rotate( LeftBottom, vOrientation ); + + XMVECTOR Corners0 = XMVectorMultiplyAdd( RightTop, vNear, vOrigin ); + XMVECTOR Corners1 = XMVectorMultiplyAdd( RightBottom, vNear, vOrigin ); + XMVECTOR Corners2 = XMVectorMultiplyAdd( LeftTop, vNear, vOrigin ); + XMVECTOR Corners3 = XMVectorMultiplyAdd( LeftBottom, vNear, vOrigin ); + XMVECTOR Corners4 = XMVectorMultiplyAdd( RightTop, vFar, vOrigin ); + XMVECTOR Corners5 = XMVectorMultiplyAdd( RightBottom, vFar, vOrigin ); + XMVECTOR Corners6 = XMVectorMultiplyAdd( LeftTop, vFar, vOrigin ); + XMVECTOR Corners7 = XMVectorMultiplyAdd( LeftBottom, vFar, vOrigin ); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane1, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane2, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane3, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane4, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane5, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the frustum is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the frustum is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The frustum is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Build the 6 frustum planes from a frustum. +// +// The intended use for these routines is for fast culling to a view frustum. +// When the volume being tested against a view frustum is small relative to the +// view frustum it is usually either inside all six planes of the frustum +// (CONTAINS) or outside one of the planes of the frustum (DISJOINT). If neither +// of these cases is true then it may or may not be intersecting the frustum +// (INTERSECTS) +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingFrustum::GetPlanes( XMVECTOR* NearPlane, XMVECTOR* FarPlane, XMVECTOR* RightPlane, + XMVECTOR* LeftPlane, XMVECTOR* TopPlane, XMVECTOR* BottomPlane ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + if (NearPlane) + { + XMVECTOR vNearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + vNearPlane = DirectX::Internal::XMPlaneTransform( vNearPlane, vOrientation, vOrigin ); + *NearPlane = XMPlaneNormalize( vNearPlane ); + } + + if (FarPlane) + { + XMVECTOR vFarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + vFarPlane = DirectX::Internal::XMPlaneTransform( vFarPlane, vOrientation, vOrigin ); + *FarPlane = XMPlaneNormalize( vFarPlane ); + } + + if (RightPlane) + { + XMVECTOR vRightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + vRightPlane = DirectX::Internal::XMPlaneTransform( vRightPlane, vOrientation, vOrigin ); + *RightPlane = XMPlaneNormalize( vRightPlane ); + } + + if (LeftPlane) + { + XMVECTOR vLeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + vLeftPlane = DirectX::Internal::XMPlaneTransform( vLeftPlane, vOrientation, vOrigin ); + *LeftPlane = XMPlaneNormalize( vLeftPlane ); + } + + if (TopPlane) + { + XMVECTOR vTopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + vTopPlane = DirectX::Internal::XMPlaneTransform( vTopPlane, vOrientation, vOrigin ); + *TopPlane = XMPlaneNormalize( vTopPlane ); + } + + if (BottomPlane) + { + XMVECTOR vBottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + vBottomPlane = DirectX::Internal::XMPlaneTransform( vBottomPlane, vOrientation, vOrigin ); + *BottomPlane = XMPlaneNormalize( vBottomPlane ); + } +} + + +//----------------------------------------------------------------------------- +// Build a frustum from a persepective projection matrix. The matrix may only +// contain a projection; any rotation, translation or scale will cause the +// constructed frustum to be incorrect. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingFrustum::CreateFromMatrix( BoundingFrustum& Out, FXMMATRIX Projection ) +{ + // Corners of the projection frustum in homogenous space. + static XMVECTORF32 HomogenousPoints[6] = + { + { 1.0f, 0.0f, 1.0f, 1.0f }, // right (at far plane) + { -1.0f, 0.0f, 1.0f, 1.0f }, // left + { 0.0f, 1.0f, 1.0f, 1.0f }, // top + { 0.0f, -1.0f, 1.0f, 1.0f }, // bottom + + { 0.0f, 0.0f, 0.0f, 1.0f }, // near + { 0.0f, 0.0f, 1.0f, 1.0f } // far + }; + + XMVECTOR Determinant; + XMMATRIX matInverse = XMMatrixInverse( &Determinant, Projection ); + + // Compute the frustum corners in world space. + XMVECTOR Points[6]; + + for( size_t i = 0; i < 6; ++i ) + { + // Transform point. + Points[i] = XMVector4Transform( HomogenousPoints[i], matInverse ); + } + + Out.Origin = XMFLOAT3( 0.0f, 0.0f, 0.0f ); + Out.Orientation = XMFLOAT4( 0.0f, 0.0f, 0.0f, 1.0f ); + + // Compute the slopes. + Points[0] = XMVectorMultiply( Points[0], XMVectorReciprocal( XMVectorSplatZ( Points[0] ) ) ); + Points[1] = XMVectorMultiply( Points[1], XMVectorReciprocal( XMVectorSplatZ( Points[1] ) ) ); + Points[2] = XMVectorMultiply( Points[2], XMVectorReciprocal( XMVectorSplatZ( Points[2] ) ) ); + Points[3] = XMVectorMultiply( Points[3], XMVectorReciprocal( XMVectorSplatZ( Points[3] ) ) ); + + Out.RightSlope = XMVectorGetX( Points[0] ); + Out.LeftSlope = XMVectorGetX( Points[1] ); + Out.TopSlope = XMVectorGetY( Points[2] ); + Out.BottomSlope = XMVectorGetY( Points[3] ); + + // Compute near and far. + Points[4] = XMVectorMultiply( Points[4], XMVectorReciprocal( XMVectorSplatW( Points[4] ) ) ); + Points[5] = XMVectorMultiply( Points[5], XMVectorReciprocal( XMVectorSplatW( Points[5] ) ) ); + + Out.Near = XMVectorGetZ( Points[4] ); + Out.Far = XMVectorGetZ( Points[5] ); +} + + +/**************************************************************************** + * + * TriangleTests + * + ****************************************************************************/ + +namespace TriangleTests +{ + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with a triangle +// (V0, V1, V2). Return true if there is an intersection and also set *pDist +// to the distance along the ray to the intersection. +// +// The algorithm is based on Moller, Tomas and Trumbore, "Fast, Minimum Storage +// Ray-Triangle Intersection", Journal of Graphics Tools, vol. 2, no. 1, +// pp 21-28, 1997. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV Intersects( FXMVECTOR Origin, FXMVECTOR Direction, FXMVECTOR V0, GXMVECTOR V1, HXMVECTOR V2, float& Dist ) +{ + assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); + + XMVECTOR Zero = XMVectorZero(); + + XMVECTOR e1 = XMVectorSubtract( V1, V0 ); + XMVECTOR e2 = XMVectorSubtract( V2, V0 ); + + // p = Direction ^ e2; + XMVECTOR p = XMVector3Cross( Direction, e2 ); + + // det = e1 * p; + XMVECTOR det = XMVector3Dot( e1, p ); + + XMVECTOR u, v, t; + + if( XMVector3GreaterOrEqual( det, g_RayEpsilon ) ) + { + // Determinate is positive (front side of the triangle). + XMVECTOR s = XMVectorSubtract( Origin, V0 ); + + // u = s * p; + u = XMVector3Dot( s, p ); + + XMVECTOR NoIntersection = XMVectorLess( u, Zero ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( u, det ) ); + + // q = s ^ e1; + XMVECTOR q = XMVector3Cross( s, e1 ); + + // v = Direction * q; + v = XMVector3Dot( Direction, q ); + + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( v, Zero ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( XMVectorAdd( u, v ), det ) ); + + // t = e2 * q; + t = XMVector3Dot( e2, q ); + + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( t, Zero ) ); + + if( XMVector4EqualInt( NoIntersection, XMVectorTrueInt() ) ) + { + Dist = 0.f; + return false; + } + } + else if( XMVector3LessOrEqual( det, g_RayNegEpsilon ) ) + { + // Determinate is negative (back side of the triangle). + XMVECTOR s = XMVectorSubtract( Origin, V0 ); + + // u = s * p; + u = XMVector3Dot( s, p ); + + XMVECTOR NoIntersection = XMVectorGreater( u, Zero ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( u, det ) ); + + // q = s ^ e1; + XMVECTOR q = XMVector3Cross( s, e1 ); + + // v = Direction * q; + v = XMVector3Dot( Direction, q ); + + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( v, Zero ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( XMVectorAdd( u, v ), det ) ); + + // t = e2 * q; + t = XMVector3Dot( e2, q ); + + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( t, Zero ) ); + + if ( XMVector4EqualInt( NoIntersection, XMVectorTrueInt() ) ) + { + Dist = 0.f; + return false; + } + } + else + { + // Parallel ray. + Dist = 0.f; + return false; + } + + t = XMVectorDivide ( t, det ); + + // (u / det) and (v / dev) are the barycentric cooridinates of the intersection. + + // Store the x-component to *pDist + XMStoreFloat( &Dist, t ); + + return true; +} + + +//----------------------------------------------------------------------------- +// Test if two triangles intersect. +// +// The final test of algorithm is based on Shen, Heng, and Tang, "A Fast +// Triangle-Triangle Overlap Test Using Signed Distances", Journal of Graphics +// Tools, vol. 8, no. 1, pp 17-23, 2003 and Guigue and Devillers, "Fast and +// Robust Triangle-Triangle Overlap Test Using Orientation Predicates", Journal +// of Graphics Tools, vol. 8, no. 1, pp 25-32, 2003. +// +// The final test could be considered an edge-edge separating plane test with +// the 9 possible cases narrowed down to the only two pairs of edges that can +// actaully result in a seperation. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV Intersects( FXMVECTOR A0, FXMVECTOR A1, FXMVECTOR A2, GXMVECTOR B0, HXMVECTOR B1, HXMVECTOR B2 ) +{ + static const XMVECTORU32 SelectY = { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 }; + static const XMVECTORU32 SelectZ = { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 }; + static const XMVECTORU32 Select0111 = { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_1 }; + static const XMVECTORU32 Select1011 = { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 }; + static const XMVECTORU32 Select1101 = { XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1 }; + + XMVECTOR Zero = XMVectorZero(); + + // Compute the normal of triangle A. + XMVECTOR N1 = XMVector3Cross( XMVectorSubtract( A1, A0 ), XMVectorSubtract( A2, A0 ) ); + + // Assert that the triangle is not degenerate. + assert( !XMVector3Equal( N1, Zero ) ); + + // Test points of B against the plane of A. + XMVECTOR BDist = XMVector3Dot( N1, XMVectorSubtract( B0, A0 ) ); + BDist = XMVectorSelect( BDist, XMVector3Dot( N1, XMVectorSubtract( B1, A0 ) ), SelectY ); + BDist = XMVectorSelect( BDist, XMVector3Dot( N1, XMVectorSubtract( B2, A0 ) ), SelectZ ); + + // Ensure robustness with co-planar triangles by zeroing small distances. + uint32_t BDistIsZeroCR; + XMVECTOR BDistIsZero = XMVectorGreaterR( &BDistIsZeroCR, g_RayEpsilon, XMVectorAbs( BDist ) ); + BDist = XMVectorSelect( BDist, Zero, BDistIsZero ); + + uint32_t BDistIsLessCR; + XMVECTOR BDistIsLess = XMVectorGreaterR( &BDistIsLessCR, Zero, BDist ); + + uint32_t BDistIsGreaterCR; + XMVECTOR BDistIsGreater = XMVectorGreaterR( &BDistIsGreaterCR, BDist, Zero ); + + // If all the points are on the same side we don't intersect. + if( XMComparisonAllTrue( BDistIsLessCR ) || XMComparisonAllTrue( BDistIsGreaterCR ) ) + return false; + + // Compute the normal of triangle B. + XMVECTOR N2 = XMVector3Cross( XMVectorSubtract( B1, B0 ), XMVectorSubtract( B2, B0 ) ); + + // Assert that the triangle is not degenerate. + assert( !XMVector3Equal( N2, Zero ) ); + + // Test points of A against the plane of B. + XMVECTOR ADist = XMVector3Dot( N2, XMVectorSubtract( A0, B0 ) ); + ADist = XMVectorSelect( ADist, XMVector3Dot( N2, XMVectorSubtract( A1, B0 ) ), SelectY ); + ADist = XMVectorSelect( ADist, XMVector3Dot( N2, XMVectorSubtract( A2, B0 ) ), SelectZ ); + + // Ensure robustness with co-planar triangles by zeroing small distances. + uint32_t ADistIsZeroCR; + XMVECTOR ADistIsZero = XMVectorGreaterR( &ADistIsZeroCR, g_RayEpsilon, XMVectorAbs( BDist ) ); + ADist = XMVectorSelect( ADist, Zero, ADistIsZero ); + + uint32_t ADistIsLessCR; + XMVECTOR ADistIsLess = XMVectorGreaterR( &ADistIsLessCR, Zero, ADist ); + + uint32_t ADistIsGreaterCR; + XMVECTOR ADistIsGreater = XMVectorGreaterR( &ADistIsGreaterCR, ADist, Zero ); + + // If all the points are on the same side we don't intersect. + if( XMComparisonAllTrue( ADistIsLessCR ) || XMComparisonAllTrue( ADistIsGreaterCR ) ) + return false; + + // Special case for co-planar triangles. + if( XMComparisonAllTrue( ADistIsZeroCR ) || XMComparisonAllTrue( BDistIsZeroCR ) ) + { + XMVECTOR Axis, Dist, MinDist; + + // Compute an axis perpindicular to the edge (points out). + Axis = XMVector3Cross( N1, XMVectorSubtract( A1, A0 ) ); + Dist = XMVector3Dot( Axis, A0 ); + + // Test points of B against the axis. + MinDist = XMVector3Dot( B0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (A1, A2) + Axis = XMVector3Cross( N1, XMVectorSubtract( A2, A1 ) ); + Dist = XMVector3Dot( Axis, A1 ); + + MinDist = XMVector3Dot( B0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (A2, A0) + Axis = XMVector3Cross( N1, XMVectorSubtract( A0, A2 ) ); + Dist = XMVector3Dot( Axis, A2 ); + + MinDist = XMVector3Dot( B0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (B0, B1) + Axis = XMVector3Cross( N2, XMVectorSubtract( B1, B0 ) ); + Dist = XMVector3Dot( Axis, B0 ); + + MinDist = XMVector3Dot( A0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (B1, B2) + Axis = XMVector3Cross( N2, XMVectorSubtract( B2, B1 ) ); + Dist = XMVector3Dot( Axis, B1 ); + + MinDist = XMVector3Dot( A0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (B2,B0) + Axis = XMVector3Cross( N2, XMVectorSubtract( B0, B2 ) ); + Dist = XMVector3Dot( Axis, B2 ); + + MinDist = XMVector3Dot( A0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + return true; + } + + // + // Find the single vertex of A and B (ie the vertex on the opposite side + // of the plane from the other two) and reorder the edges so we can compute + // the signed edge/edge distances. + // + // if ( (V0 >= 0 && V1 < 0 && V2 < 0) || + // (V0 > 0 && V1 <= 0 && V2 <= 0) || + // (V0 <= 0 && V1 > 0 && V2 > 0) || + // (V0 < 0 && V1 >= 0 && V2 >= 0) ) then V0 is singular; + // + // If our singular vertex is not on the positive side of the plane we reverse + // the triangle winding so that the overlap comparisons will compare the + // correct edges with the correct signs. + // + XMVECTOR ADistIsLessEqual = XMVectorOrInt( ADistIsLess, ADistIsZero ); + XMVECTOR ADistIsGreaterEqual = XMVectorOrInt( ADistIsGreater, ADistIsZero ); + + XMVECTOR AA0, AA1, AA2; + bool bPositiveA; + + if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select0111 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select0111 ) ) ) + { + // A0 is singular, crossing from positive to negative. + AA0 = A0; AA1 = A1; AA2 = A2; + bPositiveA = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select0111 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select0111 ) ) ) + { + // A0 is singular, crossing from negative to positive. + AA0 = A0; AA1 = A2; AA2 = A1; + bPositiveA = false; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select1011 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select1011 ) ) ) + { + // A1 is singular, crossing from positive to negative. + AA0 = A1; AA1 = A2; AA2 = A0; + bPositiveA = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select1011 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select1011 ) ) ) + { + // A1 is singular, crossing from negative to positive. + AA0 = A1; AA1 = A0; AA2 = A2; + bPositiveA = false; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select1101 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select1101 ) ) ) + { + // A2 is singular, crossing from positive to negative. + AA0 = A2; AA1 = A0; AA2 = A1; + bPositiveA = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select1101 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select1101 ) ) ) + { + // A2 is singular, crossing from negative to positive. + AA0 = A2; AA1 = A1; AA2 = A0; + bPositiveA = false; + } + else + { + assert( false ); + return false; + } + + XMVECTOR BDistIsLessEqual = XMVectorOrInt( BDistIsLess, BDistIsZero ); + XMVECTOR BDistIsGreaterEqual = XMVectorOrInt( BDistIsGreater, BDistIsZero ); + + XMVECTOR BB0, BB1, BB2; + bool bPositiveB; + + if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select0111 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select0111 ) ) ) + { + // B0 is singular, crossing from positive to negative. + BB0 = B0; BB1 = B1; BB2 = B2; + bPositiveB = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select0111 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select0111 ) ) ) + { + // B0 is singular, crossing from negative to positive. + BB0 = B0; BB1 = B2; BB2 = B1; + bPositiveB = false; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select1011 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select1011 ) ) ) + { + // B1 is singular, crossing from positive to negative. + BB0 = B1; BB1 = B2; BB2 = B0; + bPositiveB = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select1011 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select1011 ) ) ) + { + // B1 is singular, crossing from negative to positive. + BB0 = B1; BB1 = B0; BB2 = B2; + bPositiveB = false; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select1101 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select1101 ) ) ) + { + // B2 is singular, crossing from positive to negative. + BB0 = B2; BB1 = B0; BB2 = B1; + bPositiveB = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select1101 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select1101 ) ) ) + { + // B2 is singular, crossing from negative to positive. + BB0 = B2; BB1 = B1; BB2 = B0; + bPositiveB = false; + } + else + { + assert( false ); + return false; + } + + XMVECTOR Delta0, Delta1; + + // Reverse the direction of the test depending on whether the singular vertices are + // the same sign or different signs. + if( bPositiveA ^ bPositiveB ) + { + Delta0 = XMVectorSubtract( BB0, AA0 ); + Delta1 = XMVectorSubtract( AA0, BB0 ); + } + else + { + Delta0 = XMVectorSubtract( AA0, BB0 ); + Delta1 = XMVectorSubtract( BB0, AA0 ); + } + + // Check if the triangles overlap on the line of intersection between the + // planes of the two triangles by finding the signed line distances. + XMVECTOR Dist0 = XMVector3Dot( Delta0, XMVector3Cross( XMVectorSubtract( BB2, BB0 ), XMVectorSubtract( AA2, AA0 ) ) ); + if( XMVector4Greater( Dist0, Zero ) ) + return false; + + XMVECTOR Dist1 = XMVector3Dot( Delta1, XMVector3Cross( XMVectorSubtract( BB1, BB0 ), XMVectorSubtract( AA1, AA0 ) ) ); + if( XMVector4Greater( Dist1, Zero ) ) + return false; + + return true; +} + + +//----------------------------------------------------------------------------- +// Ray-triangle test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane ) +{ + XMVECTOR One = XMVectorSplatOne(); + + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Set w of the points to one so we can dot4 with a plane. + XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One); + XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One); + XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane, Outside, Inside ); + + // If the triangle is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the triangle is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The triangle is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Test a triangle vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV ContainedBy( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, + GXMVECTOR Plane0, HXMVECTOR Plane1, HXMVECTOR Plane2, + CXMVECTOR Plane3, CXMVECTOR Plane4, CXMVECTOR Plane5 ) +{ + XMVECTOR One = XMVectorSplatOne(); + + // Set w of the points to one so we can dot4 with a plane. + XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One); + XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One); + XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane1, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane2, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane3, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane4, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane5, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the triangle is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the triangle is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The triangle is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + +} // namespace TriangleTests + diff --git a/WickedEngine/Utility/DirectXColors.h b/WickedEngine/Utility/DirectXColors.h new file mode 100644 index 000000000..cbb9f23e8 --- /dev/null +++ b/WickedEngine/Utility/DirectXColors.h @@ -0,0 +1,166 @@ +//------------------------------------------------------------------------------------- +// DirectXColors.h -- C++ Color Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMathCommon.h" +#include "DirectXMath.h" + +namespace DirectX +{ + +namespace Colors +{ + // Standard colors (Red/Green/Blue/Alpha) + XMGLOBALCONST XMVECTORF32 AliceBlue = { { { 0.941176534f, 0.972549081f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 AntiqueWhite = { { { 0.980392218f, 0.921568692f, 0.843137324f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Aqua = { { { 0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Aquamarine = { { { 0.498039246f, 1.000000000f, 0.831372619f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Azure = { { { 0.941176534f, 1.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Beige = { { { 0.960784376f, 0.960784376f, 0.862745166f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Bisque = { { { 1.000000000f, 0.894117713f, 0.768627524f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Black = { { { 0.000000000f, 0.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 BlanchedAlmond = { { { 1.000000000f, 0.921568692f, 0.803921640f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Blue = { { { 0.000000000f, 0.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 BlueViolet = { { { 0.541176498f, 0.168627456f, 0.886274576f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Brown = { { { 0.647058845f, 0.164705887f, 0.164705887f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 BurlyWood = { { { 0.870588303f, 0.721568644f, 0.529411793f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 CadetBlue = { { { 0.372549027f, 0.619607866f, 0.627451003f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Chartreuse = { { { 0.498039246f, 1.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Chocolate = { { { 0.823529482f, 0.411764741f, 0.117647067f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Coral = { { { 1.000000000f, 0.498039246f, 0.313725501f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 CornflowerBlue = { { { 0.392156899f, 0.584313750f, 0.929411829f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Cornsilk = { { { 1.000000000f, 0.972549081f, 0.862745166f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Crimson = { { { 0.862745166f, 0.078431375f, 0.235294133f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Cyan = { { { 0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkBlue = { { { 0.000000000f, 0.000000000f, 0.545098066f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkCyan = { { { 0.000000000f, 0.545098066f, 0.545098066f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkGoldenrod = { { { 0.721568644f, 0.525490224f, 0.043137256f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkGray = { { { 0.662745118f, 0.662745118f, 0.662745118f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkGreen = { { { 0.000000000f, 0.392156899f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkKhaki = { { { 0.741176486f, 0.717647076f, 0.419607878f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkMagenta = { { { 0.545098066f, 0.000000000f, 0.545098066f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkOliveGreen = { { { 0.333333343f, 0.419607878f, 0.184313729f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkOrange = { { { 1.000000000f, 0.549019635f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkOrchid = { { { 0.600000024f, 0.196078449f, 0.800000072f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkRed = { { { 0.545098066f, 0.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkSalmon = { { { 0.913725555f, 0.588235319f, 0.478431404f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkSeaGreen = { { { 0.560784340f, 0.737254918f, 0.545098066f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkSlateBlue = { { { 0.282352954f, 0.239215702f, 0.545098066f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkSlateGray = { { { 0.184313729f, 0.309803933f, 0.309803933f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkTurquoise = { { { 0.000000000f, 0.807843208f, 0.819607913f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkViolet = { { { 0.580392182f, 0.000000000f, 0.827451050f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DeepPink = { { { 1.000000000f, 0.078431375f, 0.576470613f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DeepSkyBlue = { { { 0.000000000f, 0.749019623f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DimGray = { { { 0.411764741f, 0.411764741f, 0.411764741f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DodgerBlue = { { { 0.117647067f, 0.564705908f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Firebrick = { { { 0.698039234f, 0.133333340f, 0.133333340f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 FloralWhite = { { { 1.000000000f, 0.980392218f, 0.941176534f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 ForestGreen = { { { 0.133333340f, 0.545098066f, 0.133333340f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Fuchsia = { { { 1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Gainsboro = { { { 0.862745166f, 0.862745166f, 0.862745166f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 GhostWhite = { { { 0.972549081f, 0.972549081f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Gold = { { { 1.000000000f, 0.843137324f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Goldenrod = { { { 0.854902029f, 0.647058845f, 0.125490203f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Gray = { { { 0.501960814f, 0.501960814f, 0.501960814f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Green = { { { 0.000000000f, 0.501960814f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 GreenYellow = { { { 0.678431392f, 1.000000000f, 0.184313729f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Honeydew = { { { 0.941176534f, 1.000000000f, 0.941176534f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 HotPink = { { { 1.000000000f, 0.411764741f, 0.705882370f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 IndianRed = { { { 0.803921640f, 0.360784322f, 0.360784322f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Indigo = { { { 0.294117659f, 0.000000000f, 0.509803951f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Ivory = { { { 1.000000000f, 1.000000000f, 0.941176534f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Khaki = { { { 0.941176534f, 0.901960850f, 0.549019635f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Lavender = { { { 0.901960850f, 0.901960850f, 0.980392218f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LavenderBlush = { { { 1.000000000f, 0.941176534f, 0.960784376f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LawnGreen = { { { 0.486274540f, 0.988235354f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LemonChiffon = { { { 1.000000000f, 0.980392218f, 0.803921640f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightBlue = { { { 0.678431392f, 0.847058892f, 0.901960850f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightCoral = { { { 0.941176534f, 0.501960814f, 0.501960814f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightCyan = { { { 0.878431439f, 1.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightGoldenrodYellow = { { { 0.980392218f, 0.980392218f, 0.823529482f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightGreen = { { { 0.564705908f, 0.933333397f, 0.564705908f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightGray = { { { 0.827451050f, 0.827451050f, 0.827451050f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightPink = { { { 1.000000000f, 0.713725507f, 0.756862819f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightSalmon = { { { 1.000000000f, 0.627451003f, 0.478431404f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightSeaGreen = { { { 0.125490203f, 0.698039234f, 0.666666687f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightSkyBlue = { { { 0.529411793f, 0.807843208f, 0.980392218f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightSlateGray = { { { 0.466666698f, 0.533333361f, 0.600000024f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightSteelBlue = { { { 0.690196097f, 0.768627524f, 0.870588303f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightYellow = { { { 1.000000000f, 1.000000000f, 0.878431439f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Lime = { { { 0.000000000f, 1.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LimeGreen = { { { 0.196078449f, 0.803921640f, 0.196078449f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Linen = { { { 0.980392218f, 0.941176534f, 0.901960850f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Magenta = { { { 1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Maroon = { { { 0.501960814f, 0.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumAquamarine = { { { 0.400000036f, 0.803921640f, 0.666666687f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumBlue = { { { 0.000000000f, 0.000000000f, 0.803921640f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumOrchid = { { { 0.729411781f, 0.333333343f, 0.827451050f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumPurple = { { { 0.576470613f, 0.439215720f, 0.858823597f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumSeaGreen = { { { 0.235294133f, 0.701960802f, 0.443137288f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumSlateBlue = { { { 0.482352972f, 0.407843173f, 0.933333397f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumSpringGreen = { { { 0.000000000f, 0.980392218f, 0.603921592f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumTurquoise = { { { 0.282352954f, 0.819607913f, 0.800000072f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumVioletRed = { { { 0.780392230f, 0.082352944f, 0.521568656f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MidnightBlue = { { { 0.098039225f, 0.098039225f, 0.439215720f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MintCream = { { { 0.960784376f, 1.000000000f, 0.980392218f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MistyRose = { { { 1.000000000f, 0.894117713f, 0.882353008f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Moccasin = { { { 1.000000000f, 0.894117713f, 0.709803939f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 NavajoWhite = { { { 1.000000000f, 0.870588303f, 0.678431392f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Navy = { { { 0.000000000f, 0.000000000f, 0.501960814f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 OldLace = { { { 0.992156923f, 0.960784376f, 0.901960850f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Olive = { { { 0.501960814f, 0.501960814f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 OliveDrab = { { { 0.419607878f, 0.556862772f, 0.137254909f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Orange = { { { 1.000000000f, 0.647058845f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 OrangeRed = { { { 1.000000000f, 0.270588249f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Orchid = { { { 0.854902029f, 0.439215720f, 0.839215755f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PaleGoldenrod = { { { 0.933333397f, 0.909803987f, 0.666666687f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PaleGreen = { { { 0.596078455f, 0.984313786f, 0.596078455f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PaleTurquoise = { { { 0.686274529f, 0.933333397f, 0.933333397f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PaleVioletRed = { { { 0.858823597f, 0.439215720f, 0.576470613f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PapayaWhip = { { { 1.000000000f, 0.937254965f, 0.835294187f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PeachPuff = { { { 1.000000000f, 0.854902029f, 0.725490212f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Peru = { { { 0.803921640f, 0.521568656f, 0.247058839f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Pink = { { { 1.000000000f, 0.752941251f, 0.796078503f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Plum = { { { 0.866666734f, 0.627451003f, 0.866666734f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PowderBlue = { { { 0.690196097f, 0.878431439f, 0.901960850f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Purple = { { { 0.501960814f, 0.000000000f, 0.501960814f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Red = { { { 1.000000000f, 0.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 RosyBrown = { { { 0.737254918f, 0.560784340f, 0.560784340f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 RoyalBlue = { { { 0.254901975f, 0.411764741f, 0.882353008f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SaddleBrown = { { { 0.545098066f, 0.270588249f, 0.074509807f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Salmon = { { { 0.980392218f, 0.501960814f, 0.447058856f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SandyBrown = { { { 0.956862807f, 0.643137276f, 0.376470625f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SeaGreen = { { { 0.180392161f, 0.545098066f, 0.341176480f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SeaShell = { { { 1.000000000f, 0.960784376f, 0.933333397f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Sienna = { { { 0.627451003f, 0.321568638f, 0.176470593f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Silver = { { { 0.752941251f, 0.752941251f, 0.752941251f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SkyBlue = { { { 0.529411793f, 0.807843208f, 0.921568692f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SlateBlue = { { { 0.415686309f, 0.352941185f, 0.803921640f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SlateGray = { { { 0.439215720f, 0.501960814f, 0.564705908f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Snow = { { { 1.000000000f, 0.980392218f, 0.980392218f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SpringGreen = { { { 0.000000000f, 1.000000000f, 0.498039246f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SteelBlue = { { { 0.274509817f, 0.509803951f, 0.705882370f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Tan = { { { 0.823529482f, 0.705882370f, 0.549019635f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Teal = { { { 0.000000000f, 0.501960814f, 0.501960814f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Thistle = { { { 0.847058892f, 0.749019623f, 0.847058892f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Tomato = { { { 1.000000000f, 0.388235331f, 0.278431386f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Transparent = { { { 0.000000000f, 0.000000000f, 0.000000000f, 0.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Turquoise = { { { 0.250980407f, 0.878431439f, 0.815686345f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Violet = { { { 0.933333397f, 0.509803951f, 0.933333397f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Wheat = { { { 0.960784376f, 0.870588303f, 0.701960802f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 White = { { { 1.000000000f, 1.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 WhiteSmoke = { { { 0.960784376f, 0.960784376f, 0.960784376f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Yellow = { { { 1.000000000f, 1.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 YellowGreen = { { { 0.603921592f, 0.803921640f, 0.196078449f, 1.000000000f } } }; + +} // namespace Colors + +} // namespace DirectX + diff --git a/WickedEngine/Utility/DirectXMath.h b/WickedEngine/Utility/DirectXMath.h new file mode 100644 index 000000000..d7cdaa5bd --- /dev/null +++ b/WickedEngine/Utility/DirectXMath.h @@ -0,0 +1,2270 @@ +//------------------------------------------------------------------------------------- +// DirectXMath.h -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMathCommon.h" + +#ifndef __cplusplus +#error DirectX Math requires C++ +#endif + +#define DIRECTX_MATH_VERSION 313 + +#if defined(_MSC_VER) && (_MSC_VER < 1900) +#error DirectX Math requires Visual C++ 2015 or later. +#endif + +#if !defined(__ARM_ARCH) && !defined(_M_ARM) && !defined(_M_ARM64) && (!_MANAGED) && (!_M_CEE) && (!defined(_M_IX86_FP) || (_M_IX86_FP > 1)) && !defined(_XM_NO_INTRINSICS_) && !defined(_XM_VECTORCALL_) +#define _XM_VECTORCALL_ 1 +#endif + +#ifdef _XM_NO_CALL_CONVENTION_ +# define XM_CALLCONV +#else +# ifdef _MSC_VER +# if _XM_VECTORCALL_ +# define XM_CALLCONV __vectorcall +# else +# define XM_CALLCONV __fastcall +# endif +# else +# if _XM_VECTORCALL_ +# define XM_CALLCONV __attribute__((vectorcall)) +# else +# define XM_CALLCONV __attribute__((fastcall)) +# endif +# endif +#endif + +#if defined(_MSC_VER) && (_MSC_FULL_VER < 190023506) +#define XM_CONST const +#define XM_CONSTEXPR +#else +#define XM_CONST constexpr +#define XM_CONSTEXPR constexpr +#endif + +#ifndef XM_DEPRECATED +# ifdef _MSC_VER +# define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version.")) +# else +# define XM_DEPRECATED +# endif +#endif + +#if !defined(_XM_AVX2_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_) +#define _XM_AVX2_INTRINSICS_ +#endif + +#if !defined(_XM_FMA3_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#define _XM_FMA3_INTRINSICS_ +#endif + +#if !defined(_XM_F16C_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#define _XM_F16C_INTRINSICS_ +#endif + +#if defined(_XM_FMA3_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_) +#define _XM_SSE4_INTRINSICS_ +#endif + +#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE3_INTRINSICS_) +#define _XM_SSE3_INTRINSICS_ +#endif + +#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) +#define _XM_SSE_INTRINSICS_ +#endif + +#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#if (defined(_M_IX86) || defined(_M_X64)) && !defined(_M_HYBRID_X86_ARM64) +#define _XM_SSE_INTRINSICS_ +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#define _XM_ARM_NEON_INTRINSICS_ +#elif !defined(_XM_NO_INTRINSICS_) +#error DirectX Math does not support this target +#endif +#endif // !_XM_ARM_NEON_INTRINSICS_ && !_XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +#if !defined(_XM_NO_XMVECTOR_OVERLOADS_) && (defined(__clang__) || defined(__GNUC__)) +#define _XM_NO_XMVECTOR_OVERLOADS_ +#endif + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable:4514 4820) +#endif +// C4514/4820: Off by default noise +#include +#include +#ifndef __APPLE__ +# include +#endif +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +#ifndef _XM_NO_INTRINSICS_ +# ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4987) +// C4987: Off by default noise +# include +# pragma warning(pop) +# endif + +#ifdef _XM_SSE_INTRINSICS_ +#include +#include + +#ifdef _XM_SSE3_INTRINSICS_ +#include +#endif + +#ifdef _XM_SSE4_INTRINSICS_ +#include +#endif + +#ifdef _XM_AVX_INTRINSICS_ +#include +#endif + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#include +#else +#include +#ifdef _XM_ARM_NEON_NO_ALIGN_ +#define vld1_u32_ex(s,a) vld1_u32(s) +#define vld1_f32_ex(s,a) vld1_f32(s) +#define vld1q_u32_ex(s,a) vld1q_u32(s) +#define vld1q_f32_ex(s,a) vld1q_f32(s) +#define vst1_u32_ex(d,v,a) vst1_u32(d,v) +#define vst1_f32_ex(d,v,a) vst1_f32(d,v) +#define vst1q_u32_ex(d,v,a) vst1q_u32(d,v) +#define vst1q_f32_ex(d,v,a) vst1q_f32(d,v) +#endif +#endif +#endif +#endif // !_XM_NO_INTRINSICS_ + +#ifdef _MSC_VER +# include +#else +# define _In_ +# define _In_reads_(n) +# define _In_reads_bytes_(n) +# define _Out_ +# define _Out_writes_(n) +# define _Out_writes_bytes_(n) +# define _Out_opt_ +# define _Success_(expr) +# define _Use_decl_annotations_ +# define _Analysis_assume_(expr) +#endif + +#include + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4005 4668) +#endif +// C4005/4668: Old header issue +#include +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +/**************************************************************************** + * + * Conditional intrinsics + * + ****************************************************************************/ + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +#if defined(_XM_NO_MOVNT_) +#define XM_STREAM_PS( p, a ) _mm_store_ps( p, a ) +#define XM_SFENCE() +#else +#define XM_STREAM_PS( p, a ) _mm_stream_ps( p, a ) +#define XM_SFENCE() _mm_sfence() +#endif + +#if defined(_XM_AVX_INTRINSICS_) +#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c ) +#else +#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c ) +#endif + +#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +namespace DirectX +{ + +/**************************************************************************** + * + * Constant definitions + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XM_PI) +#undef XM_PI +#undef XM_2PI +#undef XM_1DIVPI +#undef XM_1DIV2PI +#undef XM_PIDIV2 +#undef XM_PIDIV4 +#undef XM_SELECT_0 +#undef XM_SELECT_1 +#undef XM_PERMUTE_0X +#undef XM_PERMUTE_0Y +#undef XM_PERMUTE_0Z +#undef XM_PERMUTE_0W +#undef XM_PERMUTE_1X +#undef XM_PERMUTE_1Y +#undef XM_PERMUTE_1Z +#undef XM_PERMUTE_1W +#undef XM_CRMASK_CR6 +#undef XM_CRMASK_CR6TRUE +#undef XM_CRMASK_CR6FALSE +#undef XM_CRMASK_CR6BOUNDS +#undef XM_CACHE_LINE_SIZE +#endif + +XM_CONST float XM_PI = 3.141592654f; +XM_CONST float XM_2PI = 6.283185307f; +XM_CONST float XM_1DIVPI = 0.318309886f; +XM_CONST float XM_1DIV2PI = 0.159154943f; +XM_CONST float XM_PIDIV2 = 1.570796327f; +XM_CONST float XM_PIDIV4 = 0.785398163f; + +XM_CONST uint32_t XM_SELECT_0 = 0x00000000; +XM_CONST uint32_t XM_SELECT_1 = 0xFFFFFFFF; + +XM_CONST uint32_t XM_PERMUTE_0X = 0; +XM_CONST uint32_t XM_PERMUTE_0Y = 1; +XM_CONST uint32_t XM_PERMUTE_0Z = 2; +XM_CONST uint32_t XM_PERMUTE_0W = 3; +XM_CONST uint32_t XM_PERMUTE_1X = 4; +XM_CONST uint32_t XM_PERMUTE_1Y = 5; +XM_CONST uint32_t XM_PERMUTE_1Z = 6; +XM_CONST uint32_t XM_PERMUTE_1W = 7; + +XM_CONST uint32_t XM_SWIZZLE_X = 0; +XM_CONST uint32_t XM_SWIZZLE_Y = 1; +XM_CONST uint32_t XM_SWIZZLE_Z = 2; +XM_CONST uint32_t XM_SWIZZLE_W = 3; + +XM_CONST uint32_t XM_CRMASK_CR6 = 0x000000F0; +XM_CONST uint32_t XM_CRMASK_CR6TRUE = 0x00000080; +XM_CONST uint32_t XM_CRMASK_CR6FALSE = 0x00000020; +XM_CONST uint32_t XM_CRMASK_CR6BOUNDS = XM_CRMASK_CR6FALSE; + +XM_CONST size_t XM_CACHE_LINE_SIZE = 64; + + +/**************************************************************************** + * + * Macros + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XMComparisonAllTrue) +#undef XMComparisonAllTrue +#undef XMComparisonAnyTrue +#undef XMComparisonAllFalse +#undef XMComparisonAnyFalse +#undef XMComparisonMixed +#undef XMComparisonAllInBounds +#undef XMComparisonAnyOutOfBounds +#endif + +// Unit conversion + +inline XM_CONSTEXPR float XMConvertToRadians(float fDegrees) { return fDegrees * (XM_PI / 180.0f); } +inline XM_CONSTEXPR float XMConvertToDegrees(float fRadians) { return fRadians * (180.0f / XM_PI); } + +// Condition register evaluation proceeding a recording (R) comparison + +inline bool XMComparisonAllTrue(uint32_t CR) { return (((CR) & XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE); } +inline bool XMComparisonAnyTrue(uint32_t CR) { return (((CR) & XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE); } +inline bool XMComparisonAllFalse(uint32_t CR) { return (((CR) & XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE); } +inline bool XMComparisonAnyFalse(uint32_t CR) { return (((CR) & XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE); } +inline bool XMComparisonMixed(uint32_t CR) { return (((CR) & XM_CRMASK_CR6) == 0); } +inline bool XMComparisonAllInBounds(uint32_t CR) { return (((CR) & XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS); } +inline bool XMComparisonAnyOutOfBounds(uint32_t CR) { return (((CR) & XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS); } + + +/**************************************************************************** + * + * Data types + * + ****************************************************************************/ + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable:4068 4201 4365 4324 4820) +// C4068: ignore unknown pragmas +// C4201: nonstandard extension used : nameless struct/union +// C4365: Off by default noise +// C4324/4820: padding warnings + +# ifdef _PREFAST_ +# pragma prefast(push) +# pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +# endif +#endif + +//------------------------------------------------------------------------------ +#if defined(_XM_NO_INTRINSICS_) +struct __vector4 +{ + union + { + float vector4_f32[4]; + uint32_t vector4_u32[4]; + }; +}; +#endif // _XM_NO_INTRINSICS_ + +//------------------------------------------------------------------------------ +// Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte +// boundary and mapped to hardware vector registers +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +typedef __m128 XMVECTOR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +typedef float32x4_t XMVECTOR; +#else +typedef __vector4 XMVECTOR; +#endif + +// Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86, ARM, ARM64, and vector call; by reference otherwise +#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_) +typedef const XMVECTOR FXMVECTOR; +#else +typedef const XMVECTOR& FXMVECTOR; +#endif + +// Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM, ARM64, and x64 vector call; by reference otherwise +#if ( defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || (_XM_VECTORCALL_ && !defined(_M_IX86) ) ) && !defined(_XM_NO_INTRINSICS_) +typedef const XMVECTOR GXMVECTOR; +#else +typedef const XMVECTOR& GXMVECTOR; +#endif + +// Fix-up for (5th & 6th) XMVECTOR parameter to pass in-register for ARM64 and vector call; by reference otherwise +#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_) +typedef const XMVECTOR HXMVECTOR; +#else +typedef const XMVECTOR& HXMVECTOR; +#endif + +// Fix-up for (7th+) XMVECTOR parameters to pass by reference +typedef const XMVECTOR& CXMVECTOR; + +//------------------------------------------------------------------------------ +// Conversion types for constants +struct alignas(16) XMVECTORF32 +{ + union + { + float f[4]; + XMVECTOR v; + }; + +#if !defined(_MSC_VER) && defined(_XM_SSE_INTRINSICS_) + XMVECTORF32() = default; + XMVECTORF32(std::initializer_list l) + { + int co(0); + for (auto val : l) + { + f[co] = val; + if ((++co) > 3) break; + } + } +#endif + + inline operator XMVECTOR() const { return v; } + inline operator const float*() const { return f; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return _mm_castps_si128(v); } + inline operator __m128d() const { return _mm_castps_pd(v); } +#endif +}; + +struct alignas(16) XMVECTORI32 +{ + union + { + int32_t i[4]; + XMVECTOR v; + }; + +#if !defined(_MSC_VER) && defined(_XM_SSE_INTRINSICS_) + XMVECTORI32() = default; + XMVECTORI32(std::initializer_list l) + { + int co(0); + for (auto val : l) + { + i[co] = val; + if ((++co) > 3) break; + } + } +#endif + + inline operator XMVECTOR() const { return v; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return _mm_castps_si128(v); } + inline operator __m128d() const { return _mm_castps_pd(v); } +#endif +}; + +struct alignas(16) XMVECTORU8 +{ + union + { + uint8_t u[16]; + XMVECTOR v; + }; + +#if !defined(_MSC_VER) && defined(_XM_SSE_INTRINSICS_) + XMVECTORU8() = default; + XMVECTORU8(std::initializer_list l) + { + int co(0); + for (auto val : l) + { + u[co] = val; + if ((++co) > 3) break; + } + } +#endif + + inline operator XMVECTOR() const { return v; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return _mm_castps_si128(v); } + inline operator __m128d() const { return _mm_castps_pd(v); } +#endif +}; + +struct alignas(16) XMVECTORU32 +{ + union + { + uint32_t u[4]; + XMVECTOR v; + }; + +#if !defined(_MSC_VER) && defined(_XM_SSE_INTRINSICS_) + XMVECTORU32() = default; + XMVECTORU32(std::initializer_list l) + { + int co(0); + for (auto val : l) + { + u[co] = val; + if ((++co) > 3) break; + } + } +#endif + + inline operator XMVECTOR() const { return v; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return _mm_castps_si128(v); } + inline operator __m128d() const { return _mm_castps_pd(v); } +#endif +}; + +//------------------------------------------------------------------------------ +// Vector operators + +#ifndef _XM_NO_XMVECTOR_OVERLOADS_ +XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V); +XMVECTOR XM_CALLCONV operator- (FXMVECTOR V); + +XMVECTOR& XM_CALLCONV operator+= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& XM_CALLCONV operator-= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& XM_CALLCONV operator*= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& XM_CALLCONV operator/= (XMVECTOR& V1, FXMVECTOR V2); + +XMVECTOR& operator*= (XMVECTOR& V, float S); +XMVECTOR& operator/= (XMVECTOR& V, float S); + +XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV operator- (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV operator* (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV operator* (FXMVECTOR V, float S); +XMVECTOR XM_CALLCONV operator* (float S, FXMVECTOR V); +XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V, float S); +#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */ + +//------------------------------------------------------------------------------ +// Matrix type: Sixteen 32 bit floating point components aligned on a +// 16 byte boundary and mapped to four hardware vector registers + +struct XMMATRIX; + +// Fix-up for (1st) XMMATRIX parameter to pass in-register for ARM64 and vector call; by reference otherwise +#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_) +typedef const XMMATRIX FXMMATRIX; +#else +typedef const XMMATRIX& FXMMATRIX; +#endif + +// Fix-up for (2nd+) XMMATRIX parameters to pass by reference +typedef const XMMATRIX& CXMMATRIX; + +#ifdef _XM_NO_INTRINSICS_ +struct XMMATRIX +#else +struct alignas(16) XMMATRIX +#endif +{ +#ifdef _XM_NO_INTRINSICS_ + union + { + XMVECTOR r[4]; + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + float _41, _42, _43, _44; + }; + float m[4][4]; + }; +#else + XMVECTOR r[4]; +#endif + + XMMATRIX() = default; + + XMMATRIX(const XMMATRIX&) = default; + +#if defined(_MSC_VER) && (_MSC_FULL_VER < 191426431) + XMMATRIX& operator= (const XMMATRIX& M) noexcept { r[0] = M.r[0]; r[1] = M.r[1]; r[2] = M.r[2]; r[3] = M.r[3]; return *this; } +#else + XMMATRIX& operator=(const XMMATRIX&) = default; + + XMMATRIX(XMMATRIX&&) = default; + XMMATRIX& operator=(XMMATRIX&&) = default; +#endif + + constexpr XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) : r{ R0,R1,R2,R3 } {} + XMMATRIX(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33); + explicit XMMATRIX(_In_reads_(16) const float *pArray); + +#ifdef _XM_NO_INTRINSICS_ + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } +#endif + + XMMATRIX operator+ () const { return *this; } + XMMATRIX operator- () const; + + XMMATRIX& XM_CALLCONV operator+= (FXMMATRIX M); + XMMATRIX& XM_CALLCONV operator-= (FXMMATRIX M); + XMMATRIX& XM_CALLCONV operator*= (FXMMATRIX M); + XMMATRIX& operator*= (float S); + XMMATRIX& operator/= (float S); + + XMMATRIX XM_CALLCONV operator+ (FXMMATRIX M) const; + XMMATRIX XM_CALLCONV operator- (FXMMATRIX M) const; + XMMATRIX XM_CALLCONV operator* (FXMMATRIX M) const; + XMMATRIX operator* (float S) const; + XMMATRIX operator/ (float S) const; + + friend XMMATRIX XM_CALLCONV operator* (float S, FXMMATRIX M); +}; + +//------------------------------------------------------------------------------ +// 2D Vector; 32 bit floating point components +struct XMFLOAT2 +{ + float x; + float y; + + XMFLOAT2() = default; + + XMFLOAT2(const XMFLOAT2&) = default; + XMFLOAT2& operator=(const XMFLOAT2&) = default; + + XMFLOAT2(XMFLOAT2&&) = default; + XMFLOAT2& operator=(XMFLOAT2&&) = default; + + XM_CONSTEXPR XMFLOAT2(float _x, float _y) : x(_x), y(_y) {} + explicit XMFLOAT2(_In_reads_(2) const float *pArray) : x(pArray[0]), y(pArray[1]) {} +}; + +// 2D Vector; 32 bit floating point components aligned on a 16 byte boundary +struct alignas(16) XMFLOAT2A : public XMFLOAT2 +{ + XMFLOAT2A() = default; + + XMFLOAT2A(const XMFLOAT2A&) = default; + XMFLOAT2A& operator=(const XMFLOAT2A&) = default; + + XMFLOAT2A(XMFLOAT2A&&) = default; + XMFLOAT2A& operator=(XMFLOAT2A&&) = default; + + XM_CONSTEXPR XMFLOAT2A(float _x, float _y) : XMFLOAT2(_x, _y) {} + explicit XMFLOAT2A(_In_reads_(2) const float *pArray) : XMFLOAT2(pArray) {} +}; + +//------------------------------------------------------------------------------ +// 2D Vector; 32 bit signed integer components +struct XMINT2 +{ + int32_t x; + int32_t y; + + XMINT2() = default; + + XMINT2(const XMINT2&) = default; + XMINT2& operator=(const XMINT2&) = default; + + XMINT2(XMINT2&&) = default; + XMINT2& operator=(XMINT2&&) = default; + + XM_CONSTEXPR XMINT2(int32_t _x, int32_t _y) : x(_x), y(_y) {} + explicit XMINT2(_In_reads_(2) const int32_t *pArray) : x(pArray[0]), y(pArray[1]) {} +}; + +// 2D Vector; 32 bit unsigned integer components +struct XMUINT2 +{ + uint32_t x; + uint32_t y; + + XMUINT2() = default; + + XMUINT2(const XMUINT2&) = default; + XMUINT2& operator=(const XMUINT2&) = default; + + XMUINT2(XMUINT2&&) = default; + XMUINT2& operator=(XMUINT2&&) = default; + + XM_CONSTEXPR XMUINT2(uint32_t _x, uint32_t _y) : x(_x), y(_y) {} + explicit XMUINT2(_In_reads_(2) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]) {} +}; + +//------------------------------------------------------------------------------ +// 3D Vector; 32 bit floating point components +struct XMFLOAT3 +{ + float x; + float y; + float z; + + XMFLOAT3() = default; + + XMFLOAT3(const XMFLOAT3&) = default; + XMFLOAT3& operator=(const XMFLOAT3&) = default; + + XMFLOAT3(XMFLOAT3&&) = default; + XMFLOAT3& operator=(XMFLOAT3&&) = default; + + XM_CONSTEXPR XMFLOAT3(float _x, float _y, float _z) : x(_x), y(_y), z(_z) {} + explicit XMFLOAT3(_In_reads_(3) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} +}; + +// 3D Vector; 32 bit floating point components aligned on a 16 byte boundary +struct alignas(16) XMFLOAT3A : public XMFLOAT3 +{ + XMFLOAT3A() = default; + + XMFLOAT3A(const XMFLOAT3A&) = default; + XMFLOAT3A& operator=(const XMFLOAT3A&) = default; + + XMFLOAT3A(XMFLOAT3A&&) = default; + XMFLOAT3A& operator=(XMFLOAT3A&&) = default; + + XM_CONSTEXPR XMFLOAT3A(float _x, float _y, float _z) : XMFLOAT3(_x, _y, _z) {} + explicit XMFLOAT3A(_In_reads_(3) const float *pArray) : XMFLOAT3(pArray) {} +}; + +//------------------------------------------------------------------------------ +// 3D Vector; 32 bit signed integer components +struct XMINT3 +{ + int32_t x; + int32_t y; + int32_t z; + + XMINT3() = default; + + XMINT3(const XMINT3&) = default; + XMINT3& operator=(const XMINT3&) = default; + + XMINT3(XMINT3&&) = default; + XMINT3& operator=(XMINT3&&) = default; + + XM_CONSTEXPR XMINT3(int32_t _x, int32_t _y, int32_t _z) : x(_x), y(_y), z(_z) {} + explicit XMINT3(_In_reads_(3) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} +}; + +// 3D Vector; 32 bit unsigned integer components +struct XMUINT3 +{ + uint32_t x; + uint32_t y; + uint32_t z; + + XMUINT3() = default; + + XMUINT3(const XMUINT3&) = default; + XMUINT3& operator=(const XMUINT3&) = default; + + XMUINT3(XMUINT3&&) = default; + XMUINT3& operator=(XMUINT3&&) = default; + + XM_CONSTEXPR XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) : x(_x), y(_y), z(_z) {} + explicit XMUINT3(_In_reads_(3) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 32 bit floating point components +struct XMFLOAT4 +{ + float x; + float y; + float z; + float w; + + XMFLOAT4() = default; + + XMFLOAT4(const XMFLOAT4&) = default; + XMFLOAT4& operator=(const XMFLOAT4&) = default; + + XMFLOAT4(XMFLOAT4&&) = default; + XMFLOAT4& operator=(XMFLOAT4&&) = default; + + XM_CONSTEXPR XMFLOAT4(float _x, float _y, float _z, float _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMFLOAT4(_In_reads_(4) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} +}; + +// 4D Vector; 32 bit floating point components aligned on a 16 byte boundary +struct alignas(16) XMFLOAT4A : public XMFLOAT4 +{ + XMFLOAT4A() = default; + + XMFLOAT4A(const XMFLOAT4A&) = default; + XMFLOAT4A& operator=(const XMFLOAT4A&) = default; + + XMFLOAT4A(XMFLOAT4A&&) = default; + XMFLOAT4A& operator=(XMFLOAT4A&&) = default; + + XM_CONSTEXPR XMFLOAT4A(float _x, float _y, float _z, float _w) : XMFLOAT4(_x, _y, _z, _w) {} + explicit XMFLOAT4A(_In_reads_(4) const float *pArray) : XMFLOAT4(pArray) {} +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 32 bit signed integer components +struct XMINT4 +{ + int32_t x; + int32_t y; + int32_t z; + int32_t w; + + XMINT4() = default; + + XMINT4(const XMINT4&) = default; + XMINT4& operator=(const XMINT4&) = default; + + XMINT4(XMINT4&&) = default; + XMINT4& operator=(XMINT4&&) = default; + + XM_CONSTEXPR XMINT4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMINT4(_In_reads_(4) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} +}; + +// 4D Vector; 32 bit unsigned integer components +struct XMUINT4 +{ + uint32_t x; + uint32_t y; + uint32_t z; + uint32_t w; + + XMUINT4() = default; + + XMUINT4(const XMUINT4&) = default; + XMUINT4& operator=(const XMUINT4&) = default; + + XMUINT4(XMUINT4&&) = default; + XMUINT4& operator=(XMUINT4&&) = default; + + XM_CONSTEXPR XMUINT4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUINT4(_In_reads_(4) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} +}; + +//------------------------------------------------------------------------------ +// 3x3 Matrix: 32 bit floating point components +struct XMFLOAT3X3 +{ + union + { + struct + { + float _11, _12, _13; + float _21, _22, _23; + float _31, _32, _33; + }; + float m[3][3]; + }; + + XMFLOAT3X3() = default; + + XMFLOAT3X3(const XMFLOAT3X3&) = default; + XMFLOAT3X3& operator=(const XMFLOAT3X3&) = default; + + XMFLOAT3X3(XMFLOAT3X3&&) = default; + XMFLOAT3X3& operator=(XMFLOAT3X3&&) = default; + + XM_CONSTEXPR XMFLOAT3X3(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22) + : _11(m00), _12(m01), _13(m02), + _21(m10), _22(m11), _23(m12), + _31(m20), _32(m21), _33(m22) {} + explicit XMFLOAT3X3(_In_reads_(9) const float *pArray); + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } +}; + +//------------------------------------------------------------------------------ +// 4x3 Row-major Matrix: 32 bit floating point components +struct XMFLOAT4X3 +{ + union + { + struct + { + float _11, _12, _13; + float _21, _22, _23; + float _31, _32, _33; + float _41, _42, _43; + }; + float m[4][3]; + float f[12]; + }; + + XMFLOAT4X3() = default; + + XMFLOAT4X3(const XMFLOAT4X3&) = default; + XMFLOAT4X3& operator=(const XMFLOAT4X3&) = default; + + XMFLOAT4X3(XMFLOAT4X3&&) = default; + XMFLOAT4X3& operator=(XMFLOAT4X3&&) = default; + + XM_CONSTEXPR XMFLOAT4X3(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22, + float m30, float m31, float m32) + : _11(m00), _12(m01), _13(m02), + _21(m10), _22(m11), _23(m12), + _31(m20), _32(m21), _33(m22), + _41(m30), _42(m31), _43(m32) {} + explicit XMFLOAT4X3(_In_reads_(12) const float *pArray); + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } +}; + +// 4x3 Row-major Matrix: 32 bit floating point components aligned on a 16 byte boundary +struct alignas(16) XMFLOAT4X3A : public XMFLOAT4X3 +{ + XMFLOAT4X3A() = default; + + XMFLOAT4X3A(const XMFLOAT4X3A&) = default; + XMFLOAT4X3A& operator=(const XMFLOAT4X3A&) = default; + + XMFLOAT4X3A(XMFLOAT4X3A&&) = default; + XMFLOAT4X3A& operator=(XMFLOAT4X3A&&) = default; + + XM_CONSTEXPR XMFLOAT4X3A(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22, + float m30, float m31, float m32) : + XMFLOAT4X3(m00,m01,m02,m10,m11,m12,m20,m21,m22,m30,m31,m32) {} + explicit XMFLOAT4X3A(_In_reads_(12) const float *pArray) : XMFLOAT4X3(pArray) {} +}; + +//------------------------------------------------------------------------------ +// 3x4 Column-major Matrix: 32 bit floating point components +struct XMFLOAT3X4 +{ + union + { + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + }; + float m[3][4]; + float f[12]; + }; + + XMFLOAT3X4() = default; + + XMFLOAT3X4(const XMFLOAT3X4&) = default; + XMFLOAT3X4& operator=(const XMFLOAT3X4&) = default; + + XMFLOAT3X4(XMFLOAT3X4&&) = default; + XMFLOAT3X4& operator=(XMFLOAT3X4&&) = default; + + XM_CONSTEXPR XMFLOAT3X4(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23) + : _11(m00), _12(m01), _13(m02), _14(m03), + _21(m10), _22(m11), _23(m12), _24(m13), + _31(m20), _32(m21), _33(m22), _34(m23) {} + explicit XMFLOAT3X4(_In_reads_(12) const float *pArray); + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } +}; + +// 3x4 Column-major Matrix: 32 bit floating point components aligned on a 16 byte boundary +struct alignas(16) XMFLOAT3X4A : public XMFLOAT3X4 +{ + XMFLOAT3X4A() = default; + + XMFLOAT3X4A(const XMFLOAT3X4A&) = default; + XMFLOAT3X4A& operator=(const XMFLOAT3X4A&) = default; + + XMFLOAT3X4A(XMFLOAT3X4A&&) = default; + XMFLOAT3X4A& operator=(XMFLOAT3X4A&&) = default; + + XM_CONSTEXPR XMFLOAT3X4A(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23) : + XMFLOAT3X4(m00, m01, m02, m03, m10, m11, m12, m13, m20, m21, m22, m23) {} + explicit XMFLOAT3X4A(_In_reads_(12) const float *pArray) : XMFLOAT3X4(pArray) {} +}; + +//------------------------------------------------------------------------------ +// 4x4 Matrix: 32 bit floating point components +struct XMFLOAT4X4 +{ + union + { + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + float _41, _42, _43, _44; + }; + float m[4][4]; + }; + + XMFLOAT4X4() = default; + + XMFLOAT4X4(const XMFLOAT4X4&) = default; + XMFLOAT4X4& operator=(const XMFLOAT4X4&) = default; + + XMFLOAT4X4(XMFLOAT4X4&&) = default; + XMFLOAT4X4& operator=(XMFLOAT4X4&&) = default; + + XM_CONSTEXPR XMFLOAT4X4(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) + : _11(m00), _12(m01), _13(m02), _14(m03), + _21(m10), _22(m11), _23(m12), _24(m13), + _31(m20), _32(m21), _33(m22), _34(m23), + _41(m30), _42(m31), _43(m32), _44(m33) {} + explicit XMFLOAT4X4(_In_reads_(16) const float *pArray); + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } +}; + +// 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary +struct alignas(16) XMFLOAT4X4A : public XMFLOAT4X4 +{ + XMFLOAT4X4A() = default; + + XMFLOAT4X4A(const XMFLOAT4X4A&) = default; + XMFLOAT4X4A& operator=(const XMFLOAT4X4A&) = default; + + XMFLOAT4X4A(XMFLOAT4X4A&&) = default; + XMFLOAT4X4A& operator=(XMFLOAT4X4A&&) = default; + + XM_CONSTEXPR XMFLOAT4X4A(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) + : XMFLOAT4X4(m00,m01,m02,m03,m10,m11,m12,m13,m20,m21,m22,m23,m30,m31,m32,m33) {} + explicit XMFLOAT4X4A(_In_reads_(16) const float *pArray) : XMFLOAT4X4(pArray) {} +}; + +//////////////////////////////////////////////////////////////////////////////// + +#ifdef _MSC_VER +# ifdef _PREFAST_ +# pragma prefast(pop) +# endif + +# pragma warning(pop) +#endif + +/**************************************************************************** + * + * Data conversion operations + * + ****************************************************************************/ + +XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat(FXMVECTOR VInt, uint32_t DivExponent); +XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt(FXMVECTOR VFloat, uint32_t MulExponent); +XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat(FXMVECTOR VUInt, uint32_t DivExponent); +XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt(FXMVECTOR VFloat, uint32_t MulExponent); + +#if defined(__XNAMATH_H__) && defined(XMVectorSetBinaryConstant) +#undef XMVectorSetBinaryConstant +#undef XMVectorSplatConstant +#undef XMVectorSplatConstantInt +#endif + +XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3); +XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent); +XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant); + +/**************************************************************************** + * + * Load operations + * + ****************************************************************************/ + +XMVECTOR XM_CALLCONV XMLoadInt(_In_ const uint32_t* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat(_In_ const float* pSource); + +XMVECTOR XM_CALLCONV XMLoadInt2(_In_reads_(2) const uint32_t* pSource); +XMVECTOR XM_CALLCONV XMLoadInt2A(_In_reads_(2) const uint32_t* PSource); +XMVECTOR XM_CALLCONV XMLoadFloat2(_In_ const XMFLOAT2* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat2A(_In_ const XMFLOAT2A* pSource); +XMVECTOR XM_CALLCONV XMLoadSInt2(_In_ const XMINT2* pSource); +XMVECTOR XM_CALLCONV XMLoadUInt2(_In_ const XMUINT2* pSource); + +XMVECTOR XM_CALLCONV XMLoadInt3(_In_reads_(3) const uint32_t* pSource); +XMVECTOR XM_CALLCONV XMLoadInt3A(_In_reads_(3) const uint32_t* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat3(_In_ const XMFLOAT3* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat3A(_In_ const XMFLOAT3A* pSource); +XMVECTOR XM_CALLCONV XMLoadSInt3(_In_ const XMINT3* pSource); +XMVECTOR XM_CALLCONV XMLoadUInt3(_In_ const XMUINT3* pSource); + +XMVECTOR XM_CALLCONV XMLoadInt4(_In_reads_(4) const uint32_t* pSource); +XMVECTOR XM_CALLCONV XMLoadInt4A(_In_reads_(4) const uint32_t* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat4(_In_ const XMFLOAT4* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat4A(_In_ const XMFLOAT4A* pSource); +XMVECTOR XM_CALLCONV XMLoadSInt4(_In_ const XMINT4* pSource); +XMVECTOR XM_CALLCONV XMLoadUInt4(_In_ const XMUINT4* pSource); + +XMMATRIX XM_CALLCONV XMLoadFloat3x3(_In_ const XMFLOAT3X3* pSource); +XMMATRIX XM_CALLCONV XMLoadFloat4x3(_In_ const XMFLOAT4X3* pSource); +XMMATRIX XM_CALLCONV XMLoadFloat4x3A(_In_ const XMFLOAT4X3A* pSource); +XMMATRIX XM_CALLCONV XMLoadFloat3x4(_In_ const XMFLOAT3X4* pSource); +XMMATRIX XM_CALLCONV XMLoadFloat3x4A(_In_ const XMFLOAT3X4A* pSource); +XMMATRIX XM_CALLCONV XMLoadFloat4x4(_In_ const XMFLOAT4X4* pSource); +XMMATRIX XM_CALLCONV XMLoadFloat4x4A(_In_ const XMFLOAT4X4A* pSource); + +/**************************************************************************** + * + * Store operations + * + ****************************************************************************/ + +void XM_CALLCONV XMStoreInt(_Out_ uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat(_Out_ float* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreInt2(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreInt2A(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat2(_Out_ XMFLOAT2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat2A(_Out_ XMFLOAT2A* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreSInt2(_Out_ XMINT2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUInt2(_Out_ XMUINT2* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreInt3(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreInt3A(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat3(_Out_ XMFLOAT3* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat3A(_Out_ XMFLOAT3A* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreSInt3(_Out_ XMINT3* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUInt3(_Out_ XMUINT3* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreInt4(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreInt4A(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat4(_Out_ XMFLOAT4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat4A(_Out_ XMFLOAT4A* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreSInt4(_Out_ XMINT4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUInt4(_Out_ XMUINT4* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination, _In_ FXMMATRIX M); +void XM_CALLCONV XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination, _In_ FXMMATRIX M); +void XM_CALLCONV XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination, _In_ FXMMATRIX M); +void XM_CALLCONV XMStoreFloat3x4(_Out_ XMFLOAT3X4* pDestination, _In_ FXMMATRIX M); +void XM_CALLCONV XMStoreFloat3x4A(_Out_ XMFLOAT3X4A* pDestination, _In_ FXMMATRIX M); +void XM_CALLCONV XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination, _In_ FXMMATRIX M); +void XM_CALLCONV XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination, _In_ FXMMATRIX M); + +/**************************************************************************** + * + * General vector operations + * + ****************************************************************************/ + +XMVECTOR XM_CALLCONV XMVectorZero(); +XMVECTOR XM_CALLCONV XMVectorSet(float x, float y, float z, float w); +XMVECTOR XM_CALLCONV XMVectorSetInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w); +XMVECTOR XM_CALLCONV XMVectorReplicate(float Value); +XMVECTOR XM_CALLCONV XMVectorReplicatePtr(_In_ const float *pValue); +XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value); +XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr(_In_ const uint32_t *pValue); +XMVECTOR XM_CALLCONV XMVectorTrueInt(); +XMVECTOR XM_CALLCONV XMVectorFalseInt(); +XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSplatOne(); +XMVECTOR XM_CALLCONV XMVectorSplatInfinity(); +XMVECTOR XM_CALLCONV XMVectorSplatQNaN(); +XMVECTOR XM_CALLCONV XMVectorSplatEpsilon(); +XMVECTOR XM_CALLCONV XMVectorSplatSignMask(); + +float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i); +float XM_CALLCONV XMVectorGetX(FXMVECTOR V); +float XM_CALLCONV XMVectorGetY(FXMVECTOR V); +float XM_CALLCONV XMVectorGetZ(FXMVECTOR V); +float XM_CALLCONV XMVectorGetW(FXMVECTOR V); + +void XM_CALLCONV XMVectorGetByIndexPtr(_Out_ float *f, _In_ FXMVECTOR V, _In_ size_t i); +void XM_CALLCONV XMVectorGetXPtr(_Out_ float *x, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V); + +uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i); +uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V); +uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V); +uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V); +uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V); + +void XM_CALLCONV XMVectorGetIntByIndexPtr(_Out_ uint32_t *x, _In_ FXMVECTOR V, _In_ size_t i); +void XM_CALLCONV XMVectorGetIntXPtr(_Out_ uint32_t *x, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V); + +XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V,float f, size_t i); +XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x); +XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y); +XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z); +XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w); + +XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(_In_ FXMVECTOR V, _In_ const float *f, _In_ size_t i); +XMVECTOR XM_CALLCONV XMVectorSetXPtr(_In_ FXMVECTOR V, _In_ const float *x); +XMVECTOR XM_CALLCONV XMVectorSetYPtr(_In_ FXMVECTOR V, _In_ const float *y); +XMVECTOR XM_CALLCONV XMVectorSetZPtr(_In_ FXMVECTOR V, _In_ const float *z); +XMVECTOR XM_CALLCONV XMVectorSetWPtr(_In_ FXMVECTOR V, _In_ const float *w); + +XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i); +XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x); +XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y); +XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z); +XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w); + +XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(_In_ FXMVECTOR V, _In_ const uint32_t *x, _In_ size_t i); +XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(_In_ FXMVECTOR V, _In_ const uint32_t *x); +XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(_In_ FXMVECTOR V, _In_ const uint32_t *y); +XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(_In_ FXMVECTOR V, _In_ const uint32_t *z); +XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(_In_ FXMVECTOR V, _In_ const uint32_t *w); + +#if defined(__XNAMATH_H__) && defined(XMVectorSwizzle) +#undef XMVectorSwizzle +#endif + +XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3); +XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW); +XMVECTOR XM_CALLCONV XMVectorSelectControl(uint32_t VectorIndex0, uint32_t VectorIndex1, uint32_t VectorIndex2, uint32_t VectorIndex3); +XMVECTOR XM_CALLCONV XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control); +XMVECTOR XM_CALLCONV XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2); + +#if defined(__XNAMATH_H__) && defined(XMVectorShiftLeft) +#undef XMVectorShiftLeft +#undef XMVectorRotateLeft +#undef XMVectorRotateRight +#undef XMVectorInsert +#endif + +XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements); +XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements); +XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements); +XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements, + uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3); + +XMVECTOR XM_CALLCONV XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorEqualIntR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +XMVECTOR XM_CALLCONV XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorGreaterR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorLess(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds); +XMVECTOR XM_CALLCONV XMVectorInBoundsR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR Bounds); + +XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V); + +XMVECTOR XM_CALLCONV XMVectorMin(FXMVECTOR V1,FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorMax(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorClamp(FXMVECTOR V, FXMVECTOR Min, FXMVECTOR Max); +XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V); + +XMVECTOR XM_CALLCONV XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2); + +XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); +XMVECTOR XM_CALLCONV XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); +XMVECTOR XM_CALLCONV XMVectorScale(FXMVECTOR V, float ScaleFactor); +XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorPow(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorMod(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles); +XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V); +void XM_CALLCONV XMVectorSinCos(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorSinCosEst(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorATan2(FXMVECTOR Y, FXMVECTOR X); +XMVECTOR XM_CALLCONV XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X); +XMVECTOR XM_CALLCONV XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, float t); +XMVECTOR XM_CALLCONV XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR T); +XMVECTOR XM_CALLCONV XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, float t); +XMVECTOR XM_CALLCONV XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, HXMVECTOR T); +XMVECTOR XM_CALLCONV XMVectorCatmullRom(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, float t); +XMVECTOR XM_CALLCONV XMVectorCatmullRomV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, HXMVECTOR T); +XMVECTOR XM_CALLCONV XMVectorBaryCentric(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, float f, float g); +XMVECTOR XM_CALLCONV XMVectorBaryCentricV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR F, HXMVECTOR G); + +/**************************************************************************** + * + * 2D vector operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +bool XM_CALLCONV XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector2GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2Less(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds); + +bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V); +bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V); + +XMVECTOR XM_CALLCONV XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2ClampLength(FXMVECTOR V, float LengthMin, float LengthMax); +XMVECTOR XM_CALLCONV XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); +XMVECTOR XM_CALLCONV XMVector2Reflect(FXMVECTOR Incident, FXMVECTOR Normal); +XMVECTOR XM_CALLCONV XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex); +XMVECTOR XM_CALLCONV XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); +XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector2LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point); +XMVECTOR XM_CALLCONV XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2, FXMVECTOR Line2Point1, GXMVECTOR Line2Point2); +XMVECTOR XM_CALLCONV XMVector2Transform(FXMVECTOR V, FXMMATRIX M); +XMFLOAT4* XM_CALLCONV XMVector2TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); +XMVECTOR XM_CALLCONV XMVector2TransformCoord(FXMVECTOR V, FXMMATRIX M); +XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); +XMVECTOR XM_CALLCONV XMVector2TransformNormal(FXMVECTOR V, FXMMATRIX M); +XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); + +/**************************************************************************** + * + * 3D vector operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +bool XM_CALLCONV XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector3GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3Less(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds); + +bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V); +bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V); + +XMVECTOR XM_CALLCONV XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3ClampLength(FXMVECTOR V, float LengthMin, float LengthMax); +XMVECTOR XM_CALLCONV XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); +XMVECTOR XM_CALLCONV XMVector3Reflect(FXMVECTOR Incident, FXMVECTOR Normal); +XMVECTOR XM_CALLCONV XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex); +XMVECTOR XM_CALLCONV XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); +XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector3LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point); +void XM_CALLCONV XMVector3ComponentsFromNormal(_Out_ XMVECTOR* pParallel, _Out_ XMVECTOR* pPerpendicular, _In_ FXMVECTOR V, _In_ FXMVECTOR Normal); +XMVECTOR XM_CALLCONV XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion); +XMVECTOR XM_CALLCONV XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion); +XMVECTOR XM_CALLCONV XMVector3Transform(FXMVECTOR V, FXMMATRIX M); +XMFLOAT4* XM_CALLCONV XMVector3TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); +XMVECTOR XM_CALLCONV XMVector3TransformCoord(FXMVECTOR V, FXMMATRIX M); +XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); +XMVECTOR XM_CALLCONV XMVector3TransformNormal(FXMVECTOR V, FXMMATRIX M); +XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); +XMVECTOR XM_CALLCONV XMVector3Project(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, + FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World); +XMFLOAT3* XM_CALLCONV XMVector3ProjectStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, + _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World); +XMVECTOR XM_CALLCONV XMVector3Unproject(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, + FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World); +XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, + _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World); + +/**************************************************************************** + * + * 4D vector operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +bool XM_CALLCONV XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector4GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4Less(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds); + +bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V); +bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V); + +XMVECTOR XM_CALLCONV XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); +XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4ClampLength(FXMVECTOR V, float LengthMin, float LengthMax); +XMVECTOR XM_CALLCONV XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); +XMVECTOR XM_CALLCONV XMVector4Reflect(FXMVECTOR Incident, FXMVECTOR Normal); +XMVECTOR XM_CALLCONV XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex); +XMVECTOR XM_CALLCONV XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); +XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector4Transform(FXMVECTOR V, FXMMATRIX M); +XMFLOAT4* XM_CALLCONV XMVector4TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT4)+InputStride*(VectorCount-1)) const XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); + +/**************************************************************************** + * + * Matrix operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M); +bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M); +bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M); + +XMMATRIX XM_CALLCONV XMMatrixMultiply(FXMMATRIX M1, CXMMATRIX M2); +XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose(FXMMATRIX M1, CXMMATRIX M2); +XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M); +XMMATRIX XM_CALLCONV XMMatrixInverse(_Out_opt_ XMVECTOR* pDeterminant, _In_ FXMMATRIX M); +XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M); +_Success_(return) +bool XM_CALLCONV XMMatrixDecompose(_Out_ XMVECTOR *outScale, _Out_ XMVECTOR *outRotQuat, _Out_ XMVECTOR *outTrans, _In_ FXMMATRIX M); + +XMMATRIX XM_CALLCONV XMMatrixIdentity(); +XMMATRIX XM_CALLCONV XMMatrixSet(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33); +XMMATRIX XM_CALLCONV XMMatrixTranslation(float OffsetX, float OffsetY, float OffsetZ); +XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector(FXMVECTOR Offset); +XMMATRIX XM_CALLCONV XMMatrixScaling(float ScaleX, float ScaleY, float ScaleZ); +XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale); +XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle); +XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle); +XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle); +XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw(float Pitch, float Yaw, float Roll); +XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles); +XMMATRIX XM_CALLCONV XMMatrixRotationNormal(FXMVECTOR NormalAxis, float Angle); +XMMATRIX XM_CALLCONV XMMatrixRotationAxis(FXMVECTOR Axis, float Angle); +XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion(FXMVECTOR Quaternion); +XMMATRIX XM_CALLCONV XMMatrixTransformation2D(FXMVECTOR ScalingOrigin, float ScalingOrientation, FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, float Rotation, GXMVECTOR Translation); +XMMATRIX XM_CALLCONV XMMatrixTransformation(FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion, FXMVECTOR Scaling, + GXMVECTOR RotationOrigin, HXMVECTOR RotationQuaternion, HXMVECTOR Translation); +XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, float Rotation, FXMVECTOR Translation); +XMMATRIX XM_CALLCONV XMMatrixAffineTransformation(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion, GXMVECTOR Translation); +XMMATRIX XM_CALLCONV XMMatrixReflect(FXMVECTOR ReflectionPlane); +XMMATRIX XM_CALLCONV XMMatrixShadow(FXMVECTOR ShadowPlane, FXMVECTOR LightPosition); + +XMMATRIX XM_CALLCONV XMMatrixLookAtLH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection); +XMMATRIX XM_CALLCONV XMMatrixLookAtRH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection); +XMMATRIX XM_CALLCONV XMMatrixLookToLH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection); +XMMATRIX XM_CALLCONV XMMatrixLookToRH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection); +XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH(float FovAngleY, float AspectRatio, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH(float FovAngleY, float AspectRatio, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixOrthographicLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixOrthographicRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); + + +/**************************************************************************** + * + * Quaternion operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2); +bool XM_CALLCONV XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2); + +bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q); +bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q); +bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q); + +XMVECTOR XM_CALLCONV XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2); +XMVECTOR XM_CALLCONV XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2); +XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1, float t); +XMVECTOR XM_CALLCONV XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR T); +XMVECTOR XM_CALLCONV XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, float t); +XMVECTOR XM_CALLCONV XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, HXMVECTOR T); +void XM_CALLCONV XMQuaternionSquadSetup(_Out_ XMVECTOR* pA, _Out_ XMVECTOR* pB, _Out_ XMVECTOR* pC, _In_ FXMVECTOR Q0, _In_ FXMVECTOR Q1, _In_ FXMVECTOR Q2, _In_ GXMVECTOR Q3); +XMVECTOR XM_CALLCONV XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, float f, float g); +XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR F, HXMVECTOR G); + +XMVECTOR XM_CALLCONV XMQuaternionIdentity(); +XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw(float Pitch, float Yaw, float Roll); +XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles); +XMVECTOR XM_CALLCONV XMQuaternionRotationNormal(FXMVECTOR NormalAxis, float Angle); +XMVECTOR XM_CALLCONV XMQuaternionRotationAxis(FXMVECTOR Axis, float Angle); +XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M); + +void XM_CALLCONV XMQuaternionToAxisAngle(_Out_ XMVECTOR* pAxis, _Out_ float* pAngle, _In_ FXMVECTOR Q); + +/**************************************************************************** + * + * Plane operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2); +bool XM_CALLCONV XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2, FXMVECTOR Epsilon); +bool XM_CALLCONV XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2); + +bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P); +bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P); + +XMVECTOR XM_CALLCONV XMPlaneDot(FXMVECTOR P, FXMVECTOR V); +XMVECTOR XM_CALLCONV XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V); +XMVECTOR XM_CALLCONV XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V); +XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P); +XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P); +XMVECTOR XM_CALLCONV XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2); +void XM_CALLCONV XMPlaneIntersectPlane(_Out_ XMVECTOR* pLinePoint1, _Out_ XMVECTOR* pLinePoint2, _In_ FXMVECTOR P1, _In_ FXMVECTOR P2); +XMVECTOR XM_CALLCONV XMPlaneTransform(FXMVECTOR P, FXMMATRIX M); +XMFLOAT4* XM_CALLCONV XMPlaneTransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(PlaneCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT4)+InputStride*(PlaneCount-1)) const XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t PlaneCount, _In_ FXMMATRIX M); + +XMVECTOR XM_CALLCONV XMPlaneFromPointNormal(FXMVECTOR Point, FXMVECTOR Normal); +XMVECTOR XM_CALLCONV XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2, FXMVECTOR Point3); + +/**************************************************************************** + * + * Color operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMColorEqual(FXMVECTOR C1, FXMVECTOR C2); +bool XM_CALLCONV XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2); +bool XM_CALLCONV XMColorGreater(FXMVECTOR C1, FXMVECTOR C2); +bool XM_CALLCONV XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2); +bool XM_CALLCONV XMColorLess(FXMVECTOR C1, FXMVECTOR C2); +bool XM_CALLCONV XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2); + +bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C); +bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C); + +XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR C); +XMVECTOR XM_CALLCONV XMColorModulate(FXMVECTOR C1, FXMVECTOR C2); +XMVECTOR XM_CALLCONV XMColorAdjustSaturation(FXMVECTOR C, float Saturation); +XMVECTOR XM_CALLCONV XMColorAdjustContrast(FXMVECTOR C, float Contrast); + +XMVECTOR XM_CALLCONV XMColorRGBToHSL( FXMVECTOR rgb ); +XMVECTOR XM_CALLCONV XMColorHSLToRGB( FXMVECTOR hsl ); + +XMVECTOR XM_CALLCONV XMColorRGBToHSV( FXMVECTOR rgb ); +XMVECTOR XM_CALLCONV XMColorHSVToRGB( FXMVECTOR hsv ); + +XMVECTOR XM_CALLCONV XMColorRGBToYUV( FXMVECTOR rgb ); +XMVECTOR XM_CALLCONV XMColorYUVToRGB( FXMVECTOR yuv ); + +XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD( FXMVECTOR rgb ); +XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD( FXMVECTOR yuv ); + +XMVECTOR XM_CALLCONV XMColorRGBToXYZ( FXMVECTOR rgb ); +XMVECTOR XM_CALLCONV XMColorXYZToRGB( FXMVECTOR xyz ); + +XMVECTOR XM_CALLCONV XMColorXYZToSRGB( FXMVECTOR xyz ); +XMVECTOR XM_CALLCONV XMColorSRGBToXYZ( FXMVECTOR srgb ); + +XMVECTOR XM_CALLCONV XMColorRGBToSRGB( FXMVECTOR rgb ); +XMVECTOR XM_CALLCONV XMColorSRGBToRGB( FXMVECTOR srgb ); + + +/**************************************************************************** + * + * Miscellaneous operations + * + ****************************************************************************/ + +bool XMVerifyCPUSupport(); + +XMVECTOR XM_CALLCONV XMFresnelTerm(FXMVECTOR CosIncidentAngle, FXMVECTOR RefractionIndex); + +bool XMScalarNearEqual(float S1, float S2, float Epsilon); +float XMScalarModAngle(float Value); + +float XMScalarSin(float Value); +float XMScalarSinEst(float Value); + +float XMScalarCos(float Value); +float XMScalarCosEst(float Value); + +void XMScalarSinCos(_Out_ float* pSin, _Out_ float* pCos, float Value); +void XMScalarSinCosEst(_Out_ float* pSin, _Out_ float* pCos, float Value); + +float XMScalarASin(float Value); +float XMScalarASinEst(float Value); + +float XMScalarACos(float Value); +float XMScalarACosEst(float Value); + +/**************************************************************************** + * + * Templates + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XMMin) +#undef XMMin +#undef XMMax +#endif + +template inline T XMMin(T a, T b) { return (a < b) ? a : b; } +template inline T XMMax(T a, T b) { return (a > b) ? a : b; } + +//------------------------------------------------------------------------------ + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +// PermuteHelper internal template (SSE only) +namespace Internal +{ + // Slow path fallback for permutes that do not map to a single SSE shuffle opcode. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) + { + static const XMVECTORU32 selectMask = + { + WhichX ? 0xFFFFFFFF : 0, + WhichY ? 0xFFFFFFFF : 0, + WhichZ ? 0xFFFFFFFF : 0, + WhichW ? 0xFFFFFFFF : 0, + }; + + XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle); + XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle); + + XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); + XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); + + return _mm_or_ps(masked1, masked2); + } + }; + + // Fast path for permutes that only read from the first vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR) { return XM_PERMUTE_PS(v1, Shuffle); } + }; + + // Fast path for permutes that only read from the second vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR, FXMVECTOR v2){ return XM_PERMUTE_PS(v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the first vector, ZW from the second. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the second vector, ZW from the first. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); } + }; +} + +#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +// General permute template +template + inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); + static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); + static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); + static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); + + const bool WhichX = PermuteX > 3; + const bool WhichY = PermuteY > 3; + const bool WhichZ = PermuteZ > 3; + const bool WhichW = PermuteW > 3; + + return Internal::PermuteHelper::Permute(V1, V2); +#else + + return XMVectorPermute( V1, V2, PermuteX, PermuteY, PermuteZ, PermuteW ); + +#endif +} + +// Special-case permute templates +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR) { return V1; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR, FXMVECTOR V2) { return V2; } + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_movelh_ps(V1,V2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<6,7,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_movehl_ps(V1,V2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,4,1,5>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_unpacklo_ps(V1,V2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,6,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_unpackhi_ps(V1,V2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(V1), _mm_castps_pd(V2))); } +#endif + +#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); } +#endif + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +// If the indices are all in the range 0-3 or 4-7, then use XMVectorSwizzle instead +// The mirror cases are not spelled out here as the programmer can always swap the arguments +// (i.e. prefer permutes where the X element comes from the V1 vector instead of the V2 vector) + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_low_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,0,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_low_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,0,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vget_high_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,2,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_high_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,2,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_high_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,0,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_high_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,0,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,2,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_low_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,2,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,4,2,6>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[0]; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,5,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[1]; } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,4,1,5>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[0]; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,6,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[1]; } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,2,4,6>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[0]; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,3,5,7>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[1]; } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,2,3,4>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 1); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,4,5,6>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 3); } + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +//------------------------------------------------------------------------------ + +// General swizzle template +template + inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) +{ + static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); + static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); + static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); + static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) ); +#else + + return XMVectorSwizzle( V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW ); + +#endif +} + +// Specialized swizzles +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; } + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,0,1>(FXMVECTOR V) { return _mm_movelh_ps(V,V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,3,2,3>(FXMVECTOR V) { return _mm_movehl_ps(V,V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,1,1>(FXMVECTOR V) { return _mm_unpacklo_ps(V,V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,2,3,3>(FXMVECTOR V) { return _mm_unpackhi_ps(V,V); } +#endif + +#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); } +#endif + +#if defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return _mm_broadcastss_ps( V ); } +#endif + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 0); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,1,1>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 1); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,2,2,2>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 0); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,3,3,3>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 1); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,0,3,2>(FXMVECTOR V) { return vrev64q_f32(V); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,0,1>(FXMVECTOR V) { float32x2_t vt = vget_low_f32(V); return vcombine_f32( vt, vt ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,3,2,3>(FXMVECTOR V) { float32x2_t vt = vget_high_f32(V); return vcombine_f32( vt, vt ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,0,1,0>(FXMVECTOR V) { float32x2_t vt = vrev64_f32( vget_low_f32(V) ); return vcombine_f32( vt, vt ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,2,3,2>(FXMVECTOR V) { float32x2_t vt = vrev64_f32( vget_high_f32(V) ); return vcombine_f32( vt, vt ); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,3,2>(FXMVECTOR V) { return vcombine_f32( vget_low_f32(V), vrev64_f32( vget_high_f32(V) ) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,0,2,3>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_low_f32(V) ), vget_high_f32(V) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,3,1,0>(FXMVECTOR V) { return vcombine_f32( vget_high_f32(V), vrev64_f32( vget_low_f32(V) ) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,2,0,1>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vget_low_f32(V) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,2,1,0>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vrev64_f32( vget_low_f32(V) ) ); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return vtrnq_f32(V,V).val[0]; } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return vtrnq_f32(V,V).val[1]; } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,1,1>(FXMVECTOR V) { return vzipq_f32(V,V).val[0]; } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,2,3,3>(FXMVECTOR V) { return vzipq_f32(V,V).val[1]; } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,2,0,2>(FXMVECTOR V) { return vuzpq_f32(V,V).val[0]; } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,3,1,3>(FXMVECTOR V) { return vuzpq_f32(V,V).val[1]; } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,2,3,0>(FXMVECTOR V) { return vextq_f32(V, V, 1); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,3,0,1>(FXMVECTOR V) { return vextq_f32(V, V, 2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,0,1,2>(FXMVECTOR V) { return vextq_f32(V, V, 3); } + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +//------------------------------------------------------------------------------ + +template + inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return XMVectorPermute(V1, V2); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return XMVectorSwizzle(V); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS) +{ + XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1); + return XMVectorSelect( VD, XMVectorRotateLeft(VS), Control ); +} + +/**************************************************************************** + * + * Globals + * + ****************************************************************************/ + +// The purpose of the following global constants is to prevent redundant +// reloading of the constants when they are referenced by more than one +// separate inline math routine called within the same function. Declaring +// a constant locally within a routine is sufficient to prevent redundant +// reloads of that constant when that single routine is called multiple +// times in a function, but if the constant is used (and declared) in a +// separate math routine it would be reloaded. + +#ifndef XMGLOBALCONST +# ifdef _MSC_VER +# define XMGLOBALCONST extern const __declspec(selectany) +# else +# define XMGLOBALCONST const +# endif +#endif + +XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0 = { -0.16666667f, +0.0083333310f, -0.00019840874f, +2.7525562e-06f }; +XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1 = { -2.3889859e-08f, -0.16665852f /*Est1*/, +0.0083139502f /*Est2*/, -0.00018524670f /*Est3*/ }; +XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0 = { -0.5f, +0.041666638f, -0.0013888378f, +2.4760495e-05f }; +XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1 = { -2.6051615e-07f, -0.49992746f /*Est1*/, +0.041493919f /*Est2*/, -0.0012712436f /*Est3*/ }; +XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0 = { 1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f }; +XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1 = { 2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f }; +XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2 = { 5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f }; +XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients0 = { +1.5707963050f, -0.2145988016f, +0.0889789874f, -0.0501743046f }; +XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients1 = { +0.0308918810f, -0.0170881256f, +0.0066700901f, -0.0012624911f }; +XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0 = { -0.3333314528f, +0.1999355085f, -0.1420889944f, +0.1065626393f }; +XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1 = { -0.0752896400f, +0.0429096138f, -0.0161657367f, +0.0028662257f }; +XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients0 = { +0.999866f, +0.999866f, +0.999866f, +0.999866f }; +XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients1 = { -0.3302995f, +0.180141f, -0.085133f, +0.0208351f }; +XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients = { 2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI }; +XMGLOBALCONST XMVECTORF32 g_XMArcEstCoefficients = { +1.5707288f, -0.2121144f, +0.0742610f, -0.0187293f }; +XMGLOBALCONST XMVECTORF32 g_XMPiConstants0 = { XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI }; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR0 = { 1.0f, 0.0f, 0.0f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR1 = { 0.0f, 1.0f, 0.0f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR2 = { 0.0f, 0.0f, 1.0f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR3 = { 0.0f, 0.0f, 0.0f, 1.0f }; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0 = { -1.0f, 0.0f, 0.0f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1 = { 0.0f, -1.0f, 0.0f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2 = { 0.0f, 0.0f, -1.0f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3 = { 0.0f, 0.0f, 0.0f, -1.0f }; +XMGLOBALCONST XMVECTORU32 g_XMNegativeZero = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; +XMGLOBALCONST XMVECTORU32 g_XMNegate3 = { 0x80000000, 0x80000000, 0x80000000, 0x00000000 }; +XMGLOBALCONST XMVECTORU32 g_XMMaskXY = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }; +XMGLOBALCONST XMVECTORU32 g_XMMask3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; +XMGLOBALCONST XMVECTORU32 g_XMMaskX = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }; +XMGLOBALCONST XMVECTORU32 g_XMMaskY = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }; +XMGLOBALCONST XMVECTORU32 g_XMMaskZ = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }; +XMGLOBALCONST XMVECTORU32 g_XMMaskW = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }; +XMGLOBALCONST XMVECTORF32 g_XMOne = { 1.0f, 1.0f, 1.0f, 1.0f }; +XMGLOBALCONST XMVECTORF32 g_XMOne3 = { 1.0f, 1.0f, 1.0f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMZero = { 0.0f, 0.0f, 0.0f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMTwo = { 2.f, 2.f, 2.f, 2.f }; +XMGLOBALCONST XMVECTORF32 g_XMFour = { 4.f, 4.f, 4.f, 4.f }; +XMGLOBALCONST XMVECTORF32 g_XMSix = { 6.f, 6.f, 6.f, 6.f }; +XMGLOBALCONST XMVECTORF32 g_XMNegativeOne = { -1.0f, -1.0f, -1.0f, -1.0f }; +XMGLOBALCONST XMVECTORF32 g_XMOneHalf = { 0.5f, 0.5f, 0.5f, 0.5f }; +XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf = { -0.5f, -0.5f, -0.5f, -0.5f }; +XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi = { -XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI }; +XMGLOBALCONST XMVECTORF32 g_XMNegativePi = { -XM_PI, -XM_PI, -XM_PI, -XM_PI }; +XMGLOBALCONST XMVECTORF32 g_XMHalfPi = { XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2 }; +XMGLOBALCONST XMVECTORF32 g_XMPi = { XM_PI, XM_PI, XM_PI, XM_PI }; +XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi = { XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI }; +XMGLOBALCONST XMVECTORF32 g_XMTwoPi = { XM_2PI, XM_2PI, XM_2PI, XM_2PI }; +XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi = { XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI }; +XMGLOBALCONST XMVECTORF32 g_XMEpsilon = { 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f }; +XMGLOBALCONST XMVECTORI32 g_XMInfinity = { 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000 }; +XMGLOBALCONST XMVECTORI32 g_XMQNaN = { 0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000 }; +XMGLOBALCONST XMVECTORI32 g_XMQNaNTest = { 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF }; +XMGLOBALCONST XMVECTORI32 g_XMAbsMask = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }; +XMGLOBALCONST XMVECTORI32 g_XMFltMin = { 0x00800000, 0x00800000, 0x00800000, 0x00800000 }; +XMGLOBALCONST XMVECTORI32 g_XMFltMax = { 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF }; +XMGLOBALCONST XMVECTORU32 g_XMNegOneMask = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; +XMGLOBALCONST XMVECTORU32 g_XMMaskA8R8G8B8 = { 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000 }; +XMGLOBALCONST XMVECTORU32 g_XMFlipA8R8G8B8 = { 0x00000000, 0x00000000, 0x00000000, 0x80000000 }; +XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8 = { 0.0f, 0.0f, 0.0f, float(0x80000000U) }; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = { 1.0f / (255.0f*float(0x10000)), 1.0f / (255.0f*float(0x100)), 1.0f / 255.0f, 1.0f / (255.0f*float(0x1000000)) }; +XMGLOBALCONST XMVECTORU32 g_XMMaskA2B10G10R10 = { 0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000 }; +XMGLOBALCONST XMVECTORU32 g_XMFlipA2B10G10R10 = { 0x00000200, 0x00080000, 0x20000000, 0x80000000 }; +XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10 = { -512.0f, -512.0f*float(0x400), -512.0f*float(0x100000), float(0x80000000U) }; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = { 1.0f / 511.0f, 1.0f / (511.0f*float(0x400)), 1.0f / (511.0f*float(0x100000)), 1.0f / (3.0f*float(0x40000000)) }; +XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16 = { 0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000 }; +XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16 = { 0x00008000, 0x00000000, 0x00000000, 0x00000000 }; +XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16 = { -32768.0f, 0.0f, 0.0f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16 = { 1.0f / 32767.0f, 1.0f / (32767.0f*65536.0f), 0.0f, 0.0f }; +XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16Z16W16 = { 0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000 }; +XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16 = { 0x00008000, 0x00008000, 0x00000000, 0x00000000 }; +XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16 = { -32768.0f, -32768.0f, 0.0f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / (32767.0f*65536.0f), 1.0f / (32767.0f*65536.0f) }; +XMGLOBALCONST XMVECTORF32 g_XMNoFraction = { 8388608.0f, 8388608.0f, 8388608.0f, 8388608.0f }; +XMGLOBALCONST XMVECTORI32 g_XMMaskByte = { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF }; +XMGLOBALCONST XMVECTORF32 g_XMNegateX = { -1.0f, 1.0f, 1.0f, 1.0f }; +XMGLOBALCONST XMVECTORF32 g_XMNegateY = { 1.0f, -1.0f, 1.0f, 1.0f }; +XMGLOBALCONST XMVECTORF32 g_XMNegateZ = { 1.0f, 1.0f, -1.0f, 1.0f }; +XMGLOBALCONST XMVECTORF32 g_XMNegateW = { 1.0f, 1.0f, 1.0f, -1.0f }; +XMGLOBALCONST XMVECTORU32 g_XMSelect0101 = { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1 }; +XMGLOBALCONST XMVECTORU32 g_XMSelect1010 = { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 }; +XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD }; +XMGLOBALCONST XMVECTORU32 g_XMSelect1000 = { XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0 }; +XMGLOBALCONST XMVECTORU32 g_XMSelect1100 = { XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 }; +XMGLOBALCONST XMVECTORU32 g_XMSelect1110 = { XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 }; +XMGLOBALCONST XMVECTORU32 g_XMSelect1011 = { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 }; +XMGLOBALCONST XMVECTORF32 g_XMFixupY16 = { 1.0f, 1.0f / 65536.0f, 0.0f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16 = { 1.0f, 1.0f, 1.0f / 65536.0f, 1.0f / 65536.0f }; +XMGLOBALCONST XMVECTORU32 g_XMFlipY = { 0, 0x80000000, 0, 0 }; +XMGLOBALCONST XMVECTORU32 g_XMFlipZ = { 0, 0, 0x80000000, 0 }; +XMGLOBALCONST XMVECTORU32 g_XMFlipW = { 0, 0, 0, 0x80000000 }; +XMGLOBALCONST XMVECTORU32 g_XMFlipYZ = { 0, 0x80000000, 0x80000000, 0 }; +XMGLOBALCONST XMVECTORU32 g_XMFlipZW = { 0, 0, 0x80000000, 0x80000000 }; +XMGLOBALCONST XMVECTORU32 g_XMFlipYW = { 0, 0x80000000, 0, 0x80000000 }; +XMGLOBALCONST XMVECTORI32 g_XMMaskDec4 = { 0x3FF, 0x3FF << 10, 0x3FF << 20, static_cast(0xC0000000) }; +XMGLOBALCONST XMVECTORI32 g_XMXorDec4 = { 0x200, 0x200 << 10, 0x200 << 20, 0 }; +XMGLOBALCONST XMVECTORF32 g_XMAddUDec4 = { 0, 0, 0, 32768.0f*65536.0f }; +XMGLOBALCONST XMVECTORF32 g_XMAddDec4 = { -512.0f, -512.0f*1024.0f, -512.0f*1024.0f*1024.0f, 0 }; +XMGLOBALCONST XMVECTORF32 g_XMMulDec4 = { 1.0f, 1.0f / 1024.0f, 1.0f / (1024.0f*1024.0f), 1.0f / (1024.0f*1024.0f*1024.0f) }; +XMGLOBALCONST XMVECTORU32 g_XMMaskByte4 = { 0xFF, 0xFF00, 0xFF0000, 0xFF000000 }; +XMGLOBALCONST XMVECTORI32 g_XMXorByte4 = { 0x80, 0x8000, 0x800000, 0x00000000 }; +XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = { -128.0f, -128.0f*256.0f, -128.0f*65536.0f, 0 }; +XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned = { 32768.0f*65536.0f, 32768.0f*65536.0f, 32768.0f*65536.0f, 32768.0f*65536.0f }; +XMGLOBALCONST XMVECTORF32 g_XMMaxInt = { 65536.0f*32768.0f - 128.0f, 65536.0f*32768.0f - 128.0f, 65536.0f*32768.0f - 128.0f, 65536.0f*32768.0f - 128.0f }; +XMGLOBALCONST XMVECTORF32 g_XMMaxUInt = { 65536.0f*65536.0f - 256.0f, 65536.0f*65536.0f - 256.0f, 65536.0f*65536.0f - 256.0f, 65536.0f*65536.0f - 256.0f }; +XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix = { 32768.0f*65536.0f, 32768.0f*65536.0f, 32768.0f*65536.0f, 32768.0f*65536.0f }; +XMGLOBALCONST XMVECTORF32 g_XMsrgbScale = { 12.92f, 12.92f, 12.92f, 1.0f }; +XMGLOBALCONST XMVECTORF32 g_XMsrgbA = { 0.055f, 0.055f, 0.055f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMsrgbA1 = { 1.055f, 1.055f, 1.055f, 1.0f }; +XMGLOBALCONST XMVECTORI32 g_XMExponentBias = { 127, 127, 127, 127 }; +XMGLOBALCONST XMVECTORI32 g_XMSubnormalExponent = { -126, -126, -126, -126 }; +XMGLOBALCONST XMVECTORI32 g_XMNumTrailing = { 23, 23, 23, 23 }; +XMGLOBALCONST XMVECTORI32 g_XMMinNormal = { 0x00800000, 0x00800000, 0x00800000, 0x00800000 }; +XMGLOBALCONST XMVECTORU32 g_XMNegInfinity = { 0xFF800000, 0xFF800000, 0xFF800000, 0xFF800000 }; +XMGLOBALCONST XMVECTORU32 g_XMNegQNaN = { 0xFFC00000, 0xFFC00000, 0xFFC00000, 0xFFC00000 }; +XMGLOBALCONST XMVECTORI32 g_XMBin128 = { 0x43000000, 0x43000000, 0x43000000, 0x43000000 }; +XMGLOBALCONST XMVECTORU32 g_XMBinNeg150 = { 0xC3160000, 0xC3160000, 0xC3160000, 0xC3160000 }; +XMGLOBALCONST XMVECTORI32 g_XM253 = { 253, 253, 253, 253 }; +XMGLOBALCONST XMVECTORF32 g_XMExpEst1 = { -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f }; +XMGLOBALCONST XMVECTORF32 g_XMExpEst2 = { +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f }; +XMGLOBALCONST XMVECTORF32 g_XMExpEst3 = { -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f }; +XMGLOBALCONST XMVECTORF32 g_XMExpEst4 = { +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f }; +XMGLOBALCONST XMVECTORF32 g_XMExpEst5 = { -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f }; +XMGLOBALCONST XMVECTORF32 g_XMExpEst6 = { +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f }; +XMGLOBALCONST XMVECTORF32 g_XMExpEst7 = { -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f }; +XMGLOBALCONST XMVECTORF32 g_XMLogEst0 = { +1.442693f, +1.442693f, +1.442693f, +1.442693f }; +XMGLOBALCONST XMVECTORF32 g_XMLogEst1 = { -0.721242f, -0.721242f, -0.721242f, -0.721242f }; +XMGLOBALCONST XMVECTORF32 g_XMLogEst2 = { +0.479384f, +0.479384f, +0.479384f, +0.479384f }; +XMGLOBALCONST XMVECTORF32 g_XMLogEst3 = { -0.350295f, -0.350295f, -0.350295f, -0.350295f }; +XMGLOBALCONST XMVECTORF32 g_XMLogEst4 = { +0.248590f, +0.248590f, +0.248590f, +0.248590f }; +XMGLOBALCONST XMVECTORF32 g_XMLogEst5 = { -0.145700f, -0.145700f, -0.145700f, -0.145700f }; +XMGLOBALCONST XMVECTORF32 g_XMLogEst6 = { +0.057148f, +0.057148f, +0.057148f, +0.057148f }; +XMGLOBALCONST XMVECTORF32 g_XMLogEst7 = { -0.010578f, -0.010578f, -0.010578f, -0.010578f }; +XMGLOBALCONST XMVECTORF32 g_XMLgE = { +1.442695f, +1.442695f, +1.442695f, +1.442695f }; +XMGLOBALCONST XMVECTORF32 g_XMInvLgE = { +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f }; +XMGLOBALCONST XMVECTORF32 g_UByteMax = { 255.0f, 255.0f, 255.0f, 255.0f }; +XMGLOBALCONST XMVECTORF32 g_ByteMin = { -127.0f, -127.0f, -127.0f, -127.0f }; +XMGLOBALCONST XMVECTORF32 g_ByteMax = { 127.0f, 127.0f, 127.0f, 127.0f }; +XMGLOBALCONST XMVECTORF32 g_ShortMin = { -32767.0f, -32767.0f, -32767.0f, -32767.0f }; +XMGLOBALCONST XMVECTORF32 g_ShortMax = { 32767.0f, 32767.0f, 32767.0f, 32767.0f }; +XMGLOBALCONST XMVECTORF32 g_UShortMax = { 65535.0f, 65535.0f, 65535.0f, 65535.0f }; + +/**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable:4068 4214 4204 4365 4616 4640 6001 6101) +// C4068/4616: ignore unknown pragmas +// C4214/4204: nonstandard extension used +// C4365/4640: Off by default noise +// C6001/6101: False positives + +# ifdef _PREFAST_ +# pragma prefast(push) +# pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +# pragma prefast(disable : 26495, "Union initialization confuses /analyze") +# endif +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = (0-(C0&1)) & 0x3F800000; + vResult.u[1] = (0-(C1&1)) & 0x3F800000; + vResult.u[2] = (0-(C2&1)) & 0x3F800000; + vResult.u[3] = (0-(C3&1)) & 0x3F800000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = (0-(C0&1)) & 0x3F800000; + vResult.u[1] = (0-(C1&1)) & 0x3F800000; + vResult.u[2] = (0-(C2&1)) & 0x3F800000; + vResult.u[3] = (0-(C3&1)) & 0x3F800000; + return vResult.v; +#else // XM_SSE_INTRINSICS_ + static const XMVECTORU32 g_vMask1 = { 1, 1, 1, 1 }; + // Move the parms to a vector + __m128i vTemp = _mm_set_epi32(static_cast(C3), static_cast(C2), static_cast(C1), static_cast(C0)); + // Mask off the low bits + vTemp = _mm_and_si128(vTemp,g_vMask1); + // 0xFFFFFFFF on true bits + vTemp = _mm_cmpeq_epi32(vTemp,g_vMask1); + // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f + vTemp = _mm_and_si128(vTemp,g_XMOne); + return _mm_castsi128_ps(vTemp); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) +{ + assert( IntConstant >= -16 && IntConstant <= 15 ); + assert( DivExponent < 32 ); +#if defined(_XM_NO_INTRINSICS_) + + using DirectX::XMConvertVectorIntToFloat; + + XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant }; + return XMConvertVectorIntToFloat( V.v, DivExponent); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Splat the int + int32x4_t vScale = vdupq_n_s32(IntConstant); + // Convert to a float + XMVECTOR vResult = vcvtq_f32_s32(vScale); + // Convert DivExponent into 1.0f/(1<(&vScale)[0]); + return vResult; +#else // XM_SSE_INTRINSICS_ + // Splat the int + __m128i vScale = _mm_set1_epi32(IntConstant); + // Convert to a float + XMVECTOR vResult = _mm_cvtepi32_ps(vScale); + // Convert DivExponent into 1.0f/(1<(uScale)); + // Multiply by the reciprocal (Perform a right shift by DivExponent) + vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant) +{ + assert( IntConstant >= -16 && IntConstant <= 15 ); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORI32 V = { { { IntConstant, IntConstant, IntConstant, IntConstant } } }; + return V.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t V = vdupq_n_s32( IntConstant ); + return reinterpret_cast(&V)[0]; +#else // XM_SSE_INTRINSICS_ + __m128i V = _mm_set1_epi32( IntConstant ); + return _mm_castsi128_ps(V); +#endif +} + +#include "DirectXMathConvert.inl" +#include "DirectXMathVector.inl" +#include "DirectXMathMatrix.inl" +#include "DirectXMathMisc.inl" + +#ifdef _MSC_VER +# ifdef _PREFAST_ +# pragma prefast(pop) +# endif + +# pragma warning(pop) +#endif + +} // namespace DirectX + diff --git a/WickedEngine/Utility/DirectXMathCommon.h b/WickedEngine/Utility/DirectXMathCommon.h new file mode 100644 index 000000000..cc119658e --- /dev/null +++ b/WickedEngine/Utility/DirectXMathCommon.h @@ -0,0 +1,75 @@ +#pragma once + +#include + +#if defined(__ARM_ARCH) || defined(_M_ARM) || defined(_M_ARM64) +# define BUILD_ARCH_ARM 1 +#elif defined(_MSC_VER) +# if defined(_M_X86) || defined(_M_X64) +# define BUILD_ARCH_X86 1 +# endif +#elif defined(__GNUC__) || defined(__clang__) +# if defined(__i386__) || defined(__x86_64__) +# define BUILD_ARCH_X86 1 +# endif +#endif + +#if defined(_WIN32) +# define BUILD_PLATFORM_WIN 1 +#elif defined(__APPLE__) +# include "TargetConditionals.h" +# if defined(TARGET_OS_IPHONE) +# define BUILD_PLATFORM_IOS 1 +# else +# define BUILD_PLATFORM_OSX 1 +# endif +#elif defined(__linux__) || defined(__gnu_linux__) || defined(linux) +# define BUILD_PLATFORM_LINUX 1 +#elif defined(__unix__) +# define BUILD_PLATFORM_UNIX 1 +#elif defined(__ANDROID__) +# define BUILD_PLATFORM_ANDROID 1 +#endif + +/* + * BUILD_INTRINSICS_LEVEL 0..3. Try setting different levels and see what compiles/runs/doesn't crash. + * 0 _XM_NO_INTRINSICS_ + * 1 Some + * 2 More + * 3 All + */ +#if defined(BUILD_PLATFORM_WIN) +# if !defined(BUILD_INTRINSICS_LEVEL) +# define BUILD_INTRINSICS_LEVEL 3 +# endif +#else +# if !defined(BUILD_INTRINSICS_LEVEL) +# define BUILD_INTRINSICS_LEVEL 1 +# endif +#endif + +#if defined(BUILD_ARCH_ARM) +# if defined(__ARM_NEON) && BUILD_INTRINSICS_LEVEL > 0 +# define _XM_ARM_NEON_INTRINSICS_ +# else +# define _XM_NO_INTRINSICS_ +# endif +#else +# if BUILD_INTRINSICS_LEVEL > 0 +# define _XM_SSE_INTRINSICS_ +# endif +# if BUILD_INTRINSICS_LEVEL > 1 +# define _XM_SSE3_INTRINSICS_ +# define _XM_SSE4_INTRINSICS_ +# define _XM_AVX_INTRINSICS_ +# endif +# if BUILD_INTRINSICS_LEVEL > 2 +# define _XM_F16C_INTRINSICS_ +# endif +#endif +#if defined(__GNUC__) || defined(BUILD_PLATFORM_IOS) +# define _XM_NO_CALL_CONVENTION_ +#endif +#if defined(BUILD_PLATFORM_IOS) || defined(BUILD_PLATFORM_ANDROID) +# define _XM_ARM_NEON_NO_ALIGN_ +#endif diff --git a/WickedEngine/Utility/DirectXMathConvert.inl b/WickedEngine/Utility/DirectXMathConvert.inl new file mode 100644 index 000000000..32de29d45 --- /dev/null +++ b/WickedEngine/Utility/DirectXMathConvert.inl @@ -0,0 +1,2181 @@ +//------------------------------------------------------------------------------------- +// DirectXMathConvert.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Data conversion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable:4701) +// C4701: false positives +#endif + +inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat +( + FXMVECTOR VInt, + uint32_t DivExponent +) +{ + assert(DivExponent<32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / static_cast(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + auto iTemp = static_cast(VInt.vector4_u32[ElementIndex]); + Result.vector4_f32[ElementIndex] = static_cast(iTemp) * fScale; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fScale = 1.0f / (float)(1U << DivExponent); + float32x4_t vResult = vcvtq_f32_s32( VInt ); + return vmulq_n_f32( vResult, fScale ); +#else // _XM_SSE_INTRINSICS_ + // Convert to floats + XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt)); + // Convert DivExponent into 1.0f/(1<(uScale)); + vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt +( + FXMVECTOR VFloat, + uint32_t MulExponent +) +{ + assert(MulExponent<32); +#if defined(_XM_NO_INTRINSICS_) + // Get the scalar factor. + auto fScale = static_cast(1U << MulExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + int32_t iResult; + float fTemp = VFloat.vector4_f32[ElementIndex]*fScale; + if (fTemp <= -(65536.0f*32768.0f)) { + iResult = (-0x7FFFFFFF)-1; + } else if (fTemp > (65536.0f*32768.0f)-128.0f) { + iResult = 0x7FFFFFFF; + } else { + iResult = static_cast(fTemp); + } + Result.vector4_u32[ElementIndex] = static_cast(iResult); + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmulq_n_f32(VFloat, (float)(1U << MulExponent)); + // In case of positive overflow, detect it + uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxInt); + // Float to int conversion + int32x4_t vResulti = vcvtq_s32_f32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = vandq_u32(vOverflow,g_XMAbsMask); + vOverflow = vbicq_u32(vResulti,vOverflow); + vOverflow = vorrq_u32(vOverflow,vResult); + return vOverflow; +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1(static_cast(1U << MulExponent)); + vResult = _mm_mul_ps(vResult,VFloat); + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + return vOverflow; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat +( + FXMVECTOR VUInt, + uint32_t DivExponent +) +{ + assert(DivExponent<32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / static_cast(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + Result.vector4_f32[ElementIndex] = static_cast(VUInt.vector4_u32[ElementIndex]) * fScale; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fScale = 1.0f / (float)(1U << DivExponent); + float32x4_t vResult = vcvtq_f32_u32( VUInt ); + return vmulq_n_f32( vResult, fScale ); +#else // _XM_SSE_INTRINSICS_ + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(VUInt,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + // Convert DivExponent into 1.0f/(1<(uScale)); + vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(iMask)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt +( + FXMVECTOR VFloat, + uint32_t MulExponent +) +{ + assert(MulExponent<32); +#if defined(_XM_NO_INTRINSICS_) + // Get the scalar factor. + auto fScale = static_cast(1U << MulExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + uint32_t uResult; + float fTemp = VFloat.vector4_f32[ElementIndex]*fScale; + if (fTemp <= 0.0f) { + uResult = 0; + } else if (fTemp >= (65536.0f*65536.0f)) { + uResult = 0xFFFFFFFFU; + } else { + uResult = static_cast(fTemp); + } + Result.vector4_u32[ElementIndex] = uResult; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmulq_n_f32(VFloat,(float)(1U << MulExponent)); + // In case of overflow, detect it + uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxUInt); + // Float to int conversion + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + // If there was overflow, set to 0xFFFFFFFFU + vResult = vbicq_u32(vResulti,vOverflow); + vOverflow = vorrq_u32(vOverflow,vResult); + return vOverflow; +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1(static_cast(1U << MulExponent)); + vResult = _mm_mul_ps(vResult,VFloat); + // Clamp to >=0 + vResult = _mm_max_ps(vResult,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + return vResult; +#endif +} + +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +/**************************************************************************** + * + * Vector and matrix load operations + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = *pSource; + V.vector4_u32[1] = 0; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t zero = vdupq_n_u32(0); + return vld1q_lane_u32( pSource, zero, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss( reinterpret_cast(pSource) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat(const float* pSource) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = *pSource; + V.vector4_f32[1] = 0.f; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t zero = vdupq_n_f32(0); + return vld1q_lane_f32( pSource, zero, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss( pSource ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt2 +( + const uint32_t* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32( pSource ); + uint32x2_t zero = vdup_n_u32(0); + return vcombine_u32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast(pSource) ); + __m128 y = _mm_load_ss( reinterpret_cast(pSource+1) ); + return _mm_unpacklo_ps( x, y ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt2A +( + const uint32_t* pSource +) +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32_ex( pSource, 64 ); + uint32x2_t zero = vdup_n_u32(0); + return vcombine_u32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat2 +( + const XMFLOAT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t x = vld1_f32( reinterpret_cast(pSource) ); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( &pSource->x ); + __m128 y = _mm_load_ss( &pSource->y ); + return _mm_unpacklo_ps( x, y ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat2A +( + const XMFLOAT2A* pSource +) +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t x = vld1_f32_ex( reinterpret_cast(pSource), 64 ); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt2 +( + const XMINT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x2_t x = vld1_s32( reinterpret_cast(pSource) ); + float32x2_t v = vcvt_f32_s32( x ); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32( v, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast(&pSource->y) ); + __m128 V = _mm_unpacklo_ps( x, y ); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt2 +( + const XMUINT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32( reinterpret_cast(pSource) ); + float32x2_t v = vcvt_f32_u32( x ); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32( v, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast(&pSource->y) ); + __m128 V = _mm_unpacklo_ps( x, y ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt3 +( + const uint32_t* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32( pSource ); + uint32x2_t zero = vdup_n_u32(0); + uint32x2_t y = vld1_lane_u32( pSource+2, zero, 0 ); + return vcombine_u32( x, y ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast(pSource) ); + __m128 y = _mm_load_ss( reinterpret_cast(pSource+1) ); + __m128 z = _mm_load_ss( reinterpret_cast(pSource+2) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + return _mm_movelh_ps( xy, z ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt3A +( + const uint32_t* pSource +) +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra integer which is zero'd + uint32x4_t V = vld1q_u32_ex( pSource, 128 ); + return vsetq_lane_u32( 0, V, 3 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Reads an extra integer which is zero'd + __m128i V = _mm_load_si128( reinterpret_cast(pSource) ); + V = _mm_and_si128( V, g_XMMask3 ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3 +( + const XMFLOAT3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t x = vld1_f32( reinterpret_cast(pSource) ); + float32x2_t zero = vdup_n_f32(0); + float32x2_t y = vld1_lane_f32( reinterpret_cast(pSource)+2, zero, 0 ); + return vcombine_f32( x, y ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( &pSource->x ); + __m128 y = _mm_load_ss( &pSource->y ); + __m128 z = _mm_load_ss( &pSource->z ); + __m128 xy = _mm_unpacklo_ps( x, y ); + return _mm_movelh_ps( xy, z ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3A +( + const XMFLOAT3A* pSource +) +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra float which is zero'd + float32x4_t V = vld1q_f32_ex( reinterpret_cast(pSource), 128 ); + return vsetq_lane_f32( 0, V, 3 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Reads an extra float which is zero'd + __m128 V = _mm_load_ps( &pSource->x ); + return _mm_and_ps( V, g_XMMask3 ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt3 +( + const XMINT3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = 0.f; + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x2_t x = vld1_s32( reinterpret_cast(pSource) ); + int32x2_t zero = vdup_n_s32(0); + int32x2_t y = vld1_lane_s32( reinterpret_cast(pSource)+2, zero, 0 ); + int32x4_t v = vcombine_s32( x, y ); + return vcvtq_f32_s32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast(&pSource->y) ); + __m128 z = _mm_load_ss( reinterpret_cast(&pSource->z) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + __m128 V = _mm_movelh_ps( xy, z ); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt3 +( + const XMUINT3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32( reinterpret_cast(pSource) ); + uint32x2_t zero = vdup_n_u32(0); + uint32x2_t y = vld1_lane_u32( reinterpret_cast(pSource)+2, zero, 0 ); + uint32x4_t v = vcombine_u32( x, y ); + return vcvtq_f32_u32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast(&pSource->y) ); + __m128 z = _mm_load_ss( reinterpret_cast(&pSource->z) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + __m128 V = _mm_movelh_ps( xy, z ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt4 +( + const uint32_t* pSource +) +{ + assert(pSource); + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_u32( pSource ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128( reinterpret_cast(pSource) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt4A +( + const uint32_t* pSource +) +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_u32_ex( pSource, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_load_si128( reinterpret_cast(pSource) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat4 +( + const XMFLOAT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_f32( reinterpret_cast(pSource) ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_loadu_ps( &pSource->x ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat4A +( + const XMFLOAT4A* pSource +) +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_f32_ex( reinterpret_cast(pSource), 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps( &pSource->x ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt4 +( + const XMINT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = static_cast(pSource->w); + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vld1q_s32( reinterpret_cast(pSource) ); + return vcvtq_f32_s32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128( reinterpret_cast(pSource) ); + return _mm_cvtepi32_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt4 +( + const XMUINT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = static_cast(pSource->w); + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vld1q_u32( reinterpret_cast(pSource) ); + return vcvtq_f32_u32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128( reinterpret_cast(pSource) ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat3x3 +( + const XMFLOAT3X3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + M.r[3].vector4_f32[0] = 0.0f; + M.r[3].vector4_f32[1] = 0.0f; + M.r[3].vector4_f32[2] = 0.0f; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t v0 = vld1q_f32( &pSource->m[0][0] ); + float32x4_t v1 = vld1q_f32( &pSource->m[1][1] ); + float32x2_t v2 = vcreate_f32(static_cast(*reinterpret_cast(&pSource->m[2][2]))); + float32x4_t T = vextq_f32( v0, v1, 3 ); + + XMMATRIX M; + M.r[0] = vandq_u32( v0, g_XMMask3 ); + M.r[1] = vandq_u32( T, g_XMMask3 ); + M.r[2] = vcombine_f32( vget_high_f32(v1), v2 ); + M.r[3] = g_XMIdentityR3; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 Z = _mm_setzero_ps(); + + __m128 V1 = _mm_loadu_ps( &pSource->m[0][0] ); + __m128 V2 = _mm_loadu_ps( &pSource->m[1][1] ); + __m128 V3 = _mm_load_ss( &pSource->m[2][2] ); + + __m128 T1 = _mm_unpackhi_ps( V1, Z ); + __m128 T2 = _mm_unpacklo_ps( V2, Z ); + __m128 T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) ); + __m128 T4 = _mm_movehl_ps( T2, T3 ); + __m128 T5 = _mm_movehl_ps( Z, T1 ); + + XMMATRIX M; + M.r[0] = _mm_movelh_ps( V1, T1 ); + M.r[1] = _mm_add_ps( T4, T5 ); + M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) ); + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x3 +( + const XMFLOAT4X3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t v0 = vld1q_f32( &pSource->m[0][0] ); + float32x4_t v1 = vld1q_f32( &pSource->m[1][1] ); + float32x4_t v2 = vld1q_f32( &pSource->m[2][2] ); + + float32x4_t T1 = vextq_f32( v0, v1, 3 ); + float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) ); + float32x4_t T3 = vextq_f32( v2, v2, 1 ); + + XMMATRIX M; + M.r[0] = vandq_u32( v0, g_XMMask3 ); + M.r[1] = vandq_u32( T1, g_XMMask3 ); + M.r[2] = vandq_u32( T2, g_XMMask3 ); + M.r[3] = vsetq_lane_f32( 1.f, T3, 3 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use unaligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A +( + const XMFLOAT4X3A* pSource +) +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t v0 = vld1q_f32_ex( &pSource->m[0][0], 128 ); + float32x4_t v1 = vld1q_f32_ex( &pSource->m[1][1], 128 ); + float32x4_t v2 = vld1q_f32_ex( &pSource->m[2][2], 128 ); + + float32x4_t T1 = vextq_f32( v0, v1, 3 ); + float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) ); + float32x4_t T3 = vextq_f32( v2, v2, 1 ); + + XMMATRIX M; + M.r[0] = vandq_u32( v0, g_XMMask3 ); + M.r[1] = vandq_u32( T1, g_XMMask3 ); + M.r[2] = vandq_u32( T2, g_XMMask3 ); + M.r[3] = vsetq_lane_f32( 1.f, T3, 3 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use aligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat3x4 +( + const XMFLOAT3X4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[1][0]; + M.r[0].vector4_f32[2] = pSource->m[2][0]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[0][1]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[2][1]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[0][2]; + M.r[2].vector4_f32[1] = pSource->m[1][2]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[0][3]; + M.r[3].vector4_f32[1] = pSource->m[1][3]; + M.r[3].vector4_f32[2] = pSource->m[2][3]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2x4_t vTemp0 = vld4_f32(&pSource->_11); + float32x4_t vTemp1 = vld1q_f32(&pSource->_31); + + float32x2_t l = vget_low_f32(vTemp1); + float32x4_t T0 = vcombine_f32(vTemp0.val[0], l); + float32x2_t rl = vrev64_f32(l); + float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl); + + float32x2_t h = vget_high_f32(vTemp1); + float32x4_t T2 = vcombine_f32(vTemp0.val[2], h); + float32x2_t rh = vrev64_f32(h); + float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh); + + XMMATRIX M = {}; + M.r[0] = vandq_u32(T0, g_XMMask3); + M.r[1] = vandq_u32(T1, g_XMMask3); + M.r[2] = vandq_u32(T2, g_XMMask3); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_loadu_ps(&pSource->_11); + M.r[1] = _mm_loadu_ps(&pSource->_21); + M.r[2] = _mm_loadu_ps(&pSource->_31); + M.r[3] = g_XMIdentityR3; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + XMMATRIX mResult; + + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat3x4A +( + const XMFLOAT3X4A* pSource +) +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[1][0]; + M.r[0].vector4_f32[2] = pSource->m[2][0]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[0][1]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[2][1]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[0][2]; + M.r[2].vector4_f32[1] = pSource->m[1][2]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[0][3]; + M.r[3].vector4_f32[1] = pSource->m[1][3]; + M.r[3].vector4_f32[2] = pSource->m[2][3]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2x4_t vTemp0 = vld4_f32_ex(&pSource->_11, 128); + float32x4_t vTemp1 = vld1q_f32_ex(&pSource->_31, 128); + + float32x2_t l = vget_low_f32(vTemp1); + float32x4_t T0 = vcombine_f32(vTemp0.val[0], l); + float32x2_t rl = vrev64_f32(l); + float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl); + + float32x2_t h = vget_high_f32(vTemp1); + float32x4_t T2 = vcombine_f32(vTemp0.val[2], h); + float32x2_t rh = vrev64_f32(h); + float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh); + + XMMATRIX M = {}; + M.r[0] = vandq_u32(T0, g_XMMask3); + M.r[1] = vandq_u32(T1, g_XMMask3); + M.r[2] = vandq_u32(T2, g_XMMask3); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_load_ps(&pSource->_11); + M.r[1] = _mm_load_ps(&pSource->_21); + M.r[2] = _mm_load_ps(&pSource->_31); + M.r[3] = g_XMIdentityR3; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + XMMATRIX mResult; + + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x4 +( + const XMFLOAT4X4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vld1q_f32( reinterpret_cast(&pSource->_11) ); + M.r[1] = vld1q_f32( reinterpret_cast(&pSource->_21) ); + M.r[2] = vld1q_f32( reinterpret_cast(&pSource->_31) ); + M.r[3] = vld1q_f32( reinterpret_cast(&pSource->_41) ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_loadu_ps( &pSource->_11 ); + M.r[1] = _mm_loadu_ps( &pSource->_21 ); + M.r[2] = _mm_loadu_ps( &pSource->_31 ); + M.r[3] = _mm_loadu_ps( &pSource->_41 ); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x4A +( + const XMFLOAT4X4A* pSource +) +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vld1q_f32_ex( reinterpret_cast(&pSource->_11), 128 ); + M.r[1] = vld1q_f32_ex( reinterpret_cast(&pSource->_21), 128 ); + M.r[2] = vld1q_f32_ex( reinterpret_cast(&pSource->_31), 128 ); + M.r[3] = vld1q_f32_ex( reinterpret_cast(&pSource->_41), 128 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_load_ps( &pSource->_11 ); + M.r[1] = _mm_load_ps( &pSource->_21 ); + M.r[2] = _mm_load_ps( &pSource->_31 ); + M.r[3] = _mm_load_ps( &pSource->_41 ); + return M; +#endif +} + +/**************************************************************************** + * + * Vector and matrix store operations + * + ****************************************************************************/ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetIntX( V ); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32( pDestination, *reinterpret_cast(&V), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss( reinterpret_cast(pDestination), V ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat +( + float* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetX( V ); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32( pDestination, V, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss( pDestination, V ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt2 +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(V); + vst1_u32( pDestination, VL ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( reinterpret_cast(&pDestination[0]), V ); + _mm_store_ss( reinterpret_cast(&pDestination[1]), T ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt2A +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(V); + vst1_u32_ex( pDestination, VL, 64 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat2 +( + XMFLOAT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32( reinterpret_cast(pDestination), VL ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( &pDestination->x, V ); + _mm_store_ss( &pDestination->y, T ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat2A +( + XMFLOAT2A* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32_ex( reinterpret_cast(pDestination), VL, 64 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt2 +( + XMINT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x2_t v = vget_low_s32(V); + v = vcvt_s32_f32( v ); + vst1_s32( reinterpret_cast(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + // Write two ints + XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( reinterpret_cast(&pDestination->x), vOverflow ); + _mm_store_ss( reinterpret_cast(&pDestination->y), T ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt2 +( + XMUINT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v = vget_low_f32(V); + uint32x2_t iv = vcvt_u32_f32( v ); + vst1_u32( reinterpret_cast(pDestination), iv ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + // Write two uints + XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( reinterpret_cast(&pDestination->x), vResult ); + _mm_store_ss( reinterpret_cast(&pDestination->y), T ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt3 +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(V); + vst1_u32( pDestination, VL ); + vst1q_lane_u32( pDestination+2, *reinterpret_cast(&V), 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( reinterpret_cast(pDestination), V ); + _mm_store_ss( reinterpret_cast(&pDestination[1]), T1 ); + _mm_store_ss( reinterpret_cast(&pDestination[2]), T2 ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt3A +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(V); + vst1_u32_ex( pDestination, VL, 64 ); + vst1q_lane_u32( pDestination+2, *reinterpret_cast(&V), 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); + _mm_store_ss( reinterpret_cast(&pDestination[2]), T ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3 +( + XMFLOAT3* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32( reinterpret_cast(pDestination), VL ); + vst1q_lane_f32( reinterpret_cast(pDestination)+2, V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( &pDestination->x, V ); + _mm_store_ss( &pDestination->y, T1 ); + _mm_store_ss( &pDestination->z, T2 ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3A +( + XMFLOAT3A* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32_ex( reinterpret_cast(pDestination), VL, 64 ); + vst1q_lane_f32( reinterpret_cast(pDestination)+2, V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); + _mm_store_ss( &pDestination->z, T ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt3 +( + XMINT3* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vcvtq_s32_f32(V); + int32x2_t vL = vget_low_s32(v); + vst1_s32( reinterpret_cast(pDestination), vL ); + vst1q_lane_s32( reinterpret_cast(pDestination)+2, v, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + // Write 3 uints + XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( reinterpret_cast(&pDestination->x), vOverflow ); + _mm_store_ss( reinterpret_cast(&pDestination->y), T1 ); + _mm_store_ss( reinterpret_cast(&pDestination->z), T2 ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt3 +( + XMUINT3* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vcvtq_u32_f32(V); + uint32x2_t vL = vget_low_u32(v); + vst1_u32( reinterpret_cast(pDestination), vL ); + vst1q_lane_u32( reinterpret_cast(pDestination)+2, v, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + // Write 3 uints + XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( reinterpret_cast(&pDestination->x), vResult ); + _mm_store_ss( reinterpret_cast(&pDestination->y), T1 ); + _mm_store_ss( reinterpret_cast(&pDestination->z), T2 ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt4 +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_u32( pDestination, V ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt4A +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_u32_ex( pDestination, V, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4 +( + XMFLOAT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32( reinterpret_cast(pDestination), V ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps( &pDestination->x, V ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4A +( + XMFLOAT4A* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32_ex( reinterpret_cast(pDestination), V, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps( &pDestination->x, V ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt4 +( + XMINT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); + pDestination->w = static_cast(V.vector4_f32[3]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vcvtq_s32_f32(V); + vst1q_s32( reinterpret_cast(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt4 +( + XMUINT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); + pDestination->w = static_cast(V.vector4_f32[3]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vcvtq_u32_f32(V); + vst1q_u32( reinterpret_cast(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3x3 +( + XMFLOAT3X3* pDestination, + FXMMATRIX M +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 ); + float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); + vst1q_f32( &pDestination->m[0][0], T2 ); + + T1 = vextq_f32( M.r[1], M.r[1], 1 ); + T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); + vst1q_f32( &pDestination->m[1][1], T2 ); + + vst1q_lane_f32( &pDestination->m[2][2], M.r[2], 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2)); + vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0)); + _mm_storeu_ps(&pDestination->m[0][0],vTemp1); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + _mm_storeu_ps(&pDestination->m[1][1],vTemp2); + vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss(&pDestination->m[2][2],vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x3 +( + XMFLOAT4X3* pDestination, + FXMMATRIX M +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 ); + float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); + vst1q_f32( &pDestination->m[0][0], T2 ); + + T1 = vextq_f32( M.r[1], M.r[1], 1 ); + T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); + vst1q_f32( &pDestination->m[1][1], T2 ); + + T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 ); + T2 = vextq_f32( T1, M.r[3], 3 ); + vst1q_f32( &pDestination->m[2][2], T2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vTemp4 = M.r[3]; + XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0)); + vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0)); + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); + _mm_storeu_ps(&pDestination->m[0][0],vTemp1); + _mm_storeu_ps(&pDestination->m[1][1],vTemp2x); + _mm_storeu_ps(&pDestination->m[2][2],vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x3A +( + XMFLOAT4X3A* pDestination, + FXMMATRIX M +) +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 ); + float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); + vst1q_f32_ex( &pDestination->m[0][0], T2, 128 ); + + T1 = vextq_f32( M.r[1], M.r[1], 1 ); + T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); + vst1q_f32_ex( &pDestination->m[1][1], T2, 128 ); + + T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 ); + T2 = vextq_f32( T1, M.r[3], 3 ); + vst1q_f32_ex( &pDestination->m[2][2], T2, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + // x1,y1,z1,w1 + XMVECTOR vTemp1 = M.r[0]; + // x2,y2,z2,w2 + XMVECTOR vTemp2 = M.r[1]; + // x3,y3,z3,w3 + XMVECTOR vTemp3 = M.r[2]; + // x4,y4,z4,w4 + XMVECTOR vTemp4 = M.r[3]; + // z1,z1,x2,y2 + XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2)); + // y2,z2,x3,y3 (Final) + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + // x1,y1,z1,x2 (Final) + vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0)); + // z3,z3,x4,x4 + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); + // z3,x4,y4,z4 (Final) + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); + // Store in 3 operations + _mm_store_ps(&pDestination->m[0][0],vTemp1); + _mm_store_ps(&pDestination->m[1][1],vTemp2); + _mm_store_ps(&pDestination->m[2][2],vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3x4 +( + XMFLOAT3X4* pDestination, + FXMMATRIX M +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[1].vector4_f32[0]; + pDestination->m[0][2] = M.r[2].vector4_f32[0]; + pDestination->m[0][3] = M.r[3].vector4_f32[0]; + + pDestination->m[1][0] = M.r[0].vector4_f32[1]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[2].vector4_f32[1]; + pDestination->m[1][3] = M.r[3].vector4_f32[1]; + + pDestination->m[2][0] = M.r[0].vector4_f32[2]; + pDestination->m[2][1] = M.r[1].vector4_f32[2]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); + float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + + vst1q_f32(&pDestination->m[0][0], T0.val[0]); + vst1q_f32(&pDestination->m[1][0], T0.val[1]); + vst1q_f32(&pDestination->m[2][0], T1.val[0]); +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + // x.x,y.x,z.x,w.x + XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + + _mm_storeu_ps(&pDestination->m[0][0], r0); + _mm_storeu_ps(&pDestination->m[1][0], r1); + _mm_storeu_ps(&pDestination->m[2][0], r2); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3x4A +( + XMFLOAT3X4A* pDestination, + FXMMATRIX M +) +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[1].vector4_f32[0]; + pDestination->m[0][2] = M.r[2].vector4_f32[0]; + pDestination->m[0][3] = M.r[3].vector4_f32[0]; + + pDestination->m[1][0] = M.r[0].vector4_f32[1]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[2].vector4_f32[1]; + pDestination->m[1][3] = M.r[3].vector4_f32[1]; + + pDestination->m[2][0] = M.r[0].vector4_f32[2]; + pDestination->m[2][1] = M.r[1].vector4_f32[2]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); + float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + + vst1q_f32_ex(&pDestination->m[0][0], T0.val[0], 128); + vst1q_f32_ex(&pDestination->m[1][0], T0.val[1], 128); + vst1q_f32_ex(&pDestination->m[2][0], T1.val[0], 128); +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + // x.x,y.x,z.x,w.x + XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + + _mm_store_ps(&pDestination->m[0][0], r0); + _mm_store_ps(&pDestination->m[1][0], r1); + _mm_store_ps(&pDestination->m[2][0], r2); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x4 +( + XMFLOAT4X4* pDestination, + FXMMATRIX M +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32( reinterpret_cast(&pDestination->_11), M.r[0] ); + vst1q_f32( reinterpret_cast(&pDestination->_21), M.r[1] ); + vst1q_f32( reinterpret_cast(&pDestination->_31), M.r[2] ); + vst1q_f32( reinterpret_cast(&pDestination->_41), M.r[3] ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps( &pDestination->_11, M.r[0] ); + _mm_storeu_ps( &pDestination->_21, M.r[1] ); + _mm_storeu_ps( &pDestination->_31, M.r[2] ); + _mm_storeu_ps( &pDestination->_41, M.r[3] ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x4A +( + XMFLOAT4X4A* pDestination, + FXMMATRIX M +) +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32_ex( reinterpret_cast(&pDestination->_11), M.r[0], 128 ); + vst1q_f32_ex( reinterpret_cast(&pDestination->_21), M.r[1], 128 ); + vst1q_f32_ex( reinterpret_cast(&pDestination->_31), M.r[2], 128 ); + vst1q_f32_ex( reinterpret_cast(&pDestination->_41), M.r[3], 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps( &pDestination->_11, M.r[0] ); + _mm_store_ps( &pDestination->_21, M.r[1] ); + _mm_store_ps( &pDestination->_31, M.r[2] ); + _mm_store_ps( &pDestination->_41, M.r[3] ); +#endif +} + diff --git a/WickedEngine/Utility/DirectXMathMatrix.inl b/WickedEngine/Utility/DirectXMathMatrix.inl new file mode 100644 index 000000000..457689492 --- /dev/null +++ b/WickedEngine/Utility/DirectXMathMatrix.inl @@ -0,0 +1,3317 @@ +//------------------------------------------------------------------------------------- +// DirectXMathMatrix.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Matrix + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +// Return true if any entry in the matrix is NaN +inline bool XM_CALLCONV XMMatrixIsNaN +( + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + size_t i = 16; + auto pWork = reinterpret_cast(&M.m[0][0]); + do { + // Fetch value into integer unit + uint32_t uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // NaN is 0x7F800001 through 0x7FFFFFFF inclusive + uTest -= 0x7F800001U; + if (uTest<0x007FFFFFU) { + break; // NaN found + } + ++pWork; // Next entry + } while (--i); + return (i!=0); // i == 0 if nothing matched +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Load in registers + XMVECTOR vX = M.r[0]; + XMVECTOR vY = M.r[1]; + XMVECTOR vZ = M.r[2]; + XMVECTOR vW = M.r[3]; + // Test themselves to check for NaN + vX = vmvnq_u32(vceqq_f32(vX, vX)); + vY = vmvnq_u32(vceqq_f32(vY, vY)); + vZ = vmvnq_u32(vceqq_f32(vZ, vZ)); + vW = vmvnq_u32(vceqq_f32(vW, vW)); + // Or all the results + vX = vorrq_u32(vX,vZ); + vY = vorrq_u32(vY,vW); + vX = vorrq_u32(vX,vY); + // If any tested true, return true + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vX), vget_high_u8(vX)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + return (r != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Load in registers + XMVECTOR vX = M.r[0]; + XMVECTOR vY = M.r[1]; + XMVECTOR vZ = M.r[2]; + XMVECTOR vW = M.r[3]; + // Test themselves to check for NaN + vX = _mm_cmpneq_ps(vX,vX); + vY = _mm_cmpneq_ps(vY,vY); + vZ = _mm_cmpneq_ps(vZ,vZ); + vW = _mm_cmpneq_ps(vW,vW); + // Or all the results + vX = _mm_or_ps(vX,vZ); + vY = _mm_or_ps(vY,vW); + vX = _mm_or_ps(vX,vY); + // If any tested true, return true + return (_mm_movemask_ps(vX)!=0); +#else +#endif +} + +//------------------------------------------------------------------------------ + +// Return true if any entry in the matrix is +/-INF +inline bool XM_CALLCONV XMMatrixIsInfinite +( + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + size_t i = 16; + auto pWork = reinterpret_cast(&M.m[0][0]); + do { + // Fetch value into integer unit + uint32_t uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // INF is 0x7F800000 + if (uTest==0x7F800000U) { + break; // INF found + } + ++pWork; // Next entry + } while (--i); + return (i!=0); // i == 0 if nothing matched +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bits + XMVECTOR vTemp1 = vandq_u32(M.r[0],g_XMAbsMask); + XMVECTOR vTemp2 = vandq_u32(M.r[1],g_XMAbsMask); + XMVECTOR vTemp3 = vandq_u32(M.r[2],g_XMAbsMask); + XMVECTOR vTemp4 = vandq_u32(M.r[3],g_XMAbsMask); + // Compare to infinity + vTemp1 = vceqq_f32(vTemp1,g_XMInfinity); + vTemp2 = vceqq_f32(vTemp2,g_XMInfinity); + vTemp3 = vceqq_f32(vTemp3,g_XMInfinity); + vTemp4 = vceqq_f32(vTemp4,g_XMInfinity); + // Or the answers together + vTemp1 = vorrq_u32(vTemp1,vTemp2); + vTemp3 = vorrq_u32(vTemp3,vTemp4); + vTemp1 = vorrq_u32(vTemp1,vTemp3); + // If any are infinity, the signs are true. + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + return (r != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bits + XMVECTOR vTemp1 = _mm_and_ps(M.r[0],g_XMAbsMask); + XMVECTOR vTemp2 = _mm_and_ps(M.r[1],g_XMAbsMask); + XMVECTOR vTemp3 = _mm_and_ps(M.r[2],g_XMAbsMask); + XMVECTOR vTemp4 = _mm_and_ps(M.r[3],g_XMAbsMask); + // Compare to infinity + vTemp1 = _mm_cmpeq_ps(vTemp1,g_XMInfinity); + vTemp2 = _mm_cmpeq_ps(vTemp2,g_XMInfinity); + vTemp3 = _mm_cmpeq_ps(vTemp3,g_XMInfinity); + vTemp4 = _mm_cmpeq_ps(vTemp4,g_XMInfinity); + // Or the answers together + vTemp1 = _mm_or_ps(vTemp1,vTemp2); + vTemp3 = _mm_or_ps(vTemp3,vTemp4); + vTemp1 = _mm_or_ps(vTemp1,vTemp3); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp1)!=0); +#endif +} + +//------------------------------------------------------------------------------ + +// Return true if the XMMatrix is equal to identity +inline bool XM_CALLCONV XMMatrixIsIdentity +( + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + // Use the integer pipeline to reduce branching to a minimum + auto pWork = reinterpret_cast(&M.m[0][0]); + // Convert 1.0f to zero and or them together + uint32_t uOne = pWork[0]^0x3F800000U; + // Or all the 0.0f entries together + uint32_t uZero = pWork[1]; + uZero |= pWork[2]; + uZero |= pWork[3]; + // 2nd row + uZero |= pWork[4]; + uOne |= pWork[5]^0x3F800000U; + uZero |= pWork[6]; + uZero |= pWork[7]; + // 3rd row + uZero |= pWork[8]; + uZero |= pWork[9]; + uOne |= pWork[10]^0x3F800000U; + uZero |= pWork[11]; + // 4th row + uZero |= pWork[12]; + uZero |= pWork[13]; + uZero |= pWork[14]; + uOne |= pWork[15]^0x3F800000U; + // If all zero entries are zero, the uZero==0 + uZero &= 0x7FFFFFFF; // Allow -0.0f + // If all 1.0f entries are 1.0f, then uOne==0 + uOne |= uZero; + return (uOne==0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vTemp1 = vceqq_f32(M.r[0],g_XMIdentityR0); + XMVECTOR vTemp2 = vceqq_f32(M.r[1],g_XMIdentityR1); + XMVECTOR vTemp3 = vceqq_f32(M.r[2],g_XMIdentityR2); + XMVECTOR vTemp4 = vceqq_f32(M.r[3],g_XMIdentityR3); + vTemp1 = vandq_u32(vTemp1,vTemp2); + vTemp3 = vandq_u32(vTemp3,vTemp4); + vTemp1 = vandq_u32(vTemp1,vTemp3); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + return ( r == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0],g_XMIdentityR0); + XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1],g_XMIdentityR1); + XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2],g_XMIdentityR2); + XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3],g_XMIdentityR3); + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + vTemp3 = _mm_and_ps(vTemp3,vTemp4); + vTemp1 = _mm_and_ps(vTemp1,vTemp3); + return (_mm_movemask_ps(vTemp1)==0x0f); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Perform a 4x4 matrix multiply by a 4x4 matrix +inline XMMATRIX XM_CALLCONV XMMatrixMultiply +( + FXMMATRIX M1, + CXMMATRIX M2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M1.m[0][0]; + float y = M1.m[0][1]; + float z = M1.m[0][2]; + float w = M1.m[0][3]; + // Perform the operation on the first row + mResult.m[0][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[0][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[0][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[0][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + // Repeat for all the other rows + x = M1.m[1][0]; + y = M1.m[1][1]; + z = M1.m[1][2]; + w = M1.m[1][3]; + mResult.m[1][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[1][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[1][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[1][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + x = M1.m[2][0]; + y = M1.m[2][1]; + z = M1.m[2][2]; + w = M1.m[2][3]; + mResult.m[2][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[2][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[2][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[2][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + x = M1.m[3][0]; + y = M1.m[3][1]; + z = M1.m[3][2]; + w = M1.m[3][3]; + mResult.m[3][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[3][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[3][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[3][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + return mResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX mResult; + float32x2_t VL = vget_low_f32( M1.r[0] ); + float32x2_t VH = vget_high_f32( M1.r[0] ); + // Perform the operation on the first row + XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0); + XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1); + XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[0] = vaddq_f32( vZ, vW ); + // Repeat for the other 3 rows + VL = vget_low_f32( M1.r[1] ); + VH = vget_high_f32( M1.r[1] ); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[1] = vaddq_f32( vZ, vW ); + VL = vget_low_f32( M1.r[2] ); + VH = vget_high_f32( M1.r[2] ); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[2] = vaddq_f32( vZ, vW ); + VL = vget_low_f32( M1.r[3] ); + VH = vget_high_f32( M1.r[3] ); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[3] = vaddq_f32( vZ, vW ); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX mResult; + // Splat the component X,Y,Z then W +#if defined(_XM_AVX_INTRINSICS_) + XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 0); + XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 1); + XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 2); + XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 3); +#else + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + // Perform a binary add to reduce cumulative errors + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[0] = vX; + // Repeat for the other 3 rows +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 3); +#else + vW = M1.r[1]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[1] = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 3); +#else + vW = M1.r[2]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[2] = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 3); +#else + vW = M1.r[3]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[3] = vX; + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose +( + FXMMATRIX M1, + CXMMATRIX M2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M2.m[0][0]; + float y = M2.m[1][0]; + float z = M2.m[2][0]; + float w = M2.m[3][0]; + // Perform the operation on the first row + mResult.m[0][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[0][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[0][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[0][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + // Repeat for all the other rows + x = M2.m[0][1]; + y = M2.m[1][1]; + z = M2.m[2][1]; + w = M2.m[3][1]; + mResult.m[1][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[1][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[1][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[1][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + x = M2.m[0][2]; + y = M2.m[1][2]; + z = M2.m[2][2]; + w = M2.m[3][2]; + mResult.m[2][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[2][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[2][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[2][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + x = M2.m[0][3]; + y = M2.m[1][3]; + z = M2.m[2][3]; + w = M2.m[3][3]; + mResult.m[3][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[3][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[3][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[3][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + return mResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( M1.r[0] ); + float32x2_t VH = vget_high_f32( M1.r[0] ); + // Perform the operation on the first row + XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0); + XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1); + XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r0 = vaddq_f32( vZ, vW ); + // Repeat for the other 3 rows + VL = vget_low_f32( M1.r[1] ); + VH = vget_high_f32( M1.r[1] ); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r1 = vaddq_f32( vZ, vW ); + VL = vget_low_f32( M1.r[2] ); + VH = vget_high_f32( M1.r[2] ); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r2 = vaddq_f32( vZ, vW ); + VL = vget_low_f32( M1.r[3] ); + VH = vget_high_f32( M1.r[3] ); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r3 = vaddq_f32( vZ, vW ); + + // Transpose result + float32x4x2_t P0 = vzipq_f32( r0, r2 ); + float32x4x2_t P1 = vzipq_f32( r1, r3 ); + + float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] ); + float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] ); + + XMMATRIX mResult; + mResult.r[0] = T0.val[0]; + mResult.r[1] = T0.val[1]; + mResult.r[2] = T1.val[0]; + mResult.r[3] = T1.val[1]; + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the component X,Y,Z then W +#if defined(_XM_AVX_INTRINSICS_) + XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 0); + XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 1); + XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 2); + XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 3); +#else + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + // Perform a binary add to reduce cumulative errors + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + XMVECTOR r0 = vX; + // Repeat for the other 3 rows +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 3); +#else + vW = M1.r[1]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + XMVECTOR r1 = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 3); +#else + vW = M1.r[2]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + XMVECTOR r2 = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 3); +#else + vW = M1.r[3]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + XMVECTOR r3 = vX; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranspose +( + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + // Original matrix: + // + // m00m01m02m03 + // m10m11m12m13 + // m20m21m22m23 + // m30m31m32m33 + + XMMATRIX P; + P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]); // m00m20m01m21 + P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]); // m10m30m11m31 + P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]); // m02m22m03m23 + P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]); // m12m32m13m33 + + XMMATRIX MT; + MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]); // m00m10m20m30 + MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]); // m01m11m21m31 + MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]); // m02m12m22m32 + MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]); // m03m13m23m33 + return MT; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32( M.r[0], M.r[2] ); + float32x4x2_t P1 = vzipq_f32( M.r[1], M.r[3] ); + + float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] ); + float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] ); + + XMMATRIX mResult; + mResult.r[0] = T0.val[0]; + mResult.r[1] = T0.val[1]; + mResult.r[2] = T1.val[0]; + mResult.r[3] = T1.val[1]; + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(1,0,1,0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(3,2,3,2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(1,0,1,0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(3,2,3,2)); + XMMATRIX mResult; + + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ +// Return the inverse and the determinant of a 4x4 matrix +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMMatrixInverse +( + XMVECTOR* pDeterminant, + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMMATRIX MT = XMMatrixTranspose(M); + + XMVECTOR V0[4], V1[4]; + V0[0] = XMVectorSwizzle(MT.r[2]); + V1[0] = XMVectorSwizzle(MT.r[3]); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorSwizzle(MT.r[1]); + V0[2] = XMVectorPermute(MT.r[2], MT.r[0]); + V1[2] = XMVectorPermute(MT.r[3], MT.r[1]); + + XMVECTOR D0 = XMVectorMultiply(V0[0], V1[0]); + XMVECTOR D1 = XMVectorMultiply(V0[1], V1[1]); + XMVECTOR D2 = XMVectorMultiply(V0[2], V1[2]); + + V0[0] = XMVectorSwizzle(MT.r[2]); + V1[0] = XMVectorSwizzle(MT.r[3]); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorSwizzle(MT.r[1]); + V0[2] = XMVectorPermute(MT.r[2], MT.r[0]); + V1[2] = XMVectorPermute(MT.r[3], MT.r[1]); + + D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0); + D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1); + D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + XMVECTOR C0 = XMVectorMultiply(V0[0], V1[0]); + XMVECTOR C2 = XMVectorMultiply(V0[1], V1[1]); + XMVECTOR C4 = XMVectorMultiply(V0[2], V1[2]); + XMVECTOR C6 = XMVectorMultiply(V0[3], V1[3]); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + XMVECTOR C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0); + XMVECTOR C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + XMVECTOR C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4); + XMVECTOR C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + XMMATRIX R; + R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v); + R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v); + R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v); + R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v); + + XMVECTOR Determinant = XMVector4Dot(R.r[0], MT.r[0]); + + if (pDeterminant != nullptr) + *pDeterminant = Determinant; + + XMVECTOR Reciprocal = XMVectorReciprocal(Determinant); + + XMMATRIX Result; + Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal); + Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal); + Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal); + Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX MT = XMMatrixTranspose(M); + XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,1,0,0)); + XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(3,2,3,2)); + XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(1,1,0,0)); + XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(3,2,3,2)); + XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0],_MM_SHUFFLE(2,0,2,0)); + XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1],_MM_SHUFFLE(3,1,3,1)); + + XMVECTOR D0 = _mm_mul_ps(V00,V10); + XMVECTOR D1 = _mm_mul_ps(V01,V11); + XMVECTOR D2 = _mm_mul_ps(V02,V12); + + V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(3,2,3,2)); + V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(1,1,0,0)); + V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(3,2,3,2)); + V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(1,1,0,0)); + V02 = _mm_shuffle_ps(MT.r[2],MT.r[0],_MM_SHUFFLE(3,1,3,1)); + V12 = _mm_shuffle_ps(MT.r[3],MT.r[1],_MM_SHUFFLE(2,0,2,0)); + + V00 = _mm_mul_ps(V00,V10); + V01 = _mm_mul_ps(V01,V11); + V02 = _mm_mul_ps(V02,V12); + D0 = _mm_sub_ps(D0,V00); + D1 = _mm_sub_ps(D1,V01); + D2 = _mm_sub_ps(D2,V02); + // V11 = D0Y,D0W,D2Y,D2Y + V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,1,3,1)); + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1,0,2,1)); + V10 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(0,3,0,2)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0,1,0,2)); + V11 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(2,1,2,1)); + // V13 = D1Y,D1W,D2W,D2W + XMVECTOR V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,3,3,1)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1,0,2,1)); + V12 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(0,3,0,2)); + XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(0,1,0,2)); + V13 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(2,1,2,1)); + + XMVECTOR C0 = _mm_mul_ps(V00,V10); + XMVECTOR C2 = _mm_mul_ps(V01,V11); + XMVECTOR C4 = _mm_mul_ps(V02,V12); + XMVECTOR C6 = _mm_mul_ps(V03,V13); + + // V11 = D0X,D0Y,D2X,D2X + V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(0,0,1,0)); + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2,1,3,2)); + V10 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(2,1,0,3)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1,3,2,3)); + V11 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(0,2,1,2)); + // V13 = D1X,D1Y,D2Z,D2Z + V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(2,2,1,0)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2,1,3,2)); + V12 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(2,1,0,3)); + V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,3,2,3)); + V13 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(0,2,1,2)); + + V00 = _mm_mul_ps(V00,V10); + V01 = _mm_mul_ps(V01,V11); + V02 = _mm_mul_ps(V02,V12); + V03 = _mm_mul_ps(V03,V13); + C0 = _mm_sub_ps(C0,V00); + C2 = _mm_sub_ps(C2,V01); + C4 = _mm_sub_ps(C4,V02); + C6 = _mm_sub_ps(C6,V03); + + V00 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(0,3,0,3)); + // V10 = D0Z,D0Z,D2X,D2Y + V10 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,2,2)); + V10 = XM_PERMUTE_PS(V10,_MM_SHUFFLE(0,2,3,0)); + V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(2,0,3,1)); + // V11 = D0X,D0W,D2X,D2Y + V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,3,0)); + V11 = XM_PERMUTE_PS(V11,_MM_SHUFFLE(2,1,0,3)); + V02 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(0,3,0,3)); + // V12 = D1Z,D1Z,D2Z,D2W + V12 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,2,2)); + V12 = XM_PERMUTE_PS(V12,_MM_SHUFFLE(0,2,3,0)); + V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(2,0,3,1)); + // V13 = D1X,D1W,D2Z,D2W + V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,3,0)); + V13 = XM_PERMUTE_PS(V13,_MM_SHUFFLE(2,1,0,3)); + + V00 = _mm_mul_ps(V00,V10); + V01 = _mm_mul_ps(V01,V11); + V02 = _mm_mul_ps(V02,V12); + V03 = _mm_mul_ps(V03,V13); + XMVECTOR C1 = _mm_sub_ps(C0,V00); + C0 = _mm_add_ps(C0,V00); + XMVECTOR C3 = _mm_add_ps(C2,V01); + C2 = _mm_sub_ps(C2,V01); + XMVECTOR C5 = _mm_sub_ps(C4,V02); + C4 = _mm_add_ps(C4,V02); + XMVECTOR C7 = _mm_add_ps(C6,V03); + C6 = _mm_sub_ps(C6,V03); + + C0 = _mm_shuffle_ps(C0,C1,_MM_SHUFFLE(3,1,2,0)); + C2 = _mm_shuffle_ps(C2,C3,_MM_SHUFFLE(3,1,2,0)); + C4 = _mm_shuffle_ps(C4,C5,_MM_SHUFFLE(3,1,2,0)); + C6 = _mm_shuffle_ps(C6,C7,_MM_SHUFFLE(3,1,2,0)); + C0 = XM_PERMUTE_PS(C0,_MM_SHUFFLE(3,1,2,0)); + C2 = XM_PERMUTE_PS(C2,_MM_SHUFFLE(3,1,2,0)); + C4 = XM_PERMUTE_PS(C4,_MM_SHUFFLE(3,1,2,0)); + C6 = XM_PERMUTE_PS(C6,_MM_SHUFFLE(3,1,2,0)); + // Get the determinate + XMVECTOR vTemp = XMVector4Dot(C0,MT.r[0]); + if (pDeterminant != nullptr) + *pDeterminant = vTemp; + vTemp = _mm_div_ps(g_XMOne,vTemp); + XMMATRIX mResult; + mResult.r[0] = _mm_mul_ps(C0,vTemp); + mResult.r[1] = _mm_mul_ps(C2,vTemp); + mResult.r[2] = _mm_mul_ps(C4,vTemp); + mResult.r[3] = _mm_mul_ps(C6,vTemp); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMMatrixDeterminant +( + FXMMATRIX M +) +{ + static const XMVECTORF32 Sign = { 1.0f, -1.0f, 1.0f, -1.0f }; + + XMVECTOR V0 = XMVectorSwizzle(M.r[2]); + XMVECTOR V1 = XMVectorSwizzle(M.r[3]); + XMVECTOR V2 = XMVectorSwizzle(M.r[2]); + XMVECTOR V3 = XMVectorSwizzle(M.r[3]); + XMVECTOR V4 = XMVectorSwizzle(M.r[2]); + XMVECTOR V5 = XMVectorSwizzle(M.r[3]); + + XMVECTOR P0 = XMVectorMultiply(V0, V1); + XMVECTOR P1 = XMVectorMultiply(V2, V3); + XMVECTOR P2 = XMVectorMultiply(V4, V5); + + V0 = XMVectorSwizzle(M.r[2]); + V1 = XMVectorSwizzle(M.r[3]); + V2 = XMVectorSwizzle(M.r[2]); + V3 = XMVectorSwizzle(M.r[3]); + V4 = XMVectorSwizzle(M.r[2]); + V5 = XMVectorSwizzle(M.r[3]); + + P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0); + P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1); + P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2); + + V0 = XMVectorSwizzle(M.r[1]); + V1 = XMVectorSwizzle(M.r[1]); + V2 = XMVectorSwizzle(M.r[1]); + + XMVECTOR S = XMVectorMultiply(M.r[0], Sign.v); + XMVECTOR R = XMVectorMultiply(V0, P0); + R = XMVectorNegativeMultiplySubtract(V1, P1, R); + R = XMVectorMultiplyAdd(V2, P2, R); + + return XMVector4Dot(S, R); +} + +#define XM3RANKDECOMPOSE(a, b, c, x, y, z) \ + if((x) < (y)) \ + { \ + if((y) < (z)) \ + { \ + (a) = 2; \ + (b) = 1; \ + (c) = 0; \ + } \ + else \ + { \ + (a) = 1; \ + \ + if((x) < (z)) \ + { \ + (b) = 2; \ + (c) = 0; \ + } \ + else \ + { \ + (b) = 0; \ + (c) = 2; \ + } \ + } \ + } \ + else \ + { \ + if((x) < (z)) \ + { \ + (a) = 2; \ + (b) = 0; \ + (c) = 1; \ + } \ + else \ + { \ + (a) = 0; \ + \ + if((y) < (z)) \ + { \ + (b) = 2; \ + (c) = 1; \ + } \ + else \ + { \ + (b) = 1; \ + (c) = 2; \ + } \ + } \ + } + +#define XM3_DECOMP_EPSILON 0.0001f + +_Use_decl_annotations_ +inline bool XM_CALLCONV XMMatrixDecompose +( + XMVECTOR *outScale, + XMVECTOR *outRotQuat, + XMVECTOR *outTrans, + FXMMATRIX M +) +{ + static const XMVECTOR *pvCanonicalBasis[3] = { + &g_XMIdentityR0.v, + &g_XMIdentityR1.v, + &g_XMIdentityR2.v + }; + + assert( outScale != nullptr ); + assert( outRotQuat != nullptr ); + assert( outTrans != nullptr ); + + // Get the translation + outTrans[0] = M.r[3]; + + XMVECTOR *ppvBasis[3]; + XMMATRIX matTemp; + ppvBasis[0] = &matTemp.r[0]; + ppvBasis[1] = &matTemp.r[1]; + ppvBasis[2] = &matTemp.r[2]; + + matTemp.r[0] = M.r[0]; + matTemp.r[1] = M.r[1]; + matTemp.r[2] = M.r[2]; + matTemp.r[3] = g_XMIdentityR3.v; + + auto pfScales = reinterpret_cast(outScale); + + size_t a, b, c; + XMVectorGetXPtr(&pfScales[0],XMVector3Length(ppvBasis[0][0])); + XMVectorGetXPtr(&pfScales[1],XMVector3Length(ppvBasis[1][0])); + XMVectorGetXPtr(&pfScales[2],XMVector3Length(ppvBasis[2][0])); + pfScales[3] = 0.f; + + XM3RANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2]) + + if(pfScales[a] < XM3_DECOMP_EPSILON) + { + ppvBasis[a][0] = pvCanonicalBasis[a][0]; + } + ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]); + + if(pfScales[b] < XM3_DECOMP_EPSILON) + { + size_t aa, bb, cc; + float fAbsX, fAbsY, fAbsZ; + + fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0])); + fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0])); + fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0])); + + XM3RANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ) + + ppvBasis[b][0] = XMVector3Cross(ppvBasis[a][0],pvCanonicalBasis[cc][0]); + } + + ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]); + + if(pfScales[c] < XM3_DECOMP_EPSILON) + { + ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0],ppvBasis[b][0]); + } + + ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]); + + float fDet = XMVectorGetX(XMMatrixDeterminant(matTemp)); + + // use Kramer's rule to check for handedness of coordinate system + if(fDet < 0.0f) + { + // switch coordinate system by negating the scale and inverting the basis vector on the x-axis + pfScales[a] = -pfScales[a]; + ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]); + + fDet = -fDet; + } + + fDet -= 1.0f; + fDet *= fDet; + + if(XM3_DECOMP_EPSILON < fDet) + { + // Non-SRT matrix encountered + return false; + } + + // generate the quaternion from the matrix + outRotQuat[0] = XMQuaternionRotationMatrix(matTemp); + return true; +} + +#undef XM3_DECOMP_EPSILON +#undef XM3RANKDECOMPOSE + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixIdentity() +{ + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = g_XMIdentityR3.v; + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixSet +( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33 +) +{ + XMMATRIX M; +#if defined(_XM_NO_INTRINSICS_) + M.m[0][0] = m00; M.m[0][1] = m01; M.m[0][2] = m02; M.m[0][3] = m03; + M.m[1][0] = m10; M.m[1][1] = m11; M.m[1][2] = m12; M.m[1][3] = m13; + M.m[2][0] = m20; M.m[2][1] = m21; M.m[2][2] = m22; M.m[2][3] = m23; + M.m[3][0] = m30; M.m[3][1] = m31; M.m[3][2] = m32; M.m[3][3] = m33; +#else + M.r[0] = XMVectorSet(m00, m01, m02, m03); + M.r[1] = XMVectorSet(m10, m11, m12, m13); + M.r[2] = XMVectorSet(m20, m21, m22, m23); + M.r[3] = XMVectorSet(m30, m31, m32, m33); +#endif + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranslation +( + float OffsetX, + float OffsetY, + float OffsetZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = OffsetX; + M.m[3][1] = OffsetY; + M.m[3][2] = OffsetZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = XMVectorSet(OffsetX, OffsetY, OffsetZ, 1.f ); + return M; +#endif +} + + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector +( + FXMVECTOR Offset +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = Offset.vector4_f32[0]; + M.m[3][1] = Offset.vector4_f32[1]; + M.m[3][2] = Offset.vector4_f32[2]; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = XMVectorSelect( g_XMIdentityR3.v, Offset, g_XMSelect1110.v ); + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixScaling +( + float ScaleX, + float ScaleY, + float ScaleZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = ScaleX; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ScaleY; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = ScaleZ; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( ScaleX, Zero, 0 ); + M.r[1] = vsetq_lane_f32( ScaleY, Zero, 1 ); + M.r[2] = vsetq_lane_f32( ScaleZ, Zero, 2 ); + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_set_ps( 0, 0, 0, ScaleX ); + M.r[1] = _mm_set_ps( 0, 0, ScaleY, 0 ); + M.r[2] = _mm_set_ps( 0, ScaleZ, 0, 0 ); + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixScalingFromVector +( + FXMVECTOR Scale +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = Scale.vector4_f32[0]; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Scale.vector4_f32[1]; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = Scale.vector4_f32[2]; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vandq_u32(Scale,g_XMMaskX); + M.r[1] = vandq_u32(Scale,g_XMMaskY); + M.r[2] = vandq_u32(Scale,g_XMMaskZ); + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_and_ps(Scale,g_XMMaskX); + M.r[1] = _mm_and_ps(Scale,g_XMMaskY); + M.r[2] = _mm_and_ps(Scale,g_XMMaskZ); + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationX +( + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = fCosAngle; + M.m[1][2] = fSinAngle; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = -fSinAngle; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const XMVECTOR Zero = vdupq_n_f32(0); + + XMVECTOR T1 = vsetq_lane_f32( fCosAngle, Zero, 1 ); + T1 = vsetq_lane_f32( fSinAngle, T1, 2 ); + + XMVECTOR T2 = vsetq_lane_f32( -fSinAngle, Zero, 1 ); + T2 = vsetq_lane_f32( fCosAngle, T2, 2 ); + + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = T1; + M.r[2] = T2; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = 0,y = cos,z = sin, w = 0 + vCos = _mm_shuffle_ps(vCos,vSin,_MM_SHUFFLE(3,0,0,3)); + XMMATRIX M; + M.r[0] = g_XMIdentityR0; + M.r[1] = vCos; + // x = 0,y = sin,z = cos, w = 0 + vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,1,2,0)); + // x = 0,y = -sin,z = cos, w = 0 + vCos = _mm_mul_ps(vCos,g_XMNegateY); + M.r[2] = vCos; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationY +( + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = fCosAngle; + M.m[0][1] = 0.0f; + M.m[0][2] = -fSinAngle; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = fSinAngle; + M.m[2][1] = 0.0f; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const XMVECTOR Zero = vdupq_n_f32(0); + + XMVECTOR T0 = vsetq_lane_f32( fCosAngle, Zero, 0 ); + T0 = vsetq_lane_f32( -fSinAngle, T0, 2 ); + + XMVECTOR T2 = vsetq_lane_f32( fSinAngle, Zero, 0 ); + T2 = vsetq_lane_f32( fCosAngle, T2, 2 ); + + XMMATRIX M; + M.r[0] = T0; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = T2; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = sin,y = 0,z = cos, w = 0 + vSin = _mm_shuffle_ps(vSin,vCos,_MM_SHUFFLE(3,0,3,0)); + XMMATRIX M; + M.r[2] = vSin; + M.r[1] = g_XMIdentityR1; + // x = cos,y = 0,z = sin, w = 0 + vSin = XM_PERMUTE_PS(vSin,_MM_SHUFFLE(3,0,1,2)); + // x = cos,y = 0,z = -sin, w = 0 + vSin = _mm_mul_ps(vSin,g_XMNegateZ); + M.r[0] = vSin; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationZ +( + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = fCosAngle; + M.m[0][1] = fSinAngle; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = -fSinAngle; + M.m[1][1] = fCosAngle; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const XMVECTOR Zero = vdupq_n_f32(0); + + XMVECTOR T0 = vsetq_lane_f32( fCosAngle, Zero, 0 ); + T0 = vsetq_lane_f32( fSinAngle, T0, 1 ); + + XMVECTOR T1 = vsetq_lane_f32( -fSinAngle, Zero, 0 ); + T1 = vsetq_lane_f32( fCosAngle, T1, 1 ); + + XMMATRIX M; + M.r[0] = T0; + M.r[1] = T1; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = cos,y = sin,z = 0, w = 0 + vCos = _mm_unpacklo_ps(vCos,vSin); + XMMATRIX M; + M.r[0] = vCos; + // x = sin,y = cos,z = 0, w = 0 + vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,2,0,1)); + // x = cos,y = -sin,z = 0, w = 0 + vCos = _mm_mul_ps(vCos,g_XMNegateX); + M.r[1] = vCos; + M.r[2] = g_XMIdentityR2; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw +( + float Pitch, + float Yaw, + float Roll +) +{ + XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + return XMMatrixRotationRollPitchYawFromVector(Angles); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector +( + FXMVECTOR Angles // +) +{ + XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles); + return XMMatrixRotationQuaternion(Q); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationNormal +( + FXMVECTOR NormalAxis, + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMVECTOR A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f); + + XMVECTOR C2 = XMVectorSplatZ(A); + XMVECTOR C1 = XMVectorSplatY(A); + XMVECTOR C0 = XMVectorSplatX(A); + + XMVECTOR N0 = XMVectorSwizzle(NormalAxis); + XMVECTOR N1 = XMVectorSwizzle(NormalAxis); + + XMVECTOR V0 = XMVectorMultiply(C2, N0); + V0 = XMVectorMultiply(V0, N1); + + XMVECTOR R0 = XMVectorMultiply(C2, NormalAxis); + R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1); + + XMVECTOR R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0); + XMVECTOR R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0); + + V0 = XMVectorSelect(A, R0, g_XMSelect1110.v); + XMVECTOR V1 = XMVectorPermute(R1, R2); + XMVECTOR V2 = XMVectorPermute(R1, R2); + + XMMATRIX M; + M.r[0] = XMVectorPermute(V0, V1); + M.r[1] = XMVectorPermute(V0, V1); + M.r[2] = XMVectorPermute(V0, V2); + M.r[3] = g_XMIdentityR3.v; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMVECTOR C2 = _mm_set_ps1(1.0f - fCosAngle); + XMVECTOR C1 = _mm_set_ps1(fCosAngle); + XMVECTOR C0 = _mm_set_ps1(fSinAngle); + + XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,0,2,1)); + XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,1,0,2)); + + XMVECTOR V0 = _mm_mul_ps(C2, N0); + V0 = _mm_mul_ps(V0, N1); + + XMVECTOR R0 = _mm_mul_ps(C2, NormalAxis); + R0 = _mm_mul_ps(R0, NormalAxis); + R0 = _mm_add_ps(R0, C1); + + XMVECTOR R1 = _mm_mul_ps(C0, NormalAxis); + R1 = _mm_add_ps(R1, V0); + XMVECTOR R2 = _mm_mul_ps(C0, NormalAxis); + R2 = _mm_sub_ps(V0,R2); + + V0 = _mm_and_ps(R0,g_XMMask3); + XMVECTOR V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,1,2,0)); + V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,3,2,1)); + XMVECTOR V2 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(0,0,1,1)); + V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,2,0)); + + R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(1,0,3,0)); + R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,2,0)); + + XMMATRIX M; + M.r[0] = R2; + + R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(3,2,3,1)); + R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,0,2)); + M.r[1] = R2; + + V2 = _mm_shuffle_ps(V2,V0,_MM_SHUFFLE(3,2,1,0)); + M.r[2] = V2; + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationAxis +( + FXMVECTOR Axis, + float Angle +) +{ + assert(!XMVector3Equal(Axis, XMVectorZero())); + assert(!XMVector3IsInfinite(Axis)); + + XMVECTOR Normal = XMVector3Normalize(Axis); + return XMMatrixRotationNormal(Normal, Angle); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion +( + FXMVECTOR Quaternion +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Constant1110 = { 1.0f, 1.0f, 1.0f, 0.0f }; + + XMVECTOR Q0 = XMVectorAdd(Quaternion, Quaternion); + XMVECTOR Q1 = XMVectorMultiply(Quaternion, Q0); + + XMVECTOR V0 = XMVectorPermute(Q1, Constant1110.v); + XMVECTOR V1 = XMVectorPermute(Q1, Constant1110.v); + XMVECTOR R0 = XMVectorSubtract(Constant1110, V0); + R0 = XMVectorSubtract(R0, V1); + + V0 = XMVectorSwizzle(Quaternion); + V1 = XMVectorSwizzle(Q0); + V0 = XMVectorMultiply(V0, V1); + + V1 = XMVectorSplatW(Quaternion); + XMVECTOR V2 = XMVectorSwizzle(Q0); + V1 = XMVectorMultiply(V1, V2); + + XMVECTOR R1 = XMVectorAdd(V0, V1); + XMVECTOR R2 = XMVectorSubtract(V0, V1); + + V0 = XMVectorPermute(R1, R2); + V1 = XMVectorPermute(R1, R2); + + XMMATRIX M; + M.r[0] = XMVectorPermute(R0, V0); + M.r[1] = XMVectorPermute(R0, V0); + M.r[2] = XMVectorPermute(R0, V1); + M.r[3] = g_XMIdentityR3.v; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Constant1110 = { 1.0f, 1.0f, 1.0f, 0.0f }; + + XMVECTOR Q0 = _mm_add_ps(Quaternion,Quaternion); + XMVECTOR Q1 = _mm_mul_ps(Quaternion,Q0); + + XMVECTOR V0 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,0,0,1)); + V0 = _mm_and_ps(V0,g_XMMask3); + XMVECTOR V1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,1,2,2)); + V1 = _mm_and_ps(V1,g_XMMask3); + XMVECTOR R0 = _mm_sub_ps(Constant1110,V0); + R0 = _mm_sub_ps(R0, V1); + + V0 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,1,0,0)); + V1 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,2,1,2)); + V0 = _mm_mul_ps(V0, V1); + + V1 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,3,3,3)); + XMVECTOR V2 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,0,2,1)); + V1 = _mm_mul_ps(V1, V2); + + XMVECTOR R1 = _mm_add_ps(V0, V1); + XMVECTOR R2 = _mm_sub_ps(V0, V1); + + V0 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(1,0,2,1)); + V0 = XM_PERMUTE_PS(V0,_MM_SHUFFLE(1,3,2,0)); + V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,2,0,0)); + V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,0,2,0)); + + Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(1,0,3,0)); + Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,2,0)); + + XMMATRIX M; + M.r[0] = Q1; + + Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(3,2,3,1)); + Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,0,2)); + M.r[1] = Q1; + + Q1 = _mm_shuffle_ps(V1,R0,_MM_SHUFFLE(3,2,1,0)); + M.r[2] = Q1; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTransformation2D +( + FXMVECTOR ScalingOrigin, + float ScalingOrientation, + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + float Rotation, + GXMVECTOR Translation +) +{ + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v); + XMVECTOR NegScalingOrigin = XMVectorNegate(VScalingOrigin); + + XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + XMMATRIX MScalingOrientation = XMMatrixRotationZ(ScalingOrientation); + XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + XMMATRIX MRotation = XMMatrixRotationZ(Rotation); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v); + + XMMATRIX M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTransformation +( + FXMVECTOR ScalingOrigin, + FXMVECTOR ScalingOrientationQuaternion, + FXMVECTOR Scaling, + GXMVECTOR RotationOrigin, + HXMVECTOR RotationQuaternion, + HXMVECTOR Translation +) +{ + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v); + XMVECTOR NegScalingOrigin = XMVectorNegate(ScalingOrigin); + + XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + XMMATRIX MScalingOrientation = XMMatrixRotationQuaternion(ScalingOrientationQuaternion); + XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v); + XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v); + + XMMATRIX M; + M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + float Rotation, + FXMVECTOR Translation +) +{ + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + XMMATRIX MRotation = XMMatrixRotationZ(Rotation); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v); + + XMMATRIX M; + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + FXMVECTOR RotationQuaternion, + GXMVECTOR Translation +) +{ + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin,g_XMSelect1110.v); + XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation,g_XMSelect1110.v); + + XMMATRIX M; + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixReflect +( + FXMVECTOR ReflectionPlane +) +{ + assert(!XMVector3Equal(ReflectionPlane, XMVectorZero())); + assert(!XMPlaneIsInfinite(ReflectionPlane)); + + static const XMVECTORF32 NegativeTwo = { -2.0f, -2.0f, -2.0f, 0.0f }; + + XMVECTOR P = XMPlaneNormalize(ReflectionPlane); + XMVECTOR S = XMVectorMultiply(P, NegativeTwo); + + XMVECTOR A = XMVectorSplatX(P); + XMVECTOR B = XMVectorSplatY(P); + XMVECTOR C = XMVectorSplatZ(P); + XMVECTOR D = XMVectorSplatW(P); + + XMMATRIX M; + M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v); + M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v); + M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v); + M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixShadow +( + FXMVECTOR ShadowPlane, + FXMVECTOR LightPosition +) +{ + static const XMVECTORU32 Select0001 = { XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1 }; + + assert(!XMVector3Equal(ShadowPlane, XMVectorZero())); + assert(!XMPlaneIsInfinite(ShadowPlane)); + + XMVECTOR P = XMPlaneNormalize(ShadowPlane); + XMVECTOR Dot = XMPlaneDot(P, LightPosition); + P = XMVectorNegate(P); + XMVECTOR D = XMVectorSplatW(P); + XMVECTOR C = XMVectorSplatZ(P); + XMVECTOR B = XMVectorSplatY(P); + XMVECTOR A = XMVectorSplatX(P); + Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v); + + XMMATRIX M; + M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot); + return M; +} + +//------------------------------------------------------------------------------ +// View and projection initialization operations +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookAtLH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) +{ + XMVECTOR EyeDirection = XMVectorSubtract(FocusPosition, EyePosition); + return XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookAtRH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) +{ + XMVECTOR NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition); + return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookToLH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) +{ + assert(!XMVector3Equal(EyeDirection, XMVectorZero())); + assert(!XMVector3IsInfinite(EyeDirection)); + assert(!XMVector3Equal(UpDirection, XMVectorZero())); + assert(!XMVector3IsInfinite(UpDirection)); + + XMVECTOR R2 = XMVector3Normalize(EyeDirection); + + XMVECTOR R0 = XMVector3Cross(UpDirection, R2); + R0 = XMVector3Normalize(R0); + + XMVECTOR R1 = XMVector3Cross(R2, R0); + + XMVECTOR NegEyePosition = XMVectorNegate(EyePosition); + + XMVECTOR D0 = XMVector3Dot(R0, NegEyePosition); + XMVECTOR D1 = XMVector3Dot(R1, NegEyePosition); + XMVECTOR D2 = XMVector3Dot(R2, NegEyePosition); + + XMMATRIX M; + M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v); + M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v); + M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v); + M.r[3] = g_XMIdentityR3.v; + + M = XMMatrixTranspose(M); + + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookToRH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) +{ + XMVECTOR NegEyeDirection = XMVectorNegate(EyeDirection); + return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +#ifdef _MSC_VER +# ifdef _PREFAST_ +# pragma prefast(push) +# pragma prefast(disable:28931, "PREfast noise: Esp:1266") +# endif +#endif + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( TwoNearZ / ViewWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( TwoNearZ / ViewHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, g_XMIdentityR3.v, 2 ); + M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( TwoNearZ / ViewWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( TwoNearZ / ViewHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, g_XMNegIdentityR3.v, 2 ); + M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,-1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH +( + float FovAngleY, + float AspectRatio, + float NearZ, + float FarZ +) +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + float fRange = FarZ / (FarZ-NearZ); + + XMMATRIX M; + M.m[0][0] = Width; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Height; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float fRange = FarZ / (FarZ-NearZ); + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( Width, Zero, 0 ); + M.r[1] = vsetq_lane_f32( Height, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, g_XMIdentityR3.v, 2 ); + M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float fRange = FarZ / (FarZ-NearZ); + // Note: This is recorded on the stack + float Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectRatio, + Height, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // CosFov / SinFov,0,0,0 + XMMATRIX M; + M.r[0] = vTemp; + // 0,Height / AspectRatio,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH +( + float FovAngleY, + float AspectRatio, + float NearZ, + float FarZ +) +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + float fRange = FarZ / (NearZ-FarZ); + + XMMATRIX M; + M.m[0][0] = Width; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Height; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + float fRange = FarZ / (NearZ-FarZ); + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( Width, Zero, 0 ); + M.r[1] = vsetq_lane_f32( Height, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, g_XMNegIdentityR3.v, 2 ); + M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + float fRange = FarZ / (NearZ-FarZ); + // Note: This is recorded on the stack + float Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectRatio, + Height, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // CosFov / SinFov,0,0,0 + XMMATRIX M; + M.r[0] = vTemp; + // 0,Height / AspectRatio,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,-1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ-NearZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ-NearZ); + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( TwoNearZ * ReciprocalWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( TwoNearZ * ReciprocalHeight, Zero, 1 ); + M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + 1.0f); + M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ+NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ-NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ*ReciprocalWidth, + TwoNearZ*ReciprocalHeight, + -fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet( -(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + 1.0f ); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues,g_XMMaskZ); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ-FarZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ-FarZ); + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( TwoNearZ * ReciprocalWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( TwoNearZ * ReciprocalHeight, Zero, 1 ); + M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth, + (ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + -1.0f); + M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ+NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ*ReciprocalWidth, + TwoNearZ*ReciprocalHeight, + fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet( (ViewLeft + ViewRight) * ReciprocalWidth, + (ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + -1.0f ); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues,g_XMMaskZ); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float fRange = 1.0f / (FarZ-NearZ); + + XMMATRIX M; + M.m[0][0] = 2.0f / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 2.0f / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fRange = 1.0f / (FarZ-NearZ); + + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( 2.0f / ViewWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( 2.0f / ViewHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); + M.r[3] = vsetq_lane_f32( -fRange * NearZ, g_XMIdentityR3.v, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fRange = 1.0f / (FarZ-NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float fRange = 1.0f / (NearZ-FarZ); + + XMMATRIX M; + M.m[0][0] = 2.0f / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 2.0f / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fRange = 1.0f / (NearZ-FarZ); + + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( 2.0f / ViewWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( 2.0f / ViewHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); + M.r[3] = vsetq_lane_f32( fRange * NearZ, g_XMIdentityR3.v, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fRange = 1.0f / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ-NearZ); + + XMMATRIX M; + M.m[0][0] = ReciprocalWidth + ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ReciprocalHeight + ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; + M.m[3][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ-NearZ); + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( ReciprocalWidth + ReciprocalWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( ReciprocalHeight + ReciprocalHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + -fRange * NearZ, + 1.0f); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ-NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + -NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp,vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + vTemp = _mm_add_ps(vTemp,vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues,rMem2); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ-FarZ); + + XMMATRIX M; + M.m[0][0] = ReciprocalWidth + ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ReciprocalHeight + ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange * NearZ, + 1.0f); + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ-FarZ); + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( ReciprocalWidth + ReciprocalWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( ReciprocalHeight + ReciprocalHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange * NearZ, + 1.0f); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp,vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + vTemp = _mm_add_ps(vTemp,vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues,rMem2); + M.r[3] = vValues; + return M; +#endif +} + +#ifdef _MSC_VER +# ifdef _PREFAST_ +# pragma prefast(pop) +# endif +#endif + +/**************************************************************************** + * + * XMMATRIX operators and methods + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMMATRIX::XMMATRIX +( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33 +) +{ + r[0] = XMVectorSet(m00, m01, m02, m03); + r[1] = XMVectorSet(m10, m11, m12, m13); + r[2] = XMVectorSet(m20, m21, m22, m23); + r[3] = XMVectorSet(m30, m31, m32, m33); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX::XMMATRIX +( + const float* pArray +) +{ + assert( pArray != nullptr ); + r[0] = XMLoadFloat4(reinterpret_cast(pArray)); + r[1] = XMLoadFloat4(reinterpret_cast(pArray + 4)); + r[2] = XMLoadFloat4(reinterpret_cast(pArray + 8)); + r[3] = XMLoadFloat4(reinterpret_cast(pArray + 12)); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator- () const +{ + XMMATRIX R; + R.r[0] = XMVectorNegate( r[0] ); + R.r[1] = XMVectorNegate( r[1] ); + R.r[2] = XMVectorNegate( r[2] ); + R.r[3] = XMVectorNegate( r[3] ); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator+= (FXMMATRIX M) +{ + r[0] = XMVectorAdd( r[0], M.r[0] ); + r[1] = XMVectorAdd( r[1], M.r[1] ); + r[2] = XMVectorAdd( r[2], M.r[2] ); + r[3] = XMVectorAdd( r[3], M.r[3] ); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator-= (FXMMATRIX M) +{ + r[0] = XMVectorSubtract( r[0], M.r[0] ); + r[1] = XMVectorSubtract( r[1], M.r[1] ); + r[2] = XMVectorSubtract( r[2], M.r[2] ); + r[3] = XMVectorSubtract( r[3], M.r[3] ); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator*=(FXMMATRIX M) +{ + *this = XMMatrixMultiply( *this, M ); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator*= (float S) +{ + r[0] = XMVectorScale( r[0], S ); + r[1] = XMVectorScale( r[1], S ); + r[2] = XMVectorScale( r[2], S ); + r[3] = XMVectorScale( r[3], S ); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator/= (float S) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vS = XMVectorReplicate( S ); + r[0] = XMVectorDivide( r[0], vS ); + r[1] = XMVectorDivide( r[1], vS ); + r[2] = XMVectorDivide( r[2], vS ); + r[3] = XMVectorDivide( r[3], vS ); + return *this; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + float32x4_t vS = vdupq_n_f32( S ); + r[0] = vdivq_f32( r[0], vS ); + r[1] = vdivq_f32( r[1], vS ); + r[2] = vdivq_f32( r[2], vS ); + r[3] = vdivq_f32( r[3], vS ); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x2_t vS = vdup_n_f32( S ); + float32x2_t R0 = vrecpe_f32( vS ); + float32x2_t S0 = vrecps_f32( R0, vS ); + R0 = vmul_f32( S0, R0 ); + S0 = vrecps_f32( R0, vS ); + R0 = vmul_f32( S0, R0 ); + float32x4_t Reciprocal = vcombine_u32(R0, R0); + r[0] = vmulq_f32( r[0], Reciprocal ); + r[1] = vmulq_f32( r[1], Reciprocal ); + r[2] = vmulq_f32( r[2], Reciprocal ); + r[3] = vmulq_f32( r[3], Reciprocal ); +#endif + return *this; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 vS = _mm_set_ps1( S ); + r[0] = _mm_div_ps( r[0], vS ); + r[1] = _mm_div_ps( r[1], vS ); + r[2] = _mm_div_ps( r[2], vS ); + r[3] = _mm_div_ps( r[3], vS ); + return *this; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator+ (FXMMATRIX M) const +{ + XMMATRIX R; + R.r[0] = XMVectorAdd( r[0], M.r[0] ); + R.r[1] = XMVectorAdd( r[1], M.r[1] ); + R.r[2] = XMVectorAdd( r[2], M.r[2] ); + R.r[3] = XMVectorAdd( r[3], M.r[3] ); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator- (FXMMATRIX M) const +{ + XMMATRIX R; + R.r[0] = XMVectorSubtract( r[0], M.r[0] ); + R.r[1] = XMVectorSubtract( r[1], M.r[1] ); + R.r[2] = XMVectorSubtract( r[2], M.r[2] ); + R.r[3] = XMVectorSubtract( r[3], M.r[3] ); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator*(FXMMATRIX M) const +{ + return XMMatrixMultiply(*this, M); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator* (float S) const +{ + XMMATRIX R; + R.r[0] = XMVectorScale( r[0], S ); + R.r[1] = XMVectorScale( r[1], S ); + R.r[2] = XMVectorScale( r[2], S ); + R.r[3] = XMVectorScale( r[3], S ); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator/ (float S) const +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vS = XMVectorReplicate( S ); + XMMATRIX R; + R.r[0] = XMVectorDivide( r[0], vS ); + R.r[1] = XMVectorDivide( r[1], vS ); + R.r[2] = XMVectorDivide( r[2], vS ); + R.r[3] = XMVectorDivide( r[3], vS ); + return R; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + float32x4_t vS = vdupq_n_f32( S ); + XMMATRIX R; + R.r[0] = vdivq_f32( r[0], vS ); + R.r[1] = vdivq_f32( r[1], vS ); + R.r[2] = vdivq_f32( r[2], vS ); + R.r[3] = vdivq_f32( r[3], vS ); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x2_t vS = vdup_n_f32( S ); + float32x2_t R0 = vrecpe_f32( vS ); + float32x2_t S0 = vrecps_f32( R0, vS ); + R0 = vmul_f32( S0, R0 ); + S0 = vrecps_f32( R0, vS ); + R0 = vmul_f32( S0, R0 ); + float32x4_t Reciprocal = vcombine_u32(R0, R0); + XMMATRIX R; + R.r[0] = vmulq_f32( r[0], Reciprocal ); + R.r[1] = vmulq_f32( r[1], Reciprocal ); + R.r[2] = vmulq_f32( r[2], Reciprocal ); + R.r[3] = vmulq_f32( r[3], Reciprocal ); +#endif + return R; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 vS = _mm_set_ps1( S ); + XMMATRIX R; + R.r[0] = _mm_div_ps( r[0], vS ); + R.r[1] = _mm_div_ps( r[1], vS ); + R.r[2] = _mm_div_ps( r[2], vS ); + R.r[3] = _mm_div_ps( r[3], vS ); + return R; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV operator* +( + float S, + FXMMATRIX M +) +{ + XMMATRIX R; + R.r[0] = XMVectorScale( M.r[0], S ); + R.r[1] = XMVectorScale( M.r[1], S ); + R.r[2] = XMVectorScale( M.r[2], S ); + R.r[3] = XMVectorScale( M.r[3], S ); + return R; +} + +/**************************************************************************** + * + * XMFLOAT3X3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT3X3::XMFLOAT3X3 +( + const float* pArray +) +{ + assert( pArray != nullptr ); + for (size_t Row = 0; Row < 3; Row++) + { + for (size_t Column = 0; Column < 3; Column++) + { + m[Row][Column] = pArray[Row * 3 + Column]; + } + } +} + +/**************************************************************************** + * + * XMFLOAT4X3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4X3::XMFLOAT4X3 +( + const float* pArray +) +{ + assert( pArray != nullptr ); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + + m[1][0] = pArray[3]; + m[1][1] = pArray[4]; + m[1][2] = pArray[5]; + + m[2][0] = pArray[6]; + m[2][1] = pArray[7]; + m[2][2] = pArray[8]; + + m[3][0] = pArray[9]; + m[3][1] = pArray[10]; + m[3][2] = pArray[11]; +} + +/**************************************************************************** +* +* XMFLOAT3X4 operators +* +****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT3X4::XMFLOAT3X4 +( + const float* pArray +) +{ + assert(pArray != nullptr); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + m[0][3] = pArray[3]; + + m[1][0] = pArray[4]; + m[1][1] = pArray[5]; + m[1][2] = pArray[6]; + m[1][3] = pArray[7]; + + m[2][0] = pArray[8]; + m[2][1] = pArray[9]; + m[2][2] = pArray[10]; + m[2][3] = pArray[11]; +} + +/**************************************************************************** + * + * XMFLOAT4X4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4X4::XMFLOAT4X4 +( + const float* pArray +) +{ + assert( pArray != nullptr ); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + m[0][3] = pArray[3]; + + m[1][0] = pArray[4]; + m[1][1] = pArray[5]; + m[1][2] = pArray[6]; + m[1][3] = pArray[7]; + + m[2][0] = pArray[8]; + m[2][1] = pArray[9]; + m[2][2] = pArray[10]; + m[2][3] = pArray[11]; + + m[3][0] = pArray[12]; + m[3][1] = pArray[13]; + m[3][2] = pArray[14]; + m[3][3] = pArray[15]; +} + diff --git a/WickedEngine/Utility/DirectXMathMisc.inl b/WickedEngine/Utility/DirectXMathMisc.inl new file mode 100644 index 000000000..a805d8c54 --- /dev/null +++ b/WickedEngine/Utility/DirectXMathMisc.inl @@ -0,0 +1,2516 @@ +//------------------------------------------------------------------------------------- +// DirectXMathMisc.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Quaternion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + return XMVector4Equal(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionNotEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + return XMVector4NotEqual(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsNaN +( + FXMVECTOR Q +) +{ + return XMVector4IsNaN(Q); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsInfinite +( + FXMVECTOR Q +) +{ + return XMVector4IsInfinite(Q); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsIdentity +( + FXMVECTOR Q +) +{ + return XMVector4Equal(Q, g_XMIdentityR3.v); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionDot +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + return XMVector4Dot(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionMultiply +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1 followed by the rotation Q2) + + // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y), + // (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x), + // (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w), + // (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + (Q2.vector4_f32[3] * Q1.vector4_f32[0]) + (Q2.vector4_f32[0] * Q1.vector4_f32[3]) + (Q2.vector4_f32[1] * Q1.vector4_f32[2]) - (Q2.vector4_f32[2] * Q1.vector4_f32[1]), + (Q2.vector4_f32[3] * Q1.vector4_f32[1]) - (Q2.vector4_f32[0] * Q1.vector4_f32[2]) + (Q2.vector4_f32[1] * Q1.vector4_f32[3]) + (Q2.vector4_f32[2] * Q1.vector4_f32[0]), + (Q2.vector4_f32[3] * Q1.vector4_f32[2]) + (Q2.vector4_f32[0] * Q1.vector4_f32[1]) - (Q2.vector4_f32[1] * Q1.vector4_f32[0]) + (Q2.vector4_f32[2] * Q1.vector4_f32[3]), + (Q2.vector4_f32[3] * Q1.vector4_f32[3]) - (Q2.vector4_f32[0] * Q1.vector4_f32[0]) - (Q2.vector4_f32[1] * Q1.vector4_f32[1]) - (Q2.vector4_f32[2] * Q1.vector4_f32[2]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ControlWZYX = { 1.0f, -1.0f, 1.0f, -1.0f }; + static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f, -1.0f, -1.0f }; + static const XMVECTORF32 ControlYXWZ = { -1.0f, 1.0f, 1.0f, -1.0f }; + + float32x2_t Q2L = vget_low_f32(Q2); + float32x2_t Q2H = vget_high_f32(Q2); + + float32x4_t Q2X = vdupq_lane_f32( Q2L, 0 ); + float32x4_t Q2Y = vdupq_lane_f32( Q2L, 1 ); + float32x4_t Q2Z = vdupq_lane_f32( Q2H, 0 ); + XMVECTOR vResult = vmulq_lane_f32(Q1, Q2H, 1); + + // Mul by Q1WZYX + float32x4_t vTemp = vrev64q_f32(Q1); + vTemp = vcombine_f32( vget_high_f32(vTemp), vget_low_f32(vTemp) ); + Q2X = vmulq_f32(Q2X,vTemp); + vResult = vmlaq_f32( vResult, Q2X, ControlWZYX ); + + // Mul by Q1ZWXY + vTemp = vrev64q_u32(vTemp); + Q2Y = vmulq_f32(Q2Y,vTemp); + vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY); + + // Mul by Q1YXWZ + vTemp = vrev64q_u32(vTemp); + vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp)); + Q2Z = vmulq_f32(Q2Z,vTemp); + vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ControlWZYX = { 1.0f, -1.0f, 1.0f, -1.0f }; + static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f, -1.0f, -1.0f }; + static const XMVECTORF32 ControlYXWZ = { -1.0f, 1.0f, 1.0f, -1.0f }; + // Copy to SSE registers and use as few as possible for x86 + XMVECTOR Q2X = Q2; + XMVECTOR Q2Y = Q2; + XMVECTOR Q2Z = Q2; + XMVECTOR vResult = Q2; + // Splat with one instruction + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,3,3,3)); + Q2X = XM_PERMUTE_PS(Q2X,_MM_SHUFFLE(0,0,0,0)); + Q2Y = XM_PERMUTE_PS(Q2Y,_MM_SHUFFLE(1,1,1,1)); + Q2Z = XM_PERMUTE_PS(Q2Z,_MM_SHUFFLE(2,2,2,2)); + // Retire Q1 and perform Q1*Q2W + vResult = _mm_mul_ps(vResult,Q1); + XMVECTOR Q1Shuffle = Q1; + // Shuffle the copies of Q1 + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3)); + // Mul by Q1WZYX + Q2X = _mm_mul_ps(Q2X,Q1Shuffle); + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(2,3,0,1)); + // Flip the signs on y and z + Q2X = _mm_mul_ps(Q2X,ControlWZYX); + // Mul by Q1ZWXY + Q2Y = _mm_mul_ps(Q2Y,Q1Shuffle); + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3)); + // Flip the signs on z and w + Q2Y = _mm_mul_ps(Q2Y,ControlZWXY); + // Mul by Q1YXWZ + Q2Z = _mm_mul_ps(Q2Z,Q1Shuffle); + vResult = _mm_add_ps(vResult,Q2X); + // Flip the signs on x and w + Q2Z = _mm_mul_ps(Q2Z,ControlYXWZ); + Q2Y = _mm_add_ps(Q2Y,Q2Z); + vResult = _mm_add_ps(vResult,Q2Y); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLengthSq +( + FXMVECTOR Q +) +{ + return XMVector4LengthSq(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength +( + FXMVECTOR Q +) +{ + return XMVector4ReciprocalLength(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLength +( + FXMVECTOR Q +) +{ + return XMVector4Length(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst +( + FXMVECTOR Q +) +{ + return XMVector4NormalizeEst(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionNormalize +( + FXMVECTOR Q +) +{ + return XMVector4Normalize(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionConjugate +( + FXMVECTOR Q +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + -Q.vector4_f32[0], + -Q.vector4_f32[1], + -Q.vector4_f32[2], + Q.vector4_f32[3] + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 NegativeOne3 = { -1.0f, -1.0f, -1.0f, 1.0f }; + return vmulq_f32(Q, NegativeOne3.v ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 NegativeOne3 = { -1.0f, -1.0f, -1.0f, 1.0f }; + return _mm_mul_ps(Q,NegativeOne3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionInverse +( + FXMVECTOR Q +) +{ + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR L = XMVector4LengthSq(Q); + XMVECTOR Conjugate = XMQuaternionConjugate(Q); + + XMVECTOR Control = XMVectorLessOrEqual(L, g_XMEpsilon.v); + + XMVECTOR Result = XMVectorDivide(Conjugate, L); + + Result = XMVectorSelect(Result, Zero, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLn +( + FXMVECTOR Q +) +{ + static const XMVECTORF32 OneMinusEpsilon = { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f }; + + XMVECTOR QW = XMVectorSplatW(Q); + XMVECTOR Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v); + + XMVECTOR ControlW = XMVectorInBounds(QW, OneMinusEpsilon.v); + + XMVECTOR Theta = XMVectorACos(QW); + XMVECTOR SinTheta = XMVectorSin(Theta); + + XMVECTOR S = XMVectorDivide(Theta,SinTheta); + + XMVECTOR Result = XMVectorMultiply(Q0, S); + Result = XMVectorSelect(Q0, Result, ControlW); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionExp +( + FXMVECTOR Q +) +{ + XMVECTOR Theta = XMVector3Length(Q); + + XMVECTOR SinTheta, CosTheta; + XMVectorSinCos(&SinTheta, &CosTheta, Theta); + + XMVECTOR S = XMVectorDivide(SinTheta, Theta); + + XMVECTOR Result = XMVectorMultiply(Q, S); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v); + Result = XMVectorSelect(Result, Q, Control); + + Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSlerp +( + FXMVECTOR Q0, + FXMVECTOR Q1, + float t +) +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSlerpV(Q0, Q1, T); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSlerpV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR T +) +{ + assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T))); + + // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const XMVECTORF32 OneMinusEpsilon = { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f }; + + XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorLess(CosOmega, Zero); + XMVECTOR Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control); + + CosOmega = XMVectorMultiply(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + XMVECTOR SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v); + SinOmega = XMVectorSqrt(SinOmega); + + XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); + + XMVECTOR SignMask = XMVectorSplatSignMask(); + XMVECTOR V01 = XMVectorShiftLeft(T, Zero, 2); + SignMask = XMVectorShiftLeft(SignMask, Zero, 3); + V01 = XMVectorXorInt(V01, SignMask); + V01 = XMVectorAdd(g_XMIdentityR0.v, V01); + + XMVECTOR InvSinOmega = XMVectorReciprocal(SinOmega); + + XMVECTOR S0 = XMVectorMultiply(V01, Omega); + S0 = XMVectorSin(S0); + S0 = XMVectorMultiply(S0, InvSinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + XMVECTOR S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = XMVectorMultiply(S1, Sign); + + XMVECTOR Result = XMVectorMultiply(Q0, S0); + Result = XMVectorMultiplyAdd(Q1, S1, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 OneMinusEpsilon = { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f }; + static const XMVECTORU32 SignMask2 = { 0x80000000, 0x00000000, 0x00000000, 0x00000000 }; + + XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorLess(CosOmega, Zero); + XMVECTOR Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control); + + CosOmega = _mm_mul_ps(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + XMVECTOR SinOmega = _mm_mul_ps(CosOmega,CosOmega); + SinOmega = _mm_sub_ps(g_XMOne,SinOmega); + SinOmega = _mm_sqrt_ps(SinOmega); + + XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); + + XMVECTOR V01 = XM_PERMUTE_PS(T,_MM_SHUFFLE(2,3,0,1)); + V01 = _mm_and_ps(V01,g_XMMaskXY); + V01 = _mm_xor_ps(V01,SignMask2); + V01 = _mm_add_ps(g_XMIdentityR0, V01); + + XMVECTOR S0 = _mm_mul_ps(V01, Omega); + S0 = XMVectorSin(S0); + S0 = _mm_div_ps(S0, SinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + XMVECTOR S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = _mm_mul_ps(S1, Sign); + XMVECTOR Result = _mm_mul_ps(Q0, S0); + S1 = _mm_mul_ps(S1, Q1); + Result = _mm_add_ps(Result,S1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSquad +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3, + float t +) +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSquadV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3, + HXMVECTOR T +) +{ + assert( (XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)) ); + + XMVECTOR TP = T; + const XMVECTOR Two = XMVectorSplatConstant(2, 0); + + XMVECTOR Q03 = XMQuaternionSlerpV(Q0, Q3, T); + XMVECTOR Q12 = XMQuaternionSlerpV(Q1, Q2, T); + + TP = XMVectorNegativeMultiplySubtract(TP, TP, TP); + TP = XMVectorMultiply(TP, Two); + + XMVECTOR Result = XMQuaternionSlerpV(Q03, Q12, TP); + + return Result; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMQuaternionSquadSetup +( + XMVECTOR* pA, + XMVECTOR* pB, + XMVECTOR* pC, + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3 +) +{ + assert(pA); + assert(pB); + assert(pC); + + XMVECTOR LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2)); + XMVECTOR LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2)); + XMVECTOR SQ2 = XMVectorNegate(Q2); + + XMVECTOR Control1 = XMVectorLess(LS12, LD12); + SQ2 = XMVectorSelect(Q2, SQ2, Control1); + + XMVECTOR LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1)); + XMVECTOR LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1)); + XMVECTOR SQ0 = XMVectorNegate(Q0); + + XMVECTOR LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3)); + XMVECTOR LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3)); + XMVECTOR SQ3 = XMVectorNegate(Q3); + + XMVECTOR Control0 = XMVectorLess(LS01, LD01); + XMVECTOR Control2 = XMVectorLess(LS23, LD23); + + SQ0 = XMVectorSelect(Q0, SQ0, Control0); + SQ3 = XMVectorSelect(Q3, SQ3, Control2); + + XMVECTOR InvQ1 = XMQuaternionInverse(Q1); + XMVECTOR InvQ2 = XMQuaternionInverse(SQ2); + + XMVECTOR LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0)); + XMVECTOR LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2)); + XMVECTOR LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1)); + XMVECTOR LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3)); + + const XMVECTOR NegativeOneQuarter = XMVectorSplatConstant(-1, 2); + + XMVECTOR ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter); + XMVECTOR ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter); + ExpQ02 = XMQuaternionExp(ExpQ02); + ExpQ13 = XMQuaternionExp(ExpQ13); + + *pA = XMQuaternionMultiply(Q1, ExpQ02); + *pB = XMQuaternionMultiply(SQ2, ExpQ13); + *pC = SQ2; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentric +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + float f, + float g +) +{ + float s = f + g; + + XMVECTOR Result; + if ((s < 0.00001f) && (s > -0.00001f)) + { + Result = Q0; + } + else + { + XMVECTOR Q01 = XMQuaternionSlerp(Q0, Q1, s); + XMVECTOR Q02 = XMQuaternionSlerp(Q0, Q2, s); + + Result = XMQuaternionSlerp(Q01, Q02, g / s); + } + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR F, + HXMVECTOR G +) +{ + assert( (XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F)) ); + assert( (XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G)) ); + + const XMVECTOR Epsilon = XMVectorSplatConstant(1, 16); + + XMVECTOR S = XMVectorAdd(F, G); + + XMVECTOR Result; + if (XMVector4InBounds(S, Epsilon)) + { + Result = Q0; + } + else + { + XMVECTOR Q01 = XMQuaternionSlerpV(Q0, Q1, S); + XMVECTOR Q02 = XMQuaternionSlerpV(Q0, Q2, S); + XMVECTOR GS = XMVectorReciprocal(S); + GS = XMVectorMultiply(G, GS); + + Result = XMQuaternionSlerpV(Q01, Q02, GS); + } + + return Result; +} + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionIdentity() +{ + return g_XMIdentityR3.v; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw +( + float Pitch, + float Yaw, + float Roll +) +{ + XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles); + return Q; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector +( + FXMVECTOR Angles // +) +{ + static const XMVECTORF32 Sign = { 1.0f, -1.0f, -1.0f, 1.0f }; + + XMVECTOR HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v); + + XMVECTOR SinAngles, CosAngles; + XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles); + + XMVECTOR P0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR Y0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR R0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR P1 = XMVectorPermute(CosAngles, SinAngles); + XMVECTOR Y1 = XMVectorPermute(CosAngles, SinAngles); + XMVECTOR R1 = XMVectorPermute(CosAngles, SinAngles); + + XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v); + XMVECTOR Q0 = XMVectorMultiply(P0, Y0); + Q1 = XMVectorMultiply(Q1, Y1); + Q0 = XMVectorMultiply(Q0, R0); + XMVECTOR Q = XMVectorMultiplyAdd(Q1, R1, Q0); + + return Q; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationNormal +( + FXMVECTOR NormalAxis, + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v); + + float SinV, CosV; + XMScalarSinCos(&SinV, &CosV, 0.5f * Angle); + + XMVECTOR Scale = XMVectorSet( SinV, SinV, SinV, CosV ); + return XMVectorMultiply(N, Scale); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR N = _mm_and_ps(NormalAxis,g_XMMask3); + N = _mm_or_ps(N,g_XMIdentityR3); + XMVECTOR Scale = _mm_set_ps1(0.5f * Angle); + XMVECTOR vSine; + XMVECTOR vCosine; + XMVectorSinCos(&vSine,&vCosine,Scale); + Scale = _mm_and_ps(vSine,g_XMMask3); + vCosine = _mm_and_ps(vCosine,g_XMMaskW); + Scale = _mm_or_ps(Scale,vCosine); + N = _mm_mul_ps(N,Scale); + return N; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationAxis +( + FXMVECTOR Axis, + float Angle +) +{ + assert(!XMVector3Equal(Axis, XMVectorZero())); + assert(!XMVector3IsInfinite(Axis)); + + XMVECTOR Normal = XMVector3Normalize(Axis); + XMVECTOR Q = XMQuaternionRotationNormal(Normal, Angle); + return Q; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix +( + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 q; + float r22 = M.m[2][2]; + if (r22 <= 0.f) // x^2 + y^2 >= z^2 + w^2 + { + float dif10 = M.m[1][1] - M.m[0][0]; + float omr22 = 1.f - r22; + if (dif10 <= 0.f) // x^2 >= y^2 + { + float fourXSqr = omr22 - dif10; + float inv4x = 0.5f / sqrtf(fourXSqr); + q.f[0] = fourXSqr*inv4x; + q.f[1] = (M.m[0][1] + M.m[1][0])*inv4x; + q.f[2] = (M.m[0][2] + M.m[2][0])*inv4x; + q.f[3] = (M.m[1][2] - M.m[2][1])*inv4x; + } + else // y^2 >= x^2 + { + float fourYSqr = omr22 + dif10; + float inv4y = 0.5f / sqrtf(fourYSqr); + q.f[0] = (M.m[0][1] + M.m[1][0])*inv4y; + q.f[1] = fourYSqr*inv4y; + q.f[2] = (M.m[1][2] + M.m[2][1])*inv4y; + q.f[3] = (M.m[2][0] - M.m[0][2])*inv4y; + } + } + else // z^2 + w^2 >= x^2 + y^2 + { + float sum10 = M.m[1][1] + M.m[0][0]; + float opr22 = 1.f + r22; + if (sum10 <= 0.f) // z^2 >= w^2 + { + float fourZSqr = opr22 - sum10; + float inv4z = 0.5f / sqrtf(fourZSqr); + q.f[0] = (M.m[0][2] + M.m[2][0])*inv4z; + q.f[1] = (M.m[1][2] + M.m[2][1])*inv4z; + q.f[2] = fourZSqr*inv4z; + q.f[3] = (M.m[0][1] - M.m[1][0])*inv4z; + } + else // w^2 >= z^2 + { + float fourWSqr = opr22 + sum10; + float inv4w = 0.5f / sqrtf(fourWSqr); + q.f[0] = (M.m[1][2] - M.m[2][1])*inv4w; + q.f[1] = (M.m[2][0] - M.m[0][2])*inv4w; + q.f[2] = (M.m[0][1] - M.m[1][0])*inv4w; + q.f[3] = fourWSqr*inv4w; + } + } + return q.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 XMPMMP = { +1.0f, -1.0f, -1.0f, +1.0f }; + static const XMVECTORF32 XMMPMP = { -1.0f, +1.0f, -1.0f, +1.0f }; + static const XMVECTORF32 XMMMPP = { -1.0f, -1.0f, +1.0f, +1.0f }; + static const XMVECTORU32 Select0110 = { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 }; + static const XMVECTORU32 Select0010 = { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 }; + + XMVECTOR r0 = M.r[0]; + XMVECTOR r1 = M.r[1]; + XMVECTOR r2 = M.r[2]; + + XMVECTOR r00 = vdupq_lane_f32(vget_low_f32(r0), 0); + XMVECTOR r11 = vdupq_lane_f32(vget_low_f32(r1), 1); + XMVECTOR r22 = vdupq_lane_f32(vget_high_f32(r2), 0); + + // x^2 >= y^2 equivalent to r11 - r00 <= 0 + XMVECTOR r11mr00 = vsubq_f32(r11, r00); + XMVECTOR x2gey2 = vcleq_f32(r11mr00, g_XMZero); + + // z^2 >= w^2 equivalent to r11 + r00 <= 0 + XMVECTOR r11pr00 = vaddq_f32(r11, r00); + XMVECTOR z2gew2 = vcleq_f32(r11pr00, g_XMZero); + + // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 + XMVECTOR x2py2gez2pw2 = vcleq_f32(r22, g_XMZero); + + // (4*x^2, 4*y^2, 4*z^2, 4*w^2) + XMVECTOR t0 = vmulq_f32( XMPMMP, r00 ); + XMVECTOR x2y2z2w2 = vmlaq_f32( t0, XMMPMP, r11 ); + x2y2z2w2 = vmlaq_f32( x2y2z2w2, XMMMPP, r22 ); + x2y2z2w2 = vaddq_f32( x2y2z2w2, g_XMOne ); + + // (r01, r02, r12, r11) + t0 = vextq_f32(r0, r0, 1); + XMVECTOR t1 = vextq_f32(r1, r1, 1); + t0 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_low_f32( t1 ) ) ); + + // (r10, r20, r21, r10) + t1 = vextq_f32(r2, r2, 3); + XMVECTOR r10 = vdupq_lane_f32( vget_low_f32(r1), 0 ); + t1 = vbslq_f32( Select0110, t1, r10 ); + + // (4*x*y, 4*x*z, 4*y*z, unused) + XMVECTOR xyxzyz = vaddq_f32(t0, t1); + + // (r21, r20, r10, r10) + t0 = vcombine_f32( vrev64_f32( vget_low_f32(r2) ), vget_low_f32(r10) ); + + // (r12, r02, r01, r12) + XMVECTOR t2 = vcombine_f32( vrev64_f32( vget_high_f32(r0) ), vrev64_f32( vget_low_f32(r0) ) ); + XMVECTOR t3 = vdupq_lane_f32( vget_high_f32(r1), 0 ); + t1 = vbslq_f32( Select0110, t2, t3 ); + + // (4*x*w, 4*y*w, 4*z*w, unused) + XMVECTOR xwywzw = vsubq_f32(t0, t1); + xwywzw = vmulq_f32(XMMPMP, xwywzw); + + // (4*x*x, 4*x*y, 4*x*z, 4*x*w) + t0 = vextq_f32( xyxzyz, xyxzyz, 3 ); + t1 = vbslq_f32( Select0110, t0, x2y2z2w2 ); + t2 = vdupq_lane_f32( vget_low_f32(xwywzw), 0 ); + XMVECTOR tensor0 = vbslq_f32( g_XMSelect1110, t1, t2 ); + + // (4*y*x, 4*y*y, 4*y*z, 4*y*w) + t0 = vbslq_f32( g_XMSelect1011, xyxzyz, x2y2z2w2 ); + t1 = vdupq_lane_f32( vget_low_f32(xwywzw), 1 ); + XMVECTOR tensor1 = vbslq_f32( g_XMSelect1110, t0, t1 ); + + // (4*z*x, 4*z*y, 4*z*z, 4*z*w) + t0 = vextq_f32(xyxzyz, xyxzyz, 1); + t1 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_high_f32(xwywzw) ) ); + XMVECTOR tensor2 = vbslq_f32( Select0010, x2y2z2w2, t1 ); + + // (4*w*x, 4*w*y, 4*w*z, 4*w*w) + XMVECTOR tensor3 = vbslq_f32( g_XMSelect1110, xwywzw, x2y2z2w2 ); + + // Select the row of the tensor-product matrix that has the largest + // magnitude. + t0 = vbslq_f32( x2gey2, tensor0, tensor1 ); + t1 = vbslq_f32( z2gew2, tensor2, tensor3 ); + t2 = vbslq_f32( x2py2gez2pw2, t0, t1 ); + + // Normalize the row. No division by zero is possible because the + // quaternion is unit-length (and the row is a nonzero multiple of + // the quaternion). + t0 = XMVector4Length(t2); + return XMVectorDivide(t2, t0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 XMPMMP = { +1.0f, -1.0f, -1.0f, +1.0f }; + static const XMVECTORF32 XMMPMP = { -1.0f, +1.0f, -1.0f, +1.0f }; + static const XMVECTORF32 XMMMPP = { -1.0f, -1.0f, +1.0f, +1.0f }; + + XMVECTOR r0 = M.r[0]; // (r00, r01, r02, 0) + XMVECTOR r1 = M.r[1]; // (r10, r11, r12, 0) + XMVECTOR r2 = M.r[2]; // (r20, r21, r22, 0) + + // (r00, r00, r00, r00) + XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0,0,0,0)); + // (r11, r11, r11, r11) + XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1,1,1,1)); + // (r22, r22, r22, r22) + XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2,2,2,2)); + + // x^2 >= y^2 equivalent to r11 - r00 <= 0 + // (r11 - r00, r11 - r00, r11 - r00, r11 - r00) + XMVECTOR r11mr00 = _mm_sub_ps(r11, r00); + XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero); + + // z^2 >= w^2 equivalent to r11 + r00 <= 0 + // (r11 + r00, r11 + r00, r11 + r00, r11 + r00) + XMVECTOR r11pr00 = _mm_add_ps(r11, r00); + XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero); + + // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 + XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero); + + // (+r00, -r00, -r00, +r00) + XMVECTOR t0 = _mm_mul_ps(XMPMMP, r00); + + // (-r11, +r11, -r11, +r11) + XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11); + + // (-r22, -r22, +r22, +r22) + XMVECTOR t2 = _mm_mul_ps(XMMMPP, r22); + + // (4*x^2, 4*y^2, 4*z^2, 4*w^2) + XMVECTOR x2y2z2w2 = _mm_add_ps(t0, t1); + x2y2z2w2 = _mm_add_ps(t2, x2y2z2w2); + x2y2z2w2 = _mm_add_ps(x2y2z2w2, g_XMOne); + + // (r01, r02, r12, r11) + t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1,2,2,1)); + // (r10, r10, r20, r21) + t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1,0,0,0)); + // (r10, r20, r21, r10) + t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0)); + // (4*x*y, 4*x*z, 4*y*z, unused) + XMVECTOR xyxzyz = _mm_add_ps(t0, t1); + + // (r21, r20, r10, r10) + t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0,0,0,1)); + // (r12, r12, r02, r01) + t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1,2,2,2)); + // (r12, r02, r01, r12) + t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0)); + // (4*x*w, 4*y*w, 4*z*w, unused) + XMVECTOR xwywzw = _mm_sub_ps(t0, t1); + xwywzw = _mm_mul_ps(XMMPMP, xwywzw); + + // (4*x^2, 4*y^2, 4*x*y, unused) + t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0,0,1,0)); + // (4*z^2, 4*w^2, 4*z*w, unused) + t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0,2,3,2)); + // (4*x*z, 4*y*z, 4*x*w, 4*y*w) + t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1,0,2,1)); + + // (4*x*x, 4*x*y, 4*x*z, 4*x*w) + XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,2,0)); + // (4*y*x, 4*y*y, 4*y*z, 4*y*w) + XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,1,1,2)); + // (4*z*x, 4*z*y, 4*z*z, 4*z*w) + XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,0,1,0)); + // (4*w*x, 4*w*y, 4*w*z, 4*w*w) + XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1,2,3,2)); + + // Select the row of the tensor-product matrix that has the largest + // magnitude. + t0 = _mm_and_ps(x2gey2, tensor0); + t1 = _mm_andnot_ps(x2gey2, tensor1); + t0 = _mm_or_ps(t0, t1); + t1 = _mm_and_ps(z2gew2, tensor2); + t2 = _mm_andnot_ps(z2gew2, tensor3); + t1 = _mm_or_ps(t1, t2); + t0 = _mm_and_ps(x2py2gez2pw2, t0); + t1 = _mm_andnot_ps(x2py2gez2pw2, t1); + t2 = _mm_or_ps(t0, t1); + + // Normalize the row. No division by zero is possible because the + // quaternion is unit-length (and the row is a nonzero multiple of + // the quaternion). + t0 = XMVector4Length(t2); + return _mm_div_ps(t2, t0); +#endif +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMQuaternionToAxisAngle +( + XMVECTOR* pAxis, + float* pAngle, + FXMVECTOR Q +) +{ + assert(pAxis); + assert(pAngle); + + *pAxis = Q; + + *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q)); +} + +/**************************************************************************** + * + * Plane + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) +{ + return XMVector4Equal(P1, P2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneNearEqual +( + FXMVECTOR P1, + FXMVECTOR P2, + FXMVECTOR Epsilon +) +{ + XMVECTOR NP1 = XMPlaneNormalize(P1); + XMVECTOR NP2 = XMPlaneNormalize(P2); + return XMVector4NearEqual(NP1, NP2, Epsilon); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneNotEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) +{ + return XMVector4NotEqual(P1, P2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneIsNaN +( + FXMVECTOR P +) +{ + return XMVector4IsNaN(P); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneIsInfinite +( + FXMVECTOR P +) +{ + return XMVector4IsInfinite(P); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDot +( + FXMVECTOR P, + FXMVECTOR V +) +{ + return XMVector4Dot(P, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDotCoord +( + FXMVECTOR P, + FXMVECTOR V +) +{ + // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3] + + XMVECTOR V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v); + XMVECTOR Result = XMVector4Dot(P, V3); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDotNormal +( + FXMVECTOR P, + FXMVECTOR V +) +{ + return XMVector3Dot(P, V); +} + +//------------------------------------------------------------------------------ +// XMPlaneNormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst +( + FXMVECTOR P +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Result = XMVector3ReciprocalLengthEst(P); + return XMVectorMultiply(P, Result); + +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, P); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(P,P); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_mul_ps(vDot,P); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneNormalize +( + FXMVECTOR P +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fLengthSq = sqrtf((P.vector4_f32[0]*P.vector4_f32[0])+(P.vector4_f32[1]*P.vector4_f32[1])+(P.vector4_f32[2]*P.vector4_f32[2])); + // Prevent divide by zero + if (fLengthSq > 0) + { + fLengthSq = 1.0f/fLengthSq; + } + XMVECTORF32 vResult = { + P.vector4_f32[0] * fLengthSq, + P.vector4_f32[1] * fLengthSq, + P.vector4_f32[2] * fLengthSq, + P.vector4_f32[3] * fLengthSq + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vLength = XMVector3ReciprocalLength(P); + return XMVectorMultiply( P, vLength ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vLengthSq); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(P,P); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vLengthSq); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneIntersectLine +( + FXMVECTOR P, + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2 +) +{ + XMVECTOR V1 = XMVector3Dot(P, LinePoint1); + XMVECTOR V2 = XMVector3Dot(P, LinePoint2); + XMVECTOR D = XMVectorSubtract(V1, V2); + + XMVECTOR VT = XMPlaneDotCoord(P, LinePoint1); + VT = XMVectorDivide(VT, D); + + XMVECTOR Point = XMVectorSubtract(LinePoint2, LinePoint1); + Point = XMVectorMultiplyAdd(Point, VT, LinePoint1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v); + + return XMVectorSelect(Point, g_XMQNaN.v, Control); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMPlaneIntersectPlane +( + XMVECTOR* pLinePoint1, + XMVECTOR* pLinePoint2, + FXMVECTOR P1, + FXMVECTOR P2 +) +{ + assert(pLinePoint1); + assert(pLinePoint2); + + XMVECTOR V1 = XMVector3Cross(P2, P1); + + XMVECTOR LengthSq = XMVector3LengthSq(V1); + + XMVECTOR V2 = XMVector3Cross(P2, V1); + + XMVECTOR P1W = XMVectorSplatW(P1); + XMVECTOR Point = XMVectorMultiply(V2, P1W); + + XMVECTOR V3 = XMVector3Cross(V1, P1); + + XMVECTOR P2W = XMVectorSplatW(P2); + Point = XMVectorMultiplyAdd(V3, P2W, Point); + + XMVECTOR LinePoint1 = XMVectorDivide(Point, LengthSq); + + XMVECTOR LinePoint2 = XMVectorAdd(LinePoint1, V1); + + XMVECTOR Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v); + *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN.v, Control); + *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN.v, Control); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneTransform +( + FXMVECTOR P, + FXMMATRIX M +) +{ + XMVECTOR W = XMVectorSplatW(P); + XMVECTOR Z = XMVectorSplatZ(P); + XMVECTOR Y = XMVectorSplatY(P); + XMVECTOR X = XMVectorSplatX(P); + + XMVECTOR Result = XMVectorMultiply(W, M.r[3]); + Result = XMVectorMultiplyAdd(Z, M.r[2], Result); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + return Result; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMPlaneTransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT4* pInputStream, + size_t InputStride, + size_t PlaneCount, + FXMMATRIX M +) +{ + return XMVector4TransformStream(pOutputStream, + OutputStride, + pInputStream, + InputStride, + PlaneCount, + M); +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneFromPointNormal +( + FXMVECTOR Point, + FXMVECTOR Normal +) +{ + XMVECTOR W = XMVector3Dot(Point, Normal); + W = XMVectorNegate(W); + return XMVectorSelect(W, Normal, g_XMSelect1110.v); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneFromPoints +( + FXMVECTOR Point1, + FXMVECTOR Point2, + FXMVECTOR Point3 +) +{ + XMVECTOR V21 = XMVectorSubtract(Point1, Point2); + XMVECTOR V31 = XMVectorSubtract(Point1, Point3); + + XMVECTOR N = XMVector3Cross(V21, V31); + N = XMVector3Normalize(N); + + XMVECTOR D = XMPlaneDotNormal(N, Point1); + D = XMVectorNegate(D); + + XMVECTOR Result = XMVectorSelect(D, N, g_XMSelect1110.v); + + return Result; +} + +/**************************************************************************** + * + * Color + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4Equal(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorNotEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4NotEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorGreater +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4Greater(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorGreaterOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4GreaterOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorLess +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4Less(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorLessOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4LessOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorIsNaN +( + FXMVECTOR C +) +{ + return XMVector4IsNaN(C); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorIsInfinite +( + FXMVECTOR C +) +{ + return XMVector4IsInfinite(C); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorNegative +( + FXMVECTOR vColor +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + 1.0f - vColor.vector4_f32[0], + 1.0f - vColor.vector4_f32[1], + 1.0f - vColor.vector4_f32[2], + vColor.vector4_f32[3] + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vTemp = veorq_u32(vColor,g_XMNegate3); + return vaddq_f32(vTemp,g_XMOne3); +#elif defined(_XM_SSE_INTRINSICS_) + // Negate only x,y and z. + XMVECTOR vTemp = _mm_xor_ps(vColor,g_XMNegate3); + // Add 1,1,1,0 to -x,-y,-z,w + return _mm_add_ps(vTemp,g_XMOne3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorModulate +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVectorMultiply(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorAdjustSaturation +( + FXMVECTOR vColor, + float fSaturation +) +{ + // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2]; + // Result = (C - Luminance) * Saturation + Luminance; + + const XMVECTORF32 gvLuminance = { 0.2125f, 0.7154f, 0.0721f, 0.0f }; +#if defined(_XM_NO_INTRINSICS_) + float fLuminance = (vColor.vector4_f32[0]*gvLuminance.f[0])+(vColor.vector4_f32[1]*gvLuminance.f[1])+(vColor.vector4_f32[2]*gvLuminance.f[2]); + XMVECTOR vResult; + vResult.vector4_f32[0] = ((vColor.vector4_f32[0] - fLuminance)*fSaturation)+fLuminance; + vResult.vector4_f32[1] = ((vColor.vector4_f32[1] - fLuminance)*fSaturation)+fLuminance; + vResult.vector4_f32[2] = ((vColor.vector4_f32[2] - fLuminance)*fSaturation)+fLuminance; + vResult.vector4_f32[3] = vColor.vector4_f32[3]; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance ); + XMVECTOR vResult = vsubq_f32(vColor, vLuminance); + vResult = vmlaq_n_f32( vLuminance, vResult, fSaturation ); + return vbslq_f32( g_XMSelect1110, vResult, vColor ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance ); +// Splat fSaturation + XMVECTOR vSaturation = _mm_set_ps1(fSaturation); +// vResult = ((vColor-vLuminance)*vSaturation)+vLuminance; + XMVECTOR vResult = _mm_sub_ps(vColor,vLuminance); + vResult = _mm_mul_ps(vResult,vSaturation); + vResult = _mm_add_ps(vResult,vLuminance); +// Retain w from the source color + vLuminance = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult,vLuminance,_MM_SHUFFLE(3,0,1,0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorAdjustContrast +( + FXMVECTOR vColor, + float fContrast +) +{ + // Result = (vColor - 0.5f) * fContrast + 0.5f; + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + ((vColor.vector4_f32[0] - 0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[1] - 0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[2] - 0.5f) * fContrast) + 0.5f, + vColor.vector4_f32[3] // Leave W untouched + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult = vsubq_f32(vColor, g_XMOneHalf.v); + vResult = vmlaq_n_f32( g_XMOneHalf.v, vResult, fContrast ); + return vbslq_f32( g_XMSelect1110, vResult, vColor ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vScale = _mm_set_ps1(fContrast); // Splat the scale + XMVECTOR vResult = _mm_sub_ps(vColor,g_XMOneHalf); // Subtract 0.5f from the source (Saving source) + vResult = _mm_mul_ps(vResult,vScale); // Mul by scale + vResult = _mm_add_ps(vResult,g_XMOneHalf); // Add 0.5f +// Retain w from the source color + vScale = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult,vScale,_MM_SHUFFLE(3,0,1,0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToHSL( FXMVECTOR rgb ) +{ + XMVECTOR r = XMVectorSplatX( rgb ); + XMVECTOR g = XMVectorSplatY( rgb ); + XMVECTOR b = XMVectorSplatZ( rgb ); + + XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) ); + XMVECTOR max = XMVectorMax( r, XMVectorMax( g, b ) ); + + XMVECTOR l = XMVectorMultiply( XMVectorAdd( min, max ), g_XMOneHalf ); + + XMVECTOR d = XMVectorSubtract( max, min ); + + XMVECTOR la = XMVectorSelect( rgb, l, g_XMSelect1110 ); + + if ( XMVector3Less( d, g_XMEpsilon ) ) + { + // Achromatic, assume H and S of 0 + return XMVectorSelect( la, g_XMZero, g_XMSelect1100 ); + } + else + { + XMVECTOR s, h; + + XMVECTOR d2 = XMVectorAdd( min, max ); + + if ( XMVector3Greater( l, g_XMOneHalf ) ) + { + // d / (2-max-min) + s = XMVectorDivide( d, XMVectorSubtract( g_XMTwo, d2 ) ); + } + else + { + // d / (max+min) + s = XMVectorDivide( d, d2 ); + } + + if ( XMVector3Equal( r, max ) ) + { + // Red is max + h = XMVectorDivide( XMVectorSubtract( g, b ), d ); + } + else if ( XMVector3Equal( g, max ) ) + { + // Green is max + h = XMVectorDivide( XMVectorSubtract( b, r ), d ); + h = XMVectorAdd( h, g_XMTwo ); + } + else + { + // Blue is max + h = XMVectorDivide( XMVectorSubtract( r, g ), d ); + h = XMVectorAdd( h, g_XMFour ); + } + + h = XMVectorDivide( h, g_XMSix ); + + if ( XMVector3Less( h, g_XMZero ) ) + h = XMVectorAdd( h, g_XMOne ); + + XMVECTOR lha = XMVectorSelect( la, h, g_XMSelect1100 ); + return XMVectorSelect( s, lha, g_XMSelect1011 ); + } +} + +//------------------------------------------------------------------------------ + +namespace Internal +{ + +inline XMVECTOR XM_CALLCONV XMColorHue2Clr( FXMVECTOR p, FXMVECTOR q, FXMVECTOR h ) +{ + static const XMVECTORF32 oneSixth = { 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f }; + static const XMVECTORF32 twoThirds = { 2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f }; + + XMVECTOR t = h; + + if ( XMVector3Less( t, g_XMZero ) ) + t = XMVectorAdd( t, g_XMOne ); + + if ( XMVector3Greater( t, g_XMOne ) ) + t = XMVectorSubtract( t, g_XMOne ); + + if ( XMVector3Less( t, oneSixth ) ) + { + // p + (q - p) * 6 * t + XMVECTOR t1 = XMVectorSubtract( q, p ); + XMVECTOR t2 = XMVectorMultiply( g_XMSix, t ); + return XMVectorMultiplyAdd( t1, t2, p ); + } + + if ( XMVector3Less( t, g_XMOneHalf ) ) + return q; + + if ( XMVector3Less( t, twoThirds ) ) + { + // p + (q - p) * 6 * (2/3 - t) + XMVECTOR t1 = XMVectorSubtract( q, p ); + XMVECTOR t2 = XMVectorMultiply( g_XMSix, XMVectorSubtract( twoThirds, t ) ); + return XMVectorMultiplyAdd( t1, t2, p ); + } + + return p; +} + +} // namespace Internal + +inline XMVECTOR XM_CALLCONV XMColorHSLToRGB( FXMVECTOR hsl ) +{ + static const XMVECTORF32 oneThird = { 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f }; + + XMVECTOR s = XMVectorSplatY( hsl ); + XMVECTOR l = XMVectorSplatZ( hsl ); + + if ( XMVector3NearEqual( s, g_XMZero, g_XMEpsilon ) ) + { + // Achromatic + return XMVectorSelect( hsl, l, g_XMSelect1110 ); + } + else + { + XMVECTOR h = XMVectorSplatX( hsl ); + + XMVECTOR q; + if ( XMVector3Less( l, g_XMOneHalf ) ) + { + q = XMVectorMultiply( l, XMVectorAdd ( g_XMOne, s ) ); + } + else + { + q = XMVectorSubtract( XMVectorAdd( l, s ), XMVectorMultiply( l, s ) ); + } + + XMVECTOR p = XMVectorSubtract( XMVectorMultiply( g_XMTwo, l ), q ); + + XMVECTOR r = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorAdd( h, oneThird ) ); + XMVECTOR g = DirectX::Internal::XMColorHue2Clr( p, q, h ); + XMVECTOR b = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorSubtract( h, oneThird ) ); + + XMVECTOR rg = XMVectorSelect( g, r, g_XMSelect1000 ); + XMVECTOR ba = XMVectorSelect( hsl, b, g_XMSelect1110 ); + + return XMVectorSelect( ba, rg, g_XMSelect1100 ); + } +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToHSV( FXMVECTOR rgb ) +{ + XMVECTOR r = XMVectorSplatX( rgb ); + XMVECTOR g = XMVectorSplatY( rgb ); + XMVECTOR b = XMVectorSplatZ( rgb ); + + XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) ); + XMVECTOR v = XMVectorMax( r, XMVectorMax( g, b ) ); + + XMVECTOR d = XMVectorSubtract( v, min ); + + XMVECTOR s = ( XMVector3NearEqual( v, g_XMZero, g_XMEpsilon ) ) ? g_XMZero : XMVectorDivide( d, v ); + + if ( XMVector3Less( d, g_XMEpsilon ) ) + { + // Achromatic, assume H of 0 + XMVECTOR hv = XMVectorSelect( v, g_XMZero, g_XMSelect1000 ); + XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 ); + return XMVectorSelect( s, hva, g_XMSelect1011 ); + } + else + { + XMVECTOR h; + + if ( XMVector3Equal( r, v ) ) + { + // Red is max + h = XMVectorDivide( XMVectorSubtract( g, b ), d ); + + if ( XMVector3Less( g, b ) ) + h = XMVectorAdd( h, g_XMSix ); + } + else if ( XMVector3Equal( g, v ) ) + { + // Green is max + h = XMVectorDivide( XMVectorSubtract( b, r ), d ); + h = XMVectorAdd( h, g_XMTwo ); + } + else + { + // Blue is max + h = XMVectorDivide( XMVectorSubtract( r, g ), d ); + h = XMVectorAdd( h, g_XMFour ); + } + + h = XMVectorDivide( h, g_XMSix ); + + XMVECTOR hv = XMVectorSelect( v, h, g_XMSelect1000 ); + XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 ); + return XMVectorSelect( s, hva, g_XMSelect1011 ); + } +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorHSVToRGB( FXMVECTOR hsv ) +{ + XMVECTOR h = XMVectorSplatX( hsv ); + XMVECTOR s = XMVectorSplatY( hsv ); + XMVECTOR v = XMVectorSplatZ( hsv ); + + XMVECTOR h6 = XMVectorMultiply( h, g_XMSix ); + + XMVECTOR i = XMVectorFloor( h6 ); + XMVECTOR f = XMVectorSubtract( h6, i ); + + // p = v* (1-s) + XMVECTOR p = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, s ) ); + + // q = v*(1-f*s) + XMVECTOR q = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( f, s ) ) ); + + // t = v*(1 - (1-f)*s) + XMVECTOR t = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( XMVectorSubtract( g_XMOne, f ), s ) ) ); + + auto ii = static_cast( XMVectorGetX( XMVectorMod( i, g_XMSix ) ) ); + + XMVECTOR _rgb; + + switch (ii) + { + case 0: // rgb = vtp + { + XMVECTOR vt = XMVectorSelect( t, v, g_XMSelect1000 ); + _rgb = XMVectorSelect( p, vt, g_XMSelect1100 ); + } + break; + case 1: // rgb = qvp + { + XMVECTOR qv = XMVectorSelect( v, q, g_XMSelect1000 ); + _rgb = XMVectorSelect( p, qv, g_XMSelect1100 ); + } + break; + case 2: // rgb = pvt + { + XMVECTOR pv = XMVectorSelect( v, p, g_XMSelect1000 ); + _rgb = XMVectorSelect( t, pv, g_XMSelect1100 ); + } + break; + case 3: // rgb = pqv + { + XMVECTOR pq = XMVectorSelect( q, p, g_XMSelect1000 ); + _rgb = XMVectorSelect( v, pq, g_XMSelect1100 ); + } + break; + case 4: // rgb = tpv + { + XMVECTOR tp = XMVectorSelect( p, t, g_XMSelect1000 ); + _rgb = XMVectorSelect( v, tp, g_XMSelect1100 ); + } + break; + default: // rgb = vpq + { + XMVECTOR vp = XMVectorSelect( p, v, g_XMSelect1000 ); + _rgb = XMVectorSelect( q, vp, g_XMSelect1100 ); + } + break; + } + + return XMVectorSelect( hsv, _rgb, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToYUV( FXMVECTOR rgb ) +{ + static const XMVECTORF32 Scale0 = { 0.299f, -0.147f, 0.615f, 0.0f }; + static const XMVECTORF32 Scale1 = { 0.587f, -0.289f, -0.515f, 0.0f }; + static const XMVECTORF32 Scale2 = { 0.114f, 0.436f, -0.100f, 0.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( rgb, M ); + + return XMVectorSelect( rgb, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorYUVToRGB( FXMVECTOR yuv ) +{ + static const XMVECTORF32 Scale1 = { 0.0f, -0.395f, 2.032f, 0.0f }; + static const XMVECTORF32 Scale2 = { 1.140f, -0.581f, 0.0f, 0.0f }; + + XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( yuv, M ); + + return XMVectorSelect( yuv, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD( FXMVECTOR rgb ) +{ + static const XMVECTORF32 Scale0 = { 0.2126f, -0.0997f, 0.6150f, 0.0f }; + static const XMVECTORF32 Scale1 = { 0.7152f, -0.3354f, -0.5586f, 0.0f }; + static const XMVECTORF32 Scale2 = { 0.0722f, 0.4351f, -0.0564f, 0.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( rgb, M ); + + return XMVectorSelect( rgb, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD( FXMVECTOR yuv ) +{ + static const XMVECTORF32 Scale1 = { 0.0f, -0.2153f, 2.1324f, 0.0f }; + static const XMVECTORF32 Scale2 = { 1.2803f, -0.3806f, 0.0f, 0.0f }; + + XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( yuv, M ); + + return XMVectorSelect( yuv, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToXYZ( FXMVECTOR rgb ) +{ + static const XMVECTORF32 Scale0 = { 0.4887180f, 0.1762044f, 0.0000000f, 0.0f }; + static const XMVECTORF32 Scale1 = { 0.3106803f, 0.8129847f, 0.0102048f, 0.0f }; + static const XMVECTORF32 Scale2 = { 0.2006017f, 0.0108109f, 0.9897952f, 0.0f }; + static const XMVECTORF32 Scale = { 1.f / 0.17697f, 1.f / 0.17697f, 1.f / 0.17697f, 0.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVectorMultiply( XMVector3Transform( rgb, M ), Scale ); + + return XMVectorSelect( rgb, clr, g_XMSelect1110 ); +} + +inline XMVECTOR XM_CALLCONV XMColorXYZToRGB( FXMVECTOR xyz ) +{ + static const XMVECTORF32 Scale0 = { 2.3706743f, -0.5138850f, 0.0052982f, 0.0f }; + static const XMVECTORF32 Scale1 = { -0.9000405f, 1.4253036f, -0.0146949f, 0.0f }; + static const XMVECTORF32 Scale2 = { -0.4706338f, 0.0885814f, 1.0093968f, 0.0f }; + static const XMVECTORF32 Scale = { 0.17697f, 0.17697f, 0.17697f, 0.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( XMVectorMultiply( xyz, Scale ), M ); + + return XMVectorSelect( xyz, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorXYZToSRGB( FXMVECTOR xyz ) +{ + static const XMVECTORF32 Scale0 = { 3.2406f, -0.9689f, 0.0557f, 0.0f }; + static const XMVECTORF32 Scale1 = { -1.5372f, 1.8758f, -0.2040f, 0.0f }; + static const XMVECTORF32 Scale2 = { -0.4986f, 0.0415f, 1.0570f, 0.0f }; + static const XMVECTORF32 Cutoff = { 0.0031308f, 0.0031308f, 0.0031308f, 0.0f }; + static const XMVECTORF32 Exp = { 1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR lclr = XMVector3Transform( xyz, M ); + + XMVECTOR sel = XMVectorGreater( lclr, Cutoff ); + + // clr = 12.92 * lclr for lclr <= 0.0031308f + XMVECTOR smallC = XMVectorMultiply( lclr, g_XMsrgbScale ); + + // clr = (1+a)*pow(lclr, 1/2.4) - a for lclr > 0.0031308 (where a = 0.055) + XMVECTOR largeC = XMVectorSubtract( XMVectorMultiply( g_XMsrgbA1, XMVectorPow( lclr, Exp ) ), g_XMsrgbA ); + + XMVECTOR clr = XMVectorSelect( smallC, largeC, sel ); + + return XMVectorSelect( xyz, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorSRGBToXYZ( FXMVECTOR srgb ) +{ + static const XMVECTORF32 Scale0 = { 0.4124f, 0.2126f, 0.0193f, 0.0f }; + static const XMVECTORF32 Scale1 = { 0.3576f, 0.7152f, 0.1192f, 0.0f }; + static const XMVECTORF32 Scale2 = { 0.1805f, 0.0722f, 0.9505f, 0.0f }; + static const XMVECTORF32 Cutoff = { 0.04045f, 0.04045f, 0.04045f, 0.0f }; + static const XMVECTORF32 Exp = { 2.4f, 2.4f, 2.4f, 1.0f }; + + XMVECTOR sel = XMVectorGreater( srgb, Cutoff ); + + // lclr = clr / 12.92 + XMVECTOR smallC = XMVectorDivide( srgb, g_XMsrgbScale ); + + // lclr = pow( (clr + a) / (1+a), 2.4 ) + XMVECTOR largeC = XMVectorPow( XMVectorDivide( XMVectorAdd( srgb, g_XMsrgbA ), g_XMsrgbA1 ), Exp ); + + XMVECTOR lclr = XMVectorSelect( smallC, largeC, sel ); + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( lclr, M ); + + return XMVectorSelect( srgb, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToSRGB( FXMVECTOR rgb ) +{ + static const XMVECTORF32 Cutoff = { 0.0031308f, 0.0031308f, 0.0031308f, 1.f }; + static const XMVECTORF32 Linear = { 12.92f, 12.92f, 12.92f, 1.f }; + static const XMVECTORF32 Scale = { 1.055f, 1.055f, 1.055f, 1.f }; + static const XMVECTORF32 Bias = { 0.055f, 0.055f, 0.055f, 0.f }; + static const XMVECTORF32 InvGamma = { 1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.f }; + + XMVECTOR V = XMVectorSaturate(rgb); + XMVECTOR V0 = XMVectorMultiply( V, Linear ); + XMVECTOR V1 = XMVectorSubtract( XMVectorMultiply( Scale, XMVectorPow( V, InvGamma ) ), Bias ); + XMVECTOR select = XMVectorLess( V, Cutoff ); + V = XMVectorSelect( V1, V0, select ); + return XMVectorSelect( rgb, V, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB( FXMVECTOR srgb ) +{ + static const XMVECTORF32 Cutoff = { 0.04045f, 0.04045f, 0.04045f, 1.f }; + static const XMVECTORF32 ILinear = { 1.f / 12.92f, 1.f / 12.92f, 1.f / 12.92f, 1.f }; + static const XMVECTORF32 Scale = { 1.f / 1.055f, 1.f / 1.055f, 1.f / 1.055f, 1.f }; + static const XMVECTORF32 Bias = { 0.055f, 0.055f, 0.055f, 0.f }; + static const XMVECTORF32 Gamma = { 2.4f, 2.4f, 2.4f, 1.f }; + + XMVECTOR V = XMVectorSaturate(srgb); + XMVECTOR V0 = XMVectorMultiply( V, ILinear ); + XMVECTOR V1 = XMVectorPow( XMVectorMultiply( XMVectorAdd( V, Bias ), Scale ), Gamma ); + XMVECTOR select = XMVectorGreater( V, Cutoff ); + V = XMVectorSelect( V0, V1, select ); + return XMVectorSelect( srgb, V, g_XMSelect1110 ); +} + +/**************************************************************************** + * + * Miscellaneous + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline bool XMVerifyCPUSupport() +{ +#if defined(_MSC_VER) && defined(_WIN32) +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + int CPUInfo[4] = { -1 }; + __cpuid(CPUInfo, 0); + +#ifdef __AVX2__ + if (CPUInfo[0] < 7) + return false; +#else + if (CPUInfo[0] < 1) + return false; +#endif + + __cpuid(CPUInfo, 1); + +#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_) + // The compiler can emit FMA3 instructions even without explicit intrinsics use + if ((CPUInfo[2] & 0x38081001) != 0x38081001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support +#elif defined(_XM_FMA3_INTRINSICS_) && defined(_XM_F16C_INTRINSICS_) + if ((CPUInfo[2] & 0x38081001) != 0x38081001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support +#elif defined(_XM_FMA3_INTRINSICS_) + if ((CPUInfo[2] & 0x18081001) != 0x18081001) + return false; // No AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support +#elif defined(_XM_F16C_INTRINSICS_) + if ((CPUInfo[2] & 0x38080001) != 0x38080001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support +#elif defined(__AVX__) || defined(_XM_AVX_INTRINSICS_) + if ((CPUInfo[2] & 0x18080001) != 0x18080001) + return false; // No AVX/OSXSAVE/SSE4.1/SSE3 support +#elif defined(_XM_SSE4_INTRINSICS_) + if ((CPUInfo[2] & 0x80001) != 0x80001) + return false; // No SSE3/SSE4.1 support +#elif defined(_XM_SSE3_INTRINSICS_) + if (!(CPUInfo[2] & 0x1)) + return false; // No SSE3 support +#endif + + // The x64 processor model requires SSE2 support, but no harm in checking + if ((CPUInfo[3] & 0x6000000) != 0x6000000) + return false; // No SSE2/SSE support + +#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_) + __cpuidex(CPUInfo, 7, 0); + if (!(CPUInfo[1] & 0x20)) + return false; // No AVX2 support +#endif + + return true; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + // ARM-NEON support is required for the Windows on ARM platform + return true; +#else + // No intrinsics path always supported + return true; +#endif +#else + // TODO: Don't just return true + return true; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMFresnelTerm +( + FXMVECTOR CosIncidentAngle, + FXMVECTOR RefractionIndex +) +{ + assert(!XMVector4IsInfinite(CosIncidentAngle)); + + // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where + // c = CosIncidentAngle + // g = sqrt(c^2 + RefractionIndex^2 - 1) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v); + G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G); + G = XMVectorAbs(G); + G = XMVectorSqrt(G); + + XMVECTOR S = XMVectorAdd(G, CosIncidentAngle); + XMVECTOR D = XMVectorSubtract(G, CosIncidentAngle); + + XMVECTOR V0 = XMVectorMultiply(D, D); + XMVECTOR V1 = XMVectorMultiply(S, S); + V1 = XMVectorReciprocal(V1); + V0 = XMVectorMultiply(g_XMOneHalf.v, V0); + V0 = XMVectorMultiply(V0, V1); + + XMVECTOR V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v); + XMVECTOR V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v); + V2 = XMVectorMultiply(V2, V2); + V3 = XMVectorMultiply(V3, V3); + V3 = XMVectorReciprocal(V3); + V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v); + + XMVECTOR Result = XMVectorMultiply(V0, V2); + + Result = XMVectorSaturate(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2)) + XMVECTOR G = _mm_mul_ps(RefractionIndex,RefractionIndex); + XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle,CosIncidentAngle); + G = _mm_sub_ps(G,g_XMOne); + vTemp = _mm_add_ps(vTemp,G); + // max((0-vTemp),vTemp) == abs(vTemp) + // The abs is needed to deal with refraction and cosine being zero + G = _mm_setzero_ps(); + G = _mm_sub_ps(G,vTemp); + G = _mm_max_ps(G,vTemp); + // Last operation, the sqrt() + G = _mm_sqrt_ps(G); + + // Calc G-C and G+C + XMVECTOR GAddC = _mm_add_ps(G,CosIncidentAngle); + XMVECTOR GSubC = _mm_sub_ps(G,CosIncidentAngle); + // Perform the term (0.5f *(g - c)^2) / (g + c)^2 + XMVECTOR vResult = _mm_mul_ps(GSubC,GSubC); + vTemp = _mm_mul_ps(GAddC,GAddC); + vResult = _mm_mul_ps(vResult,g_XMOneHalf); + vResult = _mm_div_ps(vResult,vTemp); + // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) + GAddC = _mm_mul_ps(GAddC,CosIncidentAngle); + GSubC = _mm_mul_ps(GSubC,CosIncidentAngle); + GAddC = _mm_sub_ps(GAddC,g_XMOne); + GSubC = _mm_add_ps(GSubC,g_XMOne); + GAddC = _mm_mul_ps(GAddC,GAddC); + GSubC = _mm_mul_ps(GSubC,GSubC); + GAddC = _mm_div_ps(GAddC,GSubC); + GAddC = _mm_add_ps(GAddC,g_XMOne); + // Multiply the two term parts + vResult = _mm_mul_ps(vResult,GAddC); + // Clamp to 0.0 - 1.0f + vResult = _mm_max_ps(vResult,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMScalarNearEqual +( + float S1, + float S2, + float Epsilon +) +{ + float Delta = S1 - S2; + return (fabsf(Delta) <= Epsilon); +} + +//------------------------------------------------------------------------------ +// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI +inline float XMScalarModAngle +( + float Angle +) +{ + // Note: The modulo is performed with unsigned math only to work + // around a precision error on numbers that are close to PI + + // Normalize the range from 0.0f to XM_2PI + Angle = Angle + XM_PI; + // Perform the modulo, unsigned + float fTemp = fabsf(Angle); + fTemp = fTemp - (XM_2PI * static_cast(static_cast(fTemp/XM_2PI))); + // Restore the number to the range of -XM_PI to XM_PI-epsilon + fTemp = fTemp - XM_PI; + // If the modulo'd value was negative, restore negation + if (Angle<0.0f) { + fTemp = -fTemp; + } + return fTemp; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarSin +( + float Value +) +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + if (y > XM_PIDIV2) + { + y = XM_PI - y; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + } + + // 11-degree minimax approximation + float y2 = y * y; + return ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarSinEst +( + float Value +) +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + if (y > XM_PIDIV2) + { + y = XM_PI - y; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + } + + // 7-degree minimax approximation + float y2 = y * y; + return ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarCos +( + float Value +) +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + // 10-degree minimax approximation + float y2 = y*y; + float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f; + return sign*p; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarCosEst +( + float Value +) +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + // 6-degree minimax approximation + float y2 = y * y; + float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f; + return sign*p; +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMScalarSinCos +( + float* pSin, + float* pCos, + float Value +) +{ + assert(pSin); + assert(pCos); + + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + float y2 = y * y; + + // 11-degree minimax approximation + *pSin = ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y; + + // 10-degree minimax approximation + float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f; + *pCos = sign*p; +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMScalarSinCosEst +( + float* pSin, + float* pCos, + float Value +) +{ + assert(pSin); + assert(pCos); + + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + float y2 = y * y; + + // 7-degree minimax approximation + *pSin = ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y; + + // 6-degree minimax approximation + float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f; + *pCos = sign*p; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarASin +( + float Value +) +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 7-degree minimax approximation + float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f; + result *= root; // acos(|x|) + + // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) + return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarASinEst +( + float Value +) +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 3-degree minimax approximation + float result = ((-0.0187293f*x+0.0742610f)*x-0.2121144f)*x+1.5707288f; + result *= root; // acos(|x|) + + // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) + return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarACos +( + float Value +) +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 7-degree minimax approximation + float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f; + result *= root; + + // acos(x) = pi - acos(-x) when x < 0 + return (nonnegative ? result : XM_PI - result); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarACosEst +( + float Value +) +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 3-degree minimax approximation + float result = ( ( -0.0187293f * x + 0.0742610f ) * x - 0.2121144f ) * x + 1.5707288f; + result *= root; + + // acos(x) = pi - acos(-x) when x < 0 + return (nonnegative ? result : XM_PI - result); +} + diff --git a/WickedEngine/Utility/DirectXMathVector.inl b/WickedEngine/Utility/DirectXMathVector.inl new file mode 100644 index 000000000..6e848d7c8 --- /dev/null +++ b/WickedEngine/Utility/DirectXMathVector.inl @@ -0,0 +1,14643 @@ +//------------------------------------------------------------------------------------- +// DirectXMathVector.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#if defined(_XM_NO_INTRINSICS_) +#define XMISNAN(x) isnan(x) +#define XMISINF(x) isinf(x) +#endif + +#if defined(_XM_SSE_INTRINSICS_) + +#define XM3UNPACK3INTO4(l1,l2,l3) \ + XMVECTOR V3 = _mm_shuffle_ps(l2,l3,_MM_SHUFFLE(0,0,3,2));\ + XMVECTOR V2 = _mm_shuffle_ps(l2,l1,_MM_SHUFFLE(3,3,1,0));\ + V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,1,0,2));\ + XMVECTOR V4 = _mm_castsi128_ps( _mm_srli_si128(_mm_castps_si128(L3),32/8) ); + +#define XM3PACK4INTO3(v2x) \ + v2x = _mm_shuffle_ps(V2,V3,_MM_SHUFFLE(1,0,2,1));\ + V2 = _mm_shuffle_ps(V2,V1,_MM_SHUFFLE(2,2,0,0));\ + V1 = _mm_shuffle_ps(V1,V2,_MM_SHUFFLE(0,2,1,0));\ + V3 = _mm_shuffle_ps(V3,V4,_MM_SHUFFLE(0,0,2,2));\ + V3 = _mm_shuffle_ps(V3,V4,_MM_SHUFFLE(2,1,2,0));\ + +#endif + +/**************************************************************************** + * + * General Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Assignment operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Return a vector with all elements equaling zero +inline XMVECTOR XM_CALLCONV XMVectorZero() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { 0.0f, 0.0f, 0.0f, 0.0f }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four floating point values +inline XMVECTOR XM_CALLCONV XMVectorSet +( + float x, + float y, + float z, + float w +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { x, y, z, w }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t V0 = vcreate_f32( + static_cast(*reinterpret_cast(&x)) + | (static_cast(*reinterpret_cast(&y)) << 32)); + float32x2_t V1 = vcreate_f32( + static_cast(*reinterpret_cast(&z)) + | (static_cast(*reinterpret_cast(&w)) << 32)); + return vcombine_f32(V0, V1); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps( w, z, y, x ); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four integer values +inline XMVECTOR XM_CALLCONV XMVectorSetInt +( + uint32_t x, + uint32_t y, + uint32_t z, + uint32_t w +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = { x, y, z, w }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t V0 = vcreate_u32(static_cast(x) | (static_cast(y) << 32)); + uint32x2_t V1 = vcreate_u32(static_cast(z) | (static_cast(w) << 32)); + return vcombine_u32(V0, V1); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set_epi32(static_cast(w), static_cast(z), static_cast(y), static_cast(x)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value +inline XMVECTOR XM_CALLCONV XMVectorReplicate +( + float Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32( Value ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps1( Value ); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr +( + const float *pValue +) +{ +#if defined(_XM_NO_INTRINSICS_) + float Value = pValue[0]; + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_dup_f32( pValue ); +#elif defined(_XM_AVX_INTRINSICS_) + return _mm_broadcast_ss( pValue ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1( pValue ); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value +inline XMVECTOR XM_CALLCONV XMVectorReplicateInt +( + uint32_t Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32( Value ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_set1_epi32(static_cast(Value)); + return _mm_castsi128_ps(vTemp); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr +( + const uint32_t *pValue +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t Value = pValue[0]; + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_dup_u32(pValue); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1(reinterpret_cast(pValue)); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits set (true mask) +inline XMVECTOR XM_CALLCONV XMVectorTrueInt() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = { 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_s32(-1); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32(-1); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits clear (false mask) +inline XMVECTOR XM_CALLCONV XMVectorFalseInt() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { 0.0f, 0.0f, 0.0f, 0.0f }; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the x component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatX +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[0]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32( vget_low_f32( V ), 0 ); +#elif defined(_XM_AVX2_INTRINSICS_) + return _mm_broadcastss_ps( V ); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the y component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatY +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[1]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32( vget_low_f32( V ), 1 ); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the z component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatZ +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[2]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32( vget_high_f32( V ), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the w component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatW +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[3]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32( vget_high_f32( V ), 1 ); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.0f,1.0f,1.0f,1.0f +inline XMVECTOR XM_CALLCONV XMVectorSplatOne() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = 1.0f; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(1.0f); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMOne; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of INF,INF,INF,INF +inline XMVECTOR XM_CALLCONV XMVectorSplatInfinity() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x7F800000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0x7F800000); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMInfinity; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN +inline XMVECTOR XM_CALLCONV XMVectorSplatQNaN() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x7FC00000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0x7FC00000); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMQNaN; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f +inline XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x34000000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0x34000000); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMEpsilon; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f +inline XMVECTOR XM_CALLCONV XMVectorSplatSignMask() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x80000000U; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0x80000000U); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32(static_cast(0x80000000)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Return a floating point value via an index. This is not a recommended +// function to use due to performance loss. +inline float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) +{ + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[i]; +#else + XMVECTORF32 U; + U.v = V; + return U.f[i]; +#endif +} + +//------------------------------------------------------------------------------ +// Return the X component in an FPU register. +inline float XM_CALLCONV XMVectorGetX(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cvtss_f32(V); +#endif +} + +// Return the Y component in an FPU register. +inline float XM_CALLCONV XMVectorGetY(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + return _mm_cvtss_f32(vTemp); +#endif +} + +// Return the Z component in an FPU register. +inline float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + return _mm_cvtss_f32(vTemp); +#endif +} + +// Return the W component in an FPU register. +inline float XM_CALLCONV XMVectorGetW(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + return _mm_cvtss_f32(vTemp); +#endif +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetByIndexPtr(float *f, FXMVECTOR V, size_t i) +{ + assert( f != nullptr ); + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + *f = V.vector4_f32[i]; +#else + XMVECTORF32 U; + U.v = V; + *f = U.f[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetXPtr(float *x, FXMVECTOR V) +{ + assert( x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_f32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(x,V); +#endif +} + +// Store the Y component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetYPtr(float *y, FXMVECTOR V) +{ + assert( y != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(y,V,1); +#elif defined(_XM_SSE4_INTRINSICS_) + *(reinterpret_cast(y)) = _mm_extract_ps( V, 1 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + _mm_store_ss(y,vResult); +#endif +} + +// Store the Z component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetZPtr(float *z, FXMVECTOR V) +{ + assert( z != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(z,V,2); +#elif defined(_XM_SSE4_INTRINSICS_) + *(reinterpret_cast(z)) = _mm_extract_ps( V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss(z,vResult); +#endif +} + +// Store the W component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetWPtr(float *w, FXMVECTOR V) +{ + assert( w != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(w,V,3); +#elif defined(_XM_SSE4_INTRINSICS_) + *(reinterpret_cast(w)) = _mm_extract_ps( V, 3 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + _mm_store_ss(w,vResult); +#endif +} + +//------------------------------------------------------------------------------ + +// Return an integer value via an index. This is not a recommended +// function to use due to performance loss. +inline uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i) +{ + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[i]; +#else + XMVECTORU32 U; + U.v = V; + return U.u[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Return the X component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + return static_cast(_mm_cvtsi128_si32(_mm_castps_si128(V))); +#endif +} + +// Return the Y component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(V, 1); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128( V ); + return static_cast( _mm_extract_epi32( V1, 1 ) ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(1,1,1,1)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +// Return the Z component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128( V ); + return static_cast( _mm_extract_epi32( V1, 2 ) ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(2,2,2,2)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +// Return the W component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(V, 3); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128( V ); + return static_cast( _mm_extract_epi32( V1, 3 ) ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(3,3,3,3)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntByIndexPtr(uint32_t *x, FXMVECTOR V, size_t i) +{ + assert( x != nullptr ); + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[i]; +#else + XMVECTORU32 U; + U.v = V; + *x = U.u[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntXPtr(uint32_t *x, FXMVECTOR V) +{ + assert( x != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(x,*reinterpret_cast(&V),0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(reinterpret_cast(x),V); +#endif +} + +// Store the Y component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t *y, FXMVECTOR V) +{ + assert( y != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(y,*reinterpret_cast(&V),1); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128( V ); + *y = static_cast( _mm_extract_epi32( V1, 1 ) ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + _mm_store_ss(reinterpret_cast(y),vResult); +#endif +} + +// Store the Z component into a 32 bit integer locaCantion in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t *z, FXMVECTOR V) +{ + assert( z != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(z,*reinterpret_cast(&V),2); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128( V ); + *z = static_cast( _mm_extract_epi32( V1, 2 ) ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss(reinterpret_cast(z),vResult); +#endif +} + +// Store the W component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t *w, FXMVECTOR V) +{ + assert( w != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(w,*reinterpret_cast(&V),3); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128( V ); + *w = static_cast( _mm_extract_epi32( V1, 3 ) ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + _mm_store_ss(reinterpret_cast(w),vResult); +#endif +} + +//------------------------------------------------------------------------------ + +// Set a single indexed floating point component +inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) +{ + assert( i < 4 ); + _Analysis_assume_( i < 4 ); + XMVECTORF32 U; + U.v = V; + U.f[i] = f; + return U.v; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { + x, + V.vector4_f32[1], + V.vector4_f32[2], + V.vector4_f32[3] + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(x); + vResult = _mm_move_ss(V,vResult); + return vResult; +#endif +} + +// Sets the Y component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { + V.vector4_f32[0], + y, + V.vector4_f32[2], + V.vector4_f32[3] + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(y,V,1); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(y); + vResult = _mm_insert_ps( V, vResult, 0x10 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#endif +} +// Sets the Z component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { + V.vector4_f32[0], + V.vector4_f32[1], + z, + V.vector4_f32[3] + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(z,V,2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(z); + vResult = _mm_insert_ps( V, vResult, 0x20 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#endif +} + +// Sets the W component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { + V.vector4_f32[0], + V.vector4_f32[1], + V.vector4_f32[2], + w + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(w,V,3); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(w); + vResult = _mm_insert_ps( V, vResult, 0x30 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(FXMVECTOR V, const float *f, size_t i) +{ + assert( f != nullptr ); + assert( i < 4 ); + _Analysis_assume_( i < 4 ); + XMVECTORF32 U; + U.v = V; + U.f[i] = *f; + return U.v; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetXPtr(FXMVECTOR V, const float *x) +{ + assert( x != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { + *x, + V.vector4_f32[1], + V.vector4_f32[2], + V.vector4_f32[3] + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_load_ss(x); + vResult = _mm_move_ss(V,vResult); + return vResult; +#endif +} + +// Sets the Y component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetYPtr(FXMVECTOR V, const float *y) +{ + assert( y != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { + V.vector4_f32[0], + *y, + V.vector4_f32[2], + V.vector4_f32[3] + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(y,V,1); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetZPtr(FXMVECTOR V, const float *z) +{ + assert( z != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { + V.vector4_f32[0], + V.vector4_f32[1], + *z, + V.vector4_f32[3] + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(z,V,2); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#endif +} + +// Sets the W component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetWPtr(FXMVECTOR V, const float *w) +{ + assert( w != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { + V.vector4_f32[0], + V.vector4_f32[1], + V.vector4_f32[2], + *w + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(w,V,3); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) +{ + assert( i < 4 ); + _Analysis_assume_( i < 4 ); + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = x; + return tmp; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { + x, + V.vector4_u32[1], + V.vector4_u32[2], + V.vector4_u32[3] + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_u32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cvtsi32_si128(static_cast(x)); + XMVECTOR vResult = _mm_move_ss(V,_mm_castsi128_ps(vTemp)); + return vResult; +#endif +} + +// Sets the Y component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { + V.vector4_u32[0], + y, + V.vector4_u32[2], + V.vector4_u32[3] + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_u32(y,V,1); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128( V ); + vResult = _mm_insert_epi32( vResult, static_cast(y), 1 ); + return _mm_castsi128_ps( vResult ); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(static_cast(y)); + // Replace the x component + vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { + V.vector4_u32[0], + V.vector4_u32[1], + z, + V.vector4_u32[3] + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_u32(z,V,2); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128( V ); + vResult = _mm_insert_epi32( vResult, static_cast(z), 2 ); + return _mm_castsi128_ps( vResult ); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(static_cast(z)); + // Replace the x component + vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#endif +} + +// Sets the W component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { + V.vector4_u32[0], + V.vector4_u32[1], + V.vector4_u32[2], + w + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_u32(w,V,3); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128( V ); + vResult = _mm_insert_epi32( vResult, static_cast(w), 3 ); + return _mm_castsi128_ps( vResult ); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(static_cast(w)); + // Replace the x component + vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t *x, size_t i) +{ + assert( x != nullptr ); + assert( i < 4 ); + _Analysis_assume_( i < 4 ); + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = *x; + return tmp; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t *x) +{ + assert( x != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { + *x, + V.vector4_u32[1], + V.vector4_u32[2], + V.vector4_u32[3] + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_u32(x,*reinterpret_cast(&V),0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(x)); + XMVECTOR vResult = _mm_move_ss(V,vTemp); + return vResult; +#endif +} + +// Sets the Y component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t *y) +{ + assert( y != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { + V.vector4_u32[0], + *y, + V.vector4_u32[2], + V.vector4_u32[3] + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_u32(y,*reinterpret_cast(&V),1); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(y)); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t *z) +{ + assert( z != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { + V.vector4_u32[0], + V.vector4_u32[1], + *z, + V.vector4_u32[3] + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_u32(z,*reinterpret_cast(&V),2); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(z)); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#endif +} + +// Sets the W component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t *w) +{ + assert( w != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { + V.vector4_u32[0], + V.vector4_u32[1], + V.vector4_u32[2], + *w + }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_u32(w,*reinterpret_cast(&V),3); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(w)); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSwizzle +( + FXMVECTOR V, + uint32_t E0, + uint32_t E1, + uint32_t E2, + uint32_t E3 +) +{ + assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); + _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + V.vector4_f32[E0], + V.vector4_f32[E1], + V.vector4_f32[E2], + V.vector4_f32[E3] + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const uint32_t ControlElement[ 4 ] = + { + 0x03020100, // XM_SWIZZLE_X + 0x07060504, // XM_SWIZZLE_Y + 0x0B0A0908, // XM_SWIZZLE_Z + 0x0F0E0D0C, // XM_SWIZZLE_W + }; + + int8x8x2_t tbl; + tbl.val[0] = vget_low_f32(V); + tbl.val[1] = vget_high_f32(V); + + uint32x2_t idx = vcreate_u32(static_cast(ControlElement[E0]) | (static_cast(ControlElement[E1]) << 32)); + const uint8x8_t rL = vtbl2_u8( tbl, idx ); + + idx = vcreate_u32(static_cast(ControlElement[E2]) | (static_cast(ControlElement[E3]) << 32)); + const uint8x8_t rH = vtbl2_u8( tbl, idx ); + + return vcombine_f32( rL, rH ); +#elif defined(_XM_AVX_INTRINSICS_) + unsigned int elem[4] = { E0, E1, E2, E3 }; + __m128i vControl = _mm_loadu_si128( reinterpret_cast(&elem[0]) ); + return _mm_permutevar_ps( V, vControl ); +#else + auto aPtr = reinterpret_cast(&V); + + XMVECTOR Result; + auto pWork = reinterpret_cast(&Result); + + pWork[0] = aPtr[E0]; + pWork[1] = aPtr[E1]; + pWork[2] = aPtr[E2]; + pWork[3] = aPtr[E3]; + + return Result; +#endif +} + +//------------------------------------------------------------------------------ +inline XMVECTOR XM_CALLCONV XMVectorPermute +( + FXMVECTOR V1, + FXMVECTOR V2, + uint32_t PermuteX, + uint32_t PermuteY, + uint32_t PermuteZ, + uint32_t PermuteW +) +{ + assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const uint32_t ControlElement[ 8 ] = + { + 0x03020100, // XM_PERMUTE_0X + 0x07060504, // XM_PERMUTE_0Y + 0x0B0A0908, // XM_PERMUTE_0Z + 0x0F0E0D0C, // XM_PERMUTE_0W + 0x13121110, // XM_PERMUTE_1X + 0x17161514, // XM_PERMUTE_1Y + 0x1B1A1918, // XM_PERMUTE_1Z + 0x1F1E1D1C, // XM_PERMUTE_1W + }; + + int8x8x4_t tbl; + tbl.val[0] = vget_low_f32(V1); + tbl.val[1] = vget_high_f32(V1); + tbl.val[2] = vget_low_f32(V2); + tbl.val[3] = vget_high_f32(V2); + + uint32x2_t idx = vcreate_u32(static_cast(ControlElement[PermuteX]) | (static_cast(ControlElement[PermuteY]) << 32)); + const uint8x8_t rL = vtbl4_u8( tbl, idx ); + + idx = vcreate_u32(static_cast(ControlElement[PermuteZ]) | (static_cast(ControlElement[PermuteW]) << 32)); + const uint8x8_t rH = vtbl4_u8( tbl, idx ); + + return vcombine_f32( rL, rH ); +#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORU32 three = { 3, 3, 3, 3 }; + + alignas(16) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW }; + __m128i vControl = _mm_load_si128( reinterpret_cast(&elem[0]) ); + + __m128i vSelect = _mm_cmpgt_epi32( vControl, three ); + vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) ); + + __m128 shuffled1 = _mm_permutevar_ps( V1, vControl ); + __m128 shuffled2 = _mm_permutevar_ps( V2, vControl ); + + __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 ); + __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 ); + + return _mm_or_ps( masked1, masked2 ); +#else + + const uint32_t *aPtr[2]; + aPtr[0] = reinterpret_cast(&V1); + aPtr[1] = reinterpret_cast(&V2); + + XMVECTOR Result; + auto pWork = reinterpret_cast(&Result); + + const uint32_t i0 = PermuteX & 3; + const uint32_t vi0 = PermuteX >> 2; + pWork[0] = aPtr[vi0][i0]; + + const uint32_t i1 = PermuteY & 3; + const uint32_t vi1 = PermuteY >> 2; + pWork[1] = aPtr[vi1][i1]; + + const uint32_t i2 = PermuteZ & 3; + const uint32_t vi2 = PermuteZ >> 2; + pWork[2] = aPtr[vi2][i2]; + + const uint32_t i3 = PermuteW & 3; + const uint32_t vi3 = PermuteW >> 2; + pWork[3] = aPtr[vi3][i3]; + + return Result; +#endif +} + +//------------------------------------------------------------------------------ +// Define a control vector to be used in XMVectorSelect +// operations. The four integers specified in XMVectorSelectControl +// serve as indices to select between components in two vectors. +// The first index controls selection for the first component of +// the vectors involved in a select operation, the second index +// controls selection for the second component etc. A value of +// zero for an index causes the corresponding component from the first +// vector to be selected whereas a one causes the component from the +// second vector to be selected instead. + +inline XMVECTOR XM_CALLCONV XMVectorSelectControl +( + uint32_t VectorIndex0, + uint32_t VectorIndex1, + uint32_t VectorIndex2, + uint32_t VectorIndex3 +) +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + // x=Index0,y=Index1,z=Index2,w=Index3 + __m128i vTemp = _mm_set_epi32(static_cast(VectorIndex3), static_cast(VectorIndex2), static_cast(VectorIndex1), static_cast(VectorIndex0)); + // Any non-zero entries become 0xFFFFFFFF else 0 + vTemp = _mm_cmpgt_epi32(vTemp,g_XMZero); + return _mm_castsi128_ps(vTemp); +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + int32x2_t V0 = vcreate_s32(static_cast(VectorIndex0) | (static_cast(VectorIndex1) << 32)); + int32x2_t V1 = vcreate_s32(static_cast(VectorIndex2) | (static_cast(VectorIndex3) << 32)); + int32x4_t vTemp = vcombine_s32(V0, V1); + // Any non-zero entries become 0xFFFFFFFF else 0 + return vcgtq_s32(vTemp,g_XMZero); +#else + XMVECTOR ControlVector; + const uint32_t ControlElement[] = + { + XM_SELECT_0, + XM_SELECT_1 + }; + + assert(VectorIndex0 < 2); + assert(VectorIndex1 < 2); + assert(VectorIndex2 < 2); + assert(VectorIndex3 < 2); + _Analysis_assume_(VectorIndex0 < 2); + _Analysis_assume_(VectorIndex1 < 2); + _Analysis_assume_(VectorIndex2 < 2); + _Analysis_assume_(VectorIndex3 < 2); + + ControlVector.vector4_u32[0] = ControlElement[VectorIndex0]; + ControlVector.vector4_u32[1] = ControlElement[VectorIndex1]; + ControlVector.vector4_u32[2] = ControlElement[VectorIndex2]; + ControlVector.vector4_u32[3] = ControlElement[VectorIndex3]; + + return ControlVector; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSelect +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Control +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { + (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]), + (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]), + (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]), + (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]), + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vbslq_f32( Control, V2, V1 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_andnot_ps(Control,V1); + XMVECTOR vTemp2 = _mm_and_ps(V2,Control); + return _mm_or_ps(vTemp1,vTemp2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMergeXY +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { + V1.vector4_u32[0], + V2.vector4_u32[0], + V1.vector4_u32[1], + V2.vector4_u32[1], + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vzipq_f32( V1, V2 ).val[0]; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpacklo_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMergeZW +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { + V1.vector4_u32[2], + V2.vector4_u32[2], + V1.vector4_u32[3], + V2.vector4_u32[3] + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vzipq_f32( V1, V2 ).val[1]; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpackhi_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3)); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements, + uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) +{ + XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1); + return XMVectorSelect( VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control ); +} + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { + (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0, + }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vceqq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpeq_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorEqualR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + assert( pCR != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux&uy&uz&uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux|uy|uz|uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTORU32 Control = { ux, uy, uz, uw }; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are equal + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + // All elements are not equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +// Treat the components of the vectors as unsigned integers and +// compare individual bits between the two. This is useful for +// comparing control vectors and result vectors returned from +// other comparison operations. + +inline XMVECTOR XM_CALLCONV XMVectorEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { + (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0, + }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vceqq_u32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorEqualIntR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + assert( pCR != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control = XMVectorEqualInt(V1, V2); + + *pCR = 0; + if (XMVector4EqualInt(Control, XMVectorTrueInt())) + { + // All elements are equal + *pCR |= XM_CRMASK_CR6TRUE; + } + else if (XMVector4EqualInt(Control, XMVectorFalseInt())) + { + // All elements are not equal + *pCR |= XM_CRMASK_CR6FALSE; + } + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are equal + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + // All elements are not equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); + int iTemp = _mm_movemask_ps(_mm_castsi128_ps(V)); + uint32_t CR = 0; + if (iTemp==0x0F) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fDeltax = V1.vector4_f32[0]-V2.vector4_f32[0]; + float fDeltay = V1.vector4_f32[1]-V2.vector4_f32[1]; + float fDeltaz = V1.vector4_f32[2]-V2.vector4_f32[2]; + float fDeltaw = V1.vector4_f32[3]-V2.vector4_f32[3]; + + fDeltax = fabsf(fDeltax); + fDeltay = fabsf(fDeltay); + fDeltaz = fabsf(fDeltaz); + fDeltaw = fabsf(fDeltaw); + + XMVECTORU32 Control = { + (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0, + (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0, + (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0, + (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0, + }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vDelta = vsubq_f32(V1,V2); + return vacleq_f32( vDelta, Epsilon ); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { + (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0, + }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmvnq_u32(vceqq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpneq_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { + (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0, + (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0, + (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0, + (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0 + }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmvnq_u32(vceqq_u32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); + return _mm_xor_ps(_mm_castsi128_ps(V),g_XMNegOneMask); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorGreater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { + (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vcgtq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpgt_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorGreaterR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + assert( pCR != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux&uy&uz&uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux|uy|uz|uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTORU32 Control = { ux, uy, uz, uw }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { + (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vcgeq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpge_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + assert( pCR != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux&uy&uz&uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux|uy|uz|uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTORU32 Control = { ux, uy, uz, uw }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are greater or equal + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + // All elements are not greater or equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLess +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { + (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vcltq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmplt_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { + (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vcleq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmple_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorInBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { + (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0 + }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = vcleq_f32(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vTemp2,V); + // Blend answers + vTemp1 = vandq_u32(vTemp1,vTemp2); + return vTemp1; +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + return vTemp1; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorInBoundsR +( + uint32_t* pCR, + FXMVECTOR V, + FXMVECTOR Bounds +) +{ + assert( pCR != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + + uint32_t CR = 0; + if (ux&uy&uz&uw) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + + XMVECTORU32 Control = { ux, uy, uz, uw }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = vcleq_f32(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vTemp2,V); + // Blend answers + vTemp1 = vandq_u32(vTemp1,vTemp2); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + return vTemp1; +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + + uint32_t CR = 0; + if (_mm_movemask_ps(vTemp1)==0xf) { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + return vTemp1; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorIsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { + XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0, + XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0, + XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0, + XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0 + }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32( V, V ); + // Flip results + return vmvnq_u32( vTempNan ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + return _mm_cmpneq_ps(V,V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorIsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { + XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0, + XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0, + XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0, + XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0 + }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTemp = vandq_u32(V,g_XMAbsMask); + // Compare to infinity + vTemp = vceqq_f32(vTemp,g_XMInfinity); + // If any are infinity, the signs are true. + return vTemp; +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If any are infinity, the signs are true. + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +// Rounding and clamping operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMin +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0], + (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1], + (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2], + (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3] + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vminq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_min_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMax +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0], + (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1], + (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2], + (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3] + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmaxq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_max_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +namespace Internal +{ + // Round to nearest (even) a.k.a. banker's rounding + inline float round_to_nearest( float x ) + { + float i = floorf(x); + x -= i; + if(x < 0.5f) + return i; + if(x > 0.5f) + return i + 1.f; + + float int_part; + (void)modff( i / 2.f, &int_part ); + if ( (2.f*int_part) == i ) + { + return i; + } + + return i + 1.f; + } +} + +#if !defined(_XM_NO_INTRINSICS_) && !defined(__clang__) && !defined(__GNUC__) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline XMVECTOR XM_CALLCONV XMVectorRound +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + Internal::round_to_nearest(V.vector4_f32[0]), + Internal::round_to_nearest(V.vector4_f32[1]), + Internal::round_to_nearest(V.vector4_f32[2]), + Internal::round_to_nearest(V.vector4_f32[3]) + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + return vrndnq_f32(V); +#else + uint32x4_t sign = vandq_u32( V, g_XMNegativeZero ); + uint32x4_t sMagic = vorrq_u32( g_XMNoFraction, sign ); + float32x4_t R1 = vaddq_f32( V, sMagic ); + R1 = vsubq_f32( R1, sMagic ); + float32x4_t R2 = vabsq_f32( V ); + uint32x4_t mask = vcleq_f32( R2, g_XMNoFraction ); + XMVECTOR vResult = vbslq_f32( mask, R1, V ); + return vResult; +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 sign = _mm_and_ps( V, g_XMNegativeZero ); + __m128 sMagic = _mm_or_ps( g_XMNoFraction, sign ); + __m128 R1 = _mm_add_ps( V, sMagic ); + R1 = _mm_sub_ps( R1, sMagic ); + __m128 R2 = _mm_and_ps( V, g_XMAbsMask ); + __m128 mask = _mm_cmple_ps( R2, g_XMNoFraction ); + R2 = _mm_andnot_ps(mask,V); + R1 = _mm_and_ps(R1,mask); + XMVECTOR vResult = _mm_xor_ps(R1, R2); + return vResult; +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && !defined(__clang__) && !defined(__GNUC__) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTruncate +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + uint32_t i; + + // Avoid C4701 + Result.vector4_f32[0] = 0.0f; + + for (i = 0; i < 4; i++) + { + if (XMISNAN(V.vector4_f32[i])) + { + Result.vector4_u32[i] = 0x7FC00000; + } + else if (fabsf(V.vector4_f32[i]) < 8388608.0f) + { + Result.vector4_f32[i] = static_cast(static_cast(V.vector4_f32[i])); + } + else + { + Result.vector4_f32[i] = V.vector4_f32[i]; + } + } + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + return vrndq_f32(V); +#else + float32x4_t vTest = vabsq_f32( V ); + vTest = vcltq_f32( vTest, g_XMNoFraction ); + + int32x4_t vInt = vcvtq_s32_f32( V ); + XMVECTOR vResult = vcvtq_f32_s32( vInt ); + + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32( vTest, vResult, V ); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + // Get the abs value + __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); + // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Convert to int and back to float for rounding with truncation + __m128i vInt = _mm_cvttps_epi32(V); + // Convert back to floats + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); + vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorFloor +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + floorf(V.vector4_f32[0]), + floorf(V.vector4_f32[1]), + floorf(V.vector4_f32[2]), + floorf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + return vrndmq_f32(V); +#else + float32x4_t vTest = vabsq_f32( V ); + vTest = vcltq_f32( vTest, g_XMNoFraction ); + // Truncate + int32x4_t vInt = vcvtq_s32_f32( V ); + XMVECTOR vResult = vcvtq_f32_s32( vInt ); + XMVECTOR vLarger = vcgtq_f32( vResult, V ); + // 0 -> 0, 0xffffffff -> -1.0f + vLarger = vcvtq_f32_s32( vLarger ); + vResult = vaddq_f32( vResult, vLarger ); + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32( vTest, vResult, V ); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_floor_ps( V ); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Truncate + __m128i vInt = _mm_cvttps_epi32(V); + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + __m128 vLarger = _mm_cmpgt_ps( vResult, V ); + // 0 -> 0, 0xffffffff -> -1.0f + vLarger = _mm_cvtepi32_ps( _mm_castps_si128( vLarger ) ); + vResult = _mm_add_ps( vResult, vLarger ); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); + vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCeiling +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + ceilf(V.vector4_f32[0]), + ceilf(V.vector4_f32[1]), + ceilf(V.vector4_f32[2]), + ceilf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + return vrndpq_f32(V); +#else + float32x4_t vTest = vabsq_f32( V ); + vTest = vcltq_f32( vTest, g_XMNoFraction ); + // Truncate + int32x4_t vInt = vcvtq_s32_f32( V ); + XMVECTOR vResult = vcvtq_f32_s32( vInt ); + XMVECTOR vSmaller = vcltq_f32( vResult, V ); + // 0 -> 0, 0xffffffff -> -1.0f + vSmaller = vcvtq_f32_s32( vSmaller ); + vResult = vsubq_f32( vResult, vSmaller ); + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32( vTest, vResult, V ); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_ceil_ps( V ); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Truncate + __m128i vInt = _mm_cvttps_epi32(V); + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + __m128 vSmaller = _mm_cmplt_ps( vResult, V ); + // 0 -> 0, 0xffffffff -> -1.0f + vSmaller = _mm_cvtepi32_ps( _mm_castps_si128( vSmaller ) ); + vResult = _mm_sub_ps( vResult, vSmaller ); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); + vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorClamp +( + FXMVECTOR V, + FXMVECTOR Min, + FXMVECTOR Max +) +{ + assert(XMVector4LessOrEqual(Min, Max)); + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVectorMax(Min, V); + Result = XMVectorMin(Max, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult; + vResult = vmaxq_f32(Min, V); + vResult = vminq_f32(Max, vResult); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult; + vResult = _mm_max_ps(Min, V); + vResult = _mm_min_ps(Max, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSaturate +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + return XMVectorClamp(V, Zero, g_XMOne.v); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = vmaxq_f32(V, vdupq_n_f32(0) ); + // Set>1 to 1 + return vminq_f32(vResult, vdupq_n_f32(1.0f) ); +#elif defined(_XM_SSE_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Set>1 to 1 + return _mm_min_ps(vResult,g_XMOne); +#endif +} + +//------------------------------------------------------------------------------ +// Bitwise logical operations +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAndInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { + V1.vector4_u32[0] & V2.vector4_u32[0], + V1.vector4_u32[1] & V2.vector4_u32[1], + V1.vector4_u32[2] & V2.vector4_u32[2], + V1.vector4_u32[3] & V2.vector4_u32[3] + }; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vandq_u32(V1,V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_and_ps(V1,V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAndCInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { + V1.vector4_u32[0] & ~V2.vector4_u32[0], + V1.vector4_u32[1] & ~V2.vector4_u32[1], + V1.vector4_u32[2] & ~V2.vector4_u32[2], + V1.vector4_u32[3] & ~V2.vector4_u32[3] + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vbicq_u32(V1,V2); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_andnot_si128( _mm_castps_si128(V2), _mm_castps_si128(V1) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorOrInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { + V1.vector4_u32[0] | V2.vector4_u32[0], + V1.vector4_u32[1] | V2.vector4_u32[1], + V1.vector4_u32[2] | V2.vector4_u32[2], + V1.vector4_u32[3] | V2.vector4_u32[3] + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vorrq_u32(V1,V2); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { + ~(V1.vector4_u32[0] | V2.vector4_u32[0]), + ~(V1.vector4_u32[1] | V2.vector4_u32[1]), + ~(V1.vector4_u32[2] | V2.vector4_u32[2]), + ~(V1.vector4_u32[3] | V2.vector4_u32[3]) + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t Result = vorrq_u32(V1,V2); + return vbicq_u32(g_XMNegOneMask, Result); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i Result; + Result = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); + Result = _mm_andnot_si128( Result,g_XMNegOneMask); + return _mm_castsi128_ps(Result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorXorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { + V1.vector4_u32[0] ^ V2.vector4_u32[0], + V1.vector4_u32[1] ^ V2.vector4_u32[1], + V1.vector4_u32[2] ^ V2.vector4_u32[2], + V1.vector4_u32[3] ^ V2.vector4_u32[3] + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return veorq_u32(V1,V2); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_xor_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNegate +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + -V.vector4_f32[0], + -V.vector4_f32[1], + -V.vector4_f32[2], + -V.vector4_f32[3] + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vnegq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Z; + + Z = _mm_setzero_ps(); + + return _mm_sub_ps( Z, V ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAdd +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + V1.vector4_f32[0] + V2.vector4_f32[0], + V1.vector4_f32[1] + V2.vector4_f32[1], + V1.vector4_f32[2] + V2.vector4_f32[2], + V1.vector4_f32[3] + V2.vector4_f32[3] + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vaddq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_add_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSum +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result; + Result.f[0] = + Result.f[1] = + Result.f[2] = + Result.f[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3]; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + XMVECTOR vTemp = vpaddq_f32(V, V); + return vpaddq_f32(vTemp,vTemp); +#else + float32x2_t v1 = vget_low_f32(V); + float32x2_t v2 = vget_high_f32(V); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + return vcombine_f32(v1, v1); +#endif +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_hadd_ps(V, V); + return _mm_hadd_ps(vTemp,vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1)); + XMVECTOR vTemp2 = _mm_add_ps(V, vTemp); + vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2)); + return _mm_add_ps(vTemp, vTemp2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAddAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + // Add the given angles together. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + XMVECTOR Result = XMVectorAdd(V1, V2); + + XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); + XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = vaddq_f32(V1,V2); + // Less than Pi? + uint32x4_t vOffset = vcltq_f32(vResult,g_XMNegativePi); + vOffset = vandq_u32(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = vaddq_f32(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = vcgeq_f32(vResult,g_XMPi); + vOffset = vandq_u32(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = vsubq_f32(vResult,vOffset); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_add_ps(V1,V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult,g_XMPi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult,vOffset); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSubtract +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + V1.vector4_f32[0] - V2.vector4_f32[0], + V1.vector4_f32[1] - V2.vector4_f32[1], + V1.vector4_f32[2] - V2.vector4_f32[2], + V1.vector4_f32[3] - V2.vector4_f32[3] + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsubq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sub_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSubtractAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + // Subtract the given angles. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + XMVECTOR Result = XMVectorSubtract(V1, V2); + + XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); + XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = vsubq_f32(V1,V2); + // Less than Pi? + uint32x4_t vOffset = vcltq_f32(vResult,g_XMNegativePi); + vOffset = vandq_u32(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = vaddq_f32(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = vcgeq_f32(vResult,g_XMPi); + vOffset = vandq_u32(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = vsubq_f32(vResult,vOffset); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_sub_ps(V1,V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult,g_XMPi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult,vOffset); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMultiply +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + V1.vector4_f32[0] * V2.vector4_f32[0], + V1.vector4_f32[1] * V2.vector4_f32[1], + V1.vector4_f32[2] * V2.vector4_f32[2], + V1.vector4_f32[3] * V2.vector4_f32[3] + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmulq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_mul_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + V1.vector4_f32[0] * V2.vector4_f32[0] + V3.vector4_f32[0], + V1.vector4_f32[1] * V2.vector4_f32[1] + V3.vector4_f32[1], + V1.vector4_f32[2] * V2.vector4_f32[2] + V3.vector4_f32[2], + V1.vector4_f32[3] * V2.vector4_f32[3] + V3.vector4_f32[3] + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + return vfmaq_f32( V3, V1, V2 ); +#else + return vmlaq_f32( V3, V1, V2 ); +#endif +#elif defined(_XM_FMA3_INTRINSICS_) + return _mm_fmadd_ps( V1, V2, V3 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_mul_ps( V1, V2 ); + return _mm_add_ps(vResult, V3 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorDivide +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + V1.vector4_f32[0] / V2.vector4_f32[0], + V1.vector4_f32[1] / V2.vector4_f32[1], + V1.vector4_f32[2] / V2.vector4_f32[2], + V1.vector4_f32[3] / V2.vector4_f32[3] + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + return vdivq_f32( V1, V2 ); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(V2); + float32x4_t S = vrecpsq_f32( Reciprocal, V2 ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, V2 ); + Reciprocal = vmulq_f32( S, Reciprocal ); + return vmulq_f32( V1, Reciprocal ); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]), + V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]), + V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]), + V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3]) + }; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + return vfmsq_f32( V3, V1, V2 ); +#else + return vmlsq_f32( V3, V1, V2 ); +#endif +#elif defined(_XM_FMA3_INTRINSICS_) + return _mm_fnmadd_ps(V1, V2, V3); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R = _mm_mul_ps( V1, V2 ); + return _mm_sub_ps( V3, R ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorScale +( + FXMVECTOR V, + float ScaleFactor +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + V.vector4_f32[0] * ScaleFactor, + V.vector4_f32[1] * ScaleFactor, + V.vector4_f32[2] * ScaleFactor, + V.vector4_f32[3] * ScaleFactor + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmulq_n_f32( V, ScaleFactor ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_set_ps1(ScaleFactor); + return _mm_mul_ps(vResult,V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + 1.f / V.vector4_f32[0], + 1.f / V.vector4_f32[1], + 1.f / V.vector4_f32[2], + 1.f / V.vector4_f32[3] + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vrecpeq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rcp_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + 1.f / V.vector4_f32[0], + 1.f / V.vector4_f32[1], + 1.f / V.vector4_f32[2], + 1.f / V.vector4_f32[3] + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + float32x4_t one = vdupq_n_f32(1.0f); + return vdivq_f32(one,V); +#else + // 2 iterations of Newton-Raphson refinement + float32x4_t Reciprocal = vrecpeq_f32(V); + float32x4_t S = vrecpsq_f32( Reciprocal, V ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, V ); + return vmulq_f32( S, Reciprocal ); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps(g_XMOne,V); +#endif +} + +//------------------------------------------------------------------------------ +// Return an estimated square root +inline XMVECTOR XM_CALLCONV XMVectorSqrtEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + sqrtf(V.vector4_f32[0]), + sqrtf(V.vector4_f32[1]), + sqrtf(V.vector4_f32[2]), + sqrtf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 1 iteration of Newton-Raphson refinment of sqrt + float32x4_t S0 = vrsqrteq_f32(V); + float32x4_t P0 = vmulq_f32( V, S0 ); + float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); + float32x4_t S1 = vmulq_f32( S0, R0 ); + + XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) ); + XMVECTOR Result = vmulq_f32( V, S1 ); + XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + return XMVectorSelect(V, Result, Select); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSqrt +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + sqrtf(V.vector4_f32[0]), + sqrtf(V.vector4_f32[1]), + sqrtf(V.vector4_f32[2]), + sqrtf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 3 iterations of Newton-Raphson refinment of sqrt + float32x4_t S0 = vrsqrteq_f32(V); + float32x4_t P0 = vmulq_f32( V, S0 ); + float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); + float32x4_t S1 = vmulq_f32( S0, R0 ); + float32x4_t P1 = vmulq_f32( V, S1 ); + float32x4_t R1 = vrsqrtsq_f32( P1, S1 ); + float32x4_t S2 = vmulq_f32( S1, R1 ); + float32x4_t P2 = vmulq_f32( V, S2 ); + float32x4_t R2 = vrsqrtsq_f32( P2, S2 ); + float32x4_t S3 = vmulq_f32( S2, R2 ); + + XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) ); + XMVECTOR Result = vmulq_f32( V, S3 ); + XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + return XMVectorSelect(V, Result, Select); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + 1.f / sqrtf(V.vector4_f32[0]), + 1.f / sqrtf(V.vector4_f32[1]), + 1.f / sqrtf(V.vector4_f32[2]), + 1.f / sqrtf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vrsqrteq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rsqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + 1.f / sqrtf(V.vector4_f32[0]), + 1.f / sqrtf(V.vector4_f32[1]), + 1.f / sqrtf(V.vector4_f32[2]), + 1.f / sqrtf(V.vector4_f32[3]) + }; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t S0 = vrsqrteq_f32(V); + + float32x4_t P0 = vmulq_f32( V, S0 ); + float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); + + float32x4_t S1 = vmulq_f32( S0, R0 ); + float32x4_t P1 = vmulq_f32( V, S1 ); + float32x4_t R1 = vrsqrtsq_f32( P1, S1 ); + + return vmulq_f32( S1, R1 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_sqrt_ps(V); + vResult = _mm_div_ps(g_XMOne,vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExp2 +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + powf(2.0f, V.vector4_f32[0]), + powf(2.0f, V.vector4_f32[1]), + powf(2.0f, V.vector4_f32[2]), + powf(2.0f, V.vector4_f32[3]) + }; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t itrunc = vcvtq_s32_f32(V); + float32x4_t ftrunc = vcvtq_f32_s32(itrunc); + float32x4_t y = vsubq_f32(V, ftrunc); + + float32x4_t poly = vmlaq_f32( g_XMExpEst6, g_XMExpEst7, y ); + poly = vmlaq_f32( g_XMExpEst5, poly, y ); + poly = vmlaq_f32( g_XMExpEst4, poly, y ); + poly = vmlaq_f32( g_XMExpEst3, poly, y ); + poly = vmlaq_f32( g_XMExpEst2, poly, y ); + poly = vmlaq_f32( g_XMExpEst1, poly, y ); + poly = vmlaq_f32( g_XMOne, poly, y ); + + int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias); + biased = vshlq_n_s32(biased, 23); + float32x4_t result0 = XMVectorDivide(biased, poly); + + biased = vaddq_s32(itrunc, g_XM253); + biased = vshlq_n_s32(biased, 23); + float32x4_t result1 = XMVectorDivide(biased, poly); + result1 = vmulq_f32(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + int32x4_t comp = vcltq_s32( V, g_XMBin128); + float32x4_t result2 = vbslq_f32( comp, result0, g_XMInfinity ); + + comp = vcltq_s32(itrunc, g_XMSubnormalExponent); + float32x4_t result3 = vbslq_f32( comp, result1, result0 ); + + comp = vcltq_s32(V, g_XMBinNeg150); + float32x4_t result4 = vbslq_f32( comp, result3, g_XMZero ); + + int32x4_t sign = vandq_s32(V, g_XMNegativeZero); + comp = vceqq_s32(sign, g_XMNegativeZero); + float32x4_t result5 = vbslq_f32( comp, result4, result2 ); + + int32x4_t t0 = vandq_s32(V, g_XMQNaNTest); + int32x4_t t1 = vandq_s32(V, g_XMInfinity); + t0 = vceqq_s32(t0, g_XMZero); + t1 = vceqq_s32(t1, g_XMInfinity); + int32x4_t isNaN = vbicq_s32( t1,t0); + + float32x4_t vResult = vbslq_f32( isNaN, g_XMQNaN, result5 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i itrunc = _mm_cvttps_epi32(V); + __m128 ftrunc = _mm_cvtepi32_ps(itrunc); + __m128 y = _mm_sub_ps(V, ftrunc); + __m128 poly = _mm_mul_ps(g_XMExpEst7, y); + poly = _mm_add_ps(g_XMExpEst6, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst5, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst4, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst3, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst2, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst1, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMOne, poly); + + __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias); + biased = _mm_slli_epi32(biased, 23); + __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + + biased = _mm_add_epi32(itrunc, g_XM253); + biased = _mm_slli_epi32(biased, 23); + __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + result1 = _mm_mul_ps(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + __m128i comp = _mm_cmplt_epi32( _mm_castps_si128(V), g_XMBin128); + __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0)); + __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity); + __m128i result2 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent); + select1 = _mm_and_si128(comp, _mm_castps_si128(result1)); + select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0)); + __m128i result3 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBinNeg150); + select0 = _mm_and_si128(comp, result3); + select1 = _mm_andnot_si128(comp, g_XMZero); + __m128i result4 = _mm_or_si128(select0, select1); + + __m128i sign = _mm_and_si128(_mm_castps_si128(V), g_XMNegativeZero); + comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero); + select0 = _mm_and_si128(comp, result4); + select1 = _mm_andnot_si128(comp, result2); + __m128i result5 = _mm_or_si128(select0, select1); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result5); + __m128i vResult = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(vResult); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExpE +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + expf(V.vector4_f32[0]), + expf(V.vector4_f32[1]), + expf(V.vector4_f32[2]), + expf(V.vector4_f32[3]) + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // expE(V) = exp2(vin*log2(e)) + float32x4_t Ve = vmulq_f32(g_XMLgE, V); + + int32x4_t itrunc = vcvtq_s32_f32(Ve); + float32x4_t ftrunc = vcvtq_f32_s32(itrunc); + float32x4_t y = vsubq_f32(Ve, ftrunc); + + + float32x4_t poly = vmlaq_f32( g_XMExpEst6, g_XMExpEst7, y ); + poly = vmlaq_f32( g_XMExpEst5, poly, y ); + poly = vmlaq_f32( g_XMExpEst4, poly, y ); + poly = vmlaq_f32( g_XMExpEst3, poly, y ); + poly = vmlaq_f32( g_XMExpEst2, poly, y ); + poly = vmlaq_f32( g_XMExpEst1, poly, y ); + poly = vmlaq_f32( g_XMOne, poly, y ); + + int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias); + biased = vshlq_n_s32(biased, 23); + float32x4_t result0 = XMVectorDivide(biased, poly); + + biased = vaddq_s32(itrunc, g_XM253); + biased = vshlq_n_s32(biased, 23); + float32x4_t result1 = XMVectorDivide(biased, poly); + result1 = vmulq_f32(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + int32x4_t comp = vcltq_s32( Ve, g_XMBin128); + float32x4_t result2 = vbslq_f32( comp, result0, g_XMInfinity ); + + comp = vcltq_s32(itrunc, g_XMSubnormalExponent); + float32x4_t result3 = vbslq_f32( comp, result1, result0 ); + + comp = vcltq_s32(Ve, g_XMBinNeg150); + float32x4_t result4 = vbslq_f32( comp, result3, g_XMZero ); + + int32x4_t sign = vandq_s32(Ve, g_XMNegativeZero); + comp = vceqq_s32(sign, g_XMNegativeZero); + float32x4_t result5 = vbslq_f32( comp, result4, result2 ); + + int32x4_t t0 = vandq_s32(Ve, g_XMQNaNTest); + int32x4_t t1 = vandq_s32(Ve, g_XMInfinity); + t0 = vceqq_s32(t0, g_XMZero); + t1 = vceqq_s32(t1, g_XMInfinity); + int32x4_t isNaN = vbicq_s32( t1,t0); + + float32x4_t vResult = vbslq_f32( isNaN, g_XMQNaN, result5 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // expE(V) = exp2(vin*log2(e)) + __m128 Ve = _mm_mul_ps(g_XMLgE, V); + + __m128i itrunc = _mm_cvttps_epi32(Ve); + __m128 ftrunc = _mm_cvtepi32_ps(itrunc); + __m128 y = _mm_sub_ps(Ve, ftrunc); + __m128 poly = _mm_mul_ps(g_XMExpEst7, y); + poly = _mm_add_ps(g_XMExpEst6, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst5, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst4, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst3, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst2, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst1, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMOne, poly); + + __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias); + biased = _mm_slli_epi32(biased, 23); + __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + + biased = _mm_add_epi32(itrunc, g_XM253); + biased = _mm_slli_epi32(biased, 23); + __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + result1 = _mm_mul_ps(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + __m128i comp = _mm_cmplt_epi32( _mm_castps_si128(Ve), g_XMBin128); + __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0)); + __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity); + __m128i result2 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent); + select1 = _mm_and_si128(comp, _mm_castps_si128(result1)); + select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0)); + __m128i result3 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(_mm_castps_si128(Ve), g_XMBinNeg150); + select0 = _mm_and_si128(comp, result3); + select1 = _mm_andnot_si128(comp, g_XMZero); + __m128i result4 = _mm_or_si128(select0, select1); + + __m128i sign = _mm_and_si128(_mm_castps_si128(Ve), g_XMNegativeZero); + comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero); + select0 = _mm_and_si128(comp, result4); + select1 = _mm_andnot_si128(comp, result2); + __m128i result5 = _mm_or_si128(select0, select1); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(Ve), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(Ve), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result5); + __m128i vResult = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(vResult); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExp +( + FXMVECTOR V +) +{ + return XMVectorExp2(V); +} + +//------------------------------------------------------------------------------ + +#if defined(_XM_SSE_INTRINSICS_) + +namespace Internal +{ + inline __m128i multi_sll_epi32(__m128i value, __m128i count) + { + __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0,0,0,0)); + __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0,0,0,0)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r0 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1,1,1,1)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1,1,1,1)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r1 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2,2,2,2)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2,2,2,2)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r2 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3,3,3,3)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3,3,3,3)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r3 = _mm_sll_epi32(v, c); + + // (r0,r0,r1,r1) + __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0,0,0,0)); + // (r2,r2,r3,r3) + __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0,0,0,0)); + // (r0,r1,r2,r3) + __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2,0,2,0)); + return _mm_castps_si128(result); + } + + inline __m128i multi_srl_epi32(__m128i value, __m128i count) + { + __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0,0,0,0)); + __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0,0,0,0)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r0 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1,1,1,1)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1,1,1,1)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r1 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2,2,2,2)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2,2,2,2)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r2 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3,3,3,3)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3,3,3,3)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r3 = _mm_srl_epi32(v, c); + + // (r0,r0,r1,r1) + __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0,0,0,0)); + // (r2,r2,r3,r3) + __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0,0,0,0)); + // (r0,r1,r2,r3) + __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2,0,2,0)); + return _mm_castps_si128(result); + } + + inline __m128i GetLeadingBit(const __m128i value) + { + static const XMVECTORI32 g_XM0000FFFF = { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF }; + static const XMVECTORI32 g_XM000000FF = { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF }; + static const XMVECTORI32 g_XM0000000F = { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F }; + static const XMVECTORI32 g_XM00000003 = { 0x00000003, 0x00000003, 0x00000003, 0x00000003 }; + + __m128i v = value, r, c, b, s; + + c = _mm_cmpgt_epi32(v, g_XM0000FFFF); // c = (v > 0xFFFF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + r = _mm_slli_epi32(b, 4); // r = (b << 4) + v = multi_srl_epi32(v, r); // v = (v >> r) + + c = _mm_cmpgt_epi32(v, g_XM000000FF); // c = (v > 0xFF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 3); // s = (b << 3) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + c = _mm_cmpgt_epi32(v, g_XM0000000F); // c = (v > 0xF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 2); // s = (b << 2) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + c = _mm_cmpgt_epi32(v, g_XM00000003); // c = (v > 0x3) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 1); // s = (b << 1) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + s = _mm_srli_epi32(v, 1); + r = _mm_or_si128(r, s); + return r; + } +} // namespace Internal + +#endif // _XM_SSE_INTRINSICS_ + +#if defined(_XM_ARM_NEON_INTRINSICS_) + +namespace Internal +{ + inline int32x4_t GetLeadingBit(const int32x4_t value) + { + static const XMVECTORI32 g_XM0000FFFF = { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF }; + static const XMVECTORI32 g_XM000000FF = { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF }; + static const XMVECTORI32 g_XM0000000F = { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F }; + static const XMVECTORI32 g_XM00000003 = { 0x00000003, 0x00000003, 0x00000003, 0x00000003 }; + + int32x4_t v = value, r, c, b, s; + + c = vcgtq_s32(v, g_XM0000FFFF); // c = (v > 0xFFFF) + b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0) + r = vshlq_n_s32(b, 4); // r = (b << 4) + r = vnegq_s32( r ); + v = vshlq_u32( v, r ); // v = (v >> r) + + c = vcgtq_s32(v, g_XM000000FF); // c = (v > 0xFF) + b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0) + s = vshlq_n_s32(b, 3); // s = (b << 3) + s = vnegq_s32( s ); + v = vshlq_u32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + c = vcgtq_s32(v, g_XM0000000F); // c = (v > 0xF) + b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0) + s = vshlq_n_s32(b, 2); // s = (b << 2) + s = vnegq_s32( s ); + v = vshlq_u32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + c = vcgtq_s32(v, g_XM00000003); // c = (v > 0x3) + b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0) + s = vshlq_n_s32(b, 1); // s = (b << 1) + s = vnegq_s32( s ); + v = vshlq_u32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + s = vshrq_n_u32(v, 1); + r = vorrq_s32(r, s); + return r; + } + +} // namespace Internal + +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLog2 +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + const float fScale = 1.4426950f; // (1.0f / logf(2.0f)); + + XMVECTORF32 Result = { + logf(V.vector4_f32[0])*fScale, + logf(V.vector4_f32[1])*fScale, + logf(V.vector4_f32[2])*fScale, + logf(V.vector4_f32[3])*fScale + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t rawBiased = vandq_s32(V, g_XMInfinity); + int32x4_t trailing = vandq_s32(V, g_XMQNaNTest); + int32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + int32x4_t biased = vshrq_n_u32(rawBiased, 23); + int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); + int32x4_t trailingNor = trailing; + + // Compute exponent and significand for subnormals. + int32x4_t leading = Internal::GetLeadingBit(trailing); + int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); + int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); + int32x4_t trailingSub = vshlq_u32(trailing, shift); + trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); + int32x4_t e = vbslq_f32( isExponentZero, exponentSub, exponentNor ); + int32x4_t t = vbslq_f32( isExponentZero, trailingSub, trailingNor ); + + // Compute the approximation. + int32x4_t tmp = vorrq_s32(g_XMOne, t); + float32x4_t y = vsubq_f32(tmp, g_XMOne); + + float32x4_t log2 = vmlaq_f32( g_XMLogEst6, g_XMLogEst7, y ); + log2 = vmlaq_f32( g_XMLogEst5, log2, y ); + log2 = vmlaq_f32( g_XMLogEst4, log2, y ); + log2 = vmlaq_f32( g_XMLogEst3, log2, y ); + log2 = vmlaq_f32( g_XMLogEst2, log2, y ); + log2 = vmlaq_f32( g_XMLogEst1, log2, y ); + log2 = vmlaq_f32( g_XMLogEst0, log2, y ); + log2 = vmlaq_f32( vcvtq_f32_s32(e), log2, y ); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + int32x4_t isInfinite = vandq_s32((V), g_XMAbsMask); + isInfinite = vceqq_s32(isInfinite, g_XMInfinity); + + int32x4_t isGreaterZero = vcgtq_s32((V), g_XMZero); + int32x4_t isNotFinite = vcgtq_s32((V), g_XMInfinity); + int32x4_t isPositive = vbicq_s32( isGreaterZero,isNotFinite); + + int32x4_t isZero = vandq_s32((V), g_XMAbsMask); + isZero = vceqq_s32(isZero, g_XMZero); + + int32x4_t t0 = vandq_s32((V), g_XMQNaNTest); + int32x4_t t1 = vandq_s32((V), g_XMInfinity); + t0 = vceqq_s32(t0, g_XMZero); + t1 = vceqq_s32(t1, g_XMInfinity); + int32x4_t isNaN = vbicq_s32( t1,t0); + + float32x4_t result = vbslq_f32( isInfinite, g_XMInfinity, log2 ); + tmp = vbslq_f32( isZero, g_XMNegInfinity, g_XMNegQNaN ); + result = vbslq_f32(isPositive, result, tmp); + result = vbslq_f32(isNaN, g_XMQNaN, result ); + return result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + __m128i biased = _mm_srli_epi32(rawBiased, 23); + __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); + __m128i trailingNor = trailing; + + // Compute exponent and significand for subnormals. + __m128i leading = Internal::GetLeadingBit(trailing); + __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); + __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); + __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift); + trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); + + __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); + __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); + __m128i e = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isExponentZero, trailingSub); + select1 = _mm_andnot_si128(isExponentZero, trailingNor); + __m128i t = _mm_or_si128(select0, select1); + + // Compute the approximation. + __m128i tmp = _mm_or_si128(g_XMOne, t); + __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); + + __m128 log2 = _mm_mul_ps(g_XMLogEst7, y); + log2 = _mm_add_ps(g_XMLogEst6, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst5, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst4, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst3, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst2, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst1, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst0, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(e)); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); + + __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); + __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); + __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); + + __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isZero = _mm_cmpeq_epi32(isZero, g_XMZero); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isInfinite, g_XMInfinity); + select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); + __m128i result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isZero, g_XMNegInfinity); + select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); + tmp = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isPositive, result); + select1 = _mm_andnot_si128(isPositive, tmp); + result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result); + result = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLogE +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + logf(V.vector4_f32[0]), + logf(V.vector4_f32[1]), + logf(V.vector4_f32[2]), + logf(V.vector4_f32[3]) + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t rawBiased = vandq_s32(V, g_XMInfinity); + int32x4_t trailing = vandq_s32(V, g_XMQNaNTest); + int32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + int32x4_t biased = vshrq_n_u32(rawBiased, 23); + int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); + int32x4_t trailingNor = trailing; + + // Compute exponent and significand for subnormals. + int32x4_t leading = Internal::GetLeadingBit(trailing); + int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); + int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); + int32x4_t trailingSub = vshlq_u32(trailing, shift); + trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); + int32x4_t e = vbslq_f32( isExponentZero, exponentSub, exponentNor ); + int32x4_t t = vbslq_f32( isExponentZero, trailingSub, trailingNor ); + + // Compute the approximation. + int32x4_t tmp = vorrq_s32(g_XMOne, t); + float32x4_t y = vsubq_f32(tmp, g_XMOne); + + float32x4_t log2 = vmlaq_f32( g_XMLogEst6, g_XMLogEst7, y ); + log2 = vmlaq_f32( g_XMLogEst5, log2, y ); + log2 = vmlaq_f32( g_XMLogEst4, log2, y ); + log2 = vmlaq_f32( g_XMLogEst3, log2, y ); + log2 = vmlaq_f32( g_XMLogEst2, log2, y ); + log2 = vmlaq_f32( g_XMLogEst1, log2, y ); + log2 = vmlaq_f32( g_XMLogEst0, log2, y ); + log2 = vmlaq_f32( vcvtq_f32_s32(e), log2, y ); + + log2 = vmulq_f32(g_XMInvLgE, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + int32x4_t isInfinite = vandq_s32((V), g_XMAbsMask); + isInfinite = vceqq_s32(isInfinite, g_XMInfinity); + + int32x4_t isGreaterZero = vcgtq_s32((V), g_XMZero); + int32x4_t isNotFinite = vcgtq_s32((V), g_XMInfinity); + int32x4_t isPositive = vbicq_s32( isGreaterZero,isNotFinite); + + int32x4_t isZero = vandq_s32((V), g_XMAbsMask); + isZero = vceqq_s32(isZero, g_XMZero); + + int32x4_t t0 = vandq_s32((V), g_XMQNaNTest); + int32x4_t t1 = vandq_s32((V), g_XMInfinity); + t0 = vceqq_s32(t0, g_XMZero); + t1 = vceqq_s32(t1, g_XMInfinity); + int32x4_t isNaN = vbicq_s32( t1,t0); + + float32x4_t result = vbslq_f32( isInfinite, g_XMInfinity, log2 ); + tmp = vbslq_f32( isZero, g_XMNegInfinity, g_XMNegQNaN ); + result = vbslq_f32(isPositive, result, tmp); + result = vbslq_f32(isNaN, g_XMQNaN, result ); + return result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + __m128i biased = _mm_srli_epi32(rawBiased, 23); + __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); + __m128i trailingNor = trailing; + + // Compute exponent and significand for subnormals. + __m128i leading = Internal::GetLeadingBit(trailing); + __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); + __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); + __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift); + trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); + + __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); + __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); + __m128i e = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isExponentZero, trailingSub); + select1 = _mm_andnot_si128(isExponentZero, trailingNor); + __m128i t = _mm_or_si128(select0, select1); + + // Compute the approximation. + __m128i tmp = _mm_or_si128(g_XMOne, t); + __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); + + __m128 log2 = _mm_mul_ps(g_XMLogEst7, y); + log2 = _mm_add_ps(g_XMLogEst6, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst5, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst4, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst3, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst2, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst1, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst0, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(e)); + + log2 = _mm_mul_ps(g_XMInvLgE, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); + + __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); + __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); + __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); + + __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isZero = _mm_cmpeq_epi32(isZero, g_XMZero); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isInfinite, g_XMInfinity); + select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); + __m128i result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isZero, g_XMNegInfinity); + select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); + tmp = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isPositive, result); + select1 = _mm_andnot_si128(isPositive, tmp); + result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result); + result = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLog +( + FXMVECTOR V +) +{ + return XMVectorLog2(V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorPow +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + powf(V1.vector4_f32[0], V2.vector4_f32[0]), + powf(V1.vector4_f32[1], V2.vector4_f32[1]), + powf(V1.vector4_f32[2], V2.vector4_f32[2]), + powf(V1.vector4_f32[3], V2.vector4_f32[3]) + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)), + powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)), + powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)), + powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3)) + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + alignas(16) float a[4]; + alignas(16) float b[4]; + _mm_store_ps( a, V1 ); + _mm_store_ps( b, V2 ); + XMVECTOR vResult = _mm_setr_ps( + powf(a[0],b[0]), + powf(a[1],b[1]), + powf(a[2],b[2]), + powf(a[3],b[3])); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAbs +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + fabsf(V.vector4_f32[0]), + fabsf(V.vector4_f32[1]), + fabsf(V.vector4_f32[2]), + fabsf(V.vector4_f32[3]) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vabsq_f32( V ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_setzero_ps(); + vResult = _mm_sub_ps(vResult,V); + vResult = _mm_max_ps(vResult,V); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMod +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + // V1 % V2 = V1 - V2 * truncate(V1 / V2) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Quotient = XMVectorDivide(V1, V2); + Quotient = XMVectorTruncate(Quotient); + XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult = XMVectorDivide(V1, V2); + vResult = XMVectorTruncate(vResult); + return vmlsq_f32( V1, vResult, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_div_ps(V1, V2); + vResult = XMVectorTruncate(vResult); + vResult = _mm_mul_ps(vResult,V2); + vResult = _mm_sub_ps(V1,vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorModAngles +( + FXMVECTOR Angles +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR Result; + + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v); + V = XMVectorRound(V); + Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + XMVECTOR vResult = vmulq_f32(Angles,g_XMReciprocalTwoPi); + // Use the inline function due to complexity for rounding + vResult = XMVectorRound(vResult); + return vmlsq_f32( Angles, vResult, g_XMTwoPi ); +#elif defined(_XM_SSE_INTRINSICS_) + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + XMVECTOR vResult = _mm_mul_ps(Angles,g_XMReciprocalTwoPi); + // Use the inline function due to complexity for rounding + vResult = XMVectorRound(vResult); + vResult = _mm_mul_ps(vResult,g_XMTwoPi); + vResult = _mm_sub_ps(Angles,vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSin +( + FXMVECTOR V +) +{ + // 11-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32( x ); + float32x4_t rflx = vsubq_f32(c, x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR SC1 = g_XMSinCoefficients1; + const XMVECTOR SC0 = g_XMSinCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __m128 sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR SC1 = g_XMSinCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + const XMVECTOR SC0 = g_XMSinCoefficients0; + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCos +( + FXMVECTOR V +) +{ + // 10-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32( x ); + float32x4_t rflx = vsubq_f32(c, x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR CC1 = g_XMCosCoefficients1; + const XMVECTOR CC0 = g_XMCosCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0 ); + + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, sign); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR CC1 = g_XMCosCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, sign); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorSinCos +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) +{ + assert(pSin != nullptr); + assert(pCos != nullptr); + + // 11/10-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Sin = { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + }; + + XMVECTORF32 Cos = { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + }; + + *pSin = Sin.v; + *pCos = Cos.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32( x ); + float32x4_t rflx = vsubq_f32(c, x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SC1 = g_XMSinCoefficients1; + const XMVECTOR SC0 = g_XMSinCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pSin = vmulq_f32(Result, x); + + // Compute polynomial approximation for cosine + const XMVECTOR CC1 = g_XMCosCoefficients1; + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); + Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pCos = vmulq_f32(Result, sign); +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation of sine + const XMVECTOR SC1 = g_XMSinCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + const XMVECTOR SC0 = g_XMSinCoefficients0; + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + *pSin = Result; + + // Compute polynomial approximation of cosine + const XMVECTOR CC1 = g_XMCosCoefficients1; + vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_mul_ps(vConstants, x2); + + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, sign); + *pCos = Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTan +( + FXMVECTOR V +) +{ + // Cody and Waite algorithm to compute tangent. + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + tanf(V.vector4_f32[0]), + tanf(V.vector4_f32[1]), + tanf(V.vector4_f32[2]), + tanf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 TanCoefficients0 = { 1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f }; + static const XMVECTORF32 TanCoefficients1 = { 4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f }; + static const XMVECTORF32 TanConstants = { 1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ }; + static const XMVECTORU32 Mask = { 0x1, 0x1, 0x1, 0x1 }; + + XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v); + + XMVECTOR Zero = XMVectorZero(); + + XMVECTOR C0 = XMVectorSplatX(TanConstants.v); + XMVECTOR C1 = XMVectorSplatY(TanConstants.v); + XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v); + + XMVECTOR VA = XMVectorMultiply(V, TwoDivPi); + + VA = XMVectorRound(VA); + + XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V); + + XMVECTOR VB = XMVectorAbs(VA); + + VC = XMVectorNegativeMultiplySubtract(VA, C1, VC); + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + VB = vcvtq_u32_f32( VB ); +#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + reinterpret_cast<__m128i *>(&VB)[0] = _mm_cvttps_epi32(VB); +#else + for (size_t i = 0; i < 4; i++) + { + VB.vector4_u32[i] = static_cast(VB.vector4_f32[i]); + } +#endif + + XMVECTOR VC2 = XMVectorMultiply(VC, VC); + + XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v); + XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v); + XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v); + XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v); + XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v); + XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v); + XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v); + XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v); + + XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v); + VBIsEven = XMVectorEqualInt(VBIsEven, Zero); + + XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6); + XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3); + N = XMVectorMultiplyAdd(VC2, N, T5); + D = XMVectorMultiplyAdd(VC2, D, T2); + N = XMVectorMultiply(VC2, N); + D = XMVectorMultiplyAdd(VC2, D, T1); + N = XMVectorMultiplyAdd(VC, N, VC); + XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon); + D = XMVectorMultiplyAdd(VC2, D, T0); + + N = XMVectorSelect(N, VC, VCNearZero); + D = XMVectorSelect(D, g_XMOne.v, VCNearZero); + + XMVECTOR R0 = XMVectorNegate(N); + XMVECTOR R1 = XMVectorDivide(N,D); + R0 = XMVectorDivide(D,R0); + + XMVECTOR VIsZero = XMVectorEqual(V, Zero); + + XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven); + + Result = XMVectorSelect(Result, Zero, VIsZero); + + return Result; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSinH +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + sinhf(V.vector4_f32[0]), + sinhf(V.vector4_f32[1]), + sinhf(V.vector4_f32[2]), + sinhf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = vmlaq_f32( g_XMNegativeOne.v, V, Scale.v ); + XMVECTOR V2 = vmlsq_f32( g_XMNegativeOne.v, V, Scale.v ); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + + return vsubq_f32(E1, E2); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = _mm_mul_ps(V, Scale); + V1 = _mm_add_ps(V1,g_XMNegativeOne); + XMVECTOR V2 = _mm_mul_ps(V, Scale); + V2 = _mm_sub_ps(g_XMNegativeOne,V2); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + + return _mm_sub_ps(E1, E2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCosH +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + coshf(V.vector4_f32[0]), + coshf(V.vector4_f32[1]), + coshf(V.vector4_f32[2]), + coshf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + return vaddq_f32(E1, E2); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = _mm_mul_ps(V,Scale.v); + V1 = _mm_add_ps(V1,g_XMNegativeOne.v); + XMVECTOR V2 = _mm_mul_ps(V, Scale.v); + V2 = _mm_sub_ps(g_XMNegativeOne.v,V2); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + return _mm_add_ps(E1, E2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTanH +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + tanhf(V.vector4_f32[0]), + tanhf(V.vector4_f32[1]), + tanhf(V.vector4_f32[2]), + tanhf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f }; // 2.0f / ln(2.0f) + + XMVECTOR E = vmulq_f32(V, Scale.v); + E = XMVectorExp(E); + E = vmlaq_f32( g_XMOneHalf.v, E, g_XMOneHalf.v ); + E = XMVectorReciprocal(E); + return vsubq_f32(g_XMOne.v, E); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f }; // 2.0f / ln(2.0f) + + XMVECTOR E = _mm_mul_ps(V, Scale.v); + E = XMVectorExp(E); + E = _mm_mul_ps(E,g_XMOneHalf.v); + E = _mm_add_ps(E,g_XMOneHalf.v); + E = _mm_div_ps(g_XMOne.v,E); + return _mm_sub_ps(g_XMOne.v,E); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorASin +( + FXMVECTOR V +) +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + asinf(V.vector4_f32[0]), + asinf(V.vector4_f32[1]), + asinf(V.vector4_f32[2]), + asinf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); + XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AC1), 1 ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32( nonnegative, t0, t1 ); + t0 = vsubq_f32(g_XMHalfPi, t0); + return t0; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 t0 = _mm_mul_ps(vConstants, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0,_MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + t0 = _mm_sub_ps(g_XMHalfPi, t0); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorACos +( + FXMVECTOR V +) +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + acosf(V.vector4_f32[0]), + acosf(V.vector4_f32[1]), + acosf(V.vector4_f32[2]), + acosf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); + XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AC1), 1 ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32( nonnegative, t0, t1 ); + return t0; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 t0 = _mm_mul_ps(vConstants, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan +( + FXMVECTOR V +) +{ + // 17-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + atanf(V.vector4_f32[0]), + atanf(V.vector4_f32[1]), + atanf(V.vector4_f32[2]), + atanf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t absV = vabsq_f32(V); + float32x4_t invV = XMVectorReciprocal(V); + uint32x4_t comp = vcgtq_f32(V, g_XMOne); + uint32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + comp = vcleq_f32(absV, g_XMOne); + sign = vbslq_f32(comp, g_XMZero, sign); + uint32x4_t x = vbslq_f32(comp, V, invV); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR TC1 = g_XMATanCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0); + XMVECTOR Result = vmlaq_lane_f32( vConstants, x2, vget_high_f32(TC1), 1 ); + + vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + const XMVECTOR TC0 = g_XMATanCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + Result = vmlaq_f32( g_XMOne, Result, x2 ); + Result = vmulq_f32( Result, x ); + + float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); + result1 = vsubq_f32(result1, Result); + + comp = vceqq_f32(sign, g_XMZero); + Result = vbslq_f32( comp, Result, result1 ); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 absV = XMVectorAbs(V); + __m128 invV = _mm_div_ps(g_XMOne, V); + __m128 comp = _mm_cmpgt_ps(V, g_XMOne); + __m128 select0 = _mm_and_ps(comp, g_XMOne); + __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + __m128 sign = _mm_or_ps(select0, select1); + comp = _mm_cmple_ps(absV, g_XMOne); + select0 = _mm_and_ps(comp, g_XMZero); + select1 = _mm_andnot_ps(comp, sign); + sign = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, V); + select1 = _mm_andnot_ps(comp, invV); + __m128 x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR TC1 = g_XMATanCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + const XMVECTOR TC0 = g_XMATanCoefficients0; + vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); + result1 = _mm_sub_ps(result1, Result); + + comp = _mm_cmpeq_ps(sign, g_XMZero); + select0 = _mm_and_ps(comp, Result); + select1 = _mm_andnot_ps(comp, result1); + Result = _mm_or_ps(select0, select1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan2 +( + FXMVECTOR Y, + FXMVECTOR X +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + atan2f(Y.vector4_f32[0], X.vector4_f32[0]), + atan2f(Y.vector4_f32[1], X.vector4_f32[1]), + atan2f(Y.vector4_f32[2], X.vector4_f32[2]), + atan2f(Y.vector4_f32[3], X.vector4_f32[3]) + }; + return Result.v; +#else + + // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: + + // Y == 0 and X is Negative -> Pi with the sign of Y + // y == 0 and x is positive -> 0 with the sign of y + // Y != 0 and X == 0 -> Pi / 2 with the sign of Y + // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) + // X == -Infinity and Finite Y -> Pi with the sign of Y + // X == +Infinity and Finite Y -> 0 with the sign of Y + // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y + // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y + // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y + + static const XMVECTORF32 ATan2Constants = { XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f }; + + XMVECTOR Zero = XMVectorZero(); + XMVECTOR ATanResultValid = XMVectorTrueInt(); + + XMVECTOR Pi = XMVectorSplatX(ATan2Constants); + XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); + XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); + XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); + XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); + XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + + XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); + XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); + XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + XMVECTOR V = XMVectorDivide(Y, X); + + XMVECTOR R0 = XMVectorATan(V); + + R1 = XMVectorSelect( Pi, g_XMNegativeZero, XIsPositive ); + R2 = XMVectorAdd(R0, R1); + + return XMVectorSelect(Result, R2, ATanResultValid); + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSinEst +( + FXMVECTOR V +) +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32( x ); + float32x4_t rflx = vsubq_f32(c, x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __m128 sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCosEst +( + FXMVECTOR V +) +{ + // 6-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32( x ); + float32x4_t rflx = vsubq_f32(c, x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, sign); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, sign); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorSinCosEst +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) +{ + assert(pSin != nullptr); + assert(pCos != nullptr); + + // 7/6-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Sin = { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + }; + + XMVECTORF32 Cos = { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + }; + + *pSin = Sin.v; + *pCos = Cos.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32( x ); + float32x4_t rflx = vsubq_f32(c, x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pSin = vmulq_f32(Result, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); + Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pCos = vmulq_f32(Result, sign); +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + *pSin = Result; + + // Compute polynomial approximation for cosine + const XMVECTOR CEC = g_XMCosCoefficients1; + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, sign); + *pCos = Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTanEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + tanf(V.vector4_f32[0]), + tanf(V.vector4_f32[1]), + tanf(V.vector4_f32[2]), + tanf(V.vector4_f32[3]) + }; + return Result.v; +#else + + XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v); + + XMVECTOR V1 = XMVectorMultiply(V, OneOverPi); + V1 = XMVectorRound(V1); + + V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V); + + XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v); + XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v); + XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v); + + XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2); + XMVECTOR V2 = XMVectorMultiply(V1, V1); + XMVECTOR V1T0 = XMVectorMultiply(V1, T0); + XMVECTOR V1T1 = XMVectorMultiply(V1, T1); + + XMVECTOR D = XMVectorReciprocalEst(V2T2); + XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0); + + return XMVectorMultiply(N, D); + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorASinEst +( + FXMVECTOR V +) +{ + // 3-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result; + Result.f[0] = asinf( V.vector4_f32[0] ); + Result.f[1] = asinf( V.vector4_f32[1] ); + Result.f[2] = asinf( V.vector4_f32[2] ); + Result.f[3] = asinf( V.vector4_f32[3] ); + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AEC), 1 ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32( nonnegative, t0, t1 ); + t0 = vsubq_f32(g_XMHalfPi, t0); + return t0; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 t0 = _mm_mul_ps(vConstants, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + t0 = _mm_sub_ps(g_XMHalfPi, t0); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorACosEst +( + FXMVECTOR V +) +{ + // 3-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + acosf(V.vector4_f32[0]), + acosf(V.vector4_f32[1]), + acosf(V.vector4_f32[2]), + acosf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AEC), 1 ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32( nonnegative, t0, t1 ); + return t0; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 t0 = _mm_mul_ps(vConstants, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATanEst +( + FXMVECTOR V +) +{ + // 9-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + atanf(V.vector4_f32[0]), + atanf(V.vector4_f32[1]), + atanf(V.vector4_f32[2]), + atanf(V.vector4_f32[3]) + }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t absV = vabsq_f32(V); + float32x4_t invV = XMVectorReciprocalEst(V); + uint32x4_t comp = vcgtq_f32(V, g_XMOne); + uint32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne ); + comp = vcleq_f32(absV, g_XMOne); + sign = vbslq_f32(comp, g_XMZero, sign ); + uint32x4_t x = vbslq_f32(comp, V, invV ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMATanEstCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR Result = vmlaq_lane_f32( vConstants, x2, vget_high_f32(AEC), 1 ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32( AEC), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + // ATanEstCoefficients0 is already splatted + Result = vmlaq_f32( g_XMATanEstCoefficients0, Result, x2 ); + Result = vmulq_f32( Result, x ); + + float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); + result1 = vsubq_f32(result1, Result); + + comp = vceqq_f32(sign, g_XMZero); + Result = vbslq_f32( comp, Result, result1 ); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 absV = XMVectorAbs(V); + __m128 invV = _mm_div_ps(g_XMOne, V); + __m128 comp = _mm_cmpgt_ps(V, g_XMOne); + __m128 select0 = _mm_and_ps(comp, g_XMOne); + __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + __m128 sign = _mm_or_ps(select0, select1); + comp = _mm_cmple_ps(absV, g_XMOne); + select0 = _mm_and_ps(comp, g_XMZero); + select1 = _mm_andnot_ps(comp, sign); + sign = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, V); + select1 = _mm_andnot_ps(comp, invV); + __m128 x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMATanEstCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + // ATanEstCoefficients0 is already splatted + Result = _mm_add_ps(Result, g_XMATanEstCoefficients0); + Result = _mm_mul_ps(Result, x); + __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); + result1 = _mm_sub_ps(result1, Result); + + comp = _mm_cmpeq_ps(sign, g_XMZero); + select0 = _mm_and_ps(comp, Result); + select1 = _mm_andnot_ps(comp, result1); + Result = _mm_or_ps(select0, select1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan2Est +( + FXMVECTOR Y, + FXMVECTOR X +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { + atan2f(Y.vector4_f32[0], X.vector4_f32[0]), + atan2f(Y.vector4_f32[1], X.vector4_f32[1]), + atan2f(Y.vector4_f32[2], X.vector4_f32[2]), + atan2f(Y.vector4_f32[3], X.vector4_f32[3]), + }; + return Result.v; +#else + + static const XMVECTORF32 ATan2Constants = { XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */ }; + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR ATanResultValid = XMVectorTrueInt(); + + XMVECTOR Pi = XMVectorSplatX(ATan2Constants); + XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); + XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); + XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); + XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); + XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + + XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); + XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); + XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + XMVECTOR Reciprocal = XMVectorReciprocalEst(X); + XMVECTOR V = XMVectorMultiply(Y, Reciprocal); + XMVECTOR R0 = XMVectorATanEst(V); + + R1 = XMVectorSelect( Pi, g_XMNegativeZero, XIsPositive ); + R2 = XMVectorAdd(R0, R1); + + Result = XMVectorSelect(Result, R2, ATanResultValid); + + return Result; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLerp +( + FXMVECTOR V0, + FXMVECTOR V1, + float t +) +{ + // V0 + t * (V1 - V0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Scale = XMVectorReplicate(t); + XMVECTOR Length = XMVectorSubtract(V1, V0); + return XMVectorMultiplyAdd(Length, Scale, V0); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR L = vsubq_f32( V1, V0 ); + return vmlaq_n_f32( V0, L, t ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR L = _mm_sub_ps( V1, V0 ); + XMVECTOR S = _mm_set_ps1( t ); + XMVECTOR Result = _mm_mul_ps( L, S ); + return _mm_add_ps( Result, V0 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLerpV +( + FXMVECTOR V0, + FXMVECTOR V1, + FXMVECTOR T +) +{ + // V0 + T * (V1 - V0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Length = XMVectorSubtract(V1, V0); + return XMVectorMultiplyAdd(Length, T, V0); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR L = vsubq_f32( V1, V0 ); + return vmlaq_f32( V0, L, T ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Length = _mm_sub_ps( V1, V0 ); + XMVECTOR Result = _mm_mul_ps( Length, T ); + return _mm_add_ps( Result, V0 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorHermite +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + GXMVECTOR Tangent1, + float t +) +{ + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + +#if defined(_XM_NO_INTRINSICS_) + + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t); + XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = XMVectorReplicate(t3 - t2); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + float p0 = 2.0f * t3 - 3.0f * t2 + 1.0f; + float t0 = t3 - 2.0f * t2 + t; + float p1 = -2.0f * t3 + 3.0f * t2; + float t1 = t3 - t2; + + XMVECTOR vResult = vmulq_n_f32(Position0, p0 ); + vResult = vmlaq_n_f32( vResult, Tangent0, t0 ); + vResult = vmlaq_n_f32( vResult, Position1, p1 ); + vResult = vmlaq_n_f32( vResult, Tangent1, t1 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t); + XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = _mm_set_ps1(t3 - t2); + + XMVECTOR vResult = _mm_mul_ps(P0, Position0); + XMVECTOR vTemp = _mm_mul_ps(T0, Tangent0); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = _mm_mul_ps(P1, Position1); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = _mm_mul_ps(T1, Tangent1); + vResult = _mm_add_ps(vResult,vTemp); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorHermiteV +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + GXMVECTOR Tangent1, + HXMVECTOR T +) +{ + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR T2 = XMVectorMultiply(T, T); + XMVECTOR T3 = XMVectorMultiply(T , T2); + + XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f); + XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]); + XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]); + XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 CatMulT2 = { -3.0f, -2.0f, 3.0f, -1.0f }; + static const XMVECTORF32 CatMulT3 = { 2.0f, 1.0f, -2.0f, 1.0f }; + + XMVECTOR T2 = vmulq_f32(T,T); + XMVECTOR T3 = vmulq_f32(T,T2); + // Mul by the constants against t^2 + T2 = vmulq_f32(T2,CatMulT2); + // Mul by the constants against t^3 + T3 = vmlaq_f32(T2, T3, CatMulT3 ); + // T3 now has the pre-result. + // I need to add t.y only + T2 = vandq_u32(T,g_XMMaskY); + T3 = vaddq_f32(T3,T2); + // Add 1.0f to x + T3 = vaddq_f32(T3,g_XMIdentityR0); + // Now, I have the constants created + // Mul the x constant to Position0 + XMVECTOR vResult = vmulq_lane_f32( Position0, vget_low_f32( T3 ), 0 ); // T3[0] + // Mul the y constant to Tangent0 + vResult = vmlaq_lane_f32(vResult, Tangent0, vget_low_f32( T3 ), 1 ); // T3[1] + // Mul the z constant to Position1 + vResult = vmlaq_lane_f32(vResult, Position1, vget_high_f32( T3 ), 0 ); // T3[2] + // Mul the w constant to Tangent1 + vResult = vmlaq_lane_f32(vResult, Tangent1, vget_high_f32( T3 ), 1 ); // T3[3] + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 CatMulT2 = { -3.0f, -2.0f, 3.0f, -1.0f }; + static const XMVECTORF32 CatMulT3 = { 2.0f, 1.0f, -2.0f, 1.0f }; + + XMVECTOR T2 = _mm_mul_ps(T,T); + XMVECTOR T3 = _mm_mul_ps(T,T2); + // Mul by the constants against t^2 + T2 = _mm_mul_ps(T2,CatMulT2); + // Mul by the constants against t^3 + T3 = _mm_mul_ps(T3,CatMulT3); + // T3 now has the pre-result. + T3 = _mm_add_ps(T3,T2); + // I need to add t.y only + T2 = _mm_and_ps(T,g_XMMaskY); + T3 = _mm_add_ps(T3,T2); + // Add 1.0f to x + T3 = _mm_add_ps(T3,g_XMIdentityR0); + // Now, I have the constants created + // Mul the x constant to Position0 + XMVECTOR vResult = XM_PERMUTE_PS(T3,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,Position0); + // Mul the y constant to Tangent0 + T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(1,1,1,1)); + T2 = _mm_mul_ps(T2,Tangent0); + vResult = _mm_add_ps(vResult,T2); + // Mul the z constant to Position1 + T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(2,2,2,2)); + T2 = _mm_mul_ps(T2,Position1); + vResult = _mm_add_ps(vResult,T2); + // Mul the w constant to Tangent1 + T3 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(3,3,3,3)); + T3 = _mm_mul_ps(T3,Tangent1); + vResult = _mm_add_ps(vResult,T3); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCatmullRom +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR Position3, + float t +) +{ + // Result = ((-t^3 + 2 * t^2 - t) * Position0 + + // (3 * t^3 - 5 * t^2 + 2) * Position1 + + // (-3 * t^3 + 4 * t^2 + t) * Position2 + + // (t^3 - t^2) * Position3) * 0.5 + +#if defined(_XM_NO_INTRINSICS_) + + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(P2, Position2, Result); + Result = XMVectorMultiplyAdd(P3, Position3, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + float p0 = (-t3 + 2.0f * t2 - t) * 0.5f; + float p1 = (3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f; + float p2 = (-3.0f * t3 + 4.0f * t2 + t) * 0.5f; + float p3 = (t3 - t2) * 0.5f; + + XMVECTOR P1 = vmulq_n_f32(Position1, p1); + XMVECTOR P0 = vmlaq_n_f32(P1, Position0, p0); + XMVECTOR P3 = vmulq_n_f32(Position3, p3); + XMVECTOR P2 = vmlaq_n_f32(P3, Position2, p2); + P0 = vaddq_f32(P0,P2); + return P0; +#elif defined(_XM_SSE_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f); + + P0 = _mm_mul_ps(P0, Position0); + P1 = _mm_mul_ps(P1, Position1); + P2 = _mm_mul_ps(P2, Position2); + P3 = _mm_mul_ps(P3, Position3); + P0 = _mm_add_ps(P0,P1); + P2 = _mm_add_ps(P2,P3); + P0 = _mm_add_ps(P0,P2); + return P0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCatmullRomV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR Position3, + HXMVECTOR T +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fx = T.vector4_f32[0]; + float fy = T.vector4_f32[1]; + float fz = T.vector4_f32[2]; + float fw = T.vector4_f32[3]; + XMVECTORF32 vResult = { + 0.5f*((-fx*fx*fx + 2 * fx*fx - fx)*Position0.vector4_f32[0] + + (3 * fx*fx*fx - 5 * fx*fx + 2)*Position1.vector4_f32[0] + + (-3 * fx*fx*fx + 4 * fx*fx + fx)*Position2.vector4_f32[0] + + (fx*fx*fx - fx*fx)*Position3.vector4_f32[0]), + + 0.5f*((-fy*fy*fy + 2 * fy*fy - fy)*Position0.vector4_f32[1] + + (3 * fy*fy*fy - 5 * fy*fy + 2)*Position1.vector4_f32[1] + + (-3 * fy*fy*fy + 4 * fy*fy + fy)*Position2.vector4_f32[1] + + (fy*fy*fy - fy*fy)*Position3.vector4_f32[1]), + + 0.5f*((-fz*fz*fz + 2 * fz*fz - fz)*Position0.vector4_f32[2] + + (3 * fz*fz*fz - 5 * fz*fz + 2)*Position1.vector4_f32[2] + + (-3 * fz*fz*fz + 4 * fz*fz + fz)*Position2.vector4_f32[2] + + (fz*fz*fz - fz*fz)*Position3.vector4_f32[2]), + + 0.5f*((-fw*fw*fw + 2 * fw*fw - fw)*Position0.vector4_f32[3] + + (3 * fw*fw*fw - 5 * fw*fw + 2)*Position1.vector4_f32[3] + + (-3 * fw*fw*fw + 4 * fw*fw + fw)*Position2.vector4_f32[3] + + (fw*fw*fw - fw*fw)*Position3.vector4_f32[3]) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Catmul2 = { 2.0f, 2.0f, 2.0f, 2.0f }; + static const XMVECTORF32 Catmul3 = { 3.0f, 3.0f, 3.0f, 3.0f }; + static const XMVECTORF32 Catmul4 = { 4.0f, 4.0f, 4.0f, 4.0f }; + static const XMVECTORF32 Catmul5 = { 5.0f, 5.0f, 5.0f, 5.0f }; + // Cache T^2 and T^3 + XMVECTOR T2 = vmulq_f32(T,T); + XMVECTOR T3 = vmulq_f32(T,T2); + // Perform the Position0 term + XMVECTOR vResult = vaddq_f32(T2,T2); + vResult = vsubq_f32(vResult,T); + vResult = vsubq_f32(vResult,T3); + vResult = vmulq_f32(vResult,Position0); + // Perform the Position1 term and add + XMVECTOR vTemp = vmulq_f32(T3,Catmul3); + vTemp = vmlsq_f32(vTemp, T2, Catmul5); + vTemp = vaddq_f32(vTemp,Catmul2); + vResult = vmlaq_f32(vResult, vTemp, Position1); + // Perform the Position2 term and add + vTemp = vmulq_f32(T2,Catmul4); + vTemp = vmlsq_f32(vTemp, T3, Catmul3); + vTemp = vaddq_f32(vTemp,T); + vResult = vmlaq_f32(vResult, vTemp, Position2); + // Position3 is the last term + T3 = vsubq_f32(T3,T2); + vResult = vmlaq_f32(vResult, T3, Position3); + // Multiply by 0.5f and exit + vResult = vmulq_f32(vResult,g_XMOneHalf); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Catmul2 = { 2.0f, 2.0f, 2.0f, 2.0f }; + static const XMVECTORF32 Catmul3 = { 3.0f, 3.0f, 3.0f, 3.0f }; + static const XMVECTORF32 Catmul4 = { 4.0f, 4.0f, 4.0f, 4.0f }; + static const XMVECTORF32 Catmul5 = { 5.0f, 5.0f, 5.0f, 5.0f }; + // Cache T^2 and T^3 + XMVECTOR T2 = _mm_mul_ps(T,T); + XMVECTOR T3 = _mm_mul_ps(T,T2); + // Perform the Position0 term + XMVECTOR vResult = _mm_add_ps(T2,T2); + vResult = _mm_sub_ps(vResult,T); + vResult = _mm_sub_ps(vResult,T3); + vResult = _mm_mul_ps(vResult,Position0); + // Perform the Position1 term and add + XMVECTOR vTemp = _mm_mul_ps(T3,Catmul3); + XMVECTOR vTemp2 = _mm_mul_ps(T2,Catmul5); + vTemp = _mm_sub_ps(vTemp,vTemp2); + vTemp = _mm_add_ps(vTemp,Catmul2); + vTemp = _mm_mul_ps(vTemp,Position1); + vResult = _mm_add_ps(vResult,vTemp); + // Perform the Position2 term and add + vTemp = _mm_mul_ps(T2,Catmul4); + vTemp2 = _mm_mul_ps(T3,Catmul3); + vTemp = _mm_sub_ps(vTemp,vTemp2); + vTemp = _mm_add_ps(vTemp,T); + vTemp = _mm_mul_ps(vTemp,Position2); + vResult = _mm_add_ps(vResult,vTemp); + // Position3 is the last term + T3 = _mm_sub_ps(T3,T2); + T3 = _mm_mul_ps(T3,Position3); + vResult = _mm_add_ps(vResult,T3); + // Multiply by 0.5f and exit + vResult = _mm_mul_ps(vResult,g_XMOneHalf); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorBaryCentric +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + float f, + float g +) +{ + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P10 = XMVectorSubtract(Position1, Position0); + XMVECTOR ScaleF = XMVectorReplicate(f); + + XMVECTOR P20 = XMVectorSubtract(Position2, Position0); + XMVECTOR ScaleG = XMVectorReplicate(g); + + XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0); + Result = XMVectorMultiplyAdd(P20, ScaleG, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR R1 = vsubq_f32(Position1,Position0); + XMVECTOR R2 = vsubq_f32(Position2,Position0); + R1 = vmlaq_n_f32( Position0, R1, f); + return vmlaq_n_f32( R1, R2, g ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1,Position0); + XMVECTOR SF = _mm_set_ps1(f); + XMVECTOR R2 = _mm_sub_ps(Position2,Position0); + XMVECTOR SG = _mm_set_ps1(g); + R1 = _mm_mul_ps(R1,SF); + R2 = _mm_mul_ps(R2,SG); + R1 = _mm_add_ps(R1,Position0); + R1 = _mm_add_ps(R1,R2); + return R1; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorBaryCentricV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR F, + HXMVECTOR G +) +{ + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P10 = XMVectorSubtract(Position1, Position0); + XMVECTOR P20 = XMVectorSubtract(Position2, Position0); + + XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0); + Result = XMVectorMultiplyAdd(P20, G, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR R1 = vsubq_f32(Position1,Position0); + XMVECTOR R2 = vsubq_f32(Position2,Position0); + R1 = vmlaq_f32( Position0, R1, F ); + return vmlaq_f32( R1, R2, G); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1,Position0); + XMVECTOR R2 = _mm_sub_ps(Position2,Position0); + R1 = _mm_mul_ps(R1,F); + R2 = _mm_mul_ps(R2,G); + R1 = _mm_add_ps(R1,Position0); + R1 = _mm_add_ps(R1,R2); + return R1; +#endif +} + +/**************************************************************************** + * + * 2D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); +// z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#endif +} + + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); +// z and w are don't care + int iTest = _mm_movemask_ps(vTemp)&3; + uint32_t CR = 0; + if (iTest==3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)==3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&3; + uint32_t CR = 0; + if (iTest==3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + float dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); + float dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); + return ((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t vDelta = vsub_f32(vget_low_u32(V1), vget_low_u32(V2)); + uint32x2_t vTemp = vacle_f32( vDelta, vget_low_u32(Epsilon) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + return ( r == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + // z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)==0x3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); +// z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)!=3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)!=3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); +// z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp)&3; + uint32_t CR = 0; + if (iTest==3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp)&3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vclt_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcle_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + float32x2_t B = vget_low_f32( Bounds ); + // Test if less than or equal + uint32x2_t ivTemp1 = vcle_f32(VL,B); + // Negate the bounds + float32x2_t vTemp2 = vneg_f32(B); + // Test if greater or equal (Reversed) + uint32x2_t ivTemp2 = vcle_f32(vTemp2,VL); + // Blend answers + ivTemp1 = vand_u32(ivTemp1,ivTemp2); + // x and y in bounds? + return ( vget_lane_u64( ivTemp1, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // x and y in bounds? (z and w are don't care) + return (((_mm_movemask_ps(vTemp1)&0x3)==0x3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2IsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + // Test against itself. NaN is always not equal + uint32x2_t vTempNan = vceq_f32( VL, VL ); + // If x or y are NaN, the mask is zero + return ( vget_lane_u64( vTempNan, 0 ) != 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); + // If x or y are NaN, the mask is non-zero + return ((_mm_movemask_ps(vTempNan)&3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2IsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x2_t vTemp = vand_u32( vget_low_f32( V ) , vget_low_f32( g_XMAbsMask ) ); + // Compare to infinity + vTemp = vceq_f32(vTemp, vget_low_f32( g_XMInfinity) ); + // If any are infinity, the signs are true. + return vget_lane_u64( vTemp, 0 ) != 0; +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If x or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp)&3) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result; + Result.f[0] = + Result.f[1] = + Result.f[2] = + Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1]; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Perform the dot product on x and y + float32x2_t vTemp = vmul_f32( vget_low_f32(V1), vget_low_f32(V2) ); + vTemp = vpadd_f32( vTemp, vTemp ); + return vcombine_f32( vTemp, vTemp ); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps( V1, V2, 0x3f ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V1, V2); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_moveldup_ps(vDot); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V1,V2); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ] + +#if defined(_XM_NO_INTRINSICS_) + float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]); + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = fCross; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { 1.f, -1.f, 0, 0 }; + + float32x2_t vTemp = vmul_f32( vget_low_f32( V1 ), vrev64_f32( vget_low_f32( V2 ) ) ); + vTemp = vmul_f32( vTemp, vget_low_f32( Negate ) ); + vTemp = vpadd_f32( vTemp, vTemp ); + return vcombine_f32( vTemp, vTemp ); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap x and y + XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(0,1,0,1)); + // Perform the muls + vResult = _mm_mul_ps(vResult,V1); + // Splat y + XMVECTOR vTemp = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1)); + // Sub the values + vResult = _mm_sub_ss(vResult,vTemp); + // Splat the cross product + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,0,0,0)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LengthSq +( + FXMVECTOR V +) +{ + return XMVector2Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + // Reciprocal sqrt (estimate) + vTemp = vrsqrte_f32( vTemp ); + return vcombine_f32( vTemp, vTemp ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + return _mm_rsqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32( vTemp, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( vTemp, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t Result = vmul_f32( S1, R1 ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); + return _mm_div_ps( g_XMOne, vLengthSq ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = _mm_div_ss(g_XMOne, vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = _mm_div_ss(g_XMOne,vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrtEst(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32( vTemp, zero ); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32( vTemp ); + Result = vmul_f32( vTemp, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + return _mm_sqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Length +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrt(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32( vTemp, zero ); + // Sqrt + float32x2_t S0 = vrsqrte_f32( vTemp ); + float32x2_t P0 = vmul_f32( vTemp, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( vTemp, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t Result = vmul_f32( S1, R1 ); + Result = vmul_f32( vTemp, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + return _mm_sqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector2NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + // Reciprocal sqrt (estimate) + vTemp = vrsqrte_f32( vTemp ); + // Normalize + float32x2_t Result = vmul_f32( VL, vTemp ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + vLengthSq = _mm_mul_ps(vLengthSq, V); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + vLengthSq = _mm_mul_ps(vLengthSq,V); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Normalize +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR vResult = XMVector2Length( V ); + float fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) { + fLength = 1.0f/fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; + vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; + vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; + vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + uint32x2_t VEqualsZero = vceq_f32( vTemp, vdup_n_f32(0) ); + uint32x2_t VEqualsInf = vceq_f32( vTemp, vget_low_f32(g_XMInfinity) ); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32( vTemp ); + float32x2_t P0 = vmul_f32( vTemp, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( vTemp, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + vTemp = vmul_f32( S1, R1 ); + // Normalize + float32x2_t Result = vmul_f32( VL, vTemp ); + Result = vbsl_f32( VEqualsZero, vdup_n_f32(0), Result ); + Result = vbsl_f32( VEqualsInf, vget_low_f32(g_XMQNaN), Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x and y only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_moveldup_ps(vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y only + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + return XMVector2ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero)); + assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero)); + assert(XMVector2GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector2LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result; + Result = XMVector2Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector2RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +// Return the refraction of a 2D vector +inline XMVECTOR XM_CALLCONV XMVector2RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) +{ + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + +#if defined(_XM_NO_INTRINSICS_) + + float IDotN = (Incident.vector4_f32[0]*Normal.vector4_f32[0])+(Incident.vector4_f32[1]*Normal.vector4_f32[1]); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float RY = 1.0f-(IDotN*IDotN); + float RX = 1.0f-(RY*RefractionIndex.vector4_f32[0]*RefractionIndex.vector4_f32[0]); + RY = 1.0f-(RY*RefractionIndex.vector4_f32[1]*RefractionIndex.vector4_f32[1]); + if (RX>=0.0f) { + RX = (RefractionIndex.vector4_f32[0]*Incident.vector4_f32[0])-(Normal.vector4_f32[0]*((RefractionIndex.vector4_f32[0]*IDotN)+sqrtf(RX))); + } else { + RX = 0.0f; + } + if (RY>=0.0f) { + RY = (RefractionIndex.vector4_f32[1]*Incident.vector4_f32[1])-(Normal.vector4_f32[1]*((RefractionIndex.vector4_f32[1]*IDotN)+sqrtf(RY))); + } else { + RY = 0.0f; + } + + XMVECTOR vResult; + vResult.vector4_f32[0] = RX; + vResult.vector4_f32[1] = RY; + vResult.vector4_f32[2] = 0.0f; + vResult.vector4_f32[3] = 0.0f; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t IL = vget_low_f32( Incident ); + float32x2_t NL = vget_low_f32( Normal ); + float32x2_t RIL = vget_low_f32( RefractionIndex ); + // Get the 2D Dot product of Incident-Normal + float32x2_t vTemp = vmul_f32(IL, NL); + float32x2_t IDotN = vpadd_f32( vTemp, vTemp ); + // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + vTemp = vmls_f32( vget_low_f32( g_XMOne ), IDotN, IDotN); + vTemp = vmul_f32(vTemp,RIL); + vTemp = vmls_f32(vget_low_f32( g_XMOne ), vTemp, RIL ); + // If any terms are <=0, sqrt() will fail, punt to zero + uint32x2_t vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero) ); + // Sqrt(vTemp) + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32( vTemp, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( vTemp, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t S2 = vmul_f32( S1, R1 ); + vTemp = vmul_f32( vTemp, S2 ); + // R = RefractionIndex * IDotN + sqrt(R) + vTemp = vmla_f32( vTemp, RIL, IDotN ); + // Result = RefractionIndex * Incident - Normal * R + float32x2_t vResult = vmul_f32(RIL,IL); + vResult = vmls_f32( vResult, vTemp, NL ); + vResult = vand_u32(vResult,vMask); + return vcombine_f32(vResult, vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + // Get the 2D Dot product of Incident-Normal + XMVECTOR IDotN = XMVector2Dot(Incident, Normal); + // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR vTemp = _mm_mul_ps(IDotN,IDotN); + vTemp = _mm_sub_ps(g_XMOne,vTemp); + vTemp = _mm_mul_ps(vTemp,RefractionIndex); + vTemp = _mm_mul_ps(vTemp,RefractionIndex); + vTemp = _mm_sub_ps(g_XMOne,vTemp); + // If any terms are <=0, sqrt() will fail, punt to zero + XMVECTOR vMask = _mm_cmpgt_ps(vTemp,g_XMZero); + // R = RefractionIndex * IDotN + sqrt(R) + vTemp = _mm_sqrt_ps(vTemp); + XMVECTOR vResult = _mm_mul_ps(RefractionIndex,IDotN); + vTemp = _mm_add_ps(vTemp,vResult); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex,Incident); + vTemp = _mm_mul_ps(vTemp,Normal); + vResult = _mm_sub_ps(vResult,vTemp); + vResult = _mm_and_ps(vResult,vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Orthogonal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + -V.vector4_f32[1], + V.vector4_f32[0], + 0.f, + 0.f + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { -1.f, 1.f, 0, 0 }; + const float32x2_t zero = vdup_n_f32(0); + + float32x2_t VL = vget_low_f32( V ); + float32x2_t Result = vmul_f32( vrev64_f32( VL ), vget_low_f32( Negate ) ); + return vcombine_f32( Result, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + vResult = _mm_mul_ps(vResult,g_XMNegateX); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ + XMVECTOR Result = XMVector2Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ + XMVECTOR Result = XMVector2Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + XMVECTOR L1 = XMVector2ReciprocalLength(V1); + XMVECTOR L2 = XMVector2ReciprocalLength(V2); + + XMVECTOR Dot = XMVector2Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) +{ + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + + XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); + XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + XMVECTOR LengthSq = XMVector2LengthSq(LineVector); + + XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector); + PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); + + XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + return XMVector2Length(DistanceVector); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2IntersectLine +( + FXMVECTOR Line1Point1, + FXMVECTOR Line1Point2, + FXMVECTOR Line2Point1, + GXMVECTOR Line2Point2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1); + XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1); + XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1); + + XMVECTOR C1 = XMVector2Cross(V1, V2); + XMVECTOR C2 = XMVector2Cross(V2, V3); + + XMVECTOR Result; + const XMVECTOR Zero = XMVectorZero(); + if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v)) + { + if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v)) + { + // Coincident + Result = g_XMInfinity.v; + } + else + { + // Parallel + Result = g_XMQNaN.v; + } + } + else + { + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR Scale = XMVectorReciprocal(C1); + Scale = XMVectorMultiply(C2, Scale); + Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1); + } + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1); + XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1); + XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1); + // Generate the cross products + XMVECTOR C1 = XMVector2Cross(V1, V2); + XMVECTOR C2 = XMVector2Cross(V2, V3); + // If C1 is not close to epsilon, use the calculated value + XMVECTOR vResultMask = _mm_setzero_ps(); + vResultMask = _mm_sub_ps(vResultMask,C1); + vResultMask = _mm_max_ps(vResultMask,C1); + // 0xFFFFFFFF if the calculated value is to be used + vResultMask = _mm_cmpgt_ps(vResultMask,g_XMEpsilon); + // If C1 is close to epsilon, which fail type is it? INFINITY or NAN? + XMVECTOR vFailMask = _mm_setzero_ps(); + vFailMask = _mm_sub_ps(vFailMask,C2); + vFailMask = _mm_max_ps(vFailMask,C2); + vFailMask = _mm_cmple_ps(vFailMask,g_XMEpsilon); + XMVECTOR vFail = _mm_and_ps(vFailMask,g_XMInfinity); + vFailMask = _mm_andnot_ps(vFailMask,g_XMQNaN); + // vFail is NAN or INF + vFail = _mm_or_ps(vFail,vFailMask); + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR vResult = _mm_div_ps(C2,C1); + vResult = _mm_mul_ps(vResult,V1); + vResult = _mm_add_ps(vResult,Line1Point1); + // Use result, or failure value + vResult = _mm_and_ps(vResult,vResultMask); + vResultMask = _mm_andnot_ps(vResultMask,vFail); + vResult = _mm_or_ps(vResult,vResultMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Transform +( + FXMVECTOR V, + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + float32x4_t Result = vmlaq_lane_f32( M.r[3], M.r[1], VL, 1 ); // Y + return vmlaq_lane_f32( Result, M.r[0], VL, 0 ); // X +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vResult = _mm_add_ps(vResult,M.r[3]); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2(reinterpret_cast(pInputVector)); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); + Result = XMVectorMultiplyAdd(X, row0, Result); + + #ifdef _MSC_VER + # ifdef _PREFAST_ + # pragma prefast(push) + # pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) + # endif + #endif + + XMStoreFloat4(reinterpret_cast(pOutputVector), Result); + + #ifdef _MSC_VER + # ifdef _PREFAST_ + # pragma prefast(pop) + # endif + #endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*4; + + float32x2_t r3 = vget_low_f32( row3 ); + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N + + __prefetch( pInputVector ); + + r3 = vget_high_f32( row3 ); + r = vget_high_f32( row0 ); + XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O + XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( row1 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O + vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + float32x4x4_t R; + R.val[0] = vResult0; + R.val[1] = vResult1; + R.val[2] = vResult2; + R.val[3] = vResult3; + + vst4q_f32( reinterpret_cast(pOutputVector), R ); + pOutputVector += sizeof(XMFLOAT4)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32( row3, row0, V, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y + + vst1q_f32( reinterpret_cast(pOutputVector), vResult ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if ( two > 0 ) + { + if ( InputStride == sizeof(XMFLOAT2) ) + { + if ( !(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF) ) + { + // Packed input, aligned output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + X = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + i += 2; + } + } + else + { + // Packed input, unaligned output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + X = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if ( !(reinterpret_cast(pInputVector) & 0xF) && !(InputStride & 0xF) ) + { + if ( !(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF) ) + { + // Aligned input, aligned output + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast(pInputVector) ) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + } + } + else + { + // Aligned input, unaligned output + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast(pInputVector) ) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + } + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 x = _mm_load_ss( reinterpret_cast(pInputVector) ); + __m128 y = _mm_load_ss( reinterpret_cast(pInputVector+4) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(y,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR X = XM_PERMUTE_PS(x,_MM_SHUFFLE(0,0,0,0)); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2TransformCoord +( + FXMVECTOR V, + FXMMATRIX M +) +{ + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMVECTOR W = XMVectorSplatW(Result); + return XMVectorDivide( Result, W ); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2(reinterpret_cast(pInputVector)); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMVECTOR W = XMVectorSplatW(Result); + + Result = XMVectorDivide(Result, W); + + #ifdef _MSC_VER + # ifdef _PREFAST_ + # pragma prefast(push) + # pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) + # endif + #endif + + XMStoreFloat2(reinterpret_cast(pOutputVector), Result); + + #ifdef _MSC_VER + # ifdef _PREFAST_ + # pragma prefast(pop) + # endif + #endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*4; + + float32x2_t r3 = vget_low_f32( row3 ); + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N + + __prefetch( pInputVector ); + + r3 = vget_high_f32( row3 ); + r = vget_high_f32( row0 ); + XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( row1 ); + W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + V.val[0] = vdivq_f32( vResult0, W ); + V.val[1] = vdivq_f32( vResult1, W ); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + V.val[0] = vmulq_f32( vResult0, Reciprocal ); + V.val[1] = vmulq_f32( vResult1, Reciprocal ); +#endif + + vst2q_f32( reinterpret_cast(pOutputVector),V ); + pOutputVector += sizeof(XMFLOAT2)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32( row3, row0, V, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y + + V = vget_high_f32( vResult ); + float32x2_t W = vdup_lane_f32( V, 1 ); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + V = vget_low_f32( vResult ); + V = vdiv_f32( V, W ); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x2_t Reciprocal = vrecpe_f32( W ); + float32x2_t S = vrecps_f32( Reciprocal, W ); + Reciprocal = vmul_f32( S, Reciprocal ); + S = vrecps_f32( Reciprocal, W ); + Reciprocal = vmul_f32( S, Reciprocal ); + + V = vget_low_f32( vResult ); + V = vmul_f32( V, Reciprocal ); +#endif + + vst1_f32( reinterpret_cast(pOutputVector), V ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if ( two > 0 ) + { + if ( InputStride == sizeof(XMFLOAT2) ) + { + if ( OutputStride == sizeof(XMFLOAT2) ) + { + if ( !(reinterpret_cast(pOutputStream) & 0xF) ) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + XMVECTOR V1 = _mm_div_ps( vTemp, W ); + + // Result 2 + Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); + X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + XMVECTOR V2 = _mm_div_ps( vTemp, W ); + + vTemp = _mm_movelh_ps( V1, V2 ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += sizeof(XMFLOAT2)*2; + + i += 2; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + XMVECTOR V1 = _mm_div_ps( vTemp, W ); + + // Result 2 + Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); + X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + XMVECTOR V2 = _mm_div_ps( vTemp, W ); + + vTemp = _mm_movelh_ps( V1, V2 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += sizeof(XMFLOAT2)*2; + + i += 2; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + + // Result 2 + Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); + X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if ( !(reinterpret_cast(pInputVector) & 0xF) && !(InputStride & 0xF) ) + { + // Aligned input + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast(pInputVector) ) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 x = _mm_load_ss( reinterpret_cast(pInputVector) ); + __m128 y = _mm_load_ss( reinterpret_cast(pInputVector+4) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) ); + XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2TransformNormal +( + FXMVECTOR V, + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Y, M.r[1]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + float32x4_t Result = vmulq_lane_f32( M.r[1], VL, 1 ); // Y + return vmlaq_lane_f32( Result, M.r[0], VL, 0 ); // X +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2(reinterpret_cast(pInputVector)); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Y, row1); + Result = XMVectorMultiplyAdd(X, row0, Result); + + #ifdef _MSC_VER + # ifdef _PREFAST_ + # pragma prefast(push) + # pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) + # endif + #endif + + XMStoreFloat2(reinterpret_cast(pOutputVector), Result); + + #ifdef _MSC_VER + # ifdef _PREFAST_ + # pragma prefast(pop) + # endif + #endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*4; + + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax + XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx + + __prefetch( pInputVector ); + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + V.val[0] = vResult0; + V.val[1] = vResult1; + + vst2q_f32( reinterpret_cast(pOutputVector), V ); + pOutputVector += sizeof(XMFLOAT2)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vResult = vmulq_lane_f32( row0, V, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y + + V = vget_low_f32( vResult ); + vst1_f32( reinterpret_cast(pOutputVector), V ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if ( two > 0 ) + { + if ( InputStride == sizeof(XMFLOAT2) ) + { + if ( OutputStride == sizeof(XMFLOAT2) ) + { + if ( !(reinterpret_cast(pOutputStream) & 0xF) ) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + XMVECTOR V1 = _mm_add_ps( vTemp, vTemp2 ); + + // Result 2 + Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); + X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + XMVECTOR V2 = _mm_add_ps( vTemp, vTemp2 ); + + vTemp = _mm_movelh_ps( V1, V2 ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += sizeof(XMFLOAT2)*2; + + i += 2; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + XMVECTOR V1 = _mm_add_ps( vTemp, vTemp2 ); + + // Result 2 + Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); + X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + XMVECTOR V2 = _mm_add_ps( vTemp, vTemp2 ); + + vTemp = _mm_movelh_ps( V1, V2 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += sizeof(XMFLOAT2)*2; + + i += 2; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + + // Result 2 + Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); + X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if ( !(reinterpret_cast(pInputVector) & 0xF) && !(InputStride & 0xF) ) + { + // Aligned input + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast(pInputVector) ) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 x = _mm_load_ss( reinterpret_cast(pInputVector) ); + __m128 y = _mm_load_ss( reinterpret_cast(pInputVector+4) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) ); + XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +/**************************************************************************** + * + * 3D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if ( r == 0xFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp)&7; + uint32_t CR = 0; + if (iTest==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)==7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1]) && + (V1.vector4_u32[2] == V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1]) && + (V1.vector4_u32[2] != V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if ( r == 0xFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&7; + uint32_t CR = 0; + if (iTemp==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + float dx, dy, dz; + + dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vDelta = vsubq_f32( V1, V2 ); + uint32x4_t vResult = vacleq_f32( vDelta, Epsilon ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + // w is don't care + return (((_mm_movemask_ps(vTemp)&7)==0x7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)!=7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)!=7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1]) && + (V1.vector4_f32[2] > V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1]) && + (V1.vector4_f32[2] <= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if ( r == 0xFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp)&7; + if (iTest==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if ( r == 0xFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp)&7; + if (iTest==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcltq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcleq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t ivTemp1 = vcleq_f32(V,Bounds); + // Negate the bounds + float32x4_t vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + uint32x4_t ivTemp2 = vcleq_f32(vTemp2,V); + // Blend answers + ivTemp1 = vandq_u32(ivTemp1,ivTemp2); + // in bounds? + int8x8x2_t vTemp = vzip_u8(vget_low_u8(ivTemp1), vget_high_u8(ivTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // x,y and z in bounds? (w is don't care) + return (((_mm_movemask_ps(vTemp1)&0x7)==0x7) != 0); +#else + return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3IsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2])); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32( V, V ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + // If x or y or z are NaN, the mask is zero + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); + // If x or y or z are NaN, the mask is non-zero + return ((_mm_movemask_ps(vTempNan)&7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3IsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTempInf = vandq_u32( V, g_XMAbsMask ); + // Compare to infinity + vTempInf = vceqq_f32(vTempInf, g_XMInfinity ); + // If any are infinity, the signs are true. + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If x,y or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp)&7) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2]; + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = fValue; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vTemp = vmulq_f32( V1, V2 ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + return vcombine_f32( v1, v1 ); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps( V1, V2, 0x7f ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_mul_ps(V1,V2); + vTemp = _mm_and_ps(vTemp, g_XMMask3); + vTemp = _mm_hadd_ps(vTemp,vTemp); + return _mm_hadd_ps(vTemp,vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V1,V2); + // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2] + XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.vector4_f32[0] = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.vector4_f32[2] + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.vector4_f32[0] = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + return XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]), + (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]), + (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]), + 0.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v1xy = vget_low_f32(V1); + float32x2_t v2xy = vget_low_f32(V2); + + float32x2_t v1yx = vrev64_f32( v1xy ); + float32x2_t v2yx = vrev64_f32( v2xy ); + + float32x2_t v1zz = vdup_lane_f32( vget_high_f32(V1), 0 ); + float32x2_t v2zz = vdup_lane_f32( vget_high_f32(V2), 0 ); + + XMVECTOR vResult = vmulq_f32( vcombine_f32(v1yx,v1xy), vcombine_f32(v2zz,v2yx) ); + vResult = vmlsq_f32( vResult, vcombine_f32(v1zz,v1yx), vcombine_f32(v2yx,v2xy) ); + vResult = veorq_u32( vResult, g_XMFlipY ); + return vandq_u32( vResult, g_XMMask3 ); +#elif defined(_XM_SSE_INTRINSICS_) + // y1,z1,x1,w1 + XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(3,0,2,1)); + // z2,x2,y2,w2 + XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(3,1,0,2)); + // Perform the left operation + XMVECTOR vResult = _mm_mul_ps(vTemp1,vTemp2); + // z1,x1,y1,w1 + vTemp1 = XM_PERMUTE_PS(vTemp1,_MM_SHUFFLE(3,0,2,1)); + // y2,z2,x2,w2 + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(3,1,0,2)); + // Perform the right operation + vTemp1 = _mm_mul_ps(vTemp1,vTemp2); + // Subract the right from left, and return answer + vResult = _mm_sub_ps(vResult,vTemp1); + // Set w to zero + return _mm_and_ps(vResult,g_XMMask3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LengthSq +( + FXMVECTOR V +) +{ + return XMVector3Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32( v1 ); + return vcombine_f32(v2, v2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + return _mm_rsqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32( v1, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( v1, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t Result = vmul_f32( S1, R1 ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); + return _mm_div_ps( g_XMOne, vLengthSq ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_and_ps(vDot, g_XMMask3); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_sqrt_ps(vDot); + vDot = _mm_div_ps(g_XMOne,vDot); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V,V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vDot = _mm_sqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_div_ps(g_XMOne,vDot); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32( v1, zero ); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32( v1 ); + Result = vmul_f32( v1, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + return _mm_sqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Length +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32( v1, zero ); + // Sqrt + float32x2_t S0 = vrsqrte_f32( v1 ); + float32x2_t P0 = vmul_f32( v1, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( v1, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t Result = vmul_f32( S1, R1 ); + Result = vmul_f32( v1, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + return _mm_sqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector3NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector3ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32( v1 ); + // Normalize + return vmulq_f32( V, vcombine_f32(v2,v2) ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_and_ps(vDot, g_XMMask3); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_rsqrt_ps(vDot); + vDot = _mm_mul_ps(vDot,V); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V,V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Perform the normalization + vDot = _mm_mul_ps(vDot,V); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Normalize +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fLength; + XMVECTOR vResult; + + vResult = XMVector3Length( V ); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) { + fLength = 1.0f/fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; + vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; + vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; + vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + uint32x2_t VEqualsZero = vceq_f32( v1, vdup_n_f32(0) ); + uint32x2_t VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) ); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32( v1 ); + float32x2_t P0 = vmul_f32( v1, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( v1, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + v2 = vmul_f32( S1, R1 ); + // Normalize + XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) ); + vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult ); + return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + + return XMVector3ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero())); + assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero())); + assert(XMVector3GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector3LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result = XMVector3Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector3RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) +{ + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR IDotN = XMVector3Dot(Incident,Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float32x4_t R = vmlsq_f32( g_XMOne, IDotN, IDotN); + R = vmulq_f32(R, RefractionIndex); + R = vmlsq_f32(g_XMOne, R, RefractionIndex ); + + uint32x4_t vResult = vcleq_f32(R,g_XMZero); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // Sqrt(R) + float32x4_t S0 = vrsqrteq_f32(R); + float32x4_t P0 = vmulq_f32( R, S0 ); + float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); + float32x4_t S1 = vmulq_f32( S0, R0 ); + float32x4_t P1 = vmulq_f32( R, S1 ); + float32x4_t R1 = vrsqrtsq_f32( P1, S1 ); + float32x4_t S2 = vmulq_f32( S1, R1 ); + R = vmulq_f32( R, S2 ); + // R = RefractionIndex * IDotN + sqrt(R) + R = vmlaq_f32( R, RefractionIndex, IDotN ); + // Result = RefractionIndex * Incident - Normal * R + vResult = vmulq_f32(RefractionIndex, Incident); + vResult = vmlsq_f32( vResult, R, Normal ); + } + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = _mm_mul_ps(IDotN, IDotN); + R = _mm_sub_ps(g_XMOne,R); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_sub_ps(g_XMOne,R); + + XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero); + if (_mm_movemask_ps(vResult)==0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + vResult = _mm_mul_ps(RefractionIndex,IDotN); + R = _mm_add_ps(R,vResult); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + R = _mm_mul_ps(R,Normal); + vResult = _mm_sub_ps(vResult,R); + } + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Orthogonal +( + FXMVECTOR V +) +{ + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR YZYY = XMVectorSwizzle(V); + + XMVECTOR NegativeV = XMVectorSubtract(Zero, V); + + XMVECTOR ZIsNegative = XMVectorLess(Z, Zero); + XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero); + + XMVECTOR S = XMVectorAdd(YZYY, Z); + XMVECTOR D = XMVectorSubtract(YZYY, Z); + + XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative); + + XMVECTOR R0 = XMVectorPermute(NegativeV, S); + XMVECTOR R1 = XMVectorPermute(V, D); + + return XMVectorSelect(R1, R0, Select); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ + XMVECTOR Result = XMVector3Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ + XMVECTOR Result = XMVector3Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + XMVECTOR L1 = XMVector3ReciprocalLength(V1); + XMVECTOR L2 = XMVector3ReciprocalLength(V2); + + XMVECTOR Dot = XMVector3Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) +{ + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + + XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); + XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + XMVECTOR LengthSq = XMVector3LengthSq(LineVector); + + XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector); + PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); + + XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + return XMVector3Length(DistanceVector); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVector3ComponentsFromNormal +( + XMVECTOR* pParallel, + XMVECTOR* pPerpendicular, + FXMVECTOR V, + FXMVECTOR Normal +) +{ + assert(pParallel != nullptr); + assert(pPerpendicular != nullptr); + + XMVECTOR Scale = XMVector3Dot(V, Normal); + + XMVECTOR Parallel = XMVectorMultiply(Normal, Scale); + + *pParallel = Parallel; + *pPerpendicular = XMVectorSubtract(V, Parallel); +} + +//------------------------------------------------------------------------------ +// Transform a vector using a rotation expressed as a unit quaternion + +inline XMVECTOR XM_CALLCONV XMVector3Rotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) +{ + XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); + XMVECTOR Result = XMQuaternionMultiply(Q, A); + return XMQuaternionMultiply(Result, RotationQuaternion); +} + +//------------------------------------------------------------------------------ +// Transform a vector using the inverse of a rotation expressed as a unit quaternion + +inline XMVECTOR XM_CALLCONV XMVector3InverseRotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) +{ + XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A); + XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); + return XMQuaternionMultiply(Result, Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Transform +( + FXMVECTOR V, + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + XMVECTOR vResult = vmlaq_lane_f32( M.r[3], M.r[0], VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y + return vmlaq_lane_f32( vResult, M.r[2], vget_high_f32( V ), 0 ); // Z +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + vTemp = _mm_mul_ps(vTemp,M.r[2]); + vResult = _mm_add_ps(vResult,vTemp); + vResult = _mm_add_ps(vResult,M.r[3]); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif +#endif + +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMStoreFloat4(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT3)*4; + + float32x2_t r3 = vget_low_f32( row3 ); + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N + + __prefetch( pInputVector ); + + r3 = vget_high_f32( row3 ); + r = vget_high_f32( row0 ); + XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O + XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( row1 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O + vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + r = vget_low_f32( row2 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + + r = vget_high_f32( row2 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O + vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + + float32x4x4_t R; + R.val[0] = vResult0; + R.val[1] = vResult1; + R.val[2] = vResult2; + R.val[3] = vResult3; + + vst4q_f32( reinterpret_cast(pOutputVector), R ); + pOutputVector += sizeof(XMFLOAT4)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32( row3, row0, VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, VL, 1); // Y + vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z + + vst1q_f32( reinterpret_cast(pOutputVector), vResult ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if ( !(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF) ) + { + // Packed input, aligned output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + i += 4; + } + } + else + { + // Packed input, unaligned output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if ( !(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF) ) + { + // Aligned output + for (; i < VectorCount; ++i) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned output + for (; i < VectorCount; ++i) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3TransformCoord +( + FXMVECTOR V, + FXMMATRIX M +) +{ + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMVECTOR W = XMVectorSplatW(Result); + return XMVectorDivide( Result, W ); +} + +//------------------------------------------------------------------------------ + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMVECTOR W = XMVectorSplatW(Result); + + Result = XMVectorDivide(Result, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT3)*4; + + float32x2_t r3 = vget_low_f32( row3 ); + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N + + __prefetch( pInputVector ); + + r3 = vget_high_f32( row3 ); + r = vget_high_f32( row0 ); + XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O + XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( row1 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O + W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + r = vget_low_f32( row2 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + + r = vget_high_f32( row2 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O + W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + V.val[0] = vdivq_f32( vResult0, W ); + V.val[1] = vdivq_f32( vResult1, W ); + V.val[2] = vdivq_f32( vResult2, W ); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + V.val[0] = vmulq_f32( vResult0, Reciprocal ); + V.val[1] = vmulq_f32( vResult1, Reciprocal ); + V.val[2] = vmulq_f32( vResult2, Reciprocal ); +#endif + + vst3q_f32( reinterpret_cast(pOutputVector),V ); + pOutputVector += sizeof(XMFLOAT3)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32( row3, row0, VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y + vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32( VH, 1 ); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + vResult = vdivq_f32( vResult, W ); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32( W ); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + vResult = vmulq_f32( vResult, Reciprocal ); +#endif + + VL = vget_low_f32( vResult ); + vst1_f32( reinterpret_cast(pOutputVector), VL ); + vst1q_lane_f32( reinterpret_cast(pOutputVector)+2, vResult, 2 ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if ( !(reinterpret_cast(pOutputStream) & 0xF) ) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V1 = _mm_div_ps( vTemp, W ); + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V2 = _mm_div_ps( vTemp, W ); + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V3 = _mm_div_ps( vTemp, W ); + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V4 = _mm_div_ps( vTemp, W ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS( reinterpret_cast(pOutputVector), V1 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+16), vTemp ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V1 = _mm_div_ps( vTemp, W ); + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V2 = _mm_div_ps( vTemp, W ); + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V3 = _mm_div_ps( vTemp, W ); + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V4 = _mm_div_ps( vTemp, W ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps( reinterpret_cast(pOutputVector), V1 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+16), vTemp ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3TransformNormal +( + FXMVECTOR V, + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Z, M.r[2]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y + return vmlaq_lane_f32( vResult, M.r[2], vget_high_f32( V ), 0 ); // Z +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + vTemp = _mm_mul_ps(vTemp,M.r[2]); + vResult = _mm_add_ps(vResult,vTemp); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Z, row2); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT3)*4; + + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax + XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx + + __prefetch( pInputVector ); + + r = vget_high_f32( row0 ); + XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( row1 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + r = vget_low_f32( row2 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz + vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + + r = vget_high_f32( row2 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + + V.val[0] = vResult0; + V.val[1] = vResult1; + V.val[2] = vResult2; + + vst3q_f32( reinterpret_cast(pOutputVector), V ); + pOutputVector += sizeof(XMFLOAT3)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); + pInputVector += InputStride; + + XMVECTOR vResult = vmulq_lane_f32( row0, VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y + vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z + + VL = vget_low_f32( vResult ); + vst1_f32( reinterpret_cast(pOutputVector), VL ); + vst1q_lane_f32( reinterpret_cast(pOutputVector)+2, vResult, 2 ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if ( !(reinterpret_cast(pOutputStream) & 0xF) ) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V1 = _mm_add_ps( vTemp, vTemp3 ); + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V2 = _mm_add_ps( vTemp, vTemp3 ); + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V3 = _mm_add_ps( vTemp, vTemp3 ); + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V4 = _mm_add_ps( vTemp, vTemp3 ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS( reinterpret_cast(pOutputVector), V1 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+16), vTemp ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V1 = _mm_add_ps( vTemp, vTemp3 ); + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V2 = _mm_add_ps( vTemp, vTemp3 ); + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V3 = _mm_add_ps( vTemp, vTemp3 ); + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V4 = _mm_add_ps( vTemp, vTemp3 ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps( reinterpret_cast(pOutputVector), V1 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+16), vTemp ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Project +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + XMVECTOR Result = XMVector3TransformCoord(V, Transform); + + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + return Result; +} + +//------------------------------------------------------------------------------ + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + + XMVECTOR Result = XMVector3TransformCoord(V, Transform); + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + XMVECTOR ScaleX = vdupq_n_f32(HalfViewportWidth); + XMVECTOR ScaleY = vdupq_n_f32(-HalfViewportHeight); + XMVECTOR ScaleZ = vdupq_n_f32(ViewportMaxZ - ViewportMinZ); + + XMVECTOR OffsetX = vdupq_n_f32(ViewportX + HalfViewportWidth); + XMVECTOR OffsetY = vdupq_n_f32(ViewportY + HalfViewportHeight); + XMVECTOR OffsetZ = vdupq_n_f32(ViewportMinZ); + + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT3)*4; + + float32x2_t r3 = vget_low_f32( Transform.r[3] ); + float32x2_t r = vget_low_f32( Transform.r[0] ); + XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N + + __prefetch( pInputVector ); + + r3 = vget_high_f32( Transform.r[3] ); + r = vget_high_f32( Transform.r[0] ); + XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O + XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( Transform.r[1] ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( Transform.r[1] ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O + W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + r = vget_low_f32( Transform.r[2] ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + + r = vget_high_f32( Transform.r[2] ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O + W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + vResult0 = vdivq_f32( vResult0, W ); + vResult1 = vdivq_f32( vResult1, W ); + vResult2 = vdivq_f32( vResult2, W ); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + vResult0 = vmulq_f32( vResult0, Reciprocal ); + vResult1 = vmulq_f32( vResult1, Reciprocal ); + vResult2 = vmulq_f32( vResult2, Reciprocal ); +#endif + + V.val[0] = vmlaq_f32( OffsetX, vResult0, ScaleX ); + V.val[1] = vmlaq_f32( OffsetY, vResult1, ScaleY ); + V.val[2] = vmlaq_f32( OffsetZ, vResult2, ScaleZ ); + + vst3q_f32( reinterpret_cast(pOutputVector),V ); + pOutputVector += sizeof(XMFLOAT3)*4; + + i += 4; + } + } + } + + if ( i < VectorCount) + { + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32( Transform.r[3], Transform.r[0], VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, Transform.r[1], VL, 1 ); // Y + vResult = vmlaq_lane_f32( vResult, Transform.r[2], VH, 0 ); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32( VH, 1 ); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + vResult = vdivq_f32( vResult, W ); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32( W ); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + vResult = vmulq_f32( vResult, Reciprocal ); +#endif + + vResult = vmlaq_f32( Offset, vResult, Scale ); + + VL = vget_low_f32( vResult ); + vst1_f32( reinterpret_cast(pOutputVector), VL ); + vst1q_lane_f32( reinterpret_cast(pOutputVector)+2, vResult, 2 ); + pOutputVector += OutputStride; + } + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if ( !(reinterpret_cast(pOutputStream) & 0xF) ) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V1 = _mm_add_ps( vTemp, Offset ); + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V2 = _mm_add_ps( vTemp, Offset ); + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V3 = _mm_add_ps( vTemp, Offset ); + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V4 = _mm_add_ps( vTemp, Offset ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS( reinterpret_cast(pOutputVector), V1 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+16), vTemp ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V1 = _mm_add_ps( vTemp, Offset ); + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V2 = _mm_add_ps( vTemp, Offset ); + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V3 = _mm_add_ps( vTemp, Offset ); + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V4 = _mm_add_ps( vTemp, Offset ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps( reinterpret_cast(pOutputVector), V1 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+16), vTemp ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + vTemp = _mm_add_ps( vTemp, Offset ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + vTemp = _mm_add_ps( vTemp, Offset ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + vTemp = _mm_add_ps( vTemp, Offset ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + vTemp = _mm_add_ps( vTemp, Offset ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + vTemp = _mm_add_ps( vTemp, Offset ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Unproject +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); + + return XMVector3TransformCoord(Result, Transform); +} + +//------------------------------------------------------------------------------ + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + + XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); + + Result = XMVector3TransformCoord(Result, Transform); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + float sx = 1.f / (ViewportWidth * 0.5f); + float sy = 1.f / (-ViewportHeight * 0.5f); + float sz = 1.f / (ViewportMaxZ - ViewportMinZ); + + float ox = (-ViewportX * sx) - 1.f; + float oy = (-ViewportY * sy) + 1.f; + float oz = (-ViewportMinZ * sz); + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT3)*4; + + XMVECTOR ScaleX = vdupq_n_f32(sx); + XMVECTOR OffsetX = vdupq_n_f32(ox); + XMVECTOR VX = vmlaq_f32( OffsetX, ScaleX, V.val[0] ); + + float32x2_t r3 = vget_low_f32( Transform.r[3] ); + float32x2_t r = vget_low_f32( Transform.r[0] ); + XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Bx+N + + __prefetch( pInputVector ); + + r3 = vget_high_f32( Transform.r[3] ); + r = vget_high_f32( Transform.r[0] ); + XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Cx+O + XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Dx+P + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + XMVECTOR ScaleY = vdupq_n_f32(sy); + XMVECTOR OffsetY = vdupq_n_f32(oy); + XMVECTOR VY = vmlaq_f32( OffsetY, ScaleY, V.val[1] ); + + r = vget_low_f32( Transform.r[1] ); + vResult0 = vmlaq_lane_f32( vResult0, VY, r, 0 ); // Ax+Ey+M + vResult1 = vmlaq_lane_f32( vResult1, VY, r, 1 ); // Bx+Fy+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( Transform.r[1] ); + vResult2 = vmlaq_lane_f32( vResult2, VY, r, 0 ); // Cx+Gy+O + W = vmlaq_lane_f32( W, VY, r, 1 ); // Dx+Hy+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + XMVECTOR ScaleZ = vdupq_n_f32(sz); + XMVECTOR OffsetZ = vdupq_n_f32(oz); + XMVECTOR VZ = vmlaq_f32( OffsetZ, ScaleZ, V.val[2] ); + + r = vget_low_f32( Transform.r[2] ); + vResult0 = vmlaq_lane_f32( vResult0, VZ, r, 0 ); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32( vResult1, VZ, r, 1 ); // Bx+Fy+Jz+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + + r = vget_high_f32( Transform.r[2] ); + vResult2 = vmlaq_lane_f32( vResult2, VZ, r, 0 ); // Cx+Gy+Kz+O + W = vmlaq_lane_f32( W, VZ, r, 1 ); // Dx+Hy+Lz+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + V.val[0] = vdivq_f32( vResult0, W ); + V.val[1] = vdivq_f32( vResult1, W ); + V.val[2] = vdivq_f32( vResult2, W ); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + V.val[0] = vmulq_f32( vResult0, Reciprocal ); + V.val[1] = vmulq_f32( vResult1, Reciprocal ); + V.val[2] = vmulq_f32( vResult2, Reciprocal ); +#endif + + vst3q_f32( reinterpret_cast(pOutputVector),V ); + pOutputVector += sizeof(XMFLOAT3)*4; + + i += 4; + } + } + } + + if (i < VectorCount) + { + float32x2_t ScaleL = vcreate_f32( + static_cast(*reinterpret_cast(&sx)) + | (static_cast(*reinterpret_cast(&sy)) << 32)); + float32x2_t ScaleH = vcreate_f32(static_cast(*reinterpret_cast(&sz))); + + float32x2_t OffsetL = vcreate_f32( + static_cast(*reinterpret_cast(&ox)) + | (static_cast(*reinterpret_cast(&oy)) << 32)); + float32x2_t OffsetH = vcreate_f32(static_cast(*reinterpret_cast(&oz))); + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); + pInputVector += InputStride; + + VL = vmla_f32( OffsetL, VL, ScaleL ); + VH = vmla_f32( OffsetH, VH, ScaleH ); + + XMVECTOR vResult = vmlaq_lane_f32( Transform.r[3], Transform.r[0], VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, Transform.r[1], VL, 1 ); // Y + vResult = vmlaq_lane_f32( vResult, Transform.r[2], VH, 0 ); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32( VH, 1 ); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) + vResult = vdivq_f32( vResult, W ); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32( W ); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + vResult = vmulq_f32( vResult, Reciprocal ); +#endif + + VL = vget_low_f32( vResult ); + vst1_f32( reinterpret_cast(pOutputVector), VL ); + vst1q_lane_f32( reinterpret_cast(pOutputVector)+2, vResult, 2 ); + pOutputVector += OutputStride; + } + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = _mm_mul_ps(Scale, Offset); + Offset = _mm_add_ps(Offset, D); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if ( !(reinterpret_cast(pOutputStream) & 0xF) ) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + V1 = _mm_mul_ps( V1, Scale ); + V1 = _mm_add_ps( V1, Offset ); + + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V1 = _mm_div_ps( vTemp, W ); + + // Result 2 + V2 = _mm_mul_ps( V2, Scale ); + V2 = _mm_add_ps( V2, Offset ); + + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V2 = _mm_div_ps( vTemp, W ); + + // Result 3 + V3 = _mm_mul_ps( V3, Scale ); + V3 = _mm_add_ps( V3, Offset ); + + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V3 = _mm_div_ps( vTemp, W ); + + // Result 4 + V4 = _mm_mul_ps( V4, Scale ); + V4 = _mm_add_ps( V4, Offset ); + + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V4 = _mm_div_ps( vTemp, W ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS( reinterpret_cast(pOutputVector), V1 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+16), vTemp ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + V1 = _mm_mul_ps( V1, Scale ); + V1 = _mm_add_ps( V1, Offset ); + + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V1 = _mm_div_ps( vTemp, W ); + + // Result 2 + V2 = _mm_mul_ps( V2, Scale ); + V2 = _mm_add_ps( V2, Offset ); + + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V2 = _mm_div_ps( vTemp, W ); + + // Result 3 + V3 = _mm_mul_ps( V3, Scale ); + V3 = _mm_add_ps( V3, Offset ); + + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V3 = _mm_div_ps( vTemp, W ); + + // Result 4 + V4 = _mm_mul_ps( V4, Scale ); + V4 = _mm_add_ps( V4, Offset ); + + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V4 = _mm_div_ps( vTemp, W ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps( reinterpret_cast(pOutputVector), V1 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+16), vTemp ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + V1 = _mm_mul_ps( V1, Scale ); + V1 = _mm_add_ps( V1, Offset ); + + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + V2 = _mm_mul_ps( V2, Scale ); + V2 = _mm_add_ps( V2, Offset ); + + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + V3 = _mm_mul_ps( V3, Scale ); + V3 = _mm_add_ps( V3, Offset ); + + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + V4 = _mm_mul_ps( V4, Scale ); + V4 = _mm_add_ps( V4, Offset ); + + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + V = _mm_mul_ps( V, Scale ); + V = _mm_add_ps( V, Offset ); + + XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#endif + +/**************************************************************************** + * + * 4D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2]) && + (V1.vector4_f32[3] == V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2]) && + (V1.vector4_f32[3] != V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp); + uint32_t CR = 0; + if (iTest==0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest==0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))==0xf) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if (V1.vector4_u32[0] == V2.vector4_u32[0] && + V1.vector4_u32[1] == V2.vector4_u32[1] && + V1.vector4_u32[2] == V2.vector4_u32[2] && + V1.vector4_u32[3] == V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_u32[0] != V2.vector4_u32[0] && + V1.vector4_u32[1] != V2.vector4_u32[1] && + V1.vector4_u32[2] != V2.vector4_u32[2] && + V1.vector4_u32[3] != V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)); + uint32_t CR = 0; + if (iTest==0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest==0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +inline bool XM_CALLCONV XMVector4NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + float dx, dy, dz, dw; + + dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]); + dw = fabsf(V1.vector4_f32[3]-V2.vector4_f32[3]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2]) && + (dw <= Epsilon.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vDelta = vsubq_f32( V1, V2 ); + uint32x4_t vResult = vacleq_f32( vDelta, Epsilon ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + return ((_mm_movemask_ps(vTemp)==0xf) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpneq_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))!=0xF) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if (V1.vector4_f32[0] > V2.vector4_f32[0] && + V1.vector4_f32[1] > V2.vector4_f32[1] && + V1.vector4_f32[2] > V2.vector4_f32[2] && + V1.vector4_f32[3] > V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_f32[0] <= V2.vector4_f32[0] && + V1.vector4_f32[1] <= V2.vector4_f32[1] && + V1.vector4_f32[2] <= V2.vector4_f32[2] && + V1.vector4_f32[3] <= V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + uint32_t CR = 0; + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2]) && + (V1.vector4_f32[3] >= V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2]) && + (V1.vector4_f32[3] < V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + uint32_t CR = 0; + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0x0f) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcltq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcleq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) && + (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t ivTemp1 = vcleq_f32(V,Bounds); + // Negate the bounds + float32x4_t vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + uint32x4_t ivTemp2 = vcleq_f32(vTemp2,V); + // Blend answers + ivTemp1 = vandq_u32(ivTemp1,ivTemp2); + // in bounds? + int8x8x2_t vTemp = vzip_u8(vget_low_u8(ivTemp1), vget_high_u8(ivTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // All in bounds? + return ((_mm_movemask_ps(vTemp1)==0x0f) != 0); +#else + return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4IsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2]) || + XMISNAN(V.vector4_f32[3])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32( V, V ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + // If any are NaN, the mask is zero + return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); + // If any are NaN, the mask is non-zero + return (_mm_movemask_ps(vTempNan)!=0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4IsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2]) || + XMISINF(V.vector4_f32[3])); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTempInf = vandq_u32( V, g_XMAbsMask ); + // Compare to infinity + vTempInf = vceqq_f32(vTempInf, g_XMInfinity ); + // If any are infinity, the signs are true. + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) != 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + XMVECTOR vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result; + Result.f[0] = + Result.f[1] = + Result.f[2] = + Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3]; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vTemp = vmulq_f32( V1, V2 ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + return vcombine_f32( v1, v1 ); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps( V1, V2, 0xff ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_mul_ps(V1, V2); + vTemp = _mm_hadd_ps(vTemp, vTemp); + return _mm_hadd_ps(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp2 = V2; + XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position + vTemp2 = _mm_add_ps(vTemp2,vTemp); // Add Z = X+Z; W = Y+W; + vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position + vTemp = _mm_add_ps(vTemp,vTemp2); // Add Z and W together + return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Cross +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w), + // ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w), + // ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w), + // ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ] + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + (((V2.vector4_f32[2] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[2]))*V1.vector4_f32[1]) - (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1]))*V1.vector4_f32[2]) + (((V2.vector4_f32[1] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[1]))*V1.vector4_f32[3]), + (((V2.vector4_f32[3] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[3]))*V1.vector4_f32[0]) - (((V2.vector4_f32[3] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[3]))*V1.vector4_f32[2]) + (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2]))*V1.vector4_f32[3]), + (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1]))*V1.vector4_f32[0]) - (((V2.vector4_f32[0] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[0]))*V1.vector4_f32[1]) + (((V2.vector4_f32[0] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[0]))*V1.vector4_f32[3]), + (((V2.vector4_f32[2] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[2]))*V1.vector4_f32[0]) - (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2]))*V1.vector4_f32[1]) + (((V2.vector4_f32[1] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[1]))*V1.vector4_f32[2]), + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const float32x2_t select = vget_low_f32( g_XMMaskX ); + + // Term1: V2zwyz * V3wzwy + const float32x2_t v2xy = vget_low_f32(V2); + const float32x2_t v2zw = vget_high_f32(V2); + const float32x2_t v2yx = vrev64_f32(v2xy); + const float32x2_t v2wz = vrev64_f32(v2zw); + const float32x2_t v2yz = vbsl_f32( select, v2yx, v2wz ); + + const float32x2_t v3zw = vget_high_f32(V3); + const float32x2_t v3wz = vrev64_f32(v3zw); + const float32x2_t v3xy = vget_low_f32(V3); + const float32x2_t v3wy = vbsl_f32( select, v3wz, v3xy ); + + float32x4_t vTemp1 = vcombine_f32(v2zw,v2yz); + float32x4_t vTemp2 = vcombine_f32(v3wz,v3wy); + XMVECTOR vResult = vmulq_f32( vTemp1, vTemp2 ); + + // - V2wzwy * V3zwyz + const float32x2_t v2wy = vbsl_f32( select, v2wz, v2xy ); + + const float32x2_t v3yx = vrev64_f32(v3xy); + const float32x2_t v3yz = vbsl_f32( select, v3yx, v3wz ); + + vTemp1 = vcombine_f32(v2wz,v2wy); + vTemp2 = vcombine_f32(v3zw,v3yz); + vResult = vmlsq_f32( vResult, vTemp1, vTemp2 ); + + // term1 * V1yxxx + const float32x2_t v1xy = vget_low_f32(V1); + const float32x2_t v1yx = vrev64_f32(v1xy); + + vTemp1 = vcombine_f32( v1yx, vdup_lane_f32( v1yx, 1 ) ); + vResult = vmulq_f32( vResult, vTemp1 ); + + // Term2: V2ywxz * V3wxwx + const float32x2_t v2yw = vrev64_f32(v2wy); + const float32x2_t v2xz = vbsl_f32( select, v2xy, v2wz ); + + const float32x2_t v3wx = vbsl_f32( select, v3wz, v3yx ); + + vTemp1 = vcombine_f32(v2yw,v2xz); + vTemp2 = vcombine_f32(v3wx,v3wx); + float32x4_t vTerm = vmulq_f32( vTemp1, vTemp2 ); + + // - V2wxwx * V3ywxz + const float32x2_t v2wx = vbsl_f32( select, v2wz, v2yx ); + + const float32x2_t v3yw = vrev64_f32(v3wy); + const float32x2_t v3xz = vbsl_f32( select, v3xy, v3wz ); + + vTemp1 = vcombine_f32(v2wx,v2wx); + vTemp2 = vcombine_f32(v3yw,v3xz); + vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 ); + + // vResult - term2 * V1zzyy + const float32x2_t v1zw = vget_high_f32(V1); + + vTemp1 = vcombine_f32( vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0) ); + vResult = vmlsq_f32( vResult, vTerm, vTemp1 ); + + // Term3: V2yzxy * V3zxyx + const float32x2_t v3zx = vrev64_f32(v3xz); + + vTemp1 = vcombine_f32(v2yz,v2xy); + vTemp2 = vcombine_f32(v3zx,v3yx); + vTerm = vmulq_f32( vTemp1, vTemp2 ); + + // - V2zxyx * V3yzxy + const float32x2_t v2zx = vrev64_f32(v2xz); + + vTemp1 = vcombine_f32(v2zx,v2yx); + vTemp2 = vcombine_f32(v3yz,v3xy); + vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 ); + + // vResult + term3 * V1wwwz + const float32x2_t v1wz = vrev64_f32(v1zw); + + vTemp1 = vcombine_f32( vdup_lane_f32( v1wz, 0 ), v1wz ); + return vmlaq_f32( vResult, vTerm, vTemp1 ); +#elif defined(_XM_SSE_INTRINSICS_) + // V2zwyz * V3wzwy + XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,1,3,2)); + XMVECTOR vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,3,2,3)); + vResult = _mm_mul_ps(vResult,vTemp3); + // - V2wzwy * V3zwyz + XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,3,2,3)); + vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(1,3,0,1)); + vTemp2 = _mm_mul_ps(vTemp2,vTemp3); + vResult = _mm_sub_ps(vResult,vTemp2); + // term1 * V1yxxx + XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,0,0,1)); + vResult = _mm_mul_ps(vResult,vTemp1); + + // V2ywxz * V3wxwx + vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,3,1)); + vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,3,0,3)); + vTemp3 = _mm_mul_ps(vTemp3,vTemp2); + // - V2wxwx * V3ywxz + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,1,2,1)); + vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(2,0,3,1)); + vTemp2 = _mm_mul_ps(vTemp2,vTemp1); + vTemp3 = _mm_sub_ps(vTemp3,vTemp2); + // vResult - temp * V1zzyy + vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(1,1,2,2)); + vTemp1 = _mm_mul_ps(vTemp1,vTemp3); + vResult = _mm_sub_ps(vResult,vTemp1); + + // V2yzxy * V3zxyx + vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,0,2,1)); + vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,1,0,2)); + vTemp3 = _mm_mul_ps(vTemp3,vTemp2); + // - V2zxyx * V3yzxy + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,0,2,1)); + vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,0,2,1)); + vTemp1 = _mm_mul_ps(vTemp1,vTemp2); + vTemp3 = _mm_sub_ps(vTemp3,vTemp1); + // vResult + term * V1wwwz + vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,3,3,3)); + vTemp3 = _mm_mul_ps(vTemp3,vTemp1); + vResult = _mm_add_ps(vResult,vTemp3); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4LengthSq +( + FXMVECTOR V +) +{ + return XMVector4Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32( v1 ); + return vcombine_f32(v2, v2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + return _mm_rsqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32( v1, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( v1, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t Result = vmul_f32( S1, R1 ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); + return _mm_div_ps( g_XMOne, vLengthSq ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + vLengthSq = _mm_div_ps(g_XMOne, vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the reciprocal + vLengthSq = _mm_sqrt_ps(vLengthSq); + // Accurate! + vLengthSq = _mm_div_ps(g_XMOne,vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4LengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32( v1, zero ); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32( v1 ); + Result = vmul_f32( v1, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + return _mm_sqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Length +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32( v1, zero ); + // Sqrt + float32x2_t S0 = vrsqrte_f32( v1 ); + float32x2_t P0 = vmul_f32( v1, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( v1, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t Result = vmul_f32( S1, R1 ); + Result = vmul_f32( v1, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + return _mm_sqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector4NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector4ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32( v1 ); + // Normalize + return vmulq_f32( V, vcombine_f32(v2,v2) ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_rsqrt_ps(vDot); + vDot = _mm_mul_ps(vDot, V); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the reciprocal + XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq); + // Reciprocal mul to perform the normalization + vResult = _mm_mul_ps(vResult,V); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Normalize +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fLength; + XMVECTOR vResult; + + vResult = XMVector4Length( V ); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) { + fLength = 1.0f/fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; + vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; + vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; + vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + uint32x2_t VEqualsZero = vceq_f32( v1, vdup_n_f32(0) ); + uint32x2_t VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) ); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32( v1 ); + float32x2_t P0 = vmul_f32( v1, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( v1, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + v2 = vmul_f32( S1, R1 ); + // Normalize + XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) ); + vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult ); + return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + + return XMVector4ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero())); + assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero())); + assert(XMVector4GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector4LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result = XMVector4Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector4RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR IDotN; + XMVECTOR R; + const XMVECTOR Zero = XMVectorZero(); + + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + + IDotN = XMVector4Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + XMVECTOR Result; + + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR IDotN = XMVector4Dot(Incident,Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float32x4_t R = vmlsq_f32( g_XMOne, IDotN, IDotN); + R = vmulq_f32(R, RefractionIndex); + R = vmlsq_f32(g_XMOne, R, RefractionIndex ); + + uint32x4_t vResult = vcleq_f32(R,g_XMZero); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // Sqrt(R) + float32x4_t S0 = vrsqrteq_f32(R); + float32x4_t P0 = vmulq_f32( R, S0 ); + float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); + float32x4_t S1 = vmulq_f32( S0, R0 ); + float32x4_t P1 = vmulq_f32( R, S1 ); + float32x4_t R1 = vrsqrtsq_f32( P1, S1 ); + float32x4_t S2 = vmulq_f32( S1, R1 ); + R = vmulq_f32( R, S2 ); + // R = RefractionIndex * IDotN + sqrt(R) + R = vmlaq_f32( R, RefractionIndex, IDotN ); + // Result = RefractionIndex * Incident - Normal * R + vResult = vmulq_f32(RefractionIndex, Incident); + vResult = vmlsq_f32( vResult, R, Normal ); + } + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR IDotN = XMVector4Dot(Incident,Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = _mm_mul_ps(IDotN,IDotN); + R = _mm_sub_ps(g_XMOne,R); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_sub_ps(g_XMOne,R); + + XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero); + if (_mm_movemask_ps(vResult)==0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + vResult = _mm_mul_ps(RefractionIndex, IDotN); + R = _mm_add_ps(R,vResult); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + R = _mm_mul_ps(R,Normal); + vResult = _mm_sub_ps(vResult,R); + } + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Orthogonal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { + V.vector4_f32[2], + V.vector4_f32[3], + -V.vector4_f32[0], + -V.vector4_f32[1] + }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { 1.f, 1.f, -1.f, -1.f }; + + float32x4_t Result = vcombine_f32( vget_high_f32( V ), vget_low_f32( V ) ); + return vmulq_f32( Result, Negate ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FlipZW = { 1.0f, 1.0f, -1.0f, -1.0f }; + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,0,3,2)); + vResult = _mm_mul_ps(vResult,FlipZW); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ + XMVECTOR Result = XMVector4Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ + XMVECTOR Result = XMVector4Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + XMVECTOR L1 = XMVector4ReciprocalLength(V1); + XMVECTOR L2 = XMVector4ReciprocalLength(V2); + + XMVECTOR Dot = XMVector4Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Transform +( + FXMVECTOR V, + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]); + float fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]); + float fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]); + float fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]); + XMVECTORF32 vResult = { fX, fY, fZ, fW }; + return vResult.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y + float32x2_t VH = vget_high_f32( V ); + vResult = vmlaq_lane_f32( vResult, M.r[2], VH, 0 ); // Z + return vmlaq_lane_f32( vResult, M.r[3], VH, 1 ); // W +#elif defined(_XM_SSE_INTRINSICS_) + // Splat x,y,z and w + XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + // Mul by the matrix + vTempX = _mm_mul_ps(vTempX,M.r[0]); + vTempY = _mm_mul_ps(vTempY,M.r[1]); + vTempZ = _mm_mul_ps(vTempZ,M.r[2]); + vTempW = _mm_mul_ps(vTempW,M.r[3]); + // Add them all together + vTempX = _mm_add_ps(vTempX,vTempY); + vTempZ = _mm_add_ps(vTempZ,vTempW); + vTempX = _mm_add_ps(vTempX,vTempZ); + return vTempX; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT4* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT4)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat4(reinterpret_cast(pInputVector)); + XMVECTOR W = XMVectorSplatW(V); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(W, row3); + Result = XMVectorMultiplyAdd(Z, row2, Result); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) +#endif +#endif + + XMStoreFloat4(reinterpret_cast(pOutputVector), Result); + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT4)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x4_t V = vld4q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT4)*4; + + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax + XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx + + __prefetch( pInputVector ); + + r = vget_high_f32( row0 ); + XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx + XMVECTOR vResult3 = vmulq_lane_f32( V.val[0], r, 1 ); // Dx + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( row1 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy + vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + r = vget_low_f32( row2 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz + vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + + r = vget_high_f32( row2 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz + vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + + r = vget_low_f32( row3 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[3], r, 0 ); // Ax+Ey+Iz+Mw + vResult1 = vmlaq_lane_f32( vResult1, V.val[3], r, 1 ); // Bx+Fy+Jz+Nw + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*6) ); + + r = vget_high_f32( row3 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[3], r, 0 ); // Cx+Gy+Kz+Ow + vResult3 = vmlaq_lane_f32( vResult3, V.val[3], r, 1 ); // Dx+Hy+Lz+Pw + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*7) ); + + V.val[0] = vResult0; + V.val[1] = vResult1; + V.val[2] = vResult2; + V.val[3] = vResult3; + + vst4q_f32( reinterpret_cast(pOutputVector), V ); + pOutputVector += sizeof(XMFLOAT4)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = vld1q_f32( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + float32x2_t VL = vget_low_f32( V ); + XMVECTOR vResult = vmulq_lane_f32( row0, VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y + float32x2_t VH = vget_high_f32( V ); + vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z + vResult = vmlaq_lane_f32( vResult, row3, VH, 1 ); // W + + vst1q_f32( reinterpret_cast(pOutputVector), vResult ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + if ( !(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF) ) + { + if ( !(reinterpret_cast(pInputStream) & 0xF) && !(InputStride & 0xF) ) + { + // Aligned input, aligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_load_ps( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + + vTempX = _mm_mul_ps(vTempX,row0); + vTempY = _mm_mul_ps(vTempY,row1); + vTempZ = _mm_mul_ps(vTempZ,row2); + vTempW = _mm_mul_ps(vTempW,row3); + + vTempX = _mm_add_ps(vTempX,vTempY); + vTempZ = _mm_add_ps(vTempZ,vTempW); + vTempX = _mm_add_ps(vTempX,vTempZ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTempX ); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input, aligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + + vTempX = _mm_mul_ps(vTempX,row0); + vTempY = _mm_mul_ps(vTempY,row1); + vTempZ = _mm_mul_ps(vTempZ,row2); + vTempW = _mm_mul_ps(vTempW,row3); + + vTempX = _mm_add_ps(vTempX,vTempY); + vTempZ = _mm_add_ps(vTempZ,vTempW); + vTempX = _mm_add_ps(vTempX,vTempZ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTempX ); + pOutputVector += OutputStride; + } + } + } + else + { + if ( !(reinterpret_cast(pInputStream) & 0xF) && !(InputStride & 0xF) ) + { + // Aligned input, unaligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_load_ps( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + + vTempX = _mm_mul_ps(vTempX,row0); + vTempY = _mm_mul_ps(vTempY,row1); + vTempZ = _mm_mul_ps(vTempZ,row2); + vTempW = _mm_mul_ps(vTempW,row3); + + vTempX = _mm_add_ps(vTempX,vTempY); + vTempZ = _mm_add_ps(vTempZ,vTempW); + vTempX = _mm_add_ps(vTempX,vTempZ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTempX ); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input, unaligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + + vTempX = _mm_mul_ps(vTempX,row0); + vTempY = _mm_mul_ps(vTempY,row1); + vTempZ = _mm_mul_ps(vTempZ,row2); + vTempW = _mm_mul_ps(vTempW,row3); + + vTempX = _mm_add_ps(vTempX,vTempY); + vTempZ = _mm_add_ps(vTempZ,vTempW); + vTempX = _mm_add_ps(vTempX,vTempZ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTempX ); + pOutputVector += OutputStride; + } + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +/**************************************************************************** + * + * XMVECTOR operators + * + ****************************************************************************/ + +#ifndef _XM_NO_XMVECTOR_OVERLOADS_ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V) +{ + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator- (FXMVECTOR V) +{ + return XMVectorNegate(V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator+= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorAdd(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator-= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorSubtract(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator*= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorMultiply(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator/= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorDivide(V1,V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator*= +( + XMVECTOR& V, + const float S +) +{ + V = XMVectorScale(V, S); + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator/= +( + XMVECTOR& V, + const float S +) +{ + XMVECTOR vS = XMVectorReplicate( S ); + V = XMVectorDivide(V, vS); + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator+ +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorAdd(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator- +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorSubtract(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorMultiply(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator/ +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorDivide(V1,V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + FXMVECTOR V, + const float S +) +{ + return XMVectorScale(V, S); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator/ +( + FXMVECTOR V, + const float S +) +{ + XMVECTOR vS = XMVectorReplicate( S ); + return XMVectorDivide(V, vS); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + float S, + FXMVECTOR V +) +{ + return XMVectorScale(V, S); +} + +#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */ + +#if defined(_XM_NO_INTRINSICS_) +#undef XMISNAN +#undef XMISINF +#endif + +#if defined(_XM_SSE_INTRINSICS_) +#undef XM3UNPACK3INTO4 +#undef XM3PACK4INTO3 +#endif + diff --git a/WickedEngine/Utility/DirectXPackedVector.h b/WickedEngine/Utility/DirectXPackedVector.h new file mode 100644 index 000000000..b5158de28 --- /dev/null +++ b/WickedEngine/Utility/DirectXPackedVector.h @@ -0,0 +1,1203 @@ +//------------------------------------------------------------------------------------- +// DirectXPackedVector.h -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMathCommon.h" +#include "DirectXMath.h" + +namespace DirectX +{ + +namespace PackedVector +{ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4201 4365 4324 4996) +// C4201: nonstandard extension used +// C4365: Off by default noise +// C4324: alignment padding warnings +// C4996: deprecation warnings +#endif + +//------------------------------------------------------------------------------ +// ARGB Color; 8-8-8-8 bit unsigned normalized integer components packed into +// a 32 bit integer. The normalized color is packed into 32 bits using 8 bit +// unsigned, normalized integers for the alpha, red, green, and blue components. +// The alpha component is stored in the most significant bits and the blue +// component in the least significant bits (A8R8G8B8): +// [32] aaaaaaaa rrrrrrrr gggggggg bbbbbbbb [0] +struct XMCOLOR +{ + union + { + struct + { + uint8_t b; // Blue: 0/255 to 255/255 + uint8_t g; // Green: 0/255 to 255/255 + uint8_t r; // Red: 0/255 to 255/255 + uint8_t a; // Alpha: 0/255 to 255/255 + }; + uint32_t c; + }; + + XMCOLOR() = default; + + XMCOLOR(const XMCOLOR&) = default; + XMCOLOR& operator=(const XMCOLOR&) = default; + + XMCOLOR(XMCOLOR&&) = default; + XMCOLOR& operator=(XMCOLOR&&) = default; + + XM_CONSTEXPR XMCOLOR(uint32_t Color) : c(Color) {} + XMCOLOR(float _r, float _g, float _b, float _a); + explicit XMCOLOR(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return c; } + + XMCOLOR& operator= (const uint32_t Color) { c = Color; return *this; } +}; + +//------------------------------------------------------------------------------ +// 16 bit floating point number consisting of a sign bit, a 5 bit biased +// exponent, and a 10 bit mantissa +typedef uint16_t HALF; + +//------------------------------------------------------------------------------ +// 2D Vector; 16 bit floating point components +struct XMHALF2 +{ + union + { + struct + { + HALF x; + HALF y; + }; + uint32_t v; + }; + + XMHALF2() = default; + + XMHALF2(const XMHALF2&) = default; + XMHALF2& operator=(const XMHALF2&) = default; + + XMHALF2(XMHALF2&&) = default; + XMHALF2& operator=(XMHALF2&&) = default; + + explicit XM_CONSTEXPR XMHALF2(uint32_t Packed) : v(Packed) {} + XM_CONSTEXPR XMHALF2(HALF _x, HALF _y) : x(_x), y(_y) {} + explicit XMHALF2(_In_reads_(2) const HALF *pArray) : x(pArray[0]), y(pArray[1]) {} + XMHALF2(float _x, float _y); + explicit XMHALF2(_In_reads_(2) const float *pArray); + + XMHALF2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 2D Vector; 16 bit signed normalized integer components +struct XMSHORTN2 +{ + union + { + struct + { + int16_t x; + int16_t y; + }; + uint32_t v; + }; + + XMSHORTN2() = default; + + XMSHORTN2(const XMSHORTN2&) = default; + XMSHORTN2& operator=(const XMSHORTN2&) = default; + + XMSHORTN2(XMSHORTN2&&) = default; + XMSHORTN2& operator=(XMSHORTN2&&) = default; + + explicit XM_CONSTEXPR XMSHORTN2(uint32_t Packed) : v(Packed) {} + XM_CONSTEXPR XMSHORTN2(int16_t _x, int16_t _y) : x(_x), y(_y) {} + explicit XMSHORTN2(_In_reads_(2) const int16_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMSHORTN2(float _x, float _y); + explicit XMSHORTN2(_In_reads_(2) const float *pArray); + + XMSHORTN2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 16 bit signed integer components +struct XMSHORT2 +{ + union + { + struct + { + int16_t x; + int16_t y; + }; + uint32_t v; + }; + + XMSHORT2() = default; + + XMSHORT2(const XMSHORT2&) = default; + XMSHORT2& operator=(const XMSHORT2&) = default; + + XMSHORT2(XMSHORT2&&) = default; + XMSHORT2& operator=(XMSHORT2&&) = default; + + explicit XM_CONSTEXPR XMSHORT2(uint32_t Packed) : v(Packed) {} + XM_CONSTEXPR XMSHORT2(int16_t _x, int16_t _y) : x(_x), y(_y) {} + explicit XMSHORT2(_In_reads_(2) const int16_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMSHORT2(float _x, float _y); + explicit XMSHORT2(_In_reads_(2) const float *pArray); + + XMSHORT2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 16 bit unsigned normalized integer components +struct XMUSHORTN2 +{ + union + { + struct + { + uint16_t x; + uint16_t y; + }; + uint32_t v; + }; + + XMUSHORTN2() = default; + + XMUSHORTN2(const XMUSHORTN2&) = default; + XMUSHORTN2& operator=(const XMUSHORTN2&) = default; + + XMUSHORTN2(XMUSHORTN2&&) = default; + XMUSHORTN2& operator=(XMUSHORTN2&&) = default; + + explicit XM_CONSTEXPR XMUSHORTN2(uint32_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUSHORTN2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {} + explicit XMUSHORTN2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMUSHORTN2(float _x, float _y); + explicit XMUSHORTN2(_In_reads_(2) const float *pArray); + + XMUSHORTN2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 16 bit unsigned integer components +struct XMUSHORT2 +{ + union + { + struct + { + uint16_t x; + uint16_t y; + }; + uint32_t v; + }; + + XMUSHORT2() = default; + + XMUSHORT2(const XMUSHORT2&) = default; + XMUSHORT2& operator=(const XMUSHORT2&) = default; + + XMUSHORT2(XMUSHORT2&&) = default; + XMUSHORT2& operator=(XMUSHORT2&&) = default; + + explicit XM_CONSTEXPR XMUSHORT2(uint32_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUSHORT2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {} + explicit XMUSHORT2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMUSHORT2(float _x, float _y); + explicit XMUSHORT2(_In_reads_(2) const float *pArray); + + XMUSHORT2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 2D Vector; 8 bit signed normalized integer components +struct XMBYTEN2 +{ + union + { + struct + { + int8_t x; + int8_t y; + }; + uint16_t v; + }; + + XMBYTEN2() = default; + + XMBYTEN2(const XMBYTEN2&) = default; + XMBYTEN2& operator=(const XMBYTEN2&) = default; + + XMBYTEN2(XMBYTEN2&&) = default; + XMBYTEN2& operator=(XMBYTEN2&&) = default; + + explicit XM_CONSTEXPR XMBYTEN2(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMBYTEN2(int8_t _x, int8_t _y) : x(_x), y(_y) {} + explicit XMBYTEN2(_In_reads_(2) const int8_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMBYTEN2(float _x, float _y); + explicit XMBYTEN2(_In_reads_(2) const float *pArray); + + XMBYTEN2& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 8 bit signed integer components +struct XMBYTE2 +{ + union + { + struct + { + int8_t x; + int8_t y; + }; + uint16_t v; + }; + + XMBYTE2() = default; + + XMBYTE2(const XMBYTE2&) = default; + XMBYTE2& operator=(const XMBYTE2&) = default; + + XMBYTE2(XMBYTE2&&) = default; + XMBYTE2& operator=(XMBYTE2&&) = default; + + explicit XM_CONSTEXPR XMBYTE2(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMBYTE2(int8_t _x, int8_t _y) : x(_x), y(_y) {} + explicit XMBYTE2(_In_reads_(2) const int8_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMBYTE2(float _x, float _y); + explicit XMBYTE2(_In_reads_(2) const float *pArray); + + XMBYTE2& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 8 bit unsigned normalized integer components +struct XMUBYTEN2 +{ + union + { + struct + { + uint8_t x; + uint8_t y; + }; + uint16_t v; + }; + + XMUBYTEN2() = default; + + XMUBYTEN2(const XMUBYTEN2&) = default; + XMUBYTEN2& operator=(const XMUBYTEN2&) = default; + + XMUBYTEN2(XMUBYTEN2&&) = default; + XMUBYTEN2& operator=(XMUBYTEN2&&) = default; + + explicit XM_CONSTEXPR XMUBYTEN2(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUBYTEN2(uint8_t _x, uint8_t _y) : x(_x), y(_y) {} + explicit XMUBYTEN2(_In_reads_(2) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMUBYTEN2(float _x, float _y); + explicit XMUBYTEN2(_In_reads_(2) const float *pArray); + + XMUBYTEN2& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 8 bit unsigned integer components +struct XMUBYTE2 +{ + union + { + struct + { + uint8_t x; + uint8_t y; + }; + uint16_t v; + }; + + XMUBYTE2() = default; + + XMUBYTE2(const XMUBYTE2&) = default; + XMUBYTE2& operator=(const XMUBYTE2&) = default; + + XMUBYTE2(XMUBYTE2&&) = default; + XMUBYTE2& operator=(XMUBYTE2&&) = default; + + explicit XM_CONSTEXPR XMUBYTE2(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUBYTE2(uint8_t _x, uint8_t _y) : x(_x), y(_y) {} + explicit XMUBYTE2(_In_reads_(2) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMUBYTE2(float _x, float _y); + explicit XMUBYTE2(_In_reads_(2) const float *pArray); + + XMUBYTE2& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3D vector: 5/6/5 unsigned integer components +struct XMU565 +{ + union + { + struct + { + uint16_t x : 5; // 0 to 31 + uint16_t y : 6; // 0 to 63 + uint16_t z : 5; // 0 to 31 + }; + uint16_t v; + }; + + XMU565() = default; + + XMU565(const XMU565&) = default; + XMU565& operator=(const XMU565&) = default; + + XMU565(XMU565&&) = default; + XMU565& operator=(XMU565&&) = default; + + explicit XM_CONSTEXPR XMU565(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMU565(uint8_t _x, uint8_t _y, uint8_t _z) : x(_x), y(_y), z(_z) {} + explicit XMU565(_In_reads_(3) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + XMU565(float _x, float _y, float _z); + explicit XMU565(_In_reads_(3) const float *pArray); + + operator uint16_t () const { return v; } + + XMU565& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3D vector: 11/11/10 floating-point components +// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent +// and 6-bit mantissa for x component, a 5-bit biased exponent and +// 6-bit mantissa for y component, a 5-bit biased exponent and a 5-bit +// mantissa for z. The z component is stored in the most significant bits +// and the x component in the least significant bits. No sign bits so +// all partial-precision numbers are positive. +// (Z10Y11X11): [32] ZZZZZzzz zzzYYYYY yyyyyyXX XXXxxxxx [0] +struct XMFLOAT3PK +{ + union + { + struct + { + uint32_t xm : 6; // x-mantissa + uint32_t xe : 5; // x-exponent + uint32_t ym : 6; // y-mantissa + uint32_t ye : 5; // y-exponent + uint32_t zm : 5; // z-mantissa + uint32_t ze : 5; // z-exponent + }; + uint32_t v; + }; + + XMFLOAT3PK() = default; + + XMFLOAT3PK(const XMFLOAT3PK&) = default; + XMFLOAT3PK& operator=(const XMFLOAT3PK&) = default; + + XMFLOAT3PK(XMFLOAT3PK&&) = default; + XMFLOAT3PK& operator=(XMFLOAT3PK&&) = default; + + explicit XM_CONSTEXPR XMFLOAT3PK(uint32_t Packed) : v(Packed) {} + XMFLOAT3PK(float _x, float _y, float _z); + explicit XMFLOAT3PK(_In_reads_(3) const float *pArray); + + operator uint32_t () const { return v; } + + XMFLOAT3PK& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3D vector: 9/9/9 floating-point components with shared 5-bit exponent +// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent +// with 9-bit mantissa for the x, y, and z component. The shared exponent +// is stored in the most significant bits and the x component mantissa is in +// the least significant bits. No sign bits so all partial-precision numbers +// are positive. +// (E5Z9Y9X9): [32] EEEEEzzz zzzzzzyy yyyyyyyx xxxxxxxx [0] +struct XMFLOAT3SE +{ + union + { + struct + { + uint32_t xm : 9; // x-mantissa + uint32_t ym : 9; // y-mantissa + uint32_t zm : 9; // z-mantissa + uint32_t e : 5; // shared exponent + }; + uint32_t v; + }; + + XMFLOAT3SE() = default; + + XMFLOAT3SE(const XMFLOAT3SE&) = default; + XMFLOAT3SE& operator=(const XMFLOAT3SE&) = default; + + XMFLOAT3SE(XMFLOAT3SE&&) = default; + XMFLOAT3SE& operator=(XMFLOAT3SE&&) = default; + + explicit XM_CONSTEXPR XMFLOAT3SE(uint32_t Packed) : v(Packed) {} + XMFLOAT3SE(float _x, float _y, float _z); + explicit XMFLOAT3SE(_In_reads_(3) const float *pArray); + + operator uint32_t () const { return v; } + + XMFLOAT3SE& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 16 bit floating point components +struct XMHALF4 +{ + union + { + struct + { + HALF x; + HALF y; + HALF z; + HALF w; + }; + uint64_t v; + }; + + XMHALF4() = default; + + XMHALF4(const XMHALF4&) = default; + XMHALF4& operator=(const XMHALF4&) = default; + + XMHALF4(XMHALF4&&) = default; + XMHALF4& operator=(XMHALF4&&) = default; + + explicit XM_CONSTEXPR XMHALF4(uint64_t Packed) : v(Packed) {} + XM_CONSTEXPR XMHALF4(HALF _x, HALF _y, HALF _z, HALF _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMHALF4(_In_reads_(4) const HALF *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMHALF4(float _x, float _y, float _z, float _w); + explicit XMHALF4(_In_reads_(4) const float *pArray); + + XMHALF4& operator= (uint64_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 16 bit signed normalized integer components +struct XMSHORTN4 +{ + union + { + struct + { + int16_t x; + int16_t y; + int16_t z; + int16_t w; + }; + uint64_t v; + }; + + XMSHORTN4() = default; + + XMSHORTN4(const XMSHORTN4&) = default; + XMSHORTN4& operator=(const XMSHORTN4&) = default; + + XMSHORTN4(XMSHORTN4&&) = default; + XMSHORTN4& operator=(XMSHORTN4&&) = default; + + explicit XM_CONSTEXPR XMSHORTN4(uint64_t Packed) : v(Packed) {} + XM_CONSTEXPR XMSHORTN4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMSHORTN4(_In_reads_(4) const int16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMSHORTN4(float _x, float _y, float _z, float _w); + explicit XMSHORTN4(_In_reads_(4) const float *pArray); + + XMSHORTN4& operator= (uint64_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 16 bit signed integer components +struct XMSHORT4 +{ + union + { + struct + { + int16_t x; + int16_t y; + int16_t z; + int16_t w; + }; + uint64_t v; + }; + + XMSHORT4() = default; + + XMSHORT4(const XMSHORT4&) = default; + XMSHORT4& operator=(const XMSHORT4&) = default; + + XMSHORT4(XMSHORT4&&) = default; + XMSHORT4& operator=(XMSHORT4&&) = default; + + explicit XM_CONSTEXPR XMSHORT4(uint64_t Packed) : v(Packed) {} + XM_CONSTEXPR XMSHORT4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMSHORT4(_In_reads_(4) const int16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMSHORT4(float _x, float _y, float _z, float _w); + explicit XMSHORT4(_In_reads_(4) const float *pArray); + + XMSHORT4& operator= (uint64_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 16 bit unsigned normalized integer components +struct XMUSHORTN4 +{ + union + { + struct + { + uint16_t x; + uint16_t y; + uint16_t z; + uint16_t w; + }; + uint64_t v; + }; + + XMUSHORTN4() = default; + + XMUSHORTN4(const XMUSHORTN4&) = default; + XMUSHORTN4& operator=(const XMUSHORTN4&) = default; + + XMUSHORTN4(XMUSHORTN4&&) = default; + XMUSHORTN4& operator=(XMUSHORTN4&&) = default; + + explicit XM_CONSTEXPR XMUSHORTN4(uint64_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUSHORTN4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUSHORTN4(_In_reads_(4) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUSHORTN4(float _x, float _y, float _z, float _w); + explicit XMUSHORTN4(_In_reads_(4) const float *pArray); + + XMUSHORTN4& operator= (uint64_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 16 bit unsigned integer components +struct XMUSHORT4 +{ + union + { + struct + { + uint16_t x; + uint16_t y; + uint16_t z; + uint16_t w; + }; + uint64_t v; + }; + + XMUSHORT4() = default; + + XMUSHORT4(const XMUSHORT4&) = default; + XMUSHORT4& operator=(const XMUSHORT4&) = default; + + XMUSHORT4(XMUSHORT4&&) = default; + XMUSHORT4& operator=(XMUSHORT4&&) = default; + + explicit XM_CONSTEXPR XMUSHORT4(uint64_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUSHORT4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUSHORT4(_In_reads_(4) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUSHORT4(float _x, float _y, float _z, float _w); + explicit XMUSHORT4(_In_reads_(4) const float *pArray); + + XMUSHORT4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, +// normalized integer for the w component and 10 bit signed, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XMXDECN4 +{ + union + { + struct + { + int32_t x : 10; // -511/511 to 511/511 + int32_t y : 10; // -511/511 to 511/511 + int32_t z : 10; // -511/511 to 511/511 + uint32_t w : 2; // 0/3 to 3/3 + }; + uint32_t v; + }; + + XMXDECN4() = default; + + XMXDECN4(const XMXDECN4&) = default; + XMXDECN4& operator=(const XMXDECN4&) = default; + + XMXDECN4(XMXDECN4&&) = default; + XMXDECN4& operator=(XMXDECN4&&) = default; + + explicit XM_CONSTEXPR XMXDECN4(uint32_t Packed) : v(Packed) {} + XMXDECN4(float _x, float _y, float _z, float _w); + explicit XMXDECN4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMXDECN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned +// integer for the w component and 10 bit signed integers for the +// z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XM_DEPRECATED XMXDEC4 +{ + union + { + struct + { + int32_t x : 10; // -511 to 511 + int32_t y : 10; // -511 to 511 + int32_t z : 10; // -511 to 511 + uint32_t w : 2; // 0 to 3 + }; + uint32_t v; + }; + + XMXDEC4() = default; + + XMXDEC4(const XMXDEC4&) = default; + XMXDEC4& operator=(const XMXDEC4&) = default; + + XMXDEC4(XMXDEC4&&) = default; + XMXDEC4& operator=(XMXDEC4&&) = default; + + explicit XM_CONSTEXPR XMXDEC4(uint32_t Packed) : v(Packed) {} + XMXDEC4(float _x, float _y, float _z, float _w); + explicit XMXDEC4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMXDEC4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit signed, +// normalized integer for the w component and 10 bit signed, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XM_DEPRECATED XMDECN4 +{ + union + { + struct + { + int32_t x : 10; // -511/511 to 511/511 + int32_t y : 10; // -511/511 to 511/511 + int32_t z : 10; // -511/511 to 511/511 + int32_t w : 2; // -1/1 to 1/1 + }; + uint32_t v; + }; + + XMDECN4() = default; + + XMDECN4(const XMDECN4&) = default; + XMDECN4& operator=(const XMDECN4&) = default; + + XMDECN4(XMDECN4&&) = default; + XMDECN4& operator=(XMDECN4&&) = default; + + explicit XM_CONSTEXPR XMDECN4(uint32_t Packed) : v(Packed) {} + XMDECN4(float _x, float _y, float _z, float _w); + explicit XMDECN4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMDECN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer +// The 4D Vector is packed into 32 bits as follows: a 2 bit signed, +// integer for the w component and 10 bit signed integers for the +// z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XM_DEPRECATED XMDEC4 +{ + union + { + struct + { + int32_t x : 10; // -511 to 511 + int32_t y : 10; // -511 to 511 + int32_t z : 10; // -511 to 511 + int32_t w : 2; // -1 to 1 + }; + uint32_t v; + }; + + XMDEC4() = default; + + XMDEC4(const XMDEC4&) = default; + XMDEC4& operator=(const XMDEC4&) = default; + + XMDEC4(XMDEC4&&) = default; + XMDEC4& operator=(XMDEC4&&) = default; + + explicit XM_CONSTEXPR XMDEC4(uint32_t Packed) : v(Packed) {} + XMDEC4(float _x, float _y, float _z, float _w); + explicit XMDEC4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMDEC4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, +// normalized integer for the w component and 10 bit unsigned, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XMUDECN4 +{ + union + { + struct + { + uint32_t x : 10; // 0/1023 to 1023/1023 + uint32_t y : 10; // 0/1023 to 1023/1023 + uint32_t z : 10; // 0/1023 to 1023/1023 + uint32_t w : 2; // 0/3 to 3/3 + }; + uint32_t v; + }; + + XMUDECN4() = default; + + XMUDECN4(const XMUDECN4&) = default; + XMUDECN4& operator=(const XMUDECN4&) = default; + + XMUDECN4(XMUDECN4&&) = default; + XMUDECN4& operator=(XMUDECN4&&) = default; + + explicit XM_CONSTEXPR XMUDECN4(uint32_t Packed) : v(Packed) {} + XMUDECN4(float _x, float _y, float _z, float _w); + explicit XMUDECN4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMUDECN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer +// The 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, +// integer for the w component and 10 bit unsigned integers +// for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XMUDEC4 +{ + union + { + struct + { + uint32_t x : 10; // 0 to 1023 + uint32_t y : 10; // 0 to 1023 + uint32_t z : 10; // 0 to 1023 + uint32_t w : 2; // 0 to 3 + }; + uint32_t v; + }; + + XMUDEC4() = default; + + XMUDEC4(const XMUDEC4&) = default; + XMUDEC4& operator=(const XMUDEC4&) = default; + + XMUDEC4(XMUDEC4&&) = default; + XMUDEC4& operator=(XMUDEC4&&) = default; + + explicit XM_CONSTEXPR XMUDEC4(uint32_t Packed) : v(Packed) {} + XMUDEC4(float _x, float _y, float _z, float _w); + explicit XMUDEC4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMUDEC4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 8 bit signed normalized integer components +struct XMBYTEN4 +{ + union + { + struct + { + int8_t x; + int8_t y; + int8_t z; + int8_t w; + }; + uint32_t v; + }; + + XMBYTEN4() = default; + + XMBYTEN4(const XMBYTEN4&) = default; + XMBYTEN4& operator=(const XMBYTEN4&) = default; + + XMBYTEN4(XMBYTEN4&&) = default; + XMBYTEN4& operator=(XMBYTEN4&&) = default; + + XM_CONSTEXPR XMBYTEN4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XM_CONSTEXPR XMBYTEN4(uint32_t Packed) : v(Packed) {} + explicit XMBYTEN4(_In_reads_(4) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMBYTEN4(float _x, float _y, float _z, float _w); + explicit XMBYTEN4(_In_reads_(4) const float *pArray); + + XMBYTEN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 8 bit signed integer components +struct XMBYTE4 +{ + union + { + struct + { + int8_t x; + int8_t y; + int8_t z; + int8_t w; + }; + uint32_t v; + }; + + XMBYTE4() = default; + + XMBYTE4(const XMBYTE4&) = default; + XMBYTE4& operator=(const XMBYTE4&) = default; + + XMBYTE4(XMBYTE4&&) = default; + XMBYTE4& operator=(XMBYTE4&&) = default; + + XM_CONSTEXPR XMBYTE4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XM_CONSTEXPR XMBYTE4(uint32_t Packed) : v(Packed) {} + explicit XMBYTE4(_In_reads_(4) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMBYTE4(float _x, float _y, float _z, float _w); + explicit XMBYTE4(_In_reads_(4) const float *pArray); + + XMBYTE4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 8 bit unsigned normalized integer components +struct XMUBYTEN4 +{ + union + { + struct + { + uint8_t x; + uint8_t y; + uint8_t z; + uint8_t w; + }; + uint32_t v; + }; + + XMUBYTEN4() = default; + + XMUBYTEN4(const XMUBYTEN4&) = default; + XMUBYTEN4& operator=(const XMUBYTEN4&) = default; + + XMUBYTEN4(XMUBYTEN4&&) = default; + XMUBYTEN4& operator=(XMUBYTEN4&&) = default; + + XM_CONSTEXPR XMUBYTEN4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XM_CONSTEXPR XMUBYTEN4(uint32_t Packed) : v(Packed) {} + explicit XMUBYTEN4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUBYTEN4(float _x, float _y, float _z, float _w); + explicit XMUBYTEN4(_In_reads_(4) const float *pArray); + + XMUBYTEN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 8 bit unsigned integer components +struct XMUBYTE4 +{ + union + { + struct + { + uint8_t x; + uint8_t y; + uint8_t z; + uint8_t w; + }; + uint32_t v; + }; + + XMUBYTE4() = default; + + XMUBYTE4(const XMUBYTE4&) = default; + XMUBYTE4& operator=(const XMUBYTE4&) = default; + + XMUBYTE4(XMUBYTE4&&) = default; + XMUBYTE4& operator=(XMUBYTE4&&) = default; + + XM_CONSTEXPR XMUBYTE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XM_CONSTEXPR XMUBYTE4(uint32_t Packed) : v(Packed) {} + explicit XMUBYTE4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUBYTE4(float _x, float _y, float _z, float _w); + explicit XMUBYTE4(_In_reads_(4) const float *pArray); + + XMUBYTE4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D vector; 4 bit unsigned integer components +struct XMUNIBBLE4 +{ + union + { + struct + { + uint16_t x : 4; // 0 to 15 + uint16_t y : 4; // 0 to 15 + uint16_t z : 4; // 0 to 15 + uint16_t w : 4; // 0 to 15 + }; + uint16_t v; + }; + + XMUNIBBLE4() = default; + + XMUNIBBLE4(const XMUNIBBLE4&) = default; + XMUNIBBLE4& operator=(const XMUNIBBLE4&) = default; + + XMUNIBBLE4(XMUNIBBLE4&&) = default; + XMUNIBBLE4& operator=(XMUNIBBLE4&&) = default; + + explicit XM_CONSTEXPR XMUNIBBLE4(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUNIBBLE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUNIBBLE4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUNIBBLE4(float _x, float _y, float _z, float _w); + explicit XMUNIBBLE4(_In_reads_(4) const float *pArray); + + operator uint16_t () const { return v; } + + XMUNIBBLE4& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D vector: 5/5/5/1 unsigned integer components +struct XMU555 +{ + union + { + struct + { + uint16_t x : 5; // 0 to 31 + uint16_t y : 5; // 0 to 31 + uint16_t z : 5; // 0 to 31 + uint16_t w : 1; // 0 or 1 + }; + uint16_t v; + }; + + XMU555() = default; + + XMU555(const XMU555&) = default; + XMU555& operator=(const XMU555&) = default; + + XMU555(XMU555&&) = default; + XMU555& operator=(XMU555&&) = default; + + explicit XM_CONSTEXPR XMU555(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMU555(uint8_t _x, uint8_t _y, uint8_t _z, bool _w) : x(_x), y(_y), z(_z), w(_w ? 0x1 : 0) {} + XMU555(_In_reads_(3) const uint8_t *pArray, _In_ bool _w) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(_w ? 0x1 : 0) {} + XMU555(float _x, float _y, float _z, bool _w); + XMU555(_In_reads_(3) const float *pArray, _In_ bool _w); + + operator uint16_t () const { return v; } + + XMU555& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + +/**************************************************************************** + * + * Data conversion operations + * + ****************************************************************************/ + +float XMConvertHalfToFloat(HALF Value); +float* XMConvertHalfToFloatStream(_Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(HALF)+InputStride*(HalfCount-1)) const HALF* pInputStream, + _In_ size_t InputStride, _In_ size_t HalfCount); +HALF XMConvertFloatToHalf(float Value); +HALF* XMConvertFloatToHalfStream(_Out_writes_bytes_(sizeof(HALF)+OutputStride*(FloatCount-1)) HALF* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, + _In_ size_t InputStride, _In_ size_t FloatCount); + +/**************************************************************************** + * + * Load operations + * + ****************************************************************************/ + +XMVECTOR XM_CALLCONV XMLoadColor(_In_ const XMCOLOR* pSource); + +XMVECTOR XM_CALLCONV XMLoadHalf2(_In_ const XMHALF2* pSource); +XMVECTOR XM_CALLCONV XMLoadShortN2(_In_ const XMSHORTN2* pSource); +XMVECTOR XM_CALLCONV XMLoadShort2(_In_ const XMSHORT2* pSource); +XMVECTOR XM_CALLCONV XMLoadUShortN2(_In_ const XMUSHORTN2* pSource); +XMVECTOR XM_CALLCONV XMLoadUShort2(_In_ const XMUSHORT2* pSource); +XMVECTOR XM_CALLCONV XMLoadByteN2(_In_ const XMBYTEN2* pSource); +XMVECTOR XM_CALLCONV XMLoadByte2(_In_ const XMBYTE2* pSource); +XMVECTOR XM_CALLCONV XMLoadUByteN2(_In_ const XMUBYTEN2* pSource); +XMVECTOR XM_CALLCONV XMLoadUByte2(_In_ const XMUBYTE2* pSource); + +XMVECTOR XM_CALLCONV XMLoadU565(_In_ const XMU565* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat3PK(_In_ const XMFLOAT3PK* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat3SE(_In_ const XMFLOAT3SE* pSource); + +XMVECTOR XM_CALLCONV XMLoadHalf4(_In_ const XMHALF4* pSource); +XMVECTOR XM_CALLCONV XMLoadShortN4(_In_ const XMSHORTN4* pSource); +XMVECTOR XM_CALLCONV XMLoadShort4(_In_ const XMSHORT4* pSource); +XMVECTOR XM_CALLCONV XMLoadUShortN4(_In_ const XMUSHORTN4* pSource); +XMVECTOR XM_CALLCONV XMLoadUShort4(_In_ const XMUSHORT4* pSource); +XMVECTOR XM_CALLCONV XMLoadXDecN4(_In_ const XMXDECN4* pSource); +XMVECTOR XM_CALLCONV XMLoadUDecN4(_In_ const XMUDECN4* pSource); +XMVECTOR XM_CALLCONV XMLoadUDecN4_XR(_In_ const XMUDECN4* pSource); +XMVECTOR XM_CALLCONV XMLoadUDec4(_In_ const XMUDEC4* pSource); +XMVECTOR XM_CALLCONV XMLoadByteN4(_In_ const XMBYTEN4* pSource); +XMVECTOR XM_CALLCONV XMLoadByte4(_In_ const XMBYTE4* pSource); +XMVECTOR XM_CALLCONV XMLoadUByteN4(_In_ const XMUBYTEN4* pSource); +XMVECTOR XM_CALLCONV XMLoadUByte4(_In_ const XMUBYTE4* pSource); +XMVECTOR XM_CALLCONV XMLoadUNibble4(_In_ const XMUNIBBLE4* pSource); +XMVECTOR XM_CALLCONV XMLoadU555(_In_ const XMU555* pSource); + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning +#endif + +XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadDecN4(_In_ const XMDECN4* pSource); +XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadDec4(_In_ const XMDEC4* pSource); +XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadXDec4(_In_ const XMXDEC4* pSource); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/**************************************************************************** + * + * Store operations + * + ****************************************************************************/ + +void XM_CALLCONV XMStoreColor(_Out_ XMCOLOR* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreHalf2(_Out_ XMHALF2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreShortN2(_Out_ XMSHORTN2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreShort2(_Out_ XMSHORT2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUShortN2(_Out_ XMUSHORTN2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUShort2(_Out_ XMUSHORT2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreByteN2(_Out_ XMBYTEN2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreByte2(_Out_ XMBYTE2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUByteN2(_Out_ XMUBYTEN2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUByte2(_Out_ XMUBYTE2* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreU565(_Out_ XMU565* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat3PK(_Out_ XMFLOAT3PK* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat3SE(_Out_ XMFLOAT3SE* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreHalf4(_Out_ XMHALF4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreShortN4(_Out_ XMSHORTN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreShort4(_Out_ XMSHORT4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUShortN4(_Out_ XMUSHORTN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUShort4(_Out_ XMUSHORT4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreXDecN4(_Out_ XMXDECN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUDecN4(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUDecN4_XR(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUDec4(_Out_ XMUDEC4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreByteN4(_Out_ XMBYTEN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreByte4(_Out_ XMBYTE4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUByteN4(_Out_ XMUBYTEN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUByte4(_Out_ XMUBYTE4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUNibble4(_Out_ XMUNIBBLE4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreU555(_Out_ XMU555* pDestination, _In_ FXMVECTOR V); + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning +#endif + +void XM_DEPRECATED XM_CALLCONV XMStoreDecN4(_Out_ XMDECN4* pDestination, _In_ FXMVECTOR V); +void XM_DEPRECATED XM_CALLCONV XMStoreDec4(_Out_ XMDEC4* pDestination, _In_ FXMVECTOR V); +void XM_DEPRECATED XM_CALLCONV XMStoreXDec4(_Out_ XMXDEC4* pDestination, _In_ FXMVECTOR V); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4068 4214 4204 4365 4616 6001 6101) +// C4068/4616: ignore unknown pragmas +// C4214/4204: nonstandard extension used +// C4365: Off by default noise +// C6001/6101: False positives + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +#pragma prefast(disable : 26495, "Union initialization confuses /analyze") +#endif +#endif + +#include "DirectXPackedVector.inl" + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +#pragma warning(pop) +#endif + +} // namespace PackedVector + +} // namespace DirectX + diff --git a/WickedEngine/Utility/DirectXPackedVector.inl b/WickedEngine/Utility/DirectXPackedVector.inl new file mode 100644 index 000000000..9c2bc7834 --- /dev/null +++ b/WickedEngine/Utility/DirectXPackedVector.inl @@ -0,0 +1,4621 @@ +//------------------------------------------------------------------------------------- +// DirectXPackedVector.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Data conversion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline float XMConvertHalfToFloat +( + HALF Value +) +{ +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V1 = _mm_cvtsi32_si128( static_cast(Value) ); + __m128 V2 = _mm_cvtph_ps( V1 ); + return _mm_cvtss_f32( V2 ); +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_) + uint16x4_t vHalf = vdup_n_u16(Value); + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + return vgetq_lane_f32(vFloat, 0); +#else + auto Mantissa = static_cast(Value & 0x03FF); + + uint32_t Exponent = (Value & 0x7C00); + if ( Exponent == 0x7C00 ) // INF/NAN + { + Exponent = 0x8f; + } + else if (Exponent != 0) // The value is normalized + { + Exponent = static_cast((static_cast(Value) >> 10) & 0x1F); + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x0400) == 0); + + Mantissa &= 0x03FF; + } + else // The value is zero + { + Exponent = static_cast(-112); + } + + uint32_t Result = + ((static_cast(Value) & 0x8000) << 16) // Sign + | ((Exponent + 112) << 23) // Exponent + | (Mantissa << 13); // Mantissa + + return reinterpret_cast(&Result)[0]; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif +#endif + +_Use_decl_annotations_ +inline float* XMConvertHalfToFloatStream +( + float* pOutputStream, + size_t OutputStride, + const HALF* pInputStream, + size_t InputStride, + size_t HalfCount +) +{ + assert(pOutputStream); + assert(pInputStream); + + assert(InputStride >= sizeof(HALF)); + _Analysis_assume_(InputStride >= sizeof(HALF)); + + assert(OutputStride >= sizeof(float)); + _Analysis_assume_(OutputStride >= sizeof(float)); + +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + auto pHalf = reinterpret_cast(pInputStream); + auto pFloat = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = HalfCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(HALF)) + { + if (OutputStride == sizeof(float)) + { + if ( (reinterpret_cast(pFloat) & 0xF) == 0) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); + pHalf += InputStride*4; + + __m128 FV = _mm_cvtph_ps( HV ); + + XM_STREAM_PS( reinterpret_cast(pFloat), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); + pHalf += InputStride*4; + + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_storeu_ps( reinterpret_cast(pFloat), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); + pHalf += InputStride*4; + + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_store_ss( reinterpret_cast(pFloat), FV ); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 1 ); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 2 ); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 3 ); + pFloat += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(float)) + { + if ( (reinterpret_cast(pFloat) & 0xF) == 0) + { + // Scattered input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16( HV, H1, 0 ); + HV = _mm_insert_epi16( HV, H2, 1 ); + HV = _mm_insert_epi16( HV, H3, 2 ); + HV = _mm_insert_epi16( HV, H4, 3 ); + __m128 FV = _mm_cvtph_ps( HV ); + + XM_STREAM_PS( reinterpret_cast(pFloat ), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + else + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16( HV, H1, 0 ); + HV = _mm_insert_epi16( HV, H2, 1 ); + HV = _mm_insert_epi16( HV, H3, 2 ); + HV = _mm_insert_epi16( HV, H4, 3 ); + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_storeu_ps( reinterpret_cast(pFloat ), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + _mm_store_ss(reinterpret_cast(pFloat), FV); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 1); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 2); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 3); + pFloat += OutputStride; + i += 4; + } + } + } + + for (; i < HalfCount; ++i) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_) + auto pHalf = reinterpret_cast(pInputStream); + auto pFloat = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = HalfCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(HALF)) + { + if (OutputStride == sizeof(float)) + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16x4_t vHalf = vld1_u16(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + + vst1q_f32(reinterpret_cast(pFloat), vFloat); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + uint16x4_t vHalf = vld1_u16(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 0); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 1); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 2); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 3); + pFloat += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(float)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48); + uint16x4_t vHalf = vcreate_u16(iHalf); + + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + + vst1q_f32(reinterpret_cast(pFloat), vFloat); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48); + uint16x4_t vHalf = vcreate_u16(iHalf); + + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 0); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 1); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 2); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 3); + pFloat += OutputStride; + i += 4; + } + } + } + + for (; i < HalfCount; ++i) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + return pOutputStream; +#else + auto pHalf = reinterpret_cast(pInputStream); + auto pFloat = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < HalfCount; i++) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + return pOutputStream; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline HALF XMConvertFloatToHalf +( + float Value +) +{ +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128 V1 = _mm_set_ss( Value ); + __m128i V2 = _mm_cvtps_ph( V1, 0 ); + return static_cast( _mm_cvtsi128_si32(V2) ); +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_) + float32x4_t vFloat = vdupq_n_f32(Value); + float16x4_t vHalf = vcvt_f16_f32(vFloat); + return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0); +#else + uint32_t Result; + + auto IValue = reinterpret_cast(&Value)[0]; + uint32_t Sign = (IValue & 0x80000000U) >> 16U; + IValue = IValue & 0x7FFFFFFFU; // Hack off the sign + + if (IValue > 0x477FE000U) + { + // The number is too large to be represented as a half. Saturate to infinity. + if (((IValue & 0x7F800000) == 0x7F800000) && ((IValue & 0x7FFFFF ) != 0)) + { + Result = 0x7FFF; // NAN + } + else + { + Result = 0x7C00U; // INF + } + } + else if (!IValue) + { + Result = 0; + } + else + { + if (IValue < 0x38800000U) + { + // The number is too small to be represented as a normalized half. + // Convert it to a denormalized value. + uint32_t Shift = 113U - (IValue >> 23U); + IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized half. + IValue += 0xC8000000U; + } + + Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU; + } + return static_cast(Result|Sign); +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline HALF* XMConvertFloatToHalfStream +( + HALF* pOutputStream, + size_t OutputStride, + const float* pInputStream, + size_t InputStride, + size_t FloatCount +) +{ + assert(pOutputStream); + assert(pInputStream); + + assert(InputStride >= sizeof(float)); + _Analysis_assume_(InputStride >= sizeof(float)); + + assert(OutputStride >= sizeof(HALF)); + _Analysis_assume_(OutputStride >= sizeof(HALF)); + +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + auto pFloat = reinterpret_cast(pInputStream); + auto pHalf = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = FloatCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(float)) + { + if (OutputStride == sizeof(HALF)) + { + if ( (reinterpret_cast(pFloat) & 0xF) == 0) + { + // Aligned and packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); + pHalf += OutputStride*4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); + pHalf += OutputStride*4; + i += 4; + } + } + } + else + { + if ( (reinterpret_cast(pFloat) & 0xF) == 0) + { + // Aligned & packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); + pHalf += OutputStride; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); + pHalf += OutputStride; + i += 4; + } + } + } + } + else if (OutputStride == sizeof(HALF)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 ); + __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 ); + FV = _mm_blend_ps( FV, FT, 0xC ); + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); + pHalf += OutputStride*4; + i += 4; + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps(FV1, FV2, 0x2); + __m128 FT = _mm_blend_ps(FV3, FV4, 0x8); + FV = _mm_blend_ps(FV, FT, 0xC); + + __m128i HV = _mm_cvtps_ph(FV, 0); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + } + + for (; i < FloatCount; ++i) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_) + auto pFloat = reinterpret_cast(pInputStream); + auto pHalf = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = FloatCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(float)) + { + if (OutputStride == sizeof(HALF)) + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + float32x4_t vFloat = vld1q_f32(reinterpret_cast(pFloat)); + pFloat += InputStride*4; + + uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat)); + + vst1_u16(reinterpret_cast(pHalf), vHalf); + pHalf += OutputStride*4; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + float32x4_t vFloat = vld1q_f32(reinterpret_cast(pFloat)); + pFloat += InputStride*4; + + uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat)); + + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 0); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 1); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 2); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 3); + pHalf += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(HALF)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + float32x4_t vFloat = vdupq_n_f32(0); + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 0); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 1); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 2); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 3); + pFloat += InputStride; + + uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat)); + + vst1_u16(reinterpret_cast(pHalf), vHalf); + pHalf += OutputStride*4; + i += 4; + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + float32x4_t vFloat = vdupq_n_f32(0); + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 0); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 1); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 2); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 3); + pFloat += InputStride; + + uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat)); + + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 0); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 1); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 2); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 3); + pHalf += OutputStride; + i += 4; + } + } + } + + for (; i < FloatCount; ++i) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + + return pOutputStream; +#else + auto pFloat = reinterpret_cast(pInputStream); + auto pHalf = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < FloatCount; i++) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + return pOutputStream; +#endif // !_XM_F16C_INTRINSICS_ +} + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#endif + +/**************************************************************************** + * + * Vector and matrix load operations + * + ****************************************************************************/ + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable:28931, "PREfast noise: Esp:1266") +#endif +#endif + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadColor +( + const XMCOLOR* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + // int32_t -> Float conversions are done in one instruction. + // uint32_t -> Float calls a runtime function. Keep in int32_t + auto iColor = static_cast(pSource->c); + XMVECTORF32 vColor = { + static_cast((iColor >> 16) & 0xFF) * (1.0f / 255.0f), + static_cast((iColor >> 8) & 0xFF) * (1.0f / 255.0f), + static_cast(iColor & 0xFF) * (1.0f / 255.0f), + static_cast((iColor >> 24) & 0xFF) * (1.0f / 255.0f) + }; + return vColor.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32_t bgra = pSource->c; + uint32_t rgba = (bgra & 0xFF00FF00) | ((bgra >> 16) & 0xFF) | ((bgra << 16) & 0xFF0000); + uint32x2_t vInt8 = vdup_n_u32(rgba); + uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) ); + uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_n_f32( R, 1.0f/255.0f ); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + __m128i vInt = _mm_set1_epi32(static_cast(pSource->c)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8); + // a is unsigned! Flip the bit to convert the order to signed + vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8); + // Convert to floating point numbers + XMVECTOR vTemp = _mm_cvtepi32_ps(vInt); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8); + // Convert 0-255 to 0.0f-1.0f + return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadHalf2 +( + const XMHALF2* pSource +) +{ + assert(pSource); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128 V = _mm_load_ss( reinterpret_cast(pSource) ); + return _mm_cvtph_ps( _mm_castps_si128( V ) ); +#else + XMVECTORF32 vResult = { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + 0.0f, + 0.0f + }; + return vResult.v; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadShortN2 +( + const XMSHORTN2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (pSource->x == -32768) ? -1.f : (static_cast(pSource->x) * (1.0f / 32767.0f)), + (pSource->y == -32768) ? -1.f : (static_cast(pSource->y) * (1.0f / 32767.0f)), + 0.0f, + 0.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast( pSource ) ); + int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) ); + vInt = vandq_s32( vInt, g_XMMaskXY ); + float32x4_t R = vcvtq_f32_s32(vInt); + R = vmulq_n_f32( R, 1.0f/32767.0f ); + return vmaxq_f32( R, vdupq_n_f32(-1.f) ); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // x needs to be sign extended + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x - 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16); + // Convert -1.0f - 1.0f + vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16); + // Clamp result (for case of -32768) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadShort2 +( + const XMSHORT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + static_cast(pSource->x), + static_cast(pSource->y), + 0.f, + 0.f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast( pSource ) ); + int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) ); + vInt = vandq_s32( vInt, g_XMMaskXY ); + return vcvtq_f32_s32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // x needs to be sign extended + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x - 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16); + // Y is 65536 too large + return _mm_mul_ps(vTemp,g_XMFixupY16); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUShortN2 +( + const XMUSHORTN2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + static_cast(pSource->x) / 65535.0f, + static_cast(pSource->y) / 65535.0f, + 0.f, + 0.f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast( pSource ) ); + uint32x4_t vInt = vmovl_u16( vreinterpret_u16_u32(vInt16) ); + vInt = vandq_u32( vInt, g_XMMaskXY ); + float32x4_t R = vcvtq_f32_u32(vInt); + R = vmulq_n_f32( R, 1.0f/65535.0f ); + return vmaxq_f32( R, vdupq_n_f32(-1.f) ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixupY16 = { 1.0f / 65535.0f, 1.0f / (65535.0f*65536.0f), 0.0f, 0.0f }; + static const XMVECTORF32 FixaddY16 = { 0, 32768.0f*65536.0f, 0, 0 }; + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // y needs to be sign flipped + vTemp = _mm_xor_ps(vTemp,g_XMFlipY); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // y + 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,FixaddY16); + // Y is 65536 times too large + vTemp = _mm_mul_ps(vTemp,FixupY16); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUShort2 +( + const XMUSHORT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + static_cast(pSource->x), + static_cast(pSource->y), + 0.f, + 0.f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast( pSource ) ); + uint32x4_t vInt = vmovl_u16( vreinterpret_u16_u32(vInt16) ); + vInt = vandq_u32( vInt, g_XMMaskXY ); + return vcvtq_f32_u32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixaddY16 = { 0, 32768.0f, 0, 0 }; + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // y needs to be sign flipped + vTemp = _mm_xor_ps(vTemp,g_XMFlipY); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // Y is 65536 times too large + vTemp = _mm_mul_ps(vTemp,g_XMFixupY16); + // y + 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,FixaddY16); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadByteN2 +( + const XMBYTEN2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (pSource->x == -128) ? -1.f : (static_cast(pSource->x) * (1.0f / 127.0f)), + (pSource->y == -128) ? -1.f : (static_cast(pSource->y) * (1.0f / 127.0f)), + 0.0f, + 0.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast( pSource ) ); + int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u16(vInt8) ); + int32x4_t vInt = vmovl_s16( vget_low_s16( vInt16 ) ); + vInt = vandq_s32( vInt, g_XMMaskXY ); + float32x4_t R = vcvtq_f32_s32(vInt); + R = vmulq_n_f32( R, 1.0f/127.0f ); + return vmaxq_f32( R, vdupq_n_f32(-1.f) ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { 1.0f / 127.0f, 1.0f / (127.0f*256.0f), 0, 0 }; + static const XMVECTORU32 Mask = { 0xFF, 0xFF00, 0, 0 }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask + vTemp = _mm_and_ps(vTemp,Mask); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,Scale); + // Clamp result (for case of -128) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadByte2 +( + const XMBYTE2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + static_cast(pSource->x), + static_cast(pSource->y), + 0.0f, + 0.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast( pSource ) ); + int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u16(vInt8) ); + int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) ); + vInt = vandq_s32( vInt, g_XMMaskXY ); + return vcvtq_f32_s32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f*256.0f) }; + static const XMVECTORU32 Mask = { 0xFF, 0xFF00, 0, 0 }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask + vTemp = _mm_and_ps(vTemp,Mask); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddByte4); + // Fix y, z and w because they are too large + return _mm_mul_ps(vTemp,Scale); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUByteN2 +( + const XMUBYTEN2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + static_cast(pSource->x) * (1.0f / 255.0f), + static_cast(pSource->y) * (1.0f / 255.0f), + 0.0f, + 0.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast( pSource ) ); + uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u16(vInt8) ); + uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); + vInt = vandq_u32( vInt, g_XMMaskXY ); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_n_f32( R, 1.0f/255.0f ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { 1.0f / 255.0f, 1.0f / (255.0f*256.0f), 0, 0 }; + static const XMVECTORU32 Mask = { 0xFF, 0xFF00, 0, 0 }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask + vTemp = _mm_and_ps(vTemp,Mask); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Fix y, z and w because they are too large + return _mm_mul_ps(vTemp,Scale); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUByte2 +( + const XMUBYTE2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + static_cast(pSource->x), + static_cast(pSource->y), + 0.0f, + 0.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast( pSource ) ); + uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) ); + uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); + vInt = vandq_s32( vInt, g_XMMaskXY ); + return vcvtq_f32_u32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { 1.0f, 1.0f / 256.0f, 0, 0 }; + static const XMVECTORU32 Mask = { 0xFF, 0xFF00, 0, 0 }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask + vTemp = _mm_and_ps(vTemp,Mask); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Fix y, z and w because they are too large + return _mm_mul_ps(vTemp,Scale); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadU565 +( + const XMU565* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + float(pSource->v & 0x1F), + float((pSource->v >> 5) & 0x3F), + float((pSource->v >> 11) & 0x1F), + 0.f, + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORI32 U565And = { 0x1F, 0x3F << 5, 0x1F << 11, 0 }; + static const XMVECTORF32 U565Mul = { 1.0f, 1.0f / 32.0f, 1.0f / 2048.f, 0 }; + uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast( pSource ) ); + uint32x4_t vInt = vmovl_u16( vInt16 ); + vInt = vandq_u32(vInt,U565And); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R,U565Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORI32 U565And = { 0x1F, 0x3F << 5, 0x1F << 11, 0 }; + static const XMVECTORF32 U565Mul = { 1.0f, 1.0f / 32.0f, 1.0f / 2048.f, 0 }; + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,U565And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult,U565Mul); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3PK +( + const XMFLOAT3PK* pSource +) +{ + assert(pSource); + + alignas(16) uint32_t Result[4]; + uint32_t Mantissa; + uint32_t Exponent; + + // X Channel (6-bit mantissa) + Mantissa = pSource->xm; + + if ( pSource->xe == 0x1f ) // INF or NAN + { + Result[0] = static_cast(0x7f800000 | (static_cast(pSource->xm) << 17)); + } + else + { + if ( pSource->xe != 0 ) // The value is normalized + { + Exponent = pSource->xe; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x40) == 0); + + Mantissa &= 0x3F; + } + else // The value is zero + { + Exponent = static_cast(-112); + } + + Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17); + } + + // Y Channel (6-bit mantissa) + Mantissa = pSource->ym; + + if ( pSource->ye == 0x1f ) // INF or NAN + { + Result[1] = static_cast(0x7f800000 | (static_cast(pSource->ym) << 17)); + } + else + { + if ( pSource->ye != 0 ) // The value is normalized + { + Exponent = pSource->ye; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x40) == 0); + + Mantissa &= 0x3F; + } + else // The value is zero + { + Exponent = static_cast(-112); + } + + Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17); + } + + // Z Channel (5-bit mantissa) + Mantissa = pSource->zm; + + if ( pSource->ze == 0x1f ) // INF or NAN + { + Result[2] = static_cast(0x7f800000 | (static_cast(pSource->zm) << 17)); + } + else + { + if ( pSource->ze != 0 ) // The value is normalized + { + Exponent = pSource->ze; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x20) == 0); + + Mantissa &= 0x1F; + } + else // The value is zero + { + Exponent = static_cast(-112); + } + + Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18); + } + + return XMLoadFloat3A( reinterpret_cast(&Result) ); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3SE +( + const XMFLOAT3SE* pSource +) +{ + assert(pSource); + + union { float f; int32_t i; } fi; + fi.i = 0x33800000 + (pSource->e << 23); + float Scale = fi.f; + + XMVECTORF32 v = { + Scale * float(pSource->xm), + Scale * float(pSource->ym), + Scale * float(pSource->zm), + 1.0f }; + return v; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadHalf4 +( + const XMHALF4* pSource +) +{ + assert(pSource); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); + return _mm_cvtph_ps( V ); +#else + XMVECTORF32 vResult = { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + XMConvertHalfToFloat(pSource->z), + XMConvertHalfToFloat(pSource->w) + }; + return vResult.v; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadShortN4 +( + const XMSHORTN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (pSource->x == -32768) ? -1.f : (static_cast(pSource->x) * (1.0f / 32767.0f)), + (pSource->y == -32768) ? -1.f : (static_cast(pSource->y) * (1.0f / 32767.0f)), + (pSource->z == -32768) ? -1.f : (static_cast(pSource->z) * (1.0f / 32767.0f)), + (pSource->w == -32768) ? -1.f : (static_cast(pSource->w) * (1.0f / 32767.0f)) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int16x4_t vInt = vld1_s16(reinterpret_cast(pSource)); + int32x4_t V = vmovl_s16( vInt ); + V = vcvtq_f32_s32( V ); + V = vmulq_n_f32( V, 1.0f/32767.0f ); + return vmaxq_f32( V, vdupq_n_f32(-1.f) ); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x and z - 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16); + // Convert to -1.0f - 1.0f + vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); + // Clamp result (for case of -32768) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadShort4 +( + const XMSHORT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + static_cast(pSource->x), + static_cast(pSource->y), + static_cast(pSource->z), + static_cast(pSource->w) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int16x4_t vInt = vld1_s16(reinterpret_cast(pSource)); + int32x4_t V = vmovl_s16( vInt ); + return vcvtq_f32_s32( V ); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x and z - 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUShortN4 +( + const XMUSHORTN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + static_cast(pSource->x) / 65535.0f, + static_cast(pSource->y) / 65535.0f, + static_cast(pSource->z) / 65535.0f, + static_cast(pSource->w) / 65535.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt = vld1_u16(reinterpret_cast(pSource)); + uint32x4_t V = vmovl_u16( vInt ); + V = vcvtq_f32_u32( V ); + return vmulq_n_f32( V, 1.0f/65535.0f ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixupY16W16 = { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / (65535.0f*65536.0f), 1.0f / (65535.0f*65536.0f) }; + static const XMVECTORF32 FixaddY16W16 = { 0, 0, 32768.0f*65536.0f, 32768.0f*65536.0f }; + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); + // y and w are signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipZW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // y and w + 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,FixaddY16W16); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp,FixupY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUShort4 +( + const XMUSHORT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + static_cast(pSource->x), + static_cast(pSource->y), + static_cast(pSource->z), + static_cast(pSource->w) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt = vld1_u16(reinterpret_cast(pSource)); + uint32x4_t V = vmovl_u16( vInt ); + return vcvtq_f32_u32( V ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixaddY16W16 = { 0, 0, 32768.0f, 32768.0f }; + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); + // y and w are signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipZW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16); + // y and w + 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,FixaddY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadXDecN4 +( + const XMXDECN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + (ElementX == 0x200) ? -1.f : (static_cast(static_cast(ElementX | SignExtend[ElementX >> 9])) / 511.0f), + (ElementY == 0x200) ? -1.f : (static_cast(static_cast(ElementY | SignExtend[ElementY >> 9])) / 511.0f), + (ElementZ == 0x200) ? -1.f : (static_cast(static_cast(ElementZ | SignExtend[ElementZ >> 9])) / 511.0f), + static_cast(pSource->v >> 30) / 3.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskA2B10G10R10); + vInt = veorq_u32(vInt,g_XMFlipA2B10G10R10); + float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) ); + R = vaddq_f32(R,g_XMFixAA2B10G10R10); + R = vmulq_f32(R,g_XMNormalizeA2B10G10R10); + return vmaxq_f32( R, vdupq_n_f32(-1.0f) ); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10); + // Clamp result (for case of -512) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#endif +} + +//------------------------------------------------------------------------------ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning +#endif + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadXDec4 +( + const XMXDEC4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + static_cast(static_cast(ElementX | SignExtend[ElementX >> 9])), + static_cast(static_cast(ElementY | SignExtend[ElementY >> 9])), + static_cast(static_cast(ElementZ | SignExtend[ElementZ >> 9])), + static_cast(pSource->v >> 30) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORU32 XDec4Xor = { 0x200, 0x200 << 10, 0x200 << 20, 0x80000000 }; + static const XMVECTORF32 XDec4Add = { -512.0f, -512.0f*1024.0f, -512.0f*1024.0f*1024.0f, 32768 * 65536.0f }; + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskDec4); + vInt = veorq_u32(vInt,XDec4Xor); + float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) ); + R = vaddq_f32(R ,XDec4Add); + return vmulq_f32(R,g_XMMulDec4); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORU32 XDec4Xor = { 0x200, 0x200 << 10, 0x200 << 20, 0x80000000 }; + static const XMVECTORF32 XDec4Add = { -512.0f, -512.0f*1024.0f, -512.0f*1024.0f*1024.0f, 32768 * 65536.0f }; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,XDec4Xor); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,XDec4Add); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); + return vTemp; +#endif +} + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUDecN4 +( + const XMUDECN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + static_cast(ElementX) / 1023.0f, + static_cast(ElementY) / 1023.0f, + static_cast(ElementZ) / 1023.0f, + static_cast(pSource->v >> 30) / 3.0f + }; + return vResult.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 UDecN4Mul = { 1.0f / 1023.0f, 1.0f / (1023.0f*1024.0f), 1.0f / (1023.0f*1024.0f*1024.0f), 1.0f / (3.0f*1024.0f*1024.0f*1024.0f) }; + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskDec4); + float32x4_t R = vcvtq_f32_u32( vInt ); + return vmulq_f32(R,UDecN4Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 UDecN4Mul = { 1.0f / 1023.0f, 1.0f / (1023.0f*1024.0f), 1.0f / (1023.0f*1024.0f*1024.0f), 1.0f / (3.0f*1024.0f*1024.0f*1024.0f) }; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,UDecN4Mul); + return vTemp; +#endif +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUDecN4_XR +( + const XMUDECN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + int32_t ElementX = pSource->v & 0x3FF; + int32_t ElementY = (pSource->v >> 10) & 0x3FF; + int32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + static_cast(ElementX - 0x180) / 510.0f, + static_cast(ElementY - 0x180) / 510.0f, + static_cast(ElementZ - 0x180) / 510.0f, + static_cast(pSource->v >> 30) / 3.0f + }; + + return vResult.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 XRMul = { 1.0f / 510.0f, 1.0f / (510.0f*1024.0f), 1.0f / (510.0f*1024.0f*1024.0f), 1.0f / (3.0f*1024.0f*1024.0f*1024.0f) }; + static const XMVECTORI32 XRBias = { 0x180, 0x180 * 1024, 0x180 * 1024 * 1024, 0 }; + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskDec4); + int32x4_t vTemp = vsubq_s32( vreinterpretq_s32_u32(vInt), XRBias ); + vTemp = veorq_u32( vTemp, g_XMFlipW ); + float32x4_t R = vcvtq_f32_s32( vTemp ); + R = vaddq_f32(R,g_XMAddUDec4); + return vmulq_f32(R,XRMul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 XRMul = { 1.0f / 510.0f, 1.0f / (510.0f*1024.0f), 1.0f / (510.0f*1024.0f*1024.0f), 1.0f / (3.0f*1024.0f*1024.0f*1024.0f) }; + static const XMVECTORI32 XRBias = { 0x180, 0x180 * 1024, 0x180 * 1024 * 1024, 0 }; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask channels + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // Subtract bias + vTemp = _mm_castsi128_ps( _mm_sub_epi32( _mm_castps_si128(vTemp), XRBias ) ); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Convert to 0.0f-1.0f + return _mm_mul_ps(vTemp,XRMul); +#endif +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUDec4 +( + const XMUDEC4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + static_cast(ElementX), + static_cast(ElementY), + static_cast(ElementZ), + static_cast(pSource->v >> 30) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskDec4); + float32x4_t R = vcvtq_f32_u32( vInt ); + return vmulq_f32(R,g_XMMulDec4); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning +#endif + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadDecN4 +( + const XMDECN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; + static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC}; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + uint32_t ElementW = pSource->v >> 30; + + XMVECTORF32 vResult = { + (ElementX == 0x200) ? -1.f : (static_cast(static_cast(ElementX | SignExtend[ElementX >> 9])) / 511.0f), + (ElementY == 0x200) ? -1.f : (static_cast(static_cast(ElementY | SignExtend[ElementY >> 9])) / 511.0f), + (ElementZ == 0x200) ? -1.f : (static_cast(static_cast(ElementZ | SignExtend[ElementZ >> 9])) / 511.0f), + (ElementW == 0x2) ? -1.f : static_cast(static_cast(ElementW | SignExtendW[(ElementW >> 1) & 1])) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 DecN4Mul = { 1.0f / 511.0f, 1.0f / (511.0f*1024.0f), 1.0f / (511.0f*1024.0f*1024.0f), 1.0f / (1024.0f*1024.0f*1024.0f) }; + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskDec4); + vInt = veorq_u32(vInt,g_XMXorDec4); + float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) ); + R = vaddq_f32(R,g_XMAddDec4); + R = vmulq_f32(R,DecN4Mul); + return vmaxq_f32( R, vdupq_n_f32(-1.0f) ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 DecN4Mul = { 1.0f / 511.0f, 1.0f / (511.0f*1024.0f), 1.0f / (511.0f*1024.0f*1024.0f), 1.0f / (1024.0f*1024.0f*1024.0f) }; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorDec4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,DecN4Mul); + // Clamp result (for case of -512/-1) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadDec4 +( + const XMDEC4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; + static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC}; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + uint32_t ElementW = pSource->v >> 30; + + XMVECTORF32 vResult = { + static_cast(static_cast(ElementX | SignExtend[ElementX >> 9])), + static_cast(static_cast(ElementY | SignExtend[ElementY >> 9])), + static_cast(static_cast(ElementZ | SignExtend[ElementZ >> 9])), + static_cast(static_cast(ElementW | SignExtendW[ElementW >> 1])) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskDec4); + vInt = veorq_u32(vInt,g_XMXorDec4); + float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) ); + R = vaddq_f32(R,g_XMAddDec4); + return vmulq_f32(R,g_XMMulDec4); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorDec4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); + return vTemp; +#endif +} + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUByteN4 +( + const XMUBYTEN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + static_cast(pSource->x) / 255.0f, + static_cast(pSource->y) / 255.0f, + static_cast(pSource->z) / 255.0f, + static_cast(pSource->w) / 255.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast( pSource ) ); + uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) ); + uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_n_f32( R, 1.0f/255.0f ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadUByteN4Mul = { 1.0f / 255.0f, 1.0f / (255.0f*256.0f), 1.0f / (255.0f*65536.0f), 1.0f / (255.0f*65536.0f*256.0f) }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUByte4 +( + const XMUBYTE4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + static_cast(pSource->x), + static_cast(pSource->y), + static_cast(pSource->z), + static_cast(pSource->w) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast( pSource ) ); + uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) ); + uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); + return vcvtq_f32_u32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadUByte4Mul = { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f*256.0f) }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadByteN4 +( + const XMBYTEN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (pSource->x == -128) ? -1.f : (static_cast(pSource->x) / 127.0f), + (pSource->y == -128) ? -1.f : (static_cast(pSource->y) / 127.0f), + (pSource->z == -128) ? -1.f : (static_cast(pSource->z) / 127.0f), + (pSource->w == -128) ? -1.f : (static_cast(pSource->w) / 127.0f) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast( pSource ) ); + int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u32(vInt8) ); + int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) ); + float32x4_t R = vcvtq_f32_s32(vInt); + R = vmulq_n_f32( R, 1.0f/127.0f ); + return vmaxq_f32( R, vdupq_n_f32(-1.f) ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadByteN4Mul = { 1.0f / 127.0f, 1.0f / (127.0f*256.0f), 1.0f / (127.0f*65536.0f), 1.0f / (127.0f*65536.0f*256.0f) }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul); + // Clamp result (for case of -128) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadByte4 +( + const XMBYTE4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + static_cast(pSource->x), + static_cast(pSource->y), + static_cast(pSource->z), + static_cast(pSource->w) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast( pSource ) ); + int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u32(vInt8) ); + int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) ); + return vcvtq_f32_s32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadByte4Mul = { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f*256.0f) }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadByte4Mul); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUNibble4 +( + const XMUNIBBLE4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + float(pSource->v & 0xF), + float((pSource->v >> 4) & 0xF), + float((pSource->v >> 8) & 0xF), + float((pSource->v >> 12) & 0xF) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORI32 UNibble4And = { 0xF, 0xF0, 0xF00, 0xF000 }; + static const XMVECTORF32 UNibble4Mul = { 1.0f, 1.0f / 16.f, 1.0f / 256.f, 1.0f / 4096.f }; + uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast( pSource ) ); + uint32x4_t vInt = vmovl_u16( vInt16 ); + vInt = vandq_u32(vInt,UNibble4And); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R,UNibble4Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORI32 UNibble4And = { 0xF, 0xF0, 0xF00, 0xF000 }; + static const XMVECTORF32 UNibble4Mul = { 1.0f, 1.0f / 16.f, 1.0f / 256.f, 1.0f / 4096.f }; + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,UNibble4And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult,UNibble4Mul); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadU555 +( + const XMU555* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + float(pSource->v & 0x1F), + float((pSource->v >> 5) & 0x1F), + float((pSource->v >> 10) & 0x1F), + float((pSource->v >> 15) & 0x1) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORI32 U555And = { 0x1F, 0x1F << 5, 0x1F << 10, 0x8000 }; + static const XMVECTORF32 U555Mul = { 1.0f, 1.0f / 32.f, 1.0f / 1024.f, 1.0f / 32768.f }; + uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast( pSource ) ); + uint32x4_t vInt = vmovl_u16( vInt16 ); + vInt = vandq_u32(vInt,U555And); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R,U555Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORI32 U555And = { 0x1F, 0x1F << 5, 0x1F << 10, 0x8000 }; + static const XMVECTORF32 U555Mul = { 1.0f, 1.0f / 32.f, 1.0f / 1024.f, 1.0f / 32768.f }; + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,U555And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult,U555Mul); + return vResult; +#endif +} + +#ifdef _MSC_VER +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#endif + +/**************************************************************************** + * + * Vector and matrix store operations + * + ****************************************************************************/ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreColor +( + XMCOLOR* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, g_UByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->c = (static_cast(tmp.w) << 24) | + (static_cast(tmp.x) << 16) | + (static_cast(tmp.y) << 8) | + static_cast(tmp.z); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 255.0f ); + R = XMVectorRound(R); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); + uint32_t rgba = vget_lane_u32( vreinterpret_u32_u8(vInt8), 0 ); + pDestination->c = (rgba & 0xFF00FF00) | ((rgba >> 16) & 0xFF) | ((rgba << 16) & 0xFF0000); +#elif defined(_XM_SSE_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Set>1 to 1 + vResult = _mm_min_ps(vResult,g_XMOne); + // Convert to 0-255 + vResult = _mm_mul_ps(vResult,g_UByteMax); + // Shuffle RGBA to ARGB + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + // Convert to int + __m128i vInt = _mm_cvtps_epi32(vResult); + // Mash to shorts + vInt = _mm_packs_epi32(vInt,vInt); + // Mash to bytes + vInt = _mm_packus_epi16(vInt,vInt); + // Store the color + _mm_store_ss(reinterpret_cast(&pDestination->c),_mm_castsi128_ps(vInt)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreHalf2 +( + XMHALF2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V1 = _mm_cvtps_ph( V, 0 ); + _mm_store_ss( reinterpret_cast(pDestination), _mm_castsi128_ps(V1) ); +#else + pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V)); + pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V)); +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreShortN2 +( + XMSHORTN2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 32767.0f ); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32( vInt32 ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_s16(vInt16), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,g_ShortMax); + __m128i vResulti = _mm_cvtps_epi32(vResult); + vResulti = _mm_packs_epi32(vResulti,vResulti); + _mm_store_ss(reinterpret_cast(&pDestination->x),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreShort2 +( + XMSHORT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-32767.f) ); + R = vminq_f32(R, vdupq_n_f32(32767.0f)); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32( vInt32 ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_s16(vInt16), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_ShortMin); + vResult = _mm_min_ps(vResult,g_ShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Pack the ints into shorts + vInt = _mm_packs_epi32(vInt,vInt); + _mm_store_ss(reinterpret_cast(&pDestination->x),_mm_castsi128_ps(vInt)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUShortN2 +( + XMUSHORTN2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 65535.0f ); + R = vaddq_f32( R, g_XMOneHalf ); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_u16(vInt16), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,g_UShortMax); + vResult = _mm_add_ps(vResult,g_XMOneHalf); + // Convert to int + __m128i vInt = _mm_cvttps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUShort2 +( + XMUSHORT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) ); + R = vminq_f32(R, vdupq_n_f32(65535.0f)); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_u16(vInt16), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_UShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreByteN2 +( + XMBYTEN2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 127.0f ); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32( vInt32 ); + int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) ); + vst1_lane_u16( reinterpret_cast( pDestination ), vreinterpret_u16_s8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,g_ByteMax); + // Convert to int by rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt,0)); + auto y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->v = static_cast(((static_cast(y) & 0xFF) << 8) | (static_cast(x) & 0xFF)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreByte2 +( + XMBYTE2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f) ); + R = vminq_f32(R, vdupq_n_f32(127.0f)); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32( vInt32 ); + int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) ); + vst1_lane_u16( reinterpret_cast( pDestination ), vreinterpret_u16_s8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_ByteMin); + vResult = _mm_min_ps(vResult,g_ByteMax); + // Convert to int by rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt,0)); + auto y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->v = static_cast(((static_cast(y) & 0xFF) << 8) | (static_cast(x) & 0xFF)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUByteN2 +( + XMUBYTEN2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, g_UByteMax, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 255.0f ); + R = vaddq_f32( R, g_XMOneHalf ); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); + vst1_lane_u16( reinterpret_cast( pDestination ), vreinterpret_u16_u8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,g_UByteMax); + vResult = _mm_add_ps(vResult,g_XMOneHalf); + // Convert to int + __m128i vInt = _mm_cvttps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt,0)); + auto y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->v = static_cast(((static_cast(y) & 0xFF) << 8) | (static_cast(x) & 0xFF)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUByte2 +( + XMUBYTE2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) ); + R = vminq_f32(R, vdupq_n_f32(255.0f)); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); + vst1_lane_u16( reinterpret_cast( pDestination ), vreinterpret_u16_u8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_UByteMax); + // Convert to int by rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt,0)); + auto y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->v = static_cast(((static_cast(y) & 0xFF) << 8) | (static_cast(x) & 0xFF)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreU565 +( + XMU565* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + static const XMVECTORF32 Max = { 31.0f, 63.0f, 31.0f, 0.0f }; + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->v = static_cast( + ((static_cast(tmp.z) & 0x1F) << 11) + | ((static_cast(tmp.y) & 0x3F) << 5) + | ((static_cast(tmp.x) & 0x1F))); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { 1.0f, 32.f, 32.f*64.f, 0.f }; + static const XMVECTORU32 Mask = { 0x1F, 0x3F << 5, 0x1F << 11, 0 }; + float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0)); + vResult = vminq_f32(vResult,Max); + vResult = vmulq_f32(vResult,Scale); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti,Mask); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vhi = vget_high_u32(vResulti); + vTemp = vorr_u32( vTemp, vhi ); + vTemp = vpadd_u32( vTemp, vTemp ); + vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt,0)); + auto y = static_cast(_mm_extract_epi16(vInt,2)); + auto z = static_cast(_mm_extract_epi16(vInt,4)); + pDestination->v = static_cast( + ((static_cast(z) & 0x1F) << 11) + | ((static_cast(y) & 0x3F) << 5) + | ((static_cast(x) & 0x1F))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3PK +( + XMFLOAT3PK* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + + alignas(16) uint32_t IValue[4]; + XMStoreFloat3A( reinterpret_cast(&IValue), V ); + + uint32_t Result[3]; + + // X & Y Channels (5-bit exponent, 6-bit mantissa) + for(uint32_t j=0; j < 2; ++j) + { + uint32_t Sign = IValue[j] & 0x80000000; + uint32_t I = IValue[j] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Result[j] = 0x7c0; + if (( I & 0x7FFFFF ) != 0) + { + Result[j] = 0x7c0 | (((I>>17)|(I>>11)|(I>>6)|(I))&0x3f); + } + else if ( Sign ) + { + // -INF is clamped to 0 since 3PK is positive only + Result[j] = 0; + } + } + else if ( Sign ) + { + // 3PK is positive only, so clamp to zero + Result[j] = 0; + } + else if (I > 0x477E0000U) + { + // The number is too large to be represented as a float11, set to max + Result[j] = 0x7BF; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float11 + // Convert it to a denormalized value. + uint32_t Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float11 + I += 0xC8000000U; + } + + Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU; + } + } + + // Z Channel (5-bit exponent, 5-bit mantissa) + uint32_t Sign = IValue[2] & 0x80000000; + uint32_t I = IValue[2] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Result[2] = 0x3e0; + if ( I & 0x7FFFFF ) + { + Result[2] = 0x3e0 | (((I>>18)|(I>>13)|(I>>3)|(I))&0x1f); + } + else if ( Sign ) + { + // -INF is clamped to 0 since 3PK is positive only + Result[2] = 0; + } + } + else if ( Sign ) + { + // 3PK is positive only, so clamp to zero + Result[2] = 0; + } + else if (I > 0x477C0000U) + { + // The number is too large to be represented as a float10, set to max + Result[2] = 0x3df; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float10 + // Convert it to a denormalized value. + uint32_t Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float10 + I += 0xC8000000U; + } + + Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU; + } + + // Pack Result into memory + pDestination->v = (Result[0] & 0x7ff) + | ( (Result[1] & 0x7ff) << 11 ) + | ( (Result[2] & 0x3ff) << 22 ); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3SE +( + XMFLOAT3SE* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + + XMFLOAT3A tmp; + XMStoreFloat3A( &tmp, V ); + + static const float maxf9 = float(0x1FF << 7); + static const float minf9 = float(1.f / (1 << 16)); + + float x = (tmp.x >= 0.f) ? ( (tmp.x > maxf9) ? maxf9 : tmp.x ) : 0.f; + float y = (tmp.y >= 0.f) ? ( (tmp.y > maxf9) ? maxf9 : tmp.y ) : 0.f; + float z = (tmp.z >= 0.f) ? ( (tmp.z > maxf9) ? maxf9 : tmp.z ) : 0.f; + + const float max_xy = (x > y) ? x : y; + const float max_xyz = (max_xy > z) ? max_xy : z; + + const float maxColor = (max_xyz > minf9) ? max_xyz : minf9; + + union { float f; int32_t i; } fi; + fi.f = maxColor; + fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1) + + auto exp = static_cast(fi.i) >> 23; + pDestination->e = exp - 0x6f; + + fi.i = static_cast(0x83000000 - (exp << 23)); + float ScaleR = fi.f; + + pDestination->xm = static_cast( Internal::round_to_nearest(x * ScaleR) ); + pDestination->ym = static_cast( Internal::round_to_nearest(y * ScaleR) ); + pDestination->zm = static_cast( Internal::round_to_nearest(z * ScaleR) ); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreHalf4 +( + XMHALF4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V1 = _mm_cvtps_ph( V, 0 ); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 ); +#else + XMFLOAT4A t; + XMStoreFloat4A(&t, V ); + + pDestination->x = XMConvertFloatToHalf(t.x); + pDestination->y = XMConvertFloatToHalf(t.y); + pDestination->z = XMConvertFloatToHalf(t.z); + pDestination->w = XMConvertFloatToHalf(t.w); +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreShortN4 +( + XMSHORTN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(-1.f) ); + vResult = vminq_f32( vResult, vdupq_n_f32(1.0f) ); + vResult = vmulq_n_f32( vResult, 32767.0f ); + vResult = vcvtq_s32_f32( vResult ); + int16x4_t vInt = vmovn_s32( vResult ); + vst1_s16( reinterpret_cast(pDestination), vInt ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,g_ShortMax); + __m128i vResulti = _mm_cvtps_epi32(vResult); + vResulti = _mm_packs_epi32(vResulti,vResulti); + _mm_store_sd(reinterpret_cast(&pDestination->x),_mm_castsi128_pd(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreShort4 +( + XMSHORT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32( V, g_ShortMin ); + vResult = vminq_f32( vResult, g_ShortMax ); + vResult = vcvtq_s32_f32( vResult ); + int16x4_t vInt = vmovn_s32( vResult ); + vst1_s16( reinterpret_cast(pDestination), vInt ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_ShortMin); + vResult = _mm_min_ps(vResult,g_ShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Pack the ints into shorts + vInt = _mm_packs_epi32(vInt,vInt); + _mm_store_sd(reinterpret_cast(&pDestination->x),_mm_castsi128_pd(vInt)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUShortN4 +( + XMUSHORTN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(0) ); + vResult = vminq_f32( vResult, vdupq_n_f32(1.0f) ); + vResult = vmulq_n_f32( vResult, 65535.0f ); + vResult = vaddq_f32( vResult, g_XMOneHalf ); + vResult = vcvtq_u32_f32( vResult ); + uint16x4_t vInt = vmovn_u32( vResult ); + vst1_u16( reinterpret_cast(pDestination), vInt ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,g_UShortMax); + vResult = _mm_add_ps(vResult,g_XMOneHalf); + // Convert to int + __m128i vInt = _mm_cvttps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->z = static_cast(_mm_extract_epi16(vInt,4)); + pDestination->w = static_cast(_mm_extract_epi16(vInt,6)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUShort4 +( + XMUSHORT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(0) ); + vResult = vminq_f32( vResult, g_UShortMax ); + vResult = vcvtq_u32_f32( vResult ); + uint16x4_t vInt = vmovn_u32( vResult ); + vst1_u16( reinterpret_cast(pDestination), vInt ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_UShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->z = static_cast(_mm_extract_epi16(vInt,4)); + pDestination->w = static_cast(_mm_extract_epi16(vInt,6)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreXDecN4 +( + XMXDECN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + static const XMVECTORF32 Min = { -1.0f, -1.0f, -1.0f, 0.0f }; + +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = { 511.0f, 511.0f, 511.0f, 3.0f }; + + XMVECTOR N = XMVectorClamp(V, Min.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | (static_cast(tmp.x) & 0x3FF)); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { 511.0f, 511.0f*1024.0f, 511.0f*1048576.0f, 3.0f*536870912.0f }; + static const XMVECTORI32 ScaleMask = { 0x3FF, 0x3FF << 10, 0x3FF << 20, 0x3 << 29 }; + float32x4_t vResult = vmaxq_f32(V,Min); + vResult = vminq_f32(vResult,vdupq_n_f32(1.0f)); + vResult = vmulq_f32(vResult,Scale); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti,ScaleMask); + int32x4_t vResultw = vandq_s32(vResulti,g_XMMaskW); + vResulti = vaddq_s32(vResulti,vResultw); + // Do a horizontal or of all 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti)); + uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti)); + vTemp = vorr_u32( vTemp, vhi ); + vTemp = vpadd_u32( vTemp, vTemp ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { 511.0f, 511.0f*1024.0f, 511.0f*1048576.0f, 3.0f*536870912.0f }; + static const XMVECTORI32 ScaleMask = { 0x3FF, 0x3FF << 10, 0x3FF << 20, 0x3 << 29 }; + XMVECTOR vResult = _mm_max_ps(V,Min); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,Scale); + // Convert to int (W is unsigned) + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,ScaleMask); + // To fix W, add itself to shift it up to <<30 instead of <<29 + __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW); + vResulti = _mm_add_epi32(vResulti,vResultw); + // Do a horizontal or of all 4 entries + vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vResulti),_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult)); + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult)); + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult)); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning +#endif + +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreXDec4 +( + XMXDEC4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + static const XMVECTORF32 MinXDec4 = { -511.0f, -511.0f, -511.0f, 0.0f }; + static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f }; + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, MinXDec4, MaxXDec4); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleXDec4 = { 1.0f, 1024.0f / 2.0f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f / 2.0f }; + static const XMVECTORI32 MaskXDec4 = { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) }; + float32x4_t vResult = vmaxq_f32(V,MinXDec4); + vResult = vminq_f32(vResult,MaxXDec4); + vResult = vmulq_f32(vResult,ScaleXDec4); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti,MaskXDec4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti)); + uint32x2_t vTemp2 = vget_high_u32(vreinterpret_u32_s32(vResulti)); + vTemp = vorr_u32( vTemp, vTemp2 ); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32( vTemp, 1 ); + vTemp2 = vadd_s32( vTemp2, vTemp2 ); + vTemp = vorr_u32( vTemp, vTemp2 ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleXDec4 = { 1.0f, 1024.0f / 2.0f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f / 2.0f }; + static const XMVECTORI32 MaskXDec4 = { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,MinXDec4); + vResult = _mm_min_ps(vResult,MaxXDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleXDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskXDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a single bit left shift on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUDecN4 +( + XMUDECN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = { 1023.0f, 1023.0f, 1023.0f, 3.0f }; + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, Scale.v); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleUDecN4 = { 1023.0f, 1023.0f*1024.0f*0.5f, 1023.0f*1024.0f*1024.0f, 3.0f*1024.0f*1024.0f*1024.0f*0.5f }; + static const XMVECTORI32 MaskUDecN4 = { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) }; + float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0.f)); + vResult = vminq_f32(vResult,vdupq_n_f32(1.f)); + vResult = vmulq_f32(vResult,ScaleUDecN4); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti,MaskUDecN4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32( vTemp, vTemp2 ); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32( vTemp, 1 ); + vTemp2 = vadd_u32( vTemp2, vTemp2 ); + vTemp = vorr_u32( vTemp, vTemp2 ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUDecN4 = { 1023.0f, 1023.0f*1024.0f*0.5f, 1023.0f*1024.0f*1024.0f, 3.0f*1024.0f*1024.0f*1024.0f*0.5f }; + static const XMVECTORI32 MaskUDecN4 = { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUDecN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUDecN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUDecN4_XR +( + XMUDECN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + static const XMVECTORF32 Scale = { 510.0f, 510.0f, 510.0f, 3.0f }; + static const XMVECTORF32 Bias = { 384.0f, 384.0f, 384.0f, 0.0f }; + static const XMVECTORF32 C = { 1023.f, 1023.f, 1023.f, 3.f }; + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorMultiplyAdd( V, Scale, Bias ); + N = XMVectorClamp( N, g_XMZero, C ); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Shift = { 1.0f, 1024.0f*0.5f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f*0.5f }; + static const XMVECTORU32 MaskUDecN4 = { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) }; + float32x4_t vResult = vmlaq_f32( Bias, V, Scale ); + vResult = vmaxq_f32(vResult,vdupq_n_f32(0.f)); + vResult = vminq_f32(vResult,C); + vResult = vmulq_f32(vResult,Shift); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti,MaskUDecN4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32( vTemp, vTemp2 ); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32( vTemp, 1 ); + vTemp2 = vadd_u32( vTemp2, vTemp2 ); + vTemp = vorr_u32( vTemp, vTemp2 ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Shift = { 1.0f, 1024.0f*0.5f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f*0.5f }; + static const XMVECTORU32 MaskUDecN4 = { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) }; + // Scale & bias + XMVECTOR vResult = _mm_mul_ps( V, Scale ); + vResult = _mm_add_ps( vResult, Bias ); + // Clamp to bounds + vResult = _mm_max_ps(vResult,g_XMZero); + vResult = _mm_min_ps(vResult,C); + // Scale by shift values + vResult = _mm_mul_ps(vResult,Shift); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUDecN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUDec4 +( + XMUDEC4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f }; + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), MaxUDec4); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleUDec4 = { 1.0f, 1024.0f / 2.0f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f / 2.0f }; + static const XMVECTORI32 MaskUDec4 = { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) }; + float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0.f)); + vResult = vminq_f32(vResult,MaxUDec4); + vResult = vmulq_f32(vResult,ScaleUDec4); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti,MaskUDec4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32( vTemp, vTemp2 ); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32( vTemp, 1 ); + vTemp2 = vadd_u32( vTemp2, vTemp2 ); + vTemp = vorr_u32( vTemp, vTemp2 ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUDec4 = { 1.0f, 1024.0f / 2.0f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f / 2.0f }; + static const XMVECTORI32 MaskUDec4 = { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,MaxUDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning +#endif + +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreDecN4 +( + XMDECN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = { 511.0f, 511.0f, 511.0f, 1.0f }; + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleDecN4 = { 511.0f, 511.0f*1024.0f, 511.0f*1024.0f*1024.0f, 1.0f*1024.0f*1024.0f*1024.0f }; + float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(-1.f)); + vResult = vminq_f32(vResult,vdupq_n_f32(1.f)); + vResult = vmulq_f32(vResult,ScaleDecN4); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti,g_XMMaskDec4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti)); + uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti)); + vTemp = vorr_u32( vTemp, vhi ); + vTemp = vpadd_u32( vTemp, vTemp ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleDecN4 = { 511.0f, 511.0f*1024.0f, 511.0f*1024.0f*1024.0f, 1.0f*1024.0f*1024.0f*1024.0f }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleDecN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,g_XMMaskDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreDec4 +( + XMDEC4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + static const XMVECTORF32 MinDec4 = { -511.0f, -511.0f, -511.0f, -1.0f }; + static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f }; + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, MinDec4, MaxDec4); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleDec4 = { 1.0f, 1024.0f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f }; + float32x4_t vResult = vmaxq_f32(V,MinDec4); + vResult = vminq_f32(vResult,MaxDec4); + vResult = vmulq_f32(vResult,ScaleDec4); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti,g_XMMaskDec4); + // Do a horizontal or of all 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti)); + uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti)); + vTemp = vorr_u32( vTemp, vhi ); + vTemp = vpadd_u32( vTemp, vTemp ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleDec4 = { 1.0f, 1024.0f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,MinDec4); + vResult = _mm_min_ps(vResult,MaxDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,g_XMMaskDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUByteN4 +( + XMUBYTEN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, g_UByteMax); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 255.0f ); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_u8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUByteN4 = { 255.0f, 255.0f*256.0f*0.5f, 255.0f*256.0f*256.0f, 255.0f*256.0f*256.0f*256.0f*0.5f }; + static const XMVECTORI32 MaskUByteN4 = { 0xFF, 0xFF << (8 - 1), 0xFF << 16, 0xFF << (24 - 1) }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUByteN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUByteN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a single bit left shift to fix y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUByte4 +( + XMUBYTE4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) ); + R = vminq_f32(R, vdupq_n_f32(255.0f)); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_u8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUByte4 = { 1.0f, 256.0f*0.5f, 256.0f*256.0f, 256.0f*256.0f*256.0f*0.5f }; + static const XMVECTORI32 MaskUByte4 = { 0xFF, 0xFF << (8 - 1), 0xFF << 16, 0xFF << (24 - 1) }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_UByteMax); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUByte4); + // Convert to int by rounding + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUByte4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a single bit left shift to fix y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreByteN4 +( + XMBYTEN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(V, g_ByteMax); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 127.0f ); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32( vInt32 ); + int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_s8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleByteN4 = { 127.0f, 127.0f*256.0f, 127.0f*256.0f*256.0f, 127.0f*256.0f*256.0f*256.0f }; + static const XMVECTORI32 MaskByteN4 = { 0xFF, 0xFF << 8, 0xFF << 16, static_cast(0xFF000000) }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleByteN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskByteN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreByte4 +( + XMBYTE4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f) ); + R = vminq_f32(R, vdupq_n_f32(127.f)); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32( vInt32 ); + int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_s8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleByte4 = { 1.0f, 256.0f, 256.0f*256.0f, 256.0f*256.0f*256.0f }; + static const XMVECTORI32 MaskByte4 = { 0xFF, 0xFF << 8, 0xFF << 16, static_cast(0xFF000000) }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_ByteMin); + vResult = _mm_min_ps(vResult,g_ByteMax); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleByte4); + // Convert to int by rounding + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskByte4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUNibble4 +( + XMUNIBBLE4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + static const XMVECTORF32 Max = { 15.0f, 15.0f, 15.0f, 15.0f }; +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = static_cast( + ((static_cast(tmp.w) & 0xF) << 12) + | ((static_cast(tmp.z) & 0xF) << 8) + | ((static_cast(tmp.y) & 0xF) << 4) + | (static_cast(tmp.x) & 0xF)); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { 1.0f, 16.f, 16.f*16.f, 16.f*16.f*16.f }; + static const XMVECTORU32 Mask = { 0xF, 0xF << 4, 0xF << 8, 0xF << 12 }; + float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0)); + vResult = vminq_f32(vResult,Max); + vResult = vmulq_f32(vResult,Scale); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti,Mask); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vhi = vget_high_u32(vResulti); + vTemp = vorr_u32( vTemp, vhi ); + vTemp = vpadd_u32( vTemp, vTemp ); + vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt,0)); + auto y = static_cast(_mm_extract_epi16(vInt,2)); + auto z = static_cast(_mm_extract_epi16(vInt,4)); + auto w = static_cast(_mm_extract_epi16(vInt,6)); + pDestination->v = static_cast( + ((static_cast(w) & 0xF) << 12) + | ((static_cast(z) & 0xF) << 8) + | ((static_cast(y) & 0xF) << 4) + | ((static_cast(x) & 0xF))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreU555 +( + XMU555* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + static const XMVECTORF32 Max = { 31.0f, 31.0f, 31.0f, 1.0f }; + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = static_cast( + ((tmp.w > 0.f) ? 0x8000 : 0) + | ((static_cast(tmp.z) & 0x1F) << 10) + | ((static_cast(tmp.y) & 0x1F) << 5) + | (static_cast(tmp.x) & 0x1F)); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { 1.0f, 32.f / 2.f, 32.f*32.f, 32.f*32.f*32.f / 2.f }; + static const XMVECTORU32 Mask = { 0x1F, 0x1F << (5 - 1), 0x1F << 10, 0x1 << (15 - 1) }; + float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0)); + vResult = vminq_f32(vResult,Max); + vResult = vmulq_f32(vResult,Scale); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti,Mask); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32( vTemp, vTemp2 ); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32( vTemp, 1 ); + vTemp2 = vadd_s32( vTemp2, vTemp2 ); + vTemp = vorr_u32( vTemp, vTemp2 ); + vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt,0)); + auto y = static_cast(_mm_extract_epi16(vInt,2)); + auto z = static_cast(_mm_extract_epi16(vInt,4)); + auto w = static_cast(_mm_extract_epi16(vInt,6)); + pDestination->v = static_cast( + (static_cast(w) ? 0x8000 : 0) + | ((static_cast(z) & 0x1F) << 10) + | ((static_cast(y) & 0x1F) << 5) + | ((static_cast(x) & 0x1F))); +#endif +} + + +/**************************************************************************** + * + * XMCOLOR operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMCOLOR::XMCOLOR +( + float _r, + float _g, + float _b, + float _a +) +{ + XMStoreColor(this, XMVectorSet(_r, _g, _b, _a)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMCOLOR::XMCOLOR +( + const float* pArray +) +{ + XMStoreColor(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMHALF2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMHALF2::XMHALF2 +( + float _x, + float _y +) +{ + x = XMConvertFloatToHalf(_x); + y = XMConvertFloatToHalf(_y); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMHALF2::XMHALF2 +( + const float* pArray +) +{ + assert( pArray != nullptr ); + x = XMConvertFloatToHalf(pArray[0]); + y = XMConvertFloatToHalf(pArray[1]); +} + +/**************************************************************************** + * + * XMSHORTN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMSHORTN2::XMSHORTN2 +( + float _x, + float _y +) +{ + XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMSHORTN2::XMSHORTN2 +( + const float* pArray +) +{ + XMStoreShortN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMSHORT2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMSHORT2::XMSHORT2 +( + float _x, + float _y +) +{ + XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMSHORT2::XMSHORT2 +( + const float* pArray +) +{ + XMStoreShort2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORTN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMUSHORTN2::XMUSHORTN2 +( + float _x, + float _y +) +{ + XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUSHORTN2::XMUSHORTN2 +( + const float* pArray +) +{ + XMStoreUShortN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORT2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMUSHORT2::XMUSHORT2 +( + float _x, + float _y +) +{ + XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUSHORT2::XMUSHORT2 +( + const float* pArray +) +{ + XMStoreUShort2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTEN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMBYTEN2::XMBYTEN2 +( + float _x, + float _y +) +{ + XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMBYTEN2::XMBYTEN2 +( + const float* pArray +) +{ + XMStoreByteN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTE2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMBYTE2::XMBYTE2 +( + float _x, + float _y +) +{ + XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMBYTE2::XMBYTE2 +( + const float* pArray +) +{ + XMStoreByte2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTEN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMUBYTEN2::XMUBYTEN2 +( + float _x, + float _y +) +{ + XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUBYTEN2::XMUBYTEN2 +( + const float* pArray +) +{ + XMStoreUByteN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTE2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMUBYTE2::XMUBYTE2 +( + float _x, + float _y +) +{ + XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUBYTE2::XMUBYTE2 +( + const float* pArray +) +{ + XMStoreUByte2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMU565 operators + * + ****************************************************************************/ + +inline XMU565::XMU565 +( + float _x, + float _y, + float _z +) +{ + XMStoreU565(this, XMVectorSet( _x, _y, _z, 0.0f )); +} + +_Use_decl_annotations_ +inline XMU565::XMU565 +( + const float *pArray +) +{ + XMStoreU565(this, XMLoadFloat3(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMFLOAT3PK operators + * + ****************************************************************************/ + +inline XMFLOAT3PK::XMFLOAT3PK +( + float _x, + float _y, + float _z +) +{ + XMStoreFloat3PK(this, XMVectorSet( _x, _y, _z, 0.0f )); +} + +_Use_decl_annotations_ +inline XMFLOAT3PK::XMFLOAT3PK +( + const float *pArray +) +{ + XMStoreFloat3PK(this, XMLoadFloat3(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMFLOAT3SE operators + * + ****************************************************************************/ + +inline XMFLOAT3SE::XMFLOAT3SE +( + float _x, + float _y, + float _z +) +{ + XMStoreFloat3SE(this, XMVectorSet( _x, _y, _z, 0.0f )); +} + +_Use_decl_annotations_ +inline XMFLOAT3SE::XMFLOAT3SE +( + const float *pArray +) +{ + XMStoreFloat3SE(this, XMLoadFloat3(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMHALF4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMHALF4::XMHALF4 +( + float _x, + float _y, + float _z, + float _w +) +{ + x = XMConvertFloatToHalf(_x); + y = XMConvertFloatToHalf(_y); + z = XMConvertFloatToHalf(_z); + w = XMConvertFloatToHalf(_w); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMHALF4::XMHALF4 +( + const float* pArray +) +{ + XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(float), 4); +} + +/**************************************************************************** + * + * XMSHORTN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMSHORTN4::XMSHORTN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMSHORTN4::XMSHORTN4 +( + const float* pArray +) +{ + XMStoreShortN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMSHORT4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMSHORT4::XMSHORT4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMSHORT4::XMSHORT4 +( + const float* pArray +) +{ + XMStoreShort4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORTN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMUSHORTN4::XMUSHORTN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUSHORTN4::XMUSHORTN4 +( + const float* pArray +) +{ + XMStoreUShortN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORT4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMUSHORT4::XMUSHORT4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUSHORT4::XMUSHORT4 +( + const float* pArray +) +{ + XMStoreUShort4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMXDECN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMXDECN4::XMXDECN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMXDECN4::XMXDECN4 +( + const float* pArray +) +{ + XMStoreXDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMXDEC4 operators + * + ****************************************************************************/ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning +#endif + +//------------------------------------------------------------------------------ + +inline XMXDEC4::XMXDEC4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMXDEC4::XMXDEC4 +( + const float* pArray +) +{ + XMStoreXDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMDECN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMDECN4::XMDECN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMDECN4::XMDECN4 +( + const float* pArray +) +{ + XMStoreDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMDEC4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMDEC4::XMDEC4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMDEC4::XMDEC4 +( + const float* pArray +) +{ + XMStoreDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/**************************************************************************** + * + * XMUDECN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMUDECN4::XMUDECN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUDECN4::XMUDECN4 +( + const float* pArray +) +{ + XMStoreUDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUDEC4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMUDEC4::XMUDEC4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUDEC4::XMUDEC4 +( + const float* pArray +) +{ + XMStoreUDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTEN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMBYTEN4::XMBYTEN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMBYTEN4::XMBYTEN4 +( + const float* pArray +) +{ + XMStoreByteN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTE4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMBYTE4::XMBYTE4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMBYTE4::XMBYTE4 +( + const float* pArray +) +{ + XMStoreByte4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTEN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMUBYTEN4::XMUBYTEN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUBYTEN4::XMUBYTEN4 +( + const float* pArray +) +{ + XMStoreUByteN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTE4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMUBYTE4::XMUBYTE4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUBYTE4::XMUBYTE4 +( + const float* pArray +) +{ + XMStoreUByte4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUNIBBLE4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMUNIBBLE4::XMUNIBBLE4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUNibble4(this, XMVectorSet( _x, _y, _z, _w )); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUNIBBLE4::XMUNIBBLE4 +( + const float *pArray +) +{ + XMStoreUNibble4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMU555 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMU555::XMU555 +( + float _x, + float _y, + float _z, + bool _w +) +{ + XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f) )); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMU555::XMU555 +( + const float *pArray, + bool _w +) +{ + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pArray)); + XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f) )); +} + diff --git a/WickedEngine/Utility/sal.h b/WickedEngine/Utility/sal.h new file mode 100644 index 000000000..29dc49bf5 --- /dev/null +++ b/WickedEngine/Utility/sal.h @@ -0,0 +1,2961 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +/*** +*sal.h - markers for documenting the semantics of APIs +* + +* +*Purpose: +* sal.h provides a set of annotations to describe how a function uses its +* parameters - the assumptions it makes about them, and the guarantees it makes +* upon finishing. +****/ +#pragma once +#ifndef __ATTR_SAL + +/*========================================================================== + + The comments in this file are intended to give basic understanding of + the usage of SAL, the Microsoft Source Code Annotation Language. + For more details, please see http://go.microsoft.com/fwlink/?LinkID=242134 + + The macros are defined in 3 layers, plus the structural set: + + _In_/_Out_/_Ret_ Layer: + ---------------------- + This layer provides the highest abstraction and its macros should be used + in most cases. These macros typically start with: + _In_ : input parameter to a function, unmodified by called function + _Out_ : output parameter, written to by called function, pointed-to + location not expected to be initialized prior to call + _Outptr_ : like _Out_ when returned variable is a pointer type + (so param is pointer-to-pointer type). Called function + provides/allocated space. + _Outref_ : like _Outptr_, except param is reference-to-pointer type. + _Inout_ : inout parameter, read from and potentially modified by + called function. + _Ret_ : for return values + _Field_ : class/struct field invariants + For common usage, this class of SAL provides the most concise annotations. + Note that _In_/_Out_/_Inout_/_Outptr_ annotations are designed to be used + with a parameter target. Using them with _At_ to specify non-parameter + targets may yield unexpected results. + + This layer also includes a number of other properties that can be specified + to extend the ability of code analysis, most notably: + -- Designating parameters as format strings for printf/scanf/scanf_s + -- Requesting stricter type checking for C enum parameters + + _Pre_/_Post_ Layer: + ------------------ + The macros of this layer only should be used when there is no suitable macro + in the _In_/_Out_ layer. Its macros start with _Pre_ or _Post_. + This layer provides the most flexibility for annotations. + + Implementation Abstraction Layer: + -------------------------------- + Macros from this layer should never be used directly. The layer only exists + to hide the implementation of the annotation macros. + + Structural Layer: + ---------------- + These annotations, like _At_ and _When_, are used with annotations from + any of the other layers as modifiers, indicating exactly when and where + the annotations apply. + + + Common syntactic conventions: + ---------------------------- + + Usage: + ----- + _In_, _Out_, _Inout_, _Pre_, _Post_, are for formal parameters. + _Ret_, _Deref_ret_ must be used for return values. + + Nullness: + -------- + If the parameter can be NULL as a precondition to the function, the + annotation contains _opt. If the macro does not contain '_opt' the + parameter cannot be NULL. + + If an out/inout parameter returns a null pointer as a postcondition, this is + indicated by _Ret_maybenull_ or _result_maybenull_. If the macro is not + of this form, then the result will not be NULL as a postcondition. + _Outptr_ - output value is not NULL + _Outptr_result_maybenull_ - output value might be NULL + + String Type: + ----------- + _z: NullTerminated string + for _In_ parameters the buffer must have the specified stringtype before the call + for _Out_ parameters the buffer must have the specified stringtype after the call + for _Inout_ parameters both conditions apply + + Extent Syntax: + ------------- + Buffer sizes are expressed as element counts, unless the macro explicitly + contains _byte_ or _bytes_. Some annotations specify two buffer sizes, in + which case the second is used to indicate how much of the buffer is valid + as a postcondition. This table outlines the precondition buffer allocation + size, precondition number of valid elements, postcondition allocation size, + and postcondition number of valid elements for representative buffer size + annotations: + Pre | Pre | Post | Post + alloc | valid | alloc | valid + Annotation elems | elems | elems | elems + ---------- ------------------------------------ + _In_reads_(s) s | s | s | s + _Inout_updates_(s) s | s | s | s + _Inout_updates_to_(s,c) s | s | s | c + _Out_writes_(s) s | 0 | s | s + _Out_writes_to_(s,c) s | 0 | s | c + _Outptr_result_buffer_(s) ? | ? | s | s + _Outptr_result_buffer_to_(s,c) ? | ? | s | c + + For the _Outptr_ annotations, the buffer in question is at one level of + dereference. The called function is responsible for supplying the buffer. + + Success and failure: + ------------------- + The SAL concept of success allows functions to define expressions that can + be tested by the caller, which if it evaluates to non-zero, indicates the + function succeeded, which means that its postconditions are guaranteed to + hold. Otherwise, if the expression evaluates to zero, the function is + considered to have failed, and the postconditions are not guaranteed. + + The success criteria can be specified with the _Success_(expr) annotation: + _Success_(return != FALSE) BOOL + PathCanonicalizeA(_Out_writes_(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) : + pszBuf is only guaranteed to be NULL-terminated when TRUE is returned, + and FALSE indiates failure. In common practice, callers check for zero + vs. non-zero returns, so it is preferable to express the success + criteria in terms of zero/non-zero, not checked for exactly TRUE. + + Functions can specify that some postconditions will still hold, even when + the function fails, using _On_failure_(anno-list), or postconditions that + hold regardless of success or failure using _Always_(anno-list). + + The annotation _Return_type_success_(expr) may be used with a typedef to + give a default _Success_ criteria to all functions returning that type. + This is the case for common Windows API status types, including + HRESULT and NTSTATUS. This may be overridden on a per-function basis by + specifying a _Success_ annotation locally. + +============================================================================*/ + +#define __ATTR_SAL + +#ifndef _SAL_VERSION /*IFSTRIP=IGN*/ +#define _SAL_VERSION 20 +#endif + +#ifdef _PREFAST_ // [ + +// choose attribute or __declspec implementation +#ifndef _USE_DECLSPECS_FOR_SAL // [ +#define _USE_DECLSPECS_FOR_SAL 1 +#endif // ] + +#if _USE_DECLSPECS_FOR_SAL // [ +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 0 +#elif !defined(_USE_ATTRIBUTES_FOR_SAL) // ][ +#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [ +#define _USE_ATTRIBUTES_FOR_SAL 1 +#else // ][ +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif // ] +#endif // ] + + +#if !_USE_DECLSPECS_FOR_SAL // [ +#if !_USE_ATTRIBUTES_FOR_SAL // [ +#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [ +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 1 +#else // ][ +#undef _USE_DECLSPECS_FOR_SAL +#define _USE_DECLSPECS_FOR_SAL 1 +#endif // ] +#endif // ] +#endif // ] + +#else + +// Disable expansion of SAL macros in non-Prefast mode to +// improve compiler throughput. +#ifndef _USE_DECLSPECS_FOR_SAL // [ +#define _USE_DECLSPECS_FOR_SAL 0 +#endif // ] +#ifndef _USE_ATTRIBUTES_FOR_SAL // [ +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif // ] + +#endif // ] + +// safeguard for MIDL and RC builds +#if _USE_DECLSPECS_FOR_SAL && ( defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) || !defined(_PREFAST_) ) /*IFSTRIP=IGN*/ // [ +#undef _USE_DECLSPECS_FOR_SAL +#define _USE_DECLSPECS_FOR_SAL 0 +#endif // ] +#if _USE_ATTRIBUTES_FOR_SAL && ( !defined(_MSC_EXTENSIONS) || defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) ) /*IFSTRIP=IGN*/ // [ +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif // ] + +#if _USE_DECLSPECS_FOR_SAL || _USE_ATTRIBUTES_FOR_SAL + +// Special enum type for Y/N/M +enum __SAL_YesNo {_SAL_notpresent, _SAL_no, _SAL_maybe, _SAL_yes, _SAL_default}; + +#endif + +#if defined(BUILD_WINDOWS) && !_USE_ATTRIBUTES_FOR_SAL /*IFSTRIP=IGN*/ +#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _GrouP_(annotes _SAL_nop_impl_) +#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _GrouP_(annotes _SAL_nop_impl_) +#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _GrouP_(annotes _SAL_nop_impl_) +#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _GrouP_(annotes _SAL_nop_impl_) +#else +#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _Group_(annotes _SAL_nop_impl_) +#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _Group_(annotes _SAL_nop_impl_) +#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _Group_(annotes _SAL_nop_impl_) +#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _Group_(annotes _SAL_nop_impl_) +#endif + +//============================================================================ +// Structural SAL: +// These annotations modify the use of other annotations. They may +// express the annotation target (i.e. what parameter/field the annotation +// applies to) or the condition under which the annotation is applicable. +//============================================================================ + +// _At_(target, annos) specifies that the annotations listed in 'annos' is to +// be applied to 'target' rather than to the identifier which is the current +// lexical target. +#define _At_(target, annos) _At_impl_(target, annos _SAL_nop_impl_) + +// _At_buffer_(target, iter, bound, annos) is similar to _At_, except that +// target names a buffer, and each annotation in annos is applied to each +// element of target up to bound, with the variable named in iter usable +// by the annotations to refer to relevant offsets within target. +#define _At_buffer_(target, iter, bound, annos) _At_buffer_impl_(target, iter, bound, annos _SAL_nop_impl_) + +// _When_(expr, annos) specifies that the annotations listed in 'annos' only +// apply when 'expr' evaluates to non-zero. +#define _When_(expr, annos) _When_impl_(expr, annos _SAL_nop_impl_) +#define _Group_(annos) _Group_impl_(annos _SAL_nop_impl_) +#define _GrouP_(annos) _GrouP_impl_(annos _SAL_nop_impl_) + +// indicates whether normal post conditions apply to a function +#define _Success_(expr) _SAL2_Source_(_Success_, (expr), _Success_impl_(expr)) + +// indicates whether post conditions apply to a function returning +// the type that this annotation is applied to +#define _Return_type_success_(expr) _SAL2_Source_(_Return_type_success_, (expr), _Success_impl_(expr)) + +// Establish postconditions that apply only if the function does not succeed +#define _On_failure_(annos) _On_failure_impl_(annos _SAL_nop_impl_) + +// Establish postconditions that apply in both success and failure cases. +// Only applicable with functions that have _Success_ or _Return_type_succss_. +#define _Always_(annos) _Always_impl_(annos _SAL_nop_impl_) + +// Usable on a function defintion. Asserts that a function declaration is +// in scope, and its annotations are to be used. There are no other annotations +// allowed on the function definition. +#define _Use_decl_annotations_ _Use_decl_anno_impl_ + +// _Notref_ may precede a _Deref_ or "real" annotation, and removes one +// level of dereference if the parameter is a C++ reference (&). If the +// net deref on a "real" annotation is negative, it is simply discarded. +#define _Notref_ _Notref_impl_ + +// Annotations for defensive programming styles. +#define _Pre_defensive_ _SA_annotes0(SAL_pre_defensive) +#define _Post_defensive_ _SA_annotes0(SAL_post_defensive) + +#define _In_defensive_(annotes) _Pre_defensive_ _Group_(annotes) +#define _Out_defensive_(annotes) _Post_defensive_ _Group_(annotes) +#define _Inout_defensive_(annotes) _Pre_defensive_ _Post_defensive_ _Group_(annotes) + +//============================================================================ +// _In_\_Out_ Layer: +//============================================================================ + +// Reserved pointer parameters, must always be NULL. +#define _Reserved_ _SAL2_Source_(_Reserved_, (), _Pre1_impl_(__null_impl)) + +// _Const_ allows specification that any namable memory location is considered +// readonly for a given call. +#define _Const_ _SAL2_Source_(_Const_, (), _Pre1_impl_(__readaccess_impl_notref)) + + +// Input parameters -------------------------- + +// _In_ - Annotations for parameters where data is passed into the function, but not modified. +// _In_ by itself can be used with non-pointer types (although it is redundant). + +// e.g. void SetPoint( _In_ const POINT* pPT ); +#define _In_ _SAL2_Source_(_In_, (), _Pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_ _Deref_pre1_impl_(__readaccess_impl_notref)) +#define _In_opt_ _SAL2_Source_(_In_opt_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_ _Deref_pre_readonly_) + +// nullterminated 'in' parameters. +// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +#define _In_z_ _SAL2_Source_(_In_z_, (), _In_ _Pre1_impl_(__zterm_impl)) +#define _In_opt_z_ _SAL2_Source_(_In_opt_z_, (), _In_opt_ _Pre1_impl_(__zterm_impl)) + + +// 'input' buffers with given size + +#define _In_reads_(size) _SAL2_Source_(_In_reads_, (size), _Pre_count_(size) _Deref_pre_readonly_) +#define _In_reads_opt_(size) _SAL2_Source_(_In_reads_opt_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_) +#define _In_reads_bytes_(size) _SAL2_Source_(_In_reads_bytes_, (size), _Pre_bytecount_(size) _Deref_pre_readonly_) +#define _In_reads_bytes_opt_(size) _SAL2_Source_(_In_reads_bytes_opt_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_) +#define _In_reads_z_(size) _SAL2_Source_(_In_reads_z_, (size), _In_reads_(size) _Pre_z_) +#define _In_reads_opt_z_(size) _SAL2_Source_(_In_reads_opt_z_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_ _Pre_opt_z_) +#define _In_reads_or_z_(size) _SAL2_Source_(_In_reads_or_z_, (size), _In_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size)))) +#define _In_reads_or_z_opt_(size) _SAL2_Source_(_In_reads_or_z_opt_, (size), _In_opt_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size)))) + + +// 'input' buffers valid to the given end pointer + +#define _In_reads_to_ptr_(ptr) _SAL2_Source_(_In_reads_to_ptr_, (ptr), _Pre_ptrdiff_count_(ptr) _Deref_pre_readonly_) +#define _In_reads_to_ptr_opt_(ptr) _SAL2_Source_(_In_reads_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_) +#define _In_reads_to_ptr_z_(ptr) _SAL2_Source_(_In_reads_to_ptr_z_, (ptr), _In_reads_to_ptr_(ptr) _Pre_z_) +#define _In_reads_to_ptr_opt_z_(ptr) _SAL2_Source_(_In_reads_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_ _Pre_opt_z_) + + + +// Output parameters -------------------------- + +// _Out_ - Annotations for pointer or reference parameters where data passed back to the caller. +// These are mostly used where the pointer/reference is to a non-pointer type. +// _Outptr_/_Outref) (see below) are typically used to return pointers via parameters. + +// e.g. void GetPoint( _Out_ POINT* pPT ); +#define _Out_ _SAL2_Source_(_Out_, (), _Out_impl_) +#define _Out_opt_ _SAL2_Source_(_Out_opt_, (), _Out_opt_impl_) + +#define _Out_writes_(size) _SAL2_Source_(_Out_writes_, (size), _Pre_cap_(size) _Post_valid_impl_) +#define _Out_writes_opt_(size) _SAL2_Source_(_Out_writes_opt_, (size), _Pre_opt_cap_(size) _Post_valid_impl_) +#define _Out_writes_bytes_(size) _SAL2_Source_(_Out_writes_bytes_, (size), _Pre_bytecap_(size) _Post_valid_impl_) +#define _Out_writes_bytes_opt_(size) _SAL2_Source_(_Out_writes_bytes_opt_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_) +#define _Out_writes_z_(size) _SAL2_Source_(_Out_writes_z_, (size), _Pre_cap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_writes_opt_z_(size) _SAL2_Source_(_Out_writes_opt_z_, (size), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_z_) + +#define _Out_writes_to_(size,count) _SAL2_Source_(_Out_writes_to_, (size,count), _Pre_cap_(size) _Post_valid_impl_ _Post_count_(count)) +#define _Out_writes_to_opt_(size,count) _SAL2_Source_(_Out_writes_to_opt_, (size,count), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_count_(count)) +#define _Out_writes_all_(size) _SAL2_Source_(_Out_writes_all_, (size), _Out_writes_to_(_Old_(size), _Old_(size))) +#define _Out_writes_all_opt_(size) _SAL2_Source_(_Out_writes_all_opt_, (size), _Out_writes_to_opt_(_Old_(size), _Old_(size))) + +#define _Out_writes_bytes_to_(size,count) _SAL2_Source_(_Out_writes_bytes_to_, (size,count), _Pre_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count)) +#define _Out_writes_bytes_to_opt_(size,count) _SAL2_Source_(_Out_writes_bytes_to_opt_, (size,count), _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count)) +#define _Out_writes_bytes_all_(size) _SAL2_Source_(_Out_writes_bytes_all_, (size), _Out_writes_bytes_to_(_Old_(size), _Old_(size))) +#define _Out_writes_bytes_all_opt_(size) _SAL2_Source_(_Out_writes_bytes_all_opt_, (size), _Out_writes_bytes_to_opt_(_Old_(size), _Old_(size))) + +#define _Out_writes_to_ptr_(ptr) _SAL2_Source_(_Out_writes_to_ptr_, (ptr), _Pre_ptrdiff_cap_(ptr) _Post_valid_impl_) +#define _Out_writes_to_ptr_opt_(ptr) _SAL2_Source_(_Out_writes_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_) +#define _Out_writes_to_ptr_z_(ptr) _SAL2_Source_(_Out_writes_to_ptr_z_, (ptr), _Pre_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_) +#define _Out_writes_to_ptr_opt_z_(ptr) _SAL2_Source_(_Out_writes_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_) + + +// Inout parameters ---------------------------- + +// _Inout_ - Annotations for pointer or reference parameters where data is passed in and +// potentially modified. +// void ModifyPoint( _Inout_ POINT* pPT ); +// void ModifyPointByRef( _Inout_ POINT& pPT ); + +#define _Inout_ _SAL2_Source_(_Inout_, (), _Prepost_valid_) +#define _Inout_opt_ _SAL2_Source_(_Inout_opt_, (), _Prepost_opt_valid_) + +// For modifying string buffers +// void toupper( _Inout_z_ char* sz ); +#define _Inout_z_ _SAL2_Source_(_Inout_z_, (), _Prepost_z_) +#define _Inout_opt_z_ _SAL2_Source_(_Inout_opt_z_, (), _Prepost_opt_z_) + +// For modifying buffers with explicit element size +#define _Inout_updates_(size) _SAL2_Source_(_Inout_updates_, (size), _Pre_cap_(size) _Pre_valid_impl_ _Post_valid_impl_) +#define _Inout_updates_opt_(size) _SAL2_Source_(_Inout_updates_opt_, (size), _Pre_opt_cap_(size) _Pre_valid_impl_ _Post_valid_impl_) +#define _Inout_updates_z_(size) _SAL2_Source_(_Inout_updates_z_, (size), _Pre_cap_(size) _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl)) +#define _Inout_updates_opt_z_(size) _SAL2_Source_(_Inout_updates_opt_z_, (size), _Pre_opt_cap_(size) _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl)) + +#define _Inout_updates_to_(size,count) _SAL2_Source_(_Inout_updates_to_, (size,count), _Out_writes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count))) +#define _Inout_updates_to_opt_(size,count) _SAL2_Source_(_Inout_updates_to_opt_, (size,count), _Out_writes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count))) + +#define _Inout_updates_all_(size) _SAL2_Source_(_Inout_updates_all_, (size), _Inout_updates_to_(_Old_(size), _Old_(size))) +#define _Inout_updates_all_opt_(size) _SAL2_Source_(_Inout_updates_all_opt_, (size), _Inout_updates_to_opt_(_Old_(size), _Old_(size))) + +// For modifying buffers with explicit byte size +#define _Inout_updates_bytes_(size) _SAL2_Source_(_Inout_updates_bytes_, (size), _Pre_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_) +#define _Inout_updates_bytes_opt_(size) _SAL2_Source_(_Inout_updates_bytes_opt_, (size), _Pre_opt_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_) + +#define _Inout_updates_bytes_to_(size,count) _SAL2_Source_(_Inout_updates_bytes_to_, (size,count), _Out_writes_bytes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count))) +#define _Inout_updates_bytes_to_opt_(size,count) _SAL2_Source_(_Inout_updates_bytes_to_opt_, (size,count), _Out_writes_bytes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count))) + +#define _Inout_updates_bytes_all_(size) _SAL2_Source_(_Inout_updates_bytes_all_, (size), _Inout_updates_bytes_to_(_Old_(size), _Old_(size))) +#define _Inout_updates_bytes_all_opt_(size) _SAL2_Source_(_Inout_updates_bytes_all_opt_, (size), _Inout_updates_bytes_to_opt_(_Old_(size), _Old_(size))) + + +// Pointer to pointer parameters ------------------------- + +// _Outptr_ - Annotations for output params returning pointers +// These describe parameters where the called function provides the buffer: +// HRESULT SHStrDupW(_In_ LPCWSTR psz, _Outptr_ LPWSTR *ppwsz); +// The caller passes the address of an LPWSTR variable as ppwsz, and SHStrDupW allocates +// and initializes memory and returns the pointer to the new LPWSTR in *ppwsz. +// +// _Outptr_opt_ - describes parameters that are allowed to be NULL. +// _Outptr_*_result_maybenull_ - describes parameters where the called function might return NULL to the caller. +// +// Example: +// void MyFunc(_Outptr_opt_ int **ppData1, _Outptr_result_maybenull_ int **ppData2); +// Callers: +// MyFunc(NULL, NULL); // error: parameter 2, ppData2, should not be NULL +// MyFunc(&pData1, &pData2); // ok: both non-NULL +// if (*pData1 == *pData2) ... // error: pData2 might be NULL after call + +#define _Outptr_ _SAL2_Source_(_Outptr_, (), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(1))) +#define _Outptr_result_maybenull_ _SAL2_Source_(_Outptr_result_maybenull_, (), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1))) +#define _Outptr_opt_ _SAL2_Source_(_Outptr_opt_, (), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(1))) +#define _Outptr_opt_result_maybenull_ _SAL2_Source_(_Outptr_opt_result_maybenull_, (), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1))) + +// Annotations for _Outptr_ parameters returning pointers to null terminated strings. + +#define _Outptr_result_z_ _SAL2_Source_(_Outptr_result_z_, (), _Out_impl_ _Deref_post_z_) +#define _Outptr_opt_result_z_ _SAL2_Source_(_Outptr_opt_result_z_, (), _Out_opt_impl_ _Deref_post_z_) +#define _Outptr_result_maybenull_z_ _SAL2_Source_(_Outptr_result_maybenull_z_, (), _Out_impl_ _Deref_post_opt_z_) +#define _Outptr_opt_result_maybenull_z_ _SAL2_Source_(_Outptr_opt_result_maybenull_z_, (), _Out_opt_impl_ _Deref_post_opt_z_) + +// Annotations for _Outptr_ parameters where the output pointer is set to NULL if the function fails. + +#define _Outptr_result_nullonfailure_ _SAL2_Source_(_Outptr_result_nullonfailure_, (), _Outptr_ _On_failure_(_Deref_post_null_)) +#define _Outptr_opt_result_nullonfailure_ _SAL2_Source_(_Outptr_opt_result_nullonfailure_, (), _Outptr_opt_ _On_failure_(_Deref_post_null_)) + +// Annotations for _Outptr_ parameters which return a pointer to a ref-counted COM object, +// following the COM convention of setting the output to NULL on failure. +// The current implementation is identical to _Outptr_result_nullonfailure_. +// For pointers to types that are not COM objects, _Outptr_result_nullonfailure_ is preferred. + +#define _COM_Outptr_ _SAL2_Source_(_COM_Outptr_, (), _Outptr_ _On_failure_(_Deref_post_null_)) +#define _COM_Outptr_result_maybenull_ _SAL2_Source_(_COM_Outptr_result_maybenull_, (), _Outptr_result_maybenull_ _On_failure_(_Deref_post_null_)) +#define _COM_Outptr_opt_ _SAL2_Source_(_COM_Outptr_opt_, (), _Outptr_opt_ _On_failure_(_Deref_post_null_)) +#define _COM_Outptr_opt_result_maybenull_ _SAL2_Source_(_COM_Outptr_opt_result_maybenull_, (), _Outptr_opt_result_maybenull_ _On_failure_(_Deref_post_null_)) + +// Annotations for _Outptr_ parameters returning a pointer to buffer with a specified number of elements/bytes + +#define _Outptr_result_buffer_(size) _SAL2_Source_(_Outptr_result_buffer_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size))) +#define _Outptr_opt_result_buffer_(size) _SAL2_Source_(_Outptr_opt_result_buffer_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size))) +#define _Outptr_result_buffer_to_(size, count) _SAL2_Source_(_Outptr_result_buffer_to_, (size, count), _Out_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count))) +#define _Outptr_opt_result_buffer_to_(size, count) _SAL2_Source_(_Outptr_opt_result_buffer_to_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count))) + +#define _Outptr_result_buffer_all_(size) _SAL2_Source_(_Outptr_result_buffer_all_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size))) +#define _Outptr_opt_result_buffer_all_(size) _SAL2_Source_(_Outptr_opt_result_buffer_all_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size))) + +#define _Outptr_result_buffer_maybenull_(size) _SAL2_Source_(_Outptr_result_buffer_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size))) +#define _Outptr_opt_result_buffer_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_buffer_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size))) +#define _Outptr_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_result_buffer_to_maybenull_, (size, count), _Out_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count))) +#define _Outptr_opt_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_opt_result_buffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count))) + +#define _Outptr_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outptr_result_buffer_all_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size))) +#define _Outptr_opt_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_buffer_all_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size))) + +#define _Outptr_result_bytebuffer_(size) _SAL2_Source_(_Outptr_result_bytebuffer_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size))) +#define _Outptr_opt_result_bytebuffer_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size))) +#define _Outptr_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outptr_result_bytebuffer_to_, (size, count), _Out_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) +#define _Outptr_opt_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) + +#define _Outptr_result_bytebuffer_all_(size) _SAL2_Source_(_Outptr_result_bytebuffer_all_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size))) +#define _Outptr_opt_result_bytebuffer_all_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size))) + +#define _Outptr_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outptr_result_bytebuffer_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size))) +#define _Outptr_opt_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size))) +#define _Outptr_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_result_bytebuffer_to_maybenull_, (size, count), _Out_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) +#define _Outptr_opt_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) + +#define _Outptr_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outptr_result_bytebuffer_all_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size))) +#define _Outptr_opt_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size))) + +// Annotations for output reference to pointer parameters. + +#define _Outref_ _SAL2_Source_(_Outref_, (), _Out_impl_ _Post_notnull_) +#define _Outref_result_maybenull_ _SAL2_Source_(_Outref_result_maybenull_, (), _Pre2_impl_(__notnull_impl_notref, __cap_c_one_notref_impl) _Post_maybenull_ _Post_valid_impl_) + +#define _Outref_result_buffer_(size) _SAL2_Source_(_Outref_result_buffer_, (size), _Outref_ _Post1_impl_(__cap_impl(size))) +#define _Outref_result_bytebuffer_(size) _SAL2_Source_(_Outref_result_bytebuffer_, (size), _Outref_ _Post1_impl_(__bytecap_impl(size))) +#define _Outref_result_buffer_to_(size, count) _SAL2_Source_(_Outref_result_buffer_to_, (size, count), _Outref_result_buffer_(size) _Post1_impl_(__count_impl(count))) +#define _Outref_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outref_result_bytebuffer_to_, (size, count), _Outref_result_bytebuffer_(size) _Post1_impl_(__bytecount_impl(count))) +#define _Outref_result_buffer_all_(size) _SAL2_Source_(_Outref_result_buffer_all_, (size), _Outref_result_buffer_to_(size, _Old_(size))) +#define _Outref_result_bytebuffer_all_(size) _SAL2_Source_(_Outref_result_bytebuffer_all_, (size), _Outref_result_bytebuffer_to_(size, _Old_(size))) + +#define _Outref_result_buffer_maybenull_(size) _SAL2_Source_(_Outref_result_buffer_maybenull_, (size), _Outref_result_maybenull_ _Post1_impl_(__cap_impl(size))) +#define _Outref_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outref_result_bytebuffer_maybenull_, (size), _Outref_result_maybenull_ _Post1_impl_(__bytecap_impl(size))) +#define _Outref_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outref_result_buffer_to_maybenull_, (size, count), _Outref_result_buffer_maybenull_(size) _Post1_impl_(__count_impl(count))) +#define _Outref_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outref_result_bytebuffer_to_maybenull_, (size, count), _Outref_result_bytebuffer_maybenull_(size) _Post1_impl_(__bytecount_impl(count))) +#define _Outref_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outref_result_buffer_all_maybenull_, (size), _Outref_result_buffer_to_maybenull_(size, _Old_(size))) +#define _Outref_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outref_result_bytebuffer_all_maybenull_, (size), _Outref_result_bytebuffer_to_maybenull_(size, _Old_(size))) + +// Annotations for output reference to pointer parameters that guarantee +// that the pointer is set to NULL on failure. +#define _Outref_result_nullonfailure_ _SAL2_Source_(_Outref_result_nullonfailure_, (), _Outref_ _On_failure_(_Post_null_)) + +// Generic annotations to set output value of a by-pointer or by-reference parameter to null/zero on failure. +#define _Result_nullonfailure_ _SAL2_Source_(_Result_nullonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Post_null_)) +#define _Result_zeroonfailure_ _SAL2_Source_(_Result_zeroonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Out_range_(==, 0))) + + +// return values ------------------------------- + +// +// _Ret_ annotations +// +// describing conditions that hold for return values after the call + +// e.g. _Ret_z_ CString::operator const WCHAR*() const throw(); +#define _Ret_z_ _SAL2_Source_(_Ret_z_, (), _Ret2_impl_(__notnull_impl, __zterm_impl) _Ret_valid_impl_) +#define _Ret_maybenull_z_ _SAL2_Source_(_Ret_maybenull_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_) + +// used with allocated but not yet initialized objects +#define _Ret_notnull_ _SAL2_Source_(_Ret_notnull_, (), _Ret1_impl_(__notnull_impl)) +#define _Ret_maybenull_ _SAL2_Source_(_Ret_maybenull_, (), _Ret1_impl_(__maybenull_impl)) +#define _Ret_null_ _SAL2_Source_(_Ret_null_, (), _Ret1_impl_(__null_impl)) + +// used with allocated and initialized objects +// returns single valid object +#define _Ret_valid_ _SAL2_Source_(_Ret_valid_, (), _Ret1_impl_(__notnull_impl_notref) _Ret_valid_impl_) + +// returns pointer to initialized buffer of specified size +#define _Ret_writes_(size) _SAL2_Source_(_Ret_writes_, (size), _Ret2_impl_(__notnull_impl, __count_impl(size)) _Ret_valid_impl_) +#define _Ret_writes_z_(size) _SAL2_Source_(_Ret_writes_z_, (size), _Ret3_impl_(__notnull_impl, __count_impl(size), __zterm_impl) _Ret_valid_impl_) +#define _Ret_writes_bytes_(size) _SAL2_Source_(_Ret_writes_bytes_, (size), _Ret2_impl_(__notnull_impl, __bytecount_impl(size)) _Ret_valid_impl_) +#define _Ret_writes_maybenull_(size) _SAL2_Source_(_Ret_writes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__count_impl(size)) _Ret_valid_impl_) +#define _Ret_writes_maybenull_z_(size) _SAL2_Source_(_Ret_writes_maybenull_z_, (size), _Ret3_impl_(__maybenull_impl,__count_impl(size),__zterm_impl) _Ret_valid_impl_) +#define _Ret_writes_bytes_maybenull_(size) _SAL2_Source_(_Ret_writes_bytes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__bytecount_impl(size)) _Ret_valid_impl_) + +// returns pointer to partially initialized buffer, with total size 'size' and initialized size 'count' +#define _Ret_writes_to_(size,count) _SAL2_Source_(_Ret_writes_to_, (size,count), _Ret3_impl_(__notnull_impl, __cap_impl(size), __count_impl(count)) _Ret_valid_impl_) +#define _Ret_writes_bytes_to_(size,count) _SAL2_Source_(_Ret_writes_bytes_to_, (size,count), _Ret3_impl_(__notnull_impl, __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_) +#define _Ret_writes_to_maybenull_(size,count) _SAL2_Source_(_Ret_writes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl, __cap_impl(size), __count_impl(count)) _Ret_valid_impl_) +#define _Ret_writes_bytes_to_maybenull_(size,count) _SAL2_Source_(_Ret_writes_bytes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl, __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_) + + +// Annotations for strict type checking +#define _Points_to_data_ _SAL2_Source_(_Points_to_data_, (), _Pre_ _Points_to_data_impl_) +#define _Literal_ _SAL2_Source_(_Literal_, (), _Pre_ _Literal_impl_) +#define _Notliteral_ _SAL2_Source_(_Notliteral_, (), _Pre_ _Notliteral_impl_) + +// Check the return value of a function e.g. _Check_return_ ErrorCode Foo(); +#define _Check_return_ _SAL2_Source_(_Check_return_, (), _Check_return_impl_) +#define _Must_inspect_result_ _SAL2_Source_(_Must_inspect_result_, (), _Must_inspect_impl_ _Check_return_impl_) + +// e.g. MyPrintF( _Printf_format_string_ const WCHAR* wzFormat, ... ); +#define _Printf_format_string_ _SAL2_Source_(_Printf_format_string_, (), _Printf_format_string_impl_) +#define _Scanf_format_string_ _SAL2_Source_(_Scanf_format_string_, (), _Scanf_format_string_impl_) +#define _Scanf_s_format_string_ _SAL2_Source_(_Scanf_s_format_string_, (), _Scanf_s_format_string_impl_) + +#define _Format_string_impl_(kind,where) _SA_annotes2(SAL_IsFormatString2, kind, where) +#define _Printf_format_string_params_(x) _SAL2_Source_(_Printf_format_string_params_, (x), _Format_string_impl_("printf", x)) +#define _Scanf_format_string_params_(x) _SAL2_Source_(_Scanf_format_string_params_, (x), _Format_string_impl_("scanf", x)) +#define _Scanf_s_format_string_params_(x) _SAL2_Source_(_Scanf_s_format_string_params_, (x), _Format_string_impl_("scanf_s", x)) + +// annotations to express value of integral or pointer parameter +#define _In_range_(lb,ub) _SAL2_Source_(_In_range_, (lb,ub), _In_range_impl_(lb,ub)) +#define _Out_range_(lb,ub) _SAL2_Source_(_Out_range_, (lb,ub), _Out_range_impl_(lb,ub)) +#define _Ret_range_(lb,ub) _SAL2_Source_(_Ret_range_, (lb,ub), _Ret_range_impl_(lb,ub)) +#define _Deref_in_range_(lb,ub) _SAL2_Source_(_Deref_in_range_, (lb,ub), _Deref_in_range_impl_(lb,ub)) +#define _Deref_out_range_(lb,ub) _SAL2_Source_(_Deref_out_range_, (lb,ub), _Deref_out_range_impl_(lb,ub)) +#define _Deref_ret_range_(lb,ub) _SAL2_Source_(_Deref_ret_range_, (lb,ub), _Deref_ret_range_impl_(lb,ub)) +#define _Pre_equal_to_(expr) _SAL2_Source_(_Pre_equal_to_, (expr), _In_range_(==, expr)) +#define _Post_equal_to_(expr) _SAL2_Source_(_Post_equal_to_, (expr), _Out_range_(==, expr)) + +// annotation to express that a value (usually a field of a mutable class) +// is not changed by a function call +#define _Unchanged_(e) _SAL2_Source_(_Unchanged_, (e), _At_(e, _Post_equal_to_(_Old_(e)) _Const_)) + +// Annotations to allow expressing generalized pre and post conditions. +// 'cond' may be any valid SAL expression that is considered to be true as a precondition +// or postcondition (respsectively). +#define _Pre_satisfies_(cond) _SAL2_Source_(_Pre_satisfies_, (cond), _Pre_satisfies_impl_(cond)) +#define _Post_satisfies_(cond) _SAL2_Source_(_Post_satisfies_, (cond), _Post_satisfies_impl_(cond)) + +// Annotations to express struct, class and field invariants +#define _Struct_size_bytes_(size) _SAL2_Source_(_Struct_size_bytes_, (size), _Writable_bytes_(size)) + +#define _Field_size_(size) _SAL2_Source_(_Field_size_, (size), _Notnull_ _Writable_elements_(size)) +#define _Field_size_opt_(size) _SAL2_Source_(_Field_size_opt_, (size), _Maybenull_ _Writable_elements_(size)) +#define _Field_size_part_(size, count) _SAL2_Source_(_Field_size_part_, (size, count), _Notnull_ _Writable_elements_(size) _Readable_elements_(count)) +#define _Field_size_part_opt_(size, count) _SAL2_Source_(_Field_size_part_opt_, (size, count), _Maybenull_ _Writable_elements_(size) _Readable_elements_(count)) +#define _Field_size_full_(size) _SAL2_Source_(_Field_size_full_, (size), _Field_size_part_(size, size)) +#define _Field_size_full_opt_(size) _SAL2_Source_(_Field_size_full_opt_, (size), _Field_size_part_opt_(size, size)) + +#define _Field_size_bytes_(size) _SAL2_Source_(_Field_size_bytes_, (size), _Notnull_ _Writable_bytes_(size)) +#define _Field_size_bytes_opt_(size) _SAL2_Source_(_Field_size_bytes_opt_, (size), _Maybenull_ _Writable_bytes_(size)) +#define _Field_size_bytes_part_(size, count) _SAL2_Source_(_Field_size_bytes_part_, (size, count), _Notnull_ _Writable_bytes_(size) _Readable_bytes_(count)) +#define _Field_size_bytes_part_opt_(size, count) _SAL2_Source_(_Field_size_bytes_part_opt_, (size, count), _Maybenull_ _Writable_bytes_(size) _Readable_bytes_(count)) +#define _Field_size_bytes_full_(size) _SAL2_Source_(_Field_size_bytes_full_, (size), _Field_size_bytes_part_(size, size)) +#define _Field_size_bytes_full_opt_(size) _SAL2_Source_(_Field_size_bytes_full_opt_, (size), _Field_size_bytes_part_opt_(size, size)) + +#define _Field_z_ _SAL2_Source_(_Field_z_, (), _Null_terminated_) + +#define _Field_range_(min,max) _SAL2_Source_(_Field_range_, (min,max), _Field_range_impl_(min,max)) + +//============================================================================ +// _Pre_\_Post_ Layer: +//============================================================================ + +// +// Raw Pre/Post for declaring custom pre/post conditions +// + +#define _Pre_ _Pre_impl_ +#define _Post_ _Post_impl_ + +// +// Validity property +// + +#define _Valid_ _Valid_impl_ +#define _Notvalid_ _Notvalid_impl_ +#define _Maybevalid_ _Maybevalid_impl_ + +// +// Buffer size properties +// + +// Expressing buffer sizes without specifying pre or post condition +#define _Readable_bytes_(size) _SAL2_Source_(_Readable_bytes_, (size), _Readable_bytes_impl_(size)) +#define _Readable_elements_(size) _SAL2_Source_(_Readable_elements_, (size), _Readable_elements_impl_(size)) +#define _Writable_bytes_(size) _SAL2_Source_(_Writable_bytes_, (size), _Writable_bytes_impl_(size)) +#define _Writable_elements_(size) _SAL2_Source_(_Writable_elements_, (size), _Writable_elements_impl_(size)) + +#define _Null_terminated_ _SAL2_Source_(_Null_terminated_, (), _Null_terminated_impl_) +#define _NullNull_terminated_ _SAL2_Source_(_NullNull_terminated_, (), _NullNull_terminated_impl_) + +// Expressing buffer size as pre or post condition +#define _Pre_readable_size_(size) _SAL2_Source_(_Pre_readable_size_, (size), _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Pre_writable_size_(size) _SAL2_Source_(_Pre_writable_size_, (size), _Pre1_impl_(__cap_impl(size))) +#define _Pre_readable_byte_size_(size) _SAL2_Source_(_Pre_readable_byte_size_, (size), _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) +#define _Pre_writable_byte_size_(size) _SAL2_Source_(_Pre_writable_byte_size_, (size), _Pre1_impl_(__bytecap_impl(size))) + +#define _Post_readable_size_(size) _SAL2_Source_(_Post_readable_size_, (size), _Post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Post_writable_size_(size) _SAL2_Source_(_Post_writable_size_, (size), _Post1_impl_(__cap_impl(size))) +#define _Post_readable_byte_size_(size) _SAL2_Source_(_Post_readable_byte_size_, (size), _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) +#define _Post_writable_byte_size_(size) _SAL2_Source_(_Post_writable_byte_size_, (size), _Post1_impl_(__bytecap_impl(size))) + +// +// Pointer null-ness properties +// +#define _Null_ _Null_impl_ +#define _Notnull_ _Notnull_impl_ +#define _Maybenull_ _Maybenull_impl_ + +// +// _Pre_ annotations --- +// +// describing conditions that must be met before the call of the function + +// e.g. int strlen( _Pre_z_ const char* sz ); +// buffer is a zero terminated string +#define _Pre_z_ _SAL2_Source_(_Pre_z_, (), _Pre1_impl_(__zterm_impl) _Pre_valid_impl_) + +// valid size unknown or indicated by type (e.g.:LPSTR) +#define _Pre_valid_ _SAL2_Source_(_Pre_valid_, (), _Pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_) +#define _Pre_opt_valid_ _SAL2_Source_(_Pre_opt_valid_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_) + +#define _Pre_invalid_ _SAL2_Source_(_Pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl)) + +// Overrides recursive valid when some field is not yet initialized when using _Inout_ +#define _Pre_unknown_ _SAL2_Source_(_Pre_unknown_, (), _Pre1_impl_(__maybevalid_impl)) + +// used with allocated but not yet initialized objects +#define _Pre_notnull_ _SAL2_Source_(_Pre_notnull_, (), _Pre1_impl_(__notnull_impl_notref)) +#define _Pre_maybenull_ _SAL2_Source_(_Pre_maybenull_, (), _Pre1_impl_(__maybenull_impl_notref)) +#define _Pre_null_ _SAL2_Source_(_Pre_null_, (), _Pre1_impl_(__null_impl_notref)) + +// +// _Post_ annotations --- +// +// describing conditions that hold after the function call + +// void CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_ char* szFrom, size_t cchFrom ); +// buffer will be a zero-terminated string after the call +#define _Post_z_ _SAL2_Source_(_Post_z_, (), _Post1_impl_(__zterm_impl) _Post_valid_impl_) + +// e.g. HRESULT InitStruct( _Post_valid_ Struct* pobj ); +#define _Post_valid_ _SAL2_Source_(_Post_valid_, (), _Post_valid_impl_) +#define _Post_invalid_ _SAL2_Source_(_Post_invalid_, (), _Deref_post1_impl_(__notvalid_impl)) + +// e.g. void free( _Post_ptr_invalid_ void* pv ); +#define _Post_ptr_invalid_ _SAL2_Source_(_Post_ptr_invalid_, (), _Post1_impl_(__notvalid_impl)) + +// e.g. void ThrowExceptionIfNull( _Post_notnull_ const void* pv ); +#define _Post_notnull_ _SAL2_Source_(_Post_notnull_, (), _Post1_impl_(__notnull_impl)) + +// e.g. HRESULT GetObject(_Outptr_ _On_failure_(_At_(*p, _Post_null_)) T **p); +#define _Post_null_ _SAL2_Source_(_Post_null_, (), _Post1_impl_(__null_impl)) + +#define _Post_maybenull_ _SAL2_Source_(_Post_maybenull_, (), _Post1_impl_(__maybenull_impl)) + +#define _Prepost_z_ _SAL2_Source_(_Prepost_z_, (), _Pre_z_ _Post_z_) + + +// #pragma region Input Buffer SAL 1 compatibility macros + +/*========================================================================== + + This section contains definitions for macros defined for VS2010 and earlier. + Usage of these macros is still supported, but the SAL 2 macros defined above + are recommended instead. This comment block is retained to assist in + understanding SAL that still uses the older syntax. + + The macros are defined in 3 layers: + + _In_\_Out_ Layer: + ---------------- + This layer provides the highest abstraction and its macros should be used + in most cases. Its macros start with _In_, _Out_ or _Inout_. For the + typical case they provide the most concise annotations. + + _Pre_\_Post_ Layer: + ------------------ + The macros of this layer only should be used when there is no suitable macro + in the _In_\_Out_ layer. Its macros start with _Pre_, _Post_, _Ret_, + _Deref_pre_ _Deref_post_ and _Deref_ret_. This layer provides the most + flexibility for annotations. + + Implementation Abstraction Layer: + -------------------------------- + Macros from this layer should never be used directly. The layer only exists + to hide the implementation of the annotation macros. + + + Annotation Syntax: + |--------------|----------|----------------|-----------------------------| + | Usage | Nullness | ZeroTerminated | Extent | + |--------------|----------|----------------|-----------------------------| + | _In_ | <> | <> | <> | + | _Out_ | opt_ | z_ | [byte]cap_[c_|x_]( size ) | + | _Inout_ | | | [byte]count_[c_|x_]( size ) | + | _Deref_out_ | | | ptrdiff_cap_( ptr ) | + |--------------| | | ptrdiff_count_( ptr ) | + | _Ret_ | | | | + | _Deref_ret_ | | | | + |--------------| | | | + | _Pre_ | | | | + | _Post_ | | | | + | _Deref_pre_ | | | | + | _Deref_post_ | | | | + |--------------|----------|----------------|-----------------------------| + + Usage: + ----- + _In_, _Out_, _Inout_, _Pre_, _Post_, _Deref_pre_, _Deref_post_ are for + formal parameters. + _Ret_, _Deref_ret_ must be used for return values. + + Nullness: + -------- + If the pointer can be NULL the annotation contains _opt. If the macro + does not contain '_opt' the pointer may not be NULL. + + String Type: + ----------- + _z: NullTerminated string + for _In_ parameters the buffer must have the specified stringtype before the call + for _Out_ parameters the buffer must have the specified stringtype after the call + for _Inout_ parameters both conditions apply + + Extent Syntax: + |------|---------------|---------------| + | Unit | Writ\Readable | Argument Type | + |------|---------------|---------------| + | <> | cap_ | <> | + | byte | count_ | c_ | + | | | x_ | + |------|---------------|---------------| + + 'cap' (capacity) describes the writable size of the buffer and is typically used + with _Out_. The default unit is elements. Use 'bytecap' if the size is given in bytes + 'count' describes the readable size of the buffer and is typically used with _In_. + The default unit is elements. Use 'bytecount' if the size is given in bytes. + + Argument syntax for cap_, bytecap_, count_, bytecount_: + (|return)[+n] e.g. cch, return, cb+2 + + If the buffer size is a constant expression use the c_ postfix. + E.g. cap_c_(20), count_c_(MAX_PATH), bytecount_c_(16) + + If the buffer size is given by a limiting pointer use the ptrdiff_ versions + of the macros. + + If the buffer size is neither a parameter nor a constant expression use the x_ + postfix. e.g. bytecount_x_(num*size) x_ annotations accept any arbitrary string. + No analysis can be done for x_ annotations but they at least tell the tool that + the buffer has some sort of extent description. x_ annotations might be supported + by future compiler versions. + +============================================================================*/ + +// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch ) +// valid buffer extent described by another parameter +#define _In_count_(size) _SAL1_1_Source_(_In_count_, (size), _Pre_count_(size) _Deref_pre_readonly_) +#define _In_opt_count_(size) _SAL1_1_Source_(_In_opt_count_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_) +#define _In_bytecount_(size) _SAL1_1_Source_(_In_bytecount_, (size), _Pre_bytecount_(size) _Deref_pre_readonly_) +#define _In_opt_bytecount_(size) _SAL1_1_Source_(_In_opt_bytecount_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_) + +// valid buffer extent described by a constant extression +#define _In_count_c_(size) _SAL1_1_Source_(_In_count_c_, (size), _Pre_count_c_(size) _Deref_pre_readonly_) +#define _In_opt_count_c_(size) _SAL1_1_Source_(_In_opt_count_c_, (size), _Pre_opt_count_c_(size) _Deref_pre_readonly_) +#define _In_bytecount_c_(size) _SAL1_1_Source_(_In_bytecount_c_, (size), _Pre_bytecount_c_(size) _Deref_pre_readonly_) +#define _In_opt_bytecount_c_(size) _SAL1_1_Source_(_In_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_) + +// nullterminated 'input' buffers with given size + +// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch ) +// nullterminated valid buffer extent described by another parameter +#define _In_z_count_(size) _SAL1_1_Source_(_In_z_count_, (size), _Pre_z_ _Pre_count_(size) _Deref_pre_readonly_) +#define _In_opt_z_count_(size) _SAL1_1_Source_(_In_opt_z_count_, (size), _Pre_opt_z_ _Pre_opt_count_(size) _Deref_pre_readonly_) +#define _In_z_bytecount_(size) _SAL1_1_Source_(_In_z_bytecount_, (size), _Pre_z_ _Pre_bytecount_(size) _Deref_pre_readonly_) +#define _In_opt_z_bytecount_(size) _SAL1_1_Source_(_In_opt_z_bytecount_, (size), _Pre_opt_z_ _Pre_opt_bytecount_(size) _Deref_pre_readonly_) + +// nullterminated valid buffer extent described by a constant extression +#define _In_z_count_c_(size) _SAL1_1_Source_(_In_z_count_c_, (size), _Pre_z_ _Pre_count_c_(size) _Deref_pre_readonly_) +#define _In_opt_z_count_c_(size) _SAL1_1_Source_(_In_opt_z_count_c_, (size), _Pre_opt_z_ _Pre_opt_count_c_(size) _Deref_pre_readonly_) +#define _In_z_bytecount_c_(size) _SAL1_1_Source_(_In_z_bytecount_c_, (size), _Pre_z_ _Pre_bytecount_c_(size) _Deref_pre_readonly_) +#define _In_opt_z_bytecount_c_(size) _SAL1_1_Source_(_In_opt_z_bytecount_c_, (size), _Pre_opt_z_ _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_) + +// buffer capacity is described by another pointer +// e.g. void Foo( _In_ptrdiff_count_(pchMax) const char* pch, const char* pchMax ) { while pch < pchMax ) pch++; } +#define _In_ptrdiff_count_(size) _SAL1_1_Source_(_In_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size) _Deref_pre_readonly_) +#define _In_opt_ptrdiff_count_(size) _SAL1_1_Source_(_In_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size) _Deref_pre_readonly_) + +// 'x' version for complex expressions that are not supported by the current compiler version +// e.g. void Set3ColMatrix( _In_count_x_(3*cRows) const Elem* matrix, int cRows ); +#define _In_count_x_(size) _SAL1_1_Source_(_In_count_x_, (size), _Pre_count_x_(size) _Deref_pre_readonly_) +#define _In_opt_count_x_(size) _SAL1_1_Source_(_In_opt_count_x_, (size), _Pre_opt_count_x_(size) _Deref_pre_readonly_) +#define _In_bytecount_x_(size) _SAL1_1_Source_(_In_bytecount_x_, (size), _Pre_bytecount_x_(size) _Deref_pre_readonly_) +#define _In_opt_bytecount_x_(size) _SAL1_1_Source_(_In_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Deref_pre_readonly_) + + +// 'out' with buffer size +// e.g. void GetIndeces( _Out_cap_(cIndeces) int* rgIndeces, size_t cIndices ); +// buffer capacity is described by another parameter +#define _Out_cap_(size) _SAL1_1_Source_(_Out_cap_, (size), _Pre_cap_(size) _Post_valid_impl_) +#define _Out_opt_cap_(size) _SAL1_1_Source_(_Out_opt_cap_, (size), _Pre_opt_cap_(size) _Post_valid_impl_) +#define _Out_bytecap_(size) _SAL1_1_Source_(_Out_bytecap_, (size), _Pre_bytecap_(size) _Post_valid_impl_) +#define _Out_opt_bytecap_(size) _SAL1_1_Source_(_Out_opt_bytecap_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_) + +// buffer capacity is described by a constant expression +#define _Out_cap_c_(size) _SAL1_1_Source_(_Out_cap_c_, (size), _Pre_cap_c_(size) _Post_valid_impl_) +#define _Out_opt_cap_c_(size) _SAL1_1_Source_(_Out_opt_cap_c_, (size), _Pre_opt_cap_c_(size) _Post_valid_impl_) +#define _Out_bytecap_c_(size) _SAL1_1_Source_(_Out_bytecap_c_, (size), _Pre_bytecap_c_(size) _Post_valid_impl_) +#define _Out_opt_bytecap_c_(size) _SAL1_1_Source_(_Out_opt_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_) + +// buffer capacity is described by another parameter multiplied by a constant expression +#define _Out_cap_m_(mult,size) _SAL1_1_Source_(_Out_cap_m_, (mult,size), _Pre_cap_m_(mult,size) _Post_valid_impl_) +#define _Out_opt_cap_m_(mult,size) _SAL1_1_Source_(_Out_opt_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_) +#define _Out_z_cap_m_(mult,size) _SAL1_1_Source_(_Out_z_cap_m_, (mult,size), _Pre_cap_m_(mult,size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_m_(mult,size) _SAL1_1_Source_(_Out_opt_z_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_ _Post_z_) + +// buffer capacity is described by another pointer +// e.g. void Foo( _Out_ptrdiff_cap_(pchMax) char* pch, const char* pchMax ) { while pch < pchMax ) pch++; } +#define _Out_ptrdiff_cap_(size) _SAL1_1_Source_(_Out_ptrdiff_cap_, (size), _Pre_ptrdiff_cap_(size) _Post_valid_impl_) +#define _Out_opt_ptrdiff_cap_(size) _SAL1_1_Source_(_Out_opt_ptrdiff_cap_, (size), _Pre_opt_ptrdiff_cap_(size) _Post_valid_impl_) + +// buffer capacity is described by a complex expression +#define _Out_cap_x_(size) _SAL1_1_Source_(_Out_cap_x_, (size), _Pre_cap_x_(size) _Post_valid_impl_) +#define _Out_opt_cap_x_(size) _SAL1_1_Source_(_Out_opt_cap_x_, (size), _Pre_opt_cap_x_(size) _Post_valid_impl_) +#define _Out_bytecap_x_(size) _SAL1_1_Source_(_Out_bytecap_x_, (size), _Pre_bytecap_x_(size) _Post_valid_impl_) +#define _Out_opt_bytecap_x_(size) _SAL1_1_Source_(_Out_opt_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_) + +// a zero terminated string is filled into a buffer of given capacity +// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +// buffer capacity is described by another parameter +#define _Out_z_cap_(size) _SAL1_1_Source_(_Out_z_cap_, (size), _Pre_cap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_(size) _SAL1_1_Source_(_Out_opt_z_cap_, (size), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_z_bytecap_(size) _SAL1_1_Source_(_Out_z_bytecap_, (size), _Pre_bytecap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_bytecap_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_z_) + +// buffer capacity is described by a constant expression +#define _Out_z_cap_c_(size) _SAL1_1_Source_(_Out_z_cap_c_, (size), _Pre_cap_c_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_c_(size) _SAL1_1_Source_(_Out_opt_z_cap_c_, (size), _Pre_opt_cap_c_(size) _Post_valid_impl_ _Post_z_) +#define _Out_z_bytecap_c_(size) _SAL1_1_Source_(_Out_z_bytecap_c_, (size), _Pre_bytecap_c_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_ _Post_z_) + +// buffer capacity is described by a complex expression +#define _Out_z_cap_x_(size) _SAL1_1_Source_(_Out_z_cap_x_, (size), _Pre_cap_x_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_x_(size) _SAL1_1_Source_(_Out_opt_z_cap_x_, (size), _Pre_opt_cap_x_(size) _Post_valid_impl_ _Post_z_) +#define _Out_z_bytecap_x_(size) _SAL1_1_Source_(_Out_z_bytecap_x_, (size), _Pre_bytecap_x_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_ _Post_z_) + +// a zero terminated string is filled into a buffer of given capacity +// e.g. size_t CopyCharRange( _In_count_(cchFrom) const char* rgchFrom, size_t cchFrom, _Out_cap_post_count_(cchTo,return)) char* rgchTo, size_t cchTo ); +#define _Out_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_cap_post_count_, (cap,count), _Pre_cap_(cap) _Post_valid_impl_ _Post_count_(count)) +#define _Out_opt_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_opt_cap_post_count_, (cap,count), _Pre_opt_cap_(cap) _Post_valid_impl_ _Post_count_(count)) +#define _Out_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count)) +#define _Out_opt_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_opt_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count)) + +// a zero terminated string is filled into a buffer of given capacity +// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Out_z_cap_post_count_(cchTo,return+1) char* szTo, size_t cchTo ); +#define _Out_z_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_z_cap_post_count_, (cap,count), _Pre_cap_(cap) _Post_valid_impl_ _Post_z_count_(count)) +#define _Out_opt_z_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_opt_z_cap_post_count_, (cap,count), _Pre_opt_cap_(cap) _Post_valid_impl_ _Post_z_count_(count)) +#define _Out_z_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_z_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count)) +#define _Out_opt_z_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_opt_z_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count)) + +// only use with dereferenced arguments e.g. '*pcch' +#define _Out_capcount_(capcount) _SAL1_1_Source_(_Out_capcount_, (capcount), _Pre_cap_(capcount) _Post_valid_impl_ _Post_count_(capcount)) +#define _Out_opt_capcount_(capcount) _SAL1_1_Source_(_Out_opt_capcount_, (capcount), _Pre_opt_cap_(capcount) _Post_valid_impl_ _Post_count_(capcount)) +#define _Out_bytecapcount_(capcount) _SAL1_1_Source_(_Out_bytecapcount_, (capcount), _Pre_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount)) +#define _Out_opt_bytecapcount_(capcount) _SAL1_1_Source_(_Out_opt_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount)) + +#define _Out_capcount_x_(capcount) _SAL1_1_Source_(_Out_capcount_x_, (capcount), _Pre_cap_x_(capcount) _Post_valid_impl_ _Post_count_x_(capcount)) +#define _Out_opt_capcount_x_(capcount) _SAL1_1_Source_(_Out_opt_capcount_x_, (capcount), _Pre_opt_cap_x_(capcount) _Post_valid_impl_ _Post_count_x_(capcount)) +#define _Out_bytecapcount_x_(capcount) _SAL1_1_Source_(_Out_bytecapcount_x_, (capcount), _Pre_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount)) +#define _Out_opt_bytecapcount_x_(capcount) _SAL1_1_Source_(_Out_opt_bytecapcount_x_, (capcount), _Pre_opt_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount)) + +// e.g. GetString( _Out_z_capcount_(*pLen+1) char* sz, size_t* pLen ); +#define _Out_z_capcount_(capcount) _SAL1_1_Source_(_Out_z_capcount_, (capcount), _Pre_cap_(capcount) _Post_valid_impl_ _Post_z_count_(capcount)) +#define _Out_opt_z_capcount_(capcount) _SAL1_1_Source_(_Out_opt_z_capcount_, (capcount), _Pre_opt_cap_(capcount) _Post_valid_impl_ _Post_z_count_(capcount)) +#define _Out_z_bytecapcount_(capcount) _SAL1_1_Source_(_Out_z_bytecapcount_, (capcount), _Pre_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount)) +#define _Out_opt_z_bytecapcount_(capcount) _SAL1_1_Source_(_Out_opt_z_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount)) + + +// 'inout' buffers with initialized elements before and after the call +// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndeces, size_t cIndices ); +#define _Inout_count_(size) _SAL1_1_Source_(_Inout_count_, (size), _Prepost_count_(size)) +#define _Inout_opt_count_(size) _SAL1_1_Source_(_Inout_opt_count_, (size), _Prepost_opt_count_(size)) +#define _Inout_bytecount_(size) _SAL1_1_Source_(_Inout_bytecount_, (size), _Prepost_bytecount_(size)) +#define _Inout_opt_bytecount_(size) _SAL1_1_Source_(_Inout_opt_bytecount_, (size), _Prepost_opt_bytecount_(size)) + +#define _Inout_count_c_(size) _SAL1_1_Source_(_Inout_count_c_, (size), _Prepost_count_c_(size)) +#define _Inout_opt_count_c_(size) _SAL1_1_Source_(_Inout_opt_count_c_, (size), _Prepost_opt_count_c_(size)) +#define _Inout_bytecount_c_(size) _SAL1_1_Source_(_Inout_bytecount_c_, (size), _Prepost_bytecount_c_(size)) +#define _Inout_opt_bytecount_c_(size) _SAL1_1_Source_(_Inout_opt_bytecount_c_, (size), _Prepost_opt_bytecount_c_(size)) + +// nullterminated 'inout' buffers with initialized elements before and after the call +// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndeces, size_t cIndices ); +#define _Inout_z_count_(size) _SAL1_1_Source_(_Inout_z_count_, (size), _Prepost_z_ _Prepost_count_(size)) +#define _Inout_opt_z_count_(size) _SAL1_1_Source_(_Inout_opt_z_count_, (size), _Prepost_z_ _Prepost_opt_count_(size)) +#define _Inout_z_bytecount_(size) _SAL1_1_Source_(_Inout_z_bytecount_, (size), _Prepost_z_ _Prepost_bytecount_(size)) +#define _Inout_opt_z_bytecount_(size) _SAL1_1_Source_(_Inout_opt_z_bytecount_, (size), _Prepost_z_ _Prepost_opt_bytecount_(size)) + +#define _Inout_z_count_c_(size) _SAL1_1_Source_(_Inout_z_count_c_, (size), _Prepost_z_ _Prepost_count_c_(size)) +#define _Inout_opt_z_count_c_(size) _SAL1_1_Source_(_Inout_opt_z_count_c_, (size), _Prepost_z_ _Prepost_opt_count_c_(size)) +#define _Inout_z_bytecount_c_(size) _SAL1_1_Source_(_Inout_z_bytecount_c_, (size), _Prepost_z_ _Prepost_bytecount_c_(size)) +#define _Inout_opt_z_bytecount_c_(size) _SAL1_1_Source_(_Inout_opt_z_bytecount_c_, (size), _Prepost_z_ _Prepost_opt_bytecount_c_(size)) + +#define _Inout_ptrdiff_count_(size) _SAL1_1_Source_(_Inout_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size)) +#define _Inout_opt_ptrdiff_count_(size) _SAL1_1_Source_(_Inout_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size)) + +#define _Inout_count_x_(size) _SAL1_1_Source_(_Inout_count_x_, (size), _Prepost_count_x_(size)) +#define _Inout_opt_count_x_(size) _SAL1_1_Source_(_Inout_opt_count_x_, (size), _Prepost_opt_count_x_(size)) +#define _Inout_bytecount_x_(size) _SAL1_1_Source_(_Inout_bytecount_x_, (size), _Prepost_bytecount_x_(size)) +#define _Inout_opt_bytecount_x_(size) _SAL1_1_Source_(_Inout_opt_bytecount_x_, (size), _Prepost_opt_bytecount_x_(size)) + +// e.g. void AppendToLPSTR( _In_ LPCSTR szFrom, _Inout_cap_(cchTo) LPSTR* szTo, size_t cchTo ); +#define _Inout_cap_(size) _SAL1_1_Source_(_Inout_cap_, (size), _Pre_valid_cap_(size) _Post_valid_) +#define _Inout_opt_cap_(size) _SAL1_1_Source_(_Inout_opt_cap_, (size), _Pre_opt_valid_cap_(size) _Post_valid_) +#define _Inout_bytecap_(size) _SAL1_1_Source_(_Inout_bytecap_, (size), _Pre_valid_bytecap_(size) _Post_valid_) +#define _Inout_opt_bytecap_(size) _SAL1_1_Source_(_Inout_opt_bytecap_, (size), _Pre_opt_valid_bytecap_(size) _Post_valid_) + +#define _Inout_cap_c_(size) _SAL1_1_Source_(_Inout_cap_c_, (size), _Pre_valid_cap_c_(size) _Post_valid_) +#define _Inout_opt_cap_c_(size) _SAL1_1_Source_(_Inout_opt_cap_c_, (size), _Pre_opt_valid_cap_c_(size) _Post_valid_) +#define _Inout_bytecap_c_(size) _SAL1_1_Source_(_Inout_bytecap_c_, (size), _Pre_valid_bytecap_c_(size) _Post_valid_) +#define _Inout_opt_bytecap_c_(size) _SAL1_1_Source_(_Inout_opt_bytecap_c_, (size), _Pre_opt_valid_bytecap_c_(size) _Post_valid_) + +#define _Inout_cap_x_(size) _SAL1_1_Source_(_Inout_cap_x_, (size), _Pre_valid_cap_x_(size) _Post_valid_) +#define _Inout_opt_cap_x_(size) _SAL1_1_Source_(_Inout_opt_cap_x_, (size), _Pre_opt_valid_cap_x_(size) _Post_valid_) +#define _Inout_bytecap_x_(size) _SAL1_1_Source_(_Inout_bytecap_x_, (size), _Pre_valid_bytecap_x_(size) _Post_valid_) +#define _Inout_opt_bytecap_x_(size) _SAL1_1_Source_(_Inout_opt_bytecap_x_, (size), _Pre_opt_valid_bytecap_x_(size) _Post_valid_) + +// inout string buffers with writable size +// e.g. void AppendStr( _In_z_ const char* szFrom, _Inout_z_cap_(cchTo) char* szTo, size_t cchTo ); +#define _Inout_z_cap_(size) _SAL1_1_Source_(_Inout_z_cap_, (size), _Pre_z_cap_(size) _Post_z_) +#define _Inout_opt_z_cap_(size) _SAL1_1_Source_(_Inout_opt_z_cap_, (size), _Pre_opt_z_cap_(size) _Post_z_) +#define _Inout_z_bytecap_(size) _SAL1_1_Source_(_Inout_z_bytecap_, (size), _Pre_z_bytecap_(size) _Post_z_) +#define _Inout_opt_z_bytecap_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_, (size), _Pre_opt_z_bytecap_(size) _Post_z_) + +#define _Inout_z_cap_c_(size) _SAL1_1_Source_(_Inout_z_cap_c_, (size), _Pre_z_cap_c_(size) _Post_z_) +#define _Inout_opt_z_cap_c_(size) _SAL1_1_Source_(_Inout_opt_z_cap_c_, (size), _Pre_opt_z_cap_c_(size) _Post_z_) +#define _Inout_z_bytecap_c_(size) _SAL1_1_Source_(_Inout_z_bytecap_c_, (size), _Pre_z_bytecap_c_(size) _Post_z_) +#define _Inout_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_c_, (size), _Pre_opt_z_bytecap_c_(size) _Post_z_) + +#define _Inout_z_cap_x_(size) _SAL1_1_Source_(_Inout_z_cap_x_, (size), _Pre_z_cap_x_(size) _Post_z_) +#define _Inout_opt_z_cap_x_(size) _SAL1_1_Source_(_Inout_opt_z_cap_x_, (size), _Pre_opt_z_cap_x_(size) _Post_z_) +#define _Inout_z_bytecap_x_(size) _SAL1_1_Source_(_Inout_z_bytecap_x_, (size), _Pre_z_bytecap_x_(size) _Post_z_) +#define _Inout_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_x_, (size), _Pre_opt_z_bytecap_x_(size) _Post_z_) + + +// returning pointers to valid objects +#define _Ret_ _SAL1_1_Source_(_Ret_, (), _Ret_valid_) +#define _Ret_opt_ _SAL1_1_Source_(_Ret_opt_, (), _Ret_opt_valid_) + +// annotations to express 'boundedness' of integral value parameter +#define _In_bound_ _SAL1_1_Source_(_In_bound_, (), _In_bound_impl_) +#define _Out_bound_ _SAL1_1_Source_(_Out_bound_, (), _Out_bound_impl_) +#define _Ret_bound_ _SAL1_1_Source_(_Ret_bound_, (), _Ret_bound_impl_) +#define _Deref_in_bound_ _SAL1_1_Source_(_Deref_in_bound_, (), _Deref_in_bound_impl_) +#define _Deref_out_bound_ _SAL1_1_Source_(_Deref_out_bound_, (), _Deref_out_bound_impl_) +#define _Deref_inout_bound_ _SAL1_1_Source_(_Deref_inout_bound_, (), _Deref_in_bound_ _Deref_out_bound_) +#define _Deref_ret_bound_ _SAL1_1_Source_(_Deref_ret_bound_, (), _Deref_ret_bound_impl_) + +// e.g. HRESULT HrCreatePoint( _Deref_out_opt_ POINT** ppPT ); +#define _Deref_out_ _SAL1_1_Source_(_Deref_out_, (), _Out_ _Deref_post_valid_) +#define _Deref_out_opt_ _SAL1_1_Source_(_Deref_out_opt_, (), _Out_ _Deref_post_opt_valid_) +#define _Deref_opt_out_ _SAL1_1_Source_(_Deref_opt_out_, (), _Out_opt_ _Deref_post_valid_) +#define _Deref_opt_out_opt_ _SAL1_1_Source_(_Deref_opt_out_opt_, (), _Out_opt_ _Deref_post_opt_valid_) + +// e.g. void CloneString( _In_z_ const WCHAR* wzFrom, _Deref_out_z_ WCHAR** pWzTo ); +#define _Deref_out_z_ _SAL1_1_Source_(_Deref_out_z_, (), _Out_ _Deref_post_z_) +#define _Deref_out_opt_z_ _SAL1_1_Source_(_Deref_out_opt_z_, (), _Out_ _Deref_post_opt_z_) +#define _Deref_opt_out_z_ _SAL1_1_Source_(_Deref_opt_out_z_, (), _Out_opt_ _Deref_post_z_) +#define _Deref_opt_out_opt_z_ _SAL1_1_Source_(_Deref_opt_out_opt_z_, (), _Out_opt_ _Deref_post_opt_z_) + +// +// _Deref_pre_ --- +// +// describing conditions for array elements of dereferenced pointer parameters that must be met before the call + +// e.g. void SaveStringArray( _In_count_(cStrings) _Deref_pre_z_ const WCHAR* const rgpwch[] ); +#define _Deref_pre_z_ _SAL1_1_Source_(_Deref_pre_z_, (), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_) +#define _Deref_pre_opt_z_ _SAL1_1_Source_(_Deref_pre_opt_z_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_) + +// e.g. void FillInArrayOfStr32( _In_count_(cStrings) _Deref_pre_cap_c_(32) _Deref_post_z_ WCHAR* const rgpwch[] ); +// buffer capacity is described by another parameter +#define _Deref_pre_cap_(size) _SAL1_1_Source_(_Deref_pre_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_impl(size))) +#define _Deref_pre_opt_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size))) +#define _Deref_pre_bytecap_(size) _SAL1_1_Source_(_Deref_pre_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size))) +#define _Deref_pre_opt_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size))) + +// buffer capacity is described by a constant expression +#define _Deref_pre_cap_c_(size) _SAL1_1_Source_(_Deref_pre_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size))) +#define _Deref_pre_opt_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size))) +#define _Deref_pre_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size))) +#define _Deref_pre_opt_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size))) + +// buffer capacity is described by a complex condition +#define _Deref_pre_cap_x_(size) _SAL1_1_Source_(_Deref_pre_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size))) +#define _Deref_pre_opt_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size))) +#define _Deref_pre_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size))) +#define _Deref_pre_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size))) + +// convenience macros for nullterminated buffers with given capacity +#define _Deref_pre_z_cap_(size) _SAL1_1_Source_(_Deref_pre_z_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_z_bytecap_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_z_cap_c_(size) _SAL1_1_Source_(_Deref_pre_z_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_z_cap_x_(size) _SAL1_1_Source_(_Deref_pre_z_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) + +// known capacity and valid but unknown readable extent +#define _Deref_pre_valid_cap_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_valid_bytecap_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_valid_cap_c_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_valid_cap_x_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) + +// e.g. void SaveMatrix( _In_count_(n) _Deref_pre_count_(n) const Elem** matrix, size_t n ); +// valid buffer extent is described by another parameter +#define _Deref_pre_count_(size) _SAL1_1_Source_(_Deref_pre_count_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_count_(size) _SAL1_1_Source_(_Deref_pre_opt_count_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_bytecount_(size) _SAL1_1_Source_(_Deref_pre_bytecount_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_bytecount_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) + +// valid buffer extent is described by a constant expression +#define _Deref_pre_count_c_(size) _SAL1_1_Source_(_Deref_pre_count_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_count_c_(size) _SAL1_1_Source_(_Deref_pre_opt_count_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_bytecount_c_(size) _SAL1_1_Source_(_Deref_pre_bytecount_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_bytecount_c_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) + +// valid buffer extent is described by a complex expression +#define _Deref_pre_count_x_(size) _SAL1_1_Source_(_Deref_pre_count_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_count_x_(size) _SAL1_1_Source_(_Deref_pre_opt_count_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_bytecount_x_(size) _SAL1_1_Source_(_Deref_pre_bytecount_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) + +// e.g. void PrintStringArray( _In_count_(cElems) _Deref_pre_valid_ LPCSTR rgStr[], size_t cElems ); +#define _Deref_pre_valid_ _SAL1_1_Source_(_Deref_pre_valid_, (), _Deref_pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_ _SAL1_1_Source_(_Deref_pre_opt_valid_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_) +#define _Deref_pre_invalid_ _SAL1_1_Source_(_Deref_pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl)) + +#define _Deref_pre_notnull_ _SAL1_1_Source_(_Deref_pre_notnull_, (), _Deref_pre1_impl_(__notnull_impl_notref)) +#define _Deref_pre_maybenull_ _SAL1_1_Source_(_Deref_pre_maybenull_, (), _Deref_pre1_impl_(__maybenull_impl_notref)) +#define _Deref_pre_null_ _SAL1_1_Source_(_Deref_pre_null_, (), _Deref_pre1_impl_(__null_impl_notref)) + +// restrict access rights +#define _Deref_pre_readonly_ _SAL1_1_Source_(_Deref_pre_readonly_, (), _Deref_pre1_impl_(__readaccess_impl_notref)) +#define _Deref_pre_writeonly_ _SAL1_1_Source_(_Deref_pre_writeonly_, (), _Deref_pre1_impl_(__writeaccess_impl_notref)) + +// +// _Deref_post_ --- +// +// describing conditions for array elements or dereferenced pointer parameters that hold after the call + +// e.g. void CloneString( _In_z_ const Wchar_t* wzIn _Out_ _Deref_post_z_ WCHAR** pWzOut ); +#define _Deref_post_z_ _SAL1_1_Source_(_Deref_post_z_, (), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_) +#define _Deref_post_opt_z_ _SAL1_1_Source_(_Deref_post_opt_z_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_) + +// e.g. HRESULT HrAllocateMemory( size_t cb, _Out_ _Deref_post_bytecap_(cb) void** ppv ); +// buffer capacity is described by another parameter +#define _Deref_post_cap_(size) _SAL1_1_Source_(_Deref_post_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size))) +#define _Deref_post_opt_cap_(size) _SAL1_1_Source_(_Deref_post_opt_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size))) +#define _Deref_post_bytecap_(size) _SAL1_1_Source_(_Deref_post_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))) +#define _Deref_post_opt_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))) + +// buffer capacity is described by a constant expression +#define _Deref_post_cap_c_(size) _SAL1_1_Source_(_Deref_post_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))) +#define _Deref_post_opt_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))) +#define _Deref_post_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size))) +#define _Deref_post_opt_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size))) + +// buffer capacity is described by a complex expression +#define _Deref_post_cap_x_(size) _SAL1_1_Source_(_Deref_post_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))) +#define _Deref_post_opt_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))) +#define _Deref_post_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size))) +#define _Deref_post_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size))) + +// convenience macros for nullterminated buffers with given capacity +#define _Deref_post_z_cap_(size) _SAL1_1_Source_(_Deref_post_z_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_cap_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_z_bytecap_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size)) _Post_valid_impl_) + +#define _Deref_post_z_cap_c_(size) _SAL1_1_Source_(_Deref_post_z_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_) + +#define _Deref_post_z_cap_x_(size) _SAL1_1_Source_(_Deref_post_z_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_) + +// known capacity and valid but unknown readable extent +#define _Deref_post_valid_cap_(size) _SAL1_1_Source_(_Deref_post_valid_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_valid_bytecap_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)) _Post_valid_impl_) + +#define _Deref_post_valid_cap_c_(size) _SAL1_1_Source_(_Deref_post_valid_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_) + +#define _Deref_post_valid_cap_x_(size) _SAL1_1_Source_(_Deref_post_valid_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_) + +// e.g. HRESULT HrAllocateZeroInitializedMemory( size_t cb, _Out_ _Deref_post_bytecount_(cb) void** ppv ); +// valid buffer extent is described by another parameter +#define _Deref_post_count_(size) _SAL1_1_Source_(_Deref_post_count_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_count_(size) _SAL1_1_Source_(_Deref_post_opt_count_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Deref_post_bytecount_(size) _SAL1_1_Source_(_Deref_post_bytecount_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_bytecount_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) + +// buffer capacity is described by a constant expression +#define _Deref_post_count_c_(size) _SAL1_1_Source_(_Deref_post_count_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_count_c_(size) _SAL1_1_Source_(_Deref_post_opt_count_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_bytecount_c_(size) _SAL1_1_Source_(_Deref_post_bytecount_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_bytecount_c_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) + +// buffer capacity is described by a complex expression +#define _Deref_post_count_x_(size) _SAL1_1_Source_(_Deref_post_count_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_count_x_(size) _SAL1_1_Source_(_Deref_post_opt_count_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_bytecount_x_(size) _SAL1_1_Source_(_Deref_post_bytecount_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) + +// e.g. void GetStrings( _Out_count_(cElems) _Deref_post_valid_ LPSTR const rgStr[], size_t cElems ); +#define _Deref_post_valid_ _SAL1_1_Source_(_Deref_post_valid_, (), _Deref_post1_impl_(__notnull_impl_notref) _Post_valid_impl_) +#define _Deref_post_opt_valid_ _SAL1_1_Source_(_Deref_post_opt_valid_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Post_valid_impl_) + +#define _Deref_post_notnull_ _SAL1_1_Source_(_Deref_post_notnull_, (), _Deref_post1_impl_(__notnull_impl_notref)) +#define _Deref_post_maybenull_ _SAL1_1_Source_(_Deref_post_maybenull_, (), _Deref_post1_impl_(__maybenull_impl_notref)) +#define _Deref_post_null_ _SAL1_1_Source_(_Deref_post_null_, (), _Deref_post1_impl_(__null_impl_notref)) + +// +// _Deref_ret_ --- +// + +#define _Deref_ret_z_ _SAL1_1_Source_(_Deref_ret_z_, (), _Deref_ret1_impl_(__notnull_impl_notref) _Deref_ret1_impl_(__zterm_impl)) +#define _Deref_ret_opt_z_ _SAL1_1_Source_(_Deref_ret_opt_z_, (), _Deref_ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__zterm_impl)) + +// +// special _Deref_ --- +// +#define _Deref2_pre_readonly_ _SAL1_1_Source_(_Deref2_pre_readonly_, (), _Deref2_pre1_impl_(__readaccess_impl_notref)) + +// +// _Ret_ --- +// + +// e.g. _Ret_opt_valid_ LPSTR void* CloneSTR( _Pre_valid_ LPSTR src ); +#define _Ret_opt_valid_ _SAL1_1_Source_(_Ret_opt_valid_, (), _Ret1_impl_(__maybenull_impl_notref) _Ret_valid_impl_) +#define _Ret_opt_z_ _SAL1_1_Source_(_Ret_opt_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_) + +// e.g. _Ret_opt_bytecap_(cb) void* AllocateMemory( size_t cb ); +// Buffer capacity is described by another parameter +#define _Ret_cap_(size) _SAL1_1_Source_(_Ret_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_impl(size))) +#define _Ret_opt_cap_(size) _SAL1_1_Source_(_Ret_opt_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_impl(size))) +#define _Ret_bytecap_(size) _SAL1_1_Source_(_Ret_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_impl(size))) +#define _Ret_opt_bytecap_(size) _SAL1_1_Source_(_Ret_opt_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_impl(size))) + +// Buffer capacity is described by a constant expression +#define _Ret_cap_c_(size) _SAL1_1_Source_(_Ret_cap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_c_impl(size))) +#define _Ret_opt_cap_c_(size) _SAL1_1_Source_(_Ret_opt_cap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_c_impl(size))) +#define _Ret_bytecap_c_(size) _SAL1_1_Source_(_Ret_bytecap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size))) +#define _Ret_opt_bytecap_c_(size) _SAL1_1_Source_(_Ret_opt_bytecap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size))) + +// Buffer capacity is described by a complex condition +#define _Ret_cap_x_(size) _SAL1_1_Source_(_Ret_cap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_x_impl(size))) +#define _Ret_opt_cap_x_(size) _SAL1_1_Source_(_Ret_opt_cap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_x_impl(size))) +#define _Ret_bytecap_x_(size) _SAL1_1_Source_(_Ret_bytecap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size))) +#define _Ret_opt_bytecap_x_(size) _SAL1_1_Source_(_Ret_opt_bytecap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size))) + +// return value is nullterminated and capacity is given by another parameter +#define _Ret_z_cap_(size) _SAL1_1_Source_(_Ret_z_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_cap_(size) _SAL1_1_Source_(_Ret_opt_z_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size)) _Ret_valid_impl_) +#define _Ret_z_bytecap_(size) _SAL1_1_Source_(_Ret_z_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_bytecap_(size) _SAL1_1_Source_(_Ret_opt_z_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_) + +// e.g. _Ret_opt_bytecount_(cb) void* AllocateZeroInitializedMemory( size_t cb ); +// Valid Buffer extent is described by another parameter +#define _Ret_count_(size) _SAL1_1_Source_(_Ret_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_count_(size) _SAL1_1_Source_(_Ret_opt_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_impl(size)) _Ret_valid_impl_) +#define _Ret_bytecount_(size) _SAL1_1_Source_(_Ret_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_bytecount_(size) _SAL1_1_Source_(_Ret_opt_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_) + +// Valid Buffer extent is described by a constant expression +#define _Ret_count_c_(size) _SAL1_1_Source_(_Ret_count_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_c_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_count_c_(size) _SAL1_1_Source_(_Ret_opt_count_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_c_impl(size)) _Ret_valid_impl_) +#define _Ret_bytecount_c_(size) _SAL1_1_Source_(_Ret_bytecount_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_bytecount_c_(size) _SAL1_1_Source_(_Ret_opt_bytecount_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_) + +// Valid Buffer extent is described by a complex expression +#define _Ret_count_x_(size) _SAL1_1_Source_(_Ret_count_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_x_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_count_x_(size) _SAL1_1_Source_(_Ret_opt_count_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_x_impl(size)) _Ret_valid_impl_) +#define _Ret_bytecount_x_(size) _SAL1_1_Source_(_Ret_bytecount_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_bytecount_x_(size) _SAL1_1_Source_(_Ret_opt_bytecount_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_) + +// return value is nullterminated and length is given by another parameter +#define _Ret_z_count_(size) _SAL1_1_Source_(_Ret_z_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_count_(size) _SAL1_1_Source_(_Ret_opt_z_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size)) _Ret_valid_impl_) +#define _Ret_z_bytecount_(size) _SAL1_1_Source_(_Ret_z_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_bytecount_(size) _SAL1_1_Source_(_Ret_opt_z_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_) + + +// _Pre_ annotations --- +#define _Pre_opt_z_ _SAL1_1_Source_(_Pre_opt_z_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__zterm_impl) _Pre_valid_impl_) + +// restrict access rights +#define _Pre_readonly_ _SAL1_1_Source_(_Pre_readonly_, (), _Pre1_impl_(__readaccess_impl_notref)) +#define _Pre_writeonly_ _SAL1_1_Source_(_Pre_writeonly_, (), _Pre1_impl_(__writeaccess_impl_notref)) + +// e.g. void FreeMemory( _Pre_bytecap_(cb) _Post_ptr_invalid_ void* pv, size_t cb ); +// buffer capacity described by another parameter +#define _Pre_cap_(size) _SAL1_1_Source_(_Pre_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size))) +#define _Pre_opt_cap_(size) _SAL1_1_Source_(_Pre_opt_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size))) +#define _Pre_bytecap_(size) _SAL1_1_Source_(_Pre_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size))) +#define _Pre_opt_bytecap_(size) _SAL1_1_Source_(_Pre_opt_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size))) + +// buffer capacity described by a constant expression +#define _Pre_cap_c_(size) _SAL1_1_Source_(_Pre_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size))) +#define _Pre_opt_cap_c_(size) _SAL1_1_Source_(_Pre_opt_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size))) +#define _Pre_bytecap_c_(size) _SAL1_1_Source_(_Pre_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size))) +#define _Pre_opt_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size))) +#define _Pre_cap_c_one_ _SAL1_1_Source_(_Pre_cap_c_one_, (), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl)) +#define _Pre_opt_cap_c_one_ _SAL1_1_Source_(_Pre_opt_cap_c_one_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl)) + +// buffer capacity is described by another parameter multiplied by a constant expression +#define _Pre_cap_m_(mult,size) _SAL1_1_Source_(_Pre_cap_m_, (mult,size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__mult_impl(mult,size))) +#define _Pre_opt_cap_m_(mult,size) _SAL1_1_Source_(_Pre_opt_cap_m_, (mult,size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__mult_impl(mult,size))) + +// buffer capacity described by size of other buffer, only used by dangerous legacy APIs +// e.g. int strcpy(_Pre_cap_for_(src) char* dst, const char* src); +#define _Pre_cap_for_(param) _SAL1_1_Source_(_Pre_cap_for_, (param), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_for_impl(param))) +#define _Pre_opt_cap_for_(param) _SAL1_1_Source_(_Pre_opt_cap_for_, (param), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_for_impl(param))) + +// buffer capacity described by a complex condition +#define _Pre_cap_x_(size) _SAL1_1_Source_(_Pre_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size))) +#define _Pre_opt_cap_x_(size) _SAL1_1_Source_(_Pre_opt_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size))) +#define _Pre_bytecap_x_(size) _SAL1_1_Source_(_Pre_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size))) +#define _Pre_opt_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size))) + +// buffer capacity described by the difference to another pointer parameter +#define _Pre_ptrdiff_cap_(ptr) _SAL1_1_Source_(_Pre_ptrdiff_cap_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr)))) +#define _Pre_opt_ptrdiff_cap_(ptr) _SAL1_1_Source_(_Pre_opt_ptrdiff_cap_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr)))) + +// e.g. void AppendStr( _Pre_z_ const char* szFrom, _Pre_z_cap_(cchTo) _Post_z_ char* szTo, size_t cchTo ); +#define _Pre_z_cap_(size) _SAL1_1_Source_(_Pre_z_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_cap_(size) _SAL1_1_Source_(_Pre_opt_z_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_z_bytecap_(size) _SAL1_1_Source_(_Pre_z_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_bytecap_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Pre_z_cap_c_(size) _SAL1_1_Source_(_Pre_z_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_cap_c_(size) _SAL1_1_Source_(_Pre_opt_z_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_z_bytecap_c_(size) _SAL1_1_Source_(_Pre_z_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Pre_z_cap_x_(size) _SAL1_1_Source_(_Pre_z_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_cap_x_(size) _SAL1_1_Source_(_Pre_opt_z_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_z_bytecap_x_(size) _SAL1_1_Source_(_Pre_z_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) + +// known capacity and valid but unknown readable extent +#define _Pre_valid_cap_(size) _SAL1_1_Source_(_Pre_valid_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_cap_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_valid_bytecap_(size) _SAL1_1_Source_(_Pre_valid_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_bytecap_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Pre_valid_cap_c_(size) _SAL1_1_Source_(_Pre_valid_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_cap_c_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_valid_bytecap_c_(size) _SAL1_1_Source_(_Pre_valid_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Pre_valid_cap_x_(size) _SAL1_1_Source_(_Pre_valid_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_cap_x_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_valid_bytecap_x_(size) _SAL1_1_Source_(_Pre_valid_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) + +// e.g. void AppendCharRange( _Pre_count_(cchFrom) const char* rgFrom, size_t cchFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +// Valid buffer extent described by another parameter +#define _Pre_count_(size) _SAL1_1_Source_(_Pre_count_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_count_(size) _SAL1_1_Source_(_Pre_opt_count_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Pre_bytecount_(size) _SAL1_1_Source_(_Pre_bytecount_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_bytecount_(size) _SAL1_1_Source_(_Pre_opt_bytecount_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) + +// Valid buffer extent described by a constant expression +#define _Pre_count_c_(size) _SAL1_1_Source_(_Pre_count_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_count_c_(size) _SAL1_1_Source_(_Pre_opt_count_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Pre_bytecount_c_(size) _SAL1_1_Source_(_Pre_bytecount_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_bytecount_c_(size) _SAL1_1_Source_(_Pre_opt_bytecount_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) + +// Valid buffer extent described by a complex expression +#define _Pre_count_x_(size) _SAL1_1_Source_(_Pre_count_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_count_x_(size) _SAL1_1_Source_(_Pre_opt_count_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Pre_bytecount_x_(size) _SAL1_1_Source_(_Pre_bytecount_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_bytecount_x_(size) _SAL1_1_Source_(_Pre_opt_bytecount_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) + +// Valid buffer extent described by the difference to another pointer parameter +#define _Pre_ptrdiff_count_(ptr) _SAL1_1_Source_(_Pre_ptrdiff_count_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_) +#define _Pre_opt_ptrdiff_count_(ptr) _SAL1_1_Source_(_Pre_opt_ptrdiff_count_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_) + + +// char * strncpy(_Out_cap_(_Count) _Post_maybez_ char * _Dest, _In_z_ const char * _Source, _In_ size_t _Count) +// buffer maybe zero-terminated after the call +#define _Post_maybez_ _SAL1_1_Source_(_Post_maybez_, (), _Post1_impl_(__maybezterm_impl)) + +// e.g. SIZE_T HeapSize( _In_ HANDLE hHeap, DWORD dwFlags, _Pre_notnull_ _Post_bytecap_(return) LPCVOID lpMem ); +#define _Post_cap_(size) _SAL1_1_Source_(_Post_cap_, (size), _Post1_impl_(__cap_impl(size))) +#define _Post_bytecap_(size) _SAL1_1_Source_(_Post_bytecap_, (size), _Post1_impl_(__bytecap_impl(size))) + +// e.g. int strlen( _In_z_ _Post_count_(return+1) const char* sz ); +#define _Post_count_(size) _SAL1_1_Source_(_Post_count_, (size), _Post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Post_bytecount_(size) _SAL1_1_Source_(_Post_bytecount_, (size), _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) +#define _Post_count_c_(size) _SAL1_1_Source_(_Post_count_c_, (size), _Post1_impl_(__count_c_impl(size)) _Post_valid_impl_) +#define _Post_bytecount_c_(size) _SAL1_1_Source_(_Post_bytecount_c_, (size), _Post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) +#define _Post_count_x_(size) _SAL1_1_Source_(_Post_count_x_, (size), _Post1_impl_(__count_x_impl(size)) _Post_valid_impl_) +#define _Post_bytecount_x_(size) _SAL1_1_Source_(_Post_bytecount_x_, (size), _Post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) + +// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_count_(return+1) char* szFrom, size_t cchFrom ); +#define _Post_z_count_(size) _SAL1_1_Source_(_Post_z_count_, (size), _Post2_impl_(__zterm_impl,__count_impl(size)) _Post_valid_impl_) +#define _Post_z_bytecount_(size) _SAL1_1_Source_(_Post_z_bytecount_, (size), _Post2_impl_(__zterm_impl,__bytecount_impl(size)) _Post_valid_impl_) +#define _Post_z_count_c_(size) _SAL1_1_Source_(_Post_z_count_c_, (size), _Post2_impl_(__zterm_impl,__count_c_impl(size)) _Post_valid_impl_) +#define _Post_z_bytecount_c_(size) _SAL1_1_Source_(_Post_z_bytecount_c_, (size), _Post2_impl_(__zterm_impl,__bytecount_c_impl(size)) _Post_valid_impl_) +#define _Post_z_count_x_(size) _SAL1_1_Source_(_Post_z_count_x_, (size), _Post2_impl_(__zterm_impl,__count_x_impl(size)) _Post_valid_impl_) +#define _Post_z_bytecount_x_(size) _SAL1_1_Source_(_Post_z_bytecount_x_, (size), _Post2_impl_(__zterm_impl,__bytecount_x_impl(size)) _Post_valid_impl_) + +// +// _Prepost_ --- +// +// describing conditions that hold before and after the function call + +#define _Prepost_opt_z_ _SAL1_1_Source_(_Prepost_opt_z_, (), _Pre_opt_z_ _Post_z_) + +#define _Prepost_count_(size) _SAL1_1_Source_(_Prepost_count_, (size), _Pre_count_(size) _Post_count_(size)) +#define _Prepost_opt_count_(size) _SAL1_1_Source_(_Prepost_opt_count_, (size), _Pre_opt_count_(size) _Post_count_(size)) +#define _Prepost_bytecount_(size) _SAL1_1_Source_(_Prepost_bytecount_, (size), _Pre_bytecount_(size) _Post_bytecount_(size)) +#define _Prepost_opt_bytecount_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_, (size), _Pre_opt_bytecount_(size) _Post_bytecount_(size)) +#define _Prepost_count_c_(size) _SAL1_1_Source_(_Prepost_count_c_, (size), _Pre_count_c_(size) _Post_count_c_(size)) +#define _Prepost_opt_count_c_(size) _SAL1_1_Source_(_Prepost_opt_count_c_, (size), _Pre_opt_count_c_(size) _Post_count_c_(size)) +#define _Prepost_bytecount_c_(size) _SAL1_1_Source_(_Prepost_bytecount_c_, (size), _Pre_bytecount_c_(size) _Post_bytecount_c_(size)) +#define _Prepost_opt_bytecount_c_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Post_bytecount_c_(size)) +#define _Prepost_count_x_(size) _SAL1_1_Source_(_Prepost_count_x_, (size), _Pre_count_x_(size) _Post_count_x_(size)) +#define _Prepost_opt_count_x_(size) _SAL1_1_Source_(_Prepost_opt_count_x_, (size), _Pre_opt_count_x_(size) _Post_count_x_(size)) +#define _Prepost_bytecount_x_(size) _SAL1_1_Source_(_Prepost_bytecount_x_, (size), _Pre_bytecount_x_(size) _Post_bytecount_x_(size)) +#define _Prepost_opt_bytecount_x_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Post_bytecount_x_(size)) + +#define _Prepost_valid_ _SAL1_1_Source_(_Prepost_valid_, (), _Pre_valid_ _Post_valid_) +#define _Prepost_opt_valid_ _SAL1_1_Source_(_Prepost_opt_valid_, (), _Pre_opt_valid_ _Post_valid_) + +// +// _Deref_ --- +// +// short version for _Deref_pre_ _Deref_post_ +// describing conditions for array elements or dereferenced pointer parameters that hold before and after the call + +#define _Deref_prepost_z_ _SAL1_1_Source_(_Deref_prepost_z_, (), _Deref_pre_z_ _Deref_post_z_) +#define _Deref_prepost_opt_z_ _SAL1_1_Source_(_Deref_prepost_opt_z_, (), _Deref_pre_opt_z_ _Deref_post_opt_z_) + +#define _Deref_prepost_cap_(size) _SAL1_1_Source_(_Deref_prepost_cap_, (size), _Deref_pre_cap_(size) _Deref_post_cap_(size)) +#define _Deref_prepost_opt_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_cap_, (size), _Deref_pre_opt_cap_(size) _Deref_post_opt_cap_(size)) +#define _Deref_prepost_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_bytecap_, (size), _Deref_pre_bytecap_(size) _Deref_post_bytecap_(size)) +#define _Deref_prepost_opt_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecap_, (size), _Deref_pre_opt_bytecap_(size) _Deref_post_opt_bytecap_(size)) + +#define _Deref_prepost_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_cap_x_, (size), _Deref_pre_cap_x_(size) _Deref_post_cap_x_(size)) +#define _Deref_prepost_opt_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_cap_x_, (size), _Deref_pre_opt_cap_x_(size) _Deref_post_opt_cap_x_(size)) +#define _Deref_prepost_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_bytecap_x_, (size), _Deref_pre_bytecap_x_(size) _Deref_post_bytecap_x_(size)) +#define _Deref_prepost_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecap_x_, (size), _Deref_pre_opt_bytecap_x_(size) _Deref_post_opt_bytecap_x_(size)) + +#define _Deref_prepost_z_cap_(size) _SAL1_1_Source_(_Deref_prepost_z_cap_, (size), _Deref_pre_z_cap_(size) _Deref_post_z_cap_(size)) +#define _Deref_prepost_opt_z_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_z_cap_, (size), _Deref_pre_opt_z_cap_(size) _Deref_post_opt_z_cap_(size)) +#define _Deref_prepost_z_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_z_bytecap_, (size), _Deref_pre_z_bytecap_(size) _Deref_post_z_bytecap_(size)) +#define _Deref_prepost_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_z_bytecap_, (size), _Deref_pre_opt_z_bytecap_(size) _Deref_post_opt_z_bytecap_(size)) + +#define _Deref_prepost_valid_cap_(size) _SAL1_1_Source_(_Deref_prepost_valid_cap_, (size), _Deref_pre_valid_cap_(size) _Deref_post_valid_cap_(size)) +#define _Deref_prepost_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_, (size), _Deref_pre_opt_valid_cap_(size) _Deref_post_opt_valid_cap_(size)) +#define _Deref_prepost_valid_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_valid_bytecap_, (size), _Deref_pre_valid_bytecap_(size) _Deref_post_valid_bytecap_(size)) +#define _Deref_prepost_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_, (size), _Deref_pre_opt_valid_bytecap_(size) _Deref_post_opt_valid_bytecap_(size)) + +#define _Deref_prepost_valid_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_valid_cap_x_, (size), _Deref_pre_valid_cap_x_(size) _Deref_post_valid_cap_x_(size)) +#define _Deref_prepost_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_x_, (size), _Deref_pre_opt_valid_cap_x_(size) _Deref_post_opt_valid_cap_x_(size)) +#define _Deref_prepost_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_valid_bytecap_x_, (size), _Deref_pre_valid_bytecap_x_(size) _Deref_post_valid_bytecap_x_(size)) +#define _Deref_prepost_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_x_, (size), _Deref_pre_opt_valid_bytecap_x_(size) _Deref_post_opt_valid_bytecap_x_(size)) + +#define _Deref_prepost_count_(size) _SAL1_1_Source_(_Deref_prepost_count_, (size), _Deref_pre_count_(size) _Deref_post_count_(size)) +#define _Deref_prepost_opt_count_(size) _SAL1_1_Source_(_Deref_prepost_opt_count_, (size), _Deref_pre_opt_count_(size) _Deref_post_opt_count_(size)) +#define _Deref_prepost_bytecount_(size) _SAL1_1_Source_(_Deref_prepost_bytecount_, (size), _Deref_pre_bytecount_(size) _Deref_post_bytecount_(size)) +#define _Deref_prepost_opt_bytecount_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecount_, (size), _Deref_pre_opt_bytecount_(size) _Deref_post_opt_bytecount_(size)) + +#define _Deref_prepost_count_x_(size) _SAL1_1_Source_(_Deref_prepost_count_x_, (size), _Deref_pre_count_x_(size) _Deref_post_count_x_(size)) +#define _Deref_prepost_opt_count_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_count_x_, (size), _Deref_pre_opt_count_x_(size) _Deref_post_opt_count_x_(size)) +#define _Deref_prepost_bytecount_x_(size) _SAL1_1_Source_(_Deref_prepost_bytecount_x_, (size), _Deref_pre_bytecount_x_(size) _Deref_post_bytecount_x_(size)) +#define _Deref_prepost_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecount_x_, (size), _Deref_pre_opt_bytecount_x_(size) _Deref_post_opt_bytecount_x_(size)) + +#define _Deref_prepost_valid_ _SAL1_1_Source_(_Deref_prepost_valid_, (), _Deref_pre_valid_ _Deref_post_valid_) +#define _Deref_prepost_opt_valid_ _SAL1_1_Source_(_Deref_prepost_opt_valid_, (), _Deref_pre_opt_valid_ _Deref_post_opt_valid_) + +// +// _Deref_ +// +// used with references to arrays + +#define _Deref_out_z_cap_c_(size) _SAL1_1_Source_(_Deref_out_z_cap_c_, (size), _Deref_pre_cap_c_(size) _Deref_post_z_) +#define _Deref_inout_z_cap_c_(size) _SAL1_1_Source_(_Deref_inout_z_cap_c_, (size), _Deref_pre_z_cap_c_(size) _Deref_post_z_) +#define _Deref_out_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_out_z_bytecap_c_, (size), _Deref_pre_bytecap_c_(size) _Deref_post_z_) +#define _Deref_inout_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_inout_z_bytecap_c_, (size), _Deref_pre_z_bytecap_c_(size) _Deref_post_z_) +#define _Deref_inout_z_ _SAL1_1_Source_(_Deref_inout_z_, (), _Deref_prepost_z_) + +// #pragma endregion Input Buffer SAL 1 compatibility macros + + +//============================================================================ +// Implementation Layer: +//============================================================================ + + +// Naming conventions: +// A symbol the begins with _SA_ is for the machinery of creating any +// annotations; many of those come from sourceannotations.h in the case +// of attributes. + +// A symbol that ends with _impl is the very lowest level macro. It is +// not required to be a legal standalone annotation, and in the case +// of attribute annotations, usually is not. (In the case of some declspec +// annotations, it might be, but it should not be assumed so.) Those +// symols will be used in the _PreN..., _PostN... and _RetN... annotations +// to build up more complete annotations. + +// A symbol ending in _impl_ is reserved to the implementation as well, +// but it does form a complete annotation; usually they are used to build +// up even higher level annotations. + + +#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [ +// Sharable "_impl" macros: these can be shared between the various annotation +// forms but are part of the implementation of the macros. These are collected +// here to assure that only necessary differences in the annotations +// exist. + +#define _Always_impl_(annos) _Group_(annos _SAL_nop_impl_) _On_failure_impl_(annos _SAL_nop_impl_) +#define _Bound_impl_ _SA_annotes0(SAL_bound) +#define _Field_range_impl_(min,max) _Range_impl_(min,max) +#define _Literal_impl_ _SA_annotes1(SAL_constant, __yes) +#define _Maybenull_impl_ _SA_annotes1(SAL_null, __maybe) +#define _Maybevalid_impl_ _SA_annotes1(SAL_valid, __maybe) +#define _Must_inspect_impl_ _Post_impl_ _SA_annotes0(SAL_mustInspect) +#define _Notliteral_impl_ _SA_annotes1(SAL_constant, __no) +#define _Notnull_impl_ _SA_annotes1(SAL_null, __no) +#define _Notvalid_impl_ _SA_annotes1(SAL_valid, __no) +#define _NullNull_terminated_impl_ _Group_(_SA_annotes1(SAL_nullTerminated, __yes) _SA_annotes1(SAL_readableTo,inexpressibleCount("NullNull terminated string"))) +#define _Null_impl_ _SA_annotes1(SAL_null, __yes) +#define _Null_terminated_impl_ _SA_annotes1(SAL_nullTerminated, __yes) +#define _Out_impl_ _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_ +#define _Out_opt_impl_ _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_ +#define _Points_to_data_impl_ _At_(*_Curr_, _SA_annotes1(SAL_mayBePointer, __no)) +#define _Post_satisfies_impl_(cond) _Post_impl_ _Satisfies_impl_(cond) +#define _Post_valid_impl_ _Post1_impl_(__valid_impl) +#define _Pre_satisfies_impl_(cond) _Pre_impl_ _Satisfies_impl_(cond) +#define _Pre_valid_impl_ _Pre1_impl_(__valid_impl) +#define _Range_impl_(min,max) _SA_annotes2(SAL_range, min, max) +#define _Readable_bytes_impl_(size) _SA_annotes1(SAL_readableTo, byteCount(size)) +#define _Readable_elements_impl_(size) _SA_annotes1(SAL_readableTo, elementCount(size)) +#define _Ret_valid_impl_ _Ret1_impl_(__valid_impl) +#define _Satisfies_impl_(cond) _SA_annotes1(SAL_satisfies, cond) +#define _Valid_impl_ _SA_annotes1(SAL_valid, __yes) +#define _Writable_bytes_impl_(size) _SA_annotes1(SAL_writableTo, byteCount(size)) +#define _Writable_elements_impl_(size) _SA_annotes1(SAL_writableTo, elementCount(size)) + +#define _In_range_impl_(min,max) _Pre_impl_ _Range_impl_(min,max) +#define _Out_range_impl_(min,max) _Post_impl_ _Range_impl_(min,max) +#define _Ret_range_impl_(min,max) _Post_impl_ _Range_impl_(min,max) +#define _Deref_in_range_impl_(min,max) _Deref_pre_impl_ _Range_impl_(min,max) +#define _Deref_out_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max) +#define _Deref_ret_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max) + +#define _Deref_pre_impl_ _Pre_impl_ _Notref_impl_ _Deref_impl_ +#define _Deref_post_impl_ _Post_impl_ _Notref_impl_ _Deref_impl_ + +// The following are for the implementation machinery, and are not +// suitable for annotating general code. +// We're tying to phase this out, someday. The parser quotes the param. +#define __AuToQuOtE _SA_annotes0(SAL_AuToQuOtE) + +// Normally the parser does some simple type checking of annotation params, +// defer that check to the plugin. +#define __deferTypecheck _SA_annotes0(SAL_deferTypecheck) + +#define _SA_SPECSTRIZE( x ) #x +#define _SAL_nop_impl_ /* nothing */ +#define __nop_impl(x) x +#endif + + +#if _USE_ATTRIBUTES_FOR_SAL // [ + +// Using attributes for sal + +#include "codeanalysis\sourceannotations.h" + + +#define _SA_annotes0(n) [SAL_annotes(Name=#n)] +#define _SA_annotes1(n,pp1) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1))] +#define _SA_annotes2(n,pp1,pp2) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2))] +#define _SA_annotes3(n,pp1,pp2,pp3) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2), p3=_SA_SPECSTRIZE(pp3))] + +#define _Pre_impl_ [SAL_pre] +#define _Post_impl_ [SAL_post] +#define _Deref_impl_ [SAL_deref] +#define _Notref_impl_ [SAL_notref] + + +// Declare a function to be an annotation or primop (respectively). +// Done this way so that they don't appear in the regular compiler's +// namespace. +#define __ANNOTATION(fun) _SA_annotes0(SAL_annotation) void __SA_##fun; +#define __PRIMOP(type, fun) _SA_annotes0(SAL_primop) type __SA_##fun; +#define __QUALIFIER(fun) _SA_annotes0(SAL_qualifier) void __SA_##fun; + +// Benign declspec needed here for WindowsPREfast +#define __In_impl_ [SA_Pre(Valid=SA_Yes)] [SA_Pre(Deref=1, Notref=1, Access=SA_Read)] __declspec("SAL_pre SAL_valid") + +#elif _USE_DECLSPECS_FOR_SAL // ][ + +// Using declspecs for sal + +#define _SA_annotes0(n) __declspec(#n) +#define _SA_annotes1(n,pp1) __declspec(#n "(" _SA_SPECSTRIZE(pp1) ")" ) +#define _SA_annotes2(n,pp1,pp2) __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) ")") +#define _SA_annotes3(n,pp1,pp2,pp3) __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) "," _SA_SPECSTRIZE(pp3) ")") + +#define _Pre_impl_ _SA_annotes0(SAL_pre) +#define _Post_impl_ _SA_annotes0(SAL_post) +#define _Deref_impl_ _SA_annotes0(SAL_deref) +#define _Notref_impl_ _SA_annotes0(SAL_notref) + +// Declare a function to be an annotation or primop (respectively). +// Done this way so that they don't appear in the regular compiler's +// namespace. +#define __ANNOTATION(fun) _SA_annotes0(SAL_annotation) void __SA_##fun + +#define __PRIMOP(type, fun) _SA_annotes0(SAL_primop) type __SA_##fun + +#define __QUALIFIER(fun) _SA_annotes0(SAL_qualifier) void __SA_##fun; + +#define __In_impl_ _Pre_impl_ _SA_annotes0(SAL_valid) _Pre_impl_ _Deref_impl_ _Notref_impl_ _SA_annotes0(SAL_readonly) + +#else // ][ + +// Using "nothing" for sal + +#define _SA_annotes0(n) +#define _SA_annotes1(n,pp1) +#define _SA_annotes2(n,pp1,pp2) +#define _SA_annotes3(n,pp1,pp2,pp3) + +#define __ANNOTATION(fun) +#define __PRIMOP(type, fun) +#define __QUALIFIER(type, fun) + +#endif // ] + +#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [ + +// Declare annotations that need to be declared. +__ANNOTATION(SAL_useHeader(void)); +__ANNOTATION(SAL_bound(void)); +__ANNOTATION(SAL_allocator(void)); //??? resolve with PFD +__ANNOTATION(SAL_file_parser(__AuToQuOtE __In_impl_ char *, __In_impl_ char *)); +__ANNOTATION(SAL_source_code_content(__In_impl_ char *)); +__ANNOTATION(SAL_analysisHint(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_untrusted_data_source(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_untrusted_data_source_this(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_validated(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_validated_this(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_encoded(void)); +__ANNOTATION(SAL_adt(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_add_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_remove_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_transfer_adt_property_from(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_post_type(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_volatile(void)); +__ANNOTATION(SAL_nonvolatile(void)); +__ANNOTATION(SAL_entrypoint(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_blocksOn(__In_impl_ void*)); +__ANNOTATION(SAL_mustInspect(void)); + +// Only appears in model files, but needs to be declared. +__ANNOTATION(SAL_TypeName(__AuToQuOtE __In_impl_ char *)); + +// To be declared well-known soon. +__ANNOTATION(SAL_interlocked(void);) + +#pragma warning (suppress: 28227 28241) +__ANNOTATION(SAL_name(__In_impl_ char *, __In_impl_ char *, __In_impl_ char *);) + +__PRIMOP(char *, _Macro_value_(__In_impl_ char *)); +__PRIMOP(int, _Macro_defined_(__In_impl_ char *)); +__PRIMOP(char *, _Strstr_(__In_impl_ char *, __In_impl_ char *)); + +#endif // ] + +#if _USE_ATTRIBUTES_FOR_SAL // [ + +#define _Check_return_impl_ [SA_Post(MustCheck=SA_Yes)] + +#define _Success_impl_(expr) [SA_Success(Condition=#expr)] +#define _On_failure_impl_(annos) [SAL_context(p1="SAL_failed")] _Group_(_Post_impl_ _Group_(annos _SAL_nop_impl_)) + +#define _Printf_format_string_impl_ [SA_FormatString(Style="printf")] +#define _Scanf_format_string_impl_ [SA_FormatString(Style="scanf")] +#define _Scanf_s_format_string_impl_ [SA_FormatString(Style="scanf_s")] + +#define _In_bound_impl_ [SA_PreBound(Deref=0)] +#define _Out_bound_impl_ [SA_PostBound(Deref=0)] +#define _Ret_bound_impl_ [SA_PostBound(Deref=0)] +#define _Deref_in_bound_impl_ [SA_PreBound(Deref=1)] +#define _Deref_out_bound_impl_ [SA_PostBound(Deref=1)] +#define _Deref_ret_bound_impl_ [SA_PostBound(Deref=1)] + +#define __valid_impl Valid=SA_Yes +#define __maybevalid_impl Valid=SA_Maybe +#define __notvalid_impl Valid=SA_No + +#define __null_impl Null=SA_Yes +#define __maybenull_impl Null=SA_Maybe +#define __notnull_impl Null=SA_No + +#define __null_impl_notref Null=SA_Yes,Notref=1 +#define __maybenull_impl_notref Null=SA_Maybe,Notref=1 +#define __notnull_impl_notref Null=SA_No,Notref=1 + +#define __zterm_impl NullTerminated=SA_Yes +#define __maybezterm_impl NullTerminated=SA_Maybe +#define __maybzterm_impl NullTerminated=SA_Maybe +#define __notzterm_impl NullTerminated=SA_No + +#define __readaccess_impl Access=SA_Read +#define __writeaccess_impl Access=SA_Write +#define __allaccess_impl Access=SA_ReadWrite + +#define __readaccess_impl_notref Access=SA_Read,Notref=1 +#define __writeaccess_impl_notref Access=SA_Write,Notref=1 +#define __allaccess_impl_notref Access=SA_ReadWrite,Notref=1 + +#if _MSC_VER >= 1610 /*IFSTRIP=IGN*/ // [ + +// For SAL2, we need to expect general expressions. + +#define __cap_impl(size) WritableElements="\n"#size +#define __bytecap_impl(size) WritableBytes="\n"#size +#define __bytecount_impl(size) ValidBytes="\n"#size +#define __count_impl(size) ValidElements="\n"#size + +#else // ][ + +#define __cap_impl(size) WritableElements=#size +#define __bytecap_impl(size) WritableBytes=#size +#define __bytecount_impl(size) ValidBytes=#size +#define __count_impl(size) ValidElements=#size + +#endif // ] + +#define __cap_c_impl(size) WritableElementsConst=size +#define __cap_c_one_notref_impl WritableElementsConst=1,Notref=1 +#define __cap_for_impl(param) WritableElementsLength=#param +#define __cap_x_impl(size) WritableElements="\n@"#size + +#define __bytecap_c_impl(size) WritableBytesConst=size +#define __bytecap_x_impl(size) WritableBytes="\n@"#size + +#define __mult_impl(mult,size) __cap_impl((mult)*(size)) + +#define __count_c_impl(size) ValidElementsConst=size +#define __count_x_impl(size) ValidElements="\n@"#size + +#define __bytecount_c_impl(size) ValidBytesConst=size +#define __bytecount_x_impl(size) ValidBytes="\n@"#size + + +#define _At_impl_(target, annos) [SAL_at(p1=#target)] _Group_(annos) +#define _At_buffer_impl_(target, iter, bound, annos) [SAL_at_buffer(p1=#target, p2=#iter, p3=#bound)] _Group_(annos) +#define _When_impl_(expr, annos) [SAL_when(p1=#expr)] _Group_(annos) + +#define _Group_impl_(annos) [SAL_begin] annos [SAL_end] +#define _GrouP_impl_(annos) [SAL_BEGIN] annos [SAL_END] + +#define _Use_decl_anno_impl_ _SA_annotes0(SAL_useHeader) // this is a special case! + +#define _Pre1_impl_(p1) [SA_Pre(p1)] +#define _Pre2_impl_(p1,p2) [SA_Pre(p1,p2)] +#define _Pre3_impl_(p1,p2,p3) [SA_Pre(p1,p2,p3)] + +#define _Post1_impl_(p1) [SA_Post(p1)] +#define _Post2_impl_(p1,p2) [SA_Post(p1,p2)] +#define _Post3_impl_(p1,p2,p3) [SA_Post(p1,p2,p3)] + +#define _Ret1_impl_(p1) [SA_Post(p1)] +#define _Ret2_impl_(p1,p2) [SA_Post(p1,p2)] +#define _Ret3_impl_(p1,p2,p3) [SA_Post(p1,p2,p3)] + +#define _Deref_pre1_impl_(p1) [SA_Pre(Deref=1,p1)] +#define _Deref_pre2_impl_(p1,p2) [SA_Pre(Deref=1,p1,p2)] +#define _Deref_pre3_impl_(p1,p2,p3) [SA_Pre(Deref=1,p1,p2,p3)] + + +#define _Deref_post1_impl_(p1) [SA_Post(Deref=1,p1)] +#define _Deref_post2_impl_(p1,p2) [SA_Post(Deref=1,p1,p2)] +#define _Deref_post3_impl_(p1,p2,p3) [SA_Post(Deref=1,p1,p2,p3)] + +#define _Deref_ret1_impl_(p1) [SA_Post(Deref=1,p1)] +#define _Deref_ret2_impl_(p1,p2) [SA_Post(Deref=1,p1,p2)] +#define _Deref_ret3_impl_(p1,p2,p3) [SA_Post(Deref=1,p1,p2,p3)] + +#define _Deref2_pre1_impl_(p1) [SA_Pre(Deref=2,Notref=1,p1)] +#define _Deref2_post1_impl_(p1) [SA_Post(Deref=2,Notref=1,p1)] +#define _Deref2_ret1_impl_(p1) [SA_Post(Deref=2,Notref=1,p1)] + +// Obsolete -- may be needed for transition to attributes. +#define __inner_typefix(ctype) [SAL_typefix(p1=_SA_SPECSTRIZE(ctype))] +#define __inner_exceptthat [SAL_except] + + +#elif _USE_DECLSPECS_FOR_SAL // ][ + +#define _Check_return_impl_ __post _SA_annotes0(SAL_checkReturn) + +#define _Success_impl_(expr) _SA_annotes1(SAL_success, expr) +#define _On_failure_impl_(annos) _SA_annotes1(SAL_context, SAL_failed) _Group_(_Post_impl_ _Group_(_SAL_nop_impl_ annos)) + +#define _Printf_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "printf") +#define _Scanf_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "scanf") +#define _Scanf_s_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "scanf_s") + +#define _In_bound_impl_ _Pre_impl_ _Bound_impl_ +#define _Out_bound_impl_ _Post_impl_ _Bound_impl_ +#define _Ret_bound_impl_ _Post_impl_ _Bound_impl_ +#define _Deref_in_bound_impl_ _Deref_pre_impl_ _Bound_impl_ +#define _Deref_out_bound_impl_ _Deref_post_impl_ _Bound_impl_ +#define _Deref_ret_bound_impl_ _Deref_post_impl_ _Bound_impl_ + + +#define __null_impl _SA_annotes0(SAL_null) // _SA_annotes1(SAL_null, __yes) +#define __notnull_impl _SA_annotes0(SAL_notnull) // _SA_annotes1(SAL_null, __no) +#define __maybenull_impl _SA_annotes0(SAL_maybenull) // _SA_annotes1(SAL_null, __maybe) + +#define __valid_impl _SA_annotes0(SAL_valid) // _SA_annotes1(SAL_valid, __yes) +#define __notvalid_impl _SA_annotes0(SAL_notvalid) // _SA_annotes1(SAL_valid, __no) +#define __maybevalid_impl _SA_annotes0(SAL_maybevalid) // _SA_annotes1(SAL_valid, __maybe) + +#define __null_impl_notref _Notref_ _Null_impl_ +#define __maybenull_impl_notref _Notref_ _Maybenull_impl_ +#define __notnull_impl_notref _Notref_ _Notnull_impl_ + +#define __zterm_impl _SA_annotes1(SAL_nullTerminated, __yes) +#define __maybezterm_impl _SA_annotes1(SAL_nullTerminated, __maybe) +#define __maybzterm_impl _SA_annotes1(SAL_nullTerminated, __maybe) +#define __notzterm_impl _SA_annotes1(SAL_nullTerminated, __no) + +#define __readaccess_impl _SA_annotes1(SAL_access, 0x1) +#define __writeaccess_impl _SA_annotes1(SAL_access, 0x2) +#define __allaccess_impl _SA_annotes1(SAL_access, 0x3) + +#define __readaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x1) +#define __writeaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x2) +#define __allaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x3) + +#define __cap_impl(size) _SA_annotes1(SAL_writableTo,elementCount(size)) +#define __cap_c_impl(size) _SA_annotes1(SAL_writableTo,elementCount(size)) +#define __cap_c_one_notref_impl _Notref_ _SA_annotes1(SAL_writableTo,elementCount(1)) +#define __cap_for_impl(param) _SA_annotes1(SAL_writableTo,inexpressibleCount(sizeof(param))) +#define __cap_x_impl(size) _SA_annotes1(SAL_writableTo,inexpressibleCount(#size)) + +#define __bytecap_impl(size) _SA_annotes1(SAL_writableTo,byteCount(size)) +#define __bytecap_c_impl(size) _SA_annotes1(SAL_writableTo,byteCount(size)) +#define __bytecap_x_impl(size) _SA_annotes1(SAL_writableTo,inexpressibleCount(#size)) + +#define __mult_impl(mult,size) _SA_annotes1(SAL_writableTo,(mult)*(size)) + +#define __count_impl(size) _SA_annotes1(SAL_readableTo,elementCount(size)) +#define __count_c_impl(size) _SA_annotes1(SAL_readableTo,elementCount(size)) +#define __count_x_impl(size) _SA_annotes1(SAL_readableTo,inexpressibleCount(#size)) + +#define __bytecount_impl(size) _SA_annotes1(SAL_readableTo,byteCount(size)) +#define __bytecount_c_impl(size) _SA_annotes1(SAL_readableTo,byteCount(size)) +#define __bytecount_x_impl(size) _SA_annotes1(SAL_readableTo,inexpressibleCount(#size)) + +#define _At_impl_(target, annos) _SA_annotes0(SAL_at(target)) _Group_(annos) +#define _At_buffer_impl_(target, iter, bound, annos) _SA_annotes3(SAL_at_buffer, target, iter, bound) _Group_(annos) +#define _Group_impl_(annos) _SA_annotes0(SAL_begin) annos _SA_annotes0(SAL_end) +#define _GrouP_impl_(annos) _SA_annotes0(SAL_BEGIN) annos _SA_annotes0(SAL_END) +#define _When_impl_(expr, annos) _SA_annotes0(SAL_when(expr)) _Group_(annos) + +#define _Use_decl_anno_impl_ __declspec("SAL_useHeader()") // this is a special case! + +#define _Pre1_impl_(p1) _Pre_impl_ p1 +#define _Pre2_impl_(p1,p2) _Pre_impl_ p1 _Pre_impl_ p2 +#define _Pre3_impl_(p1,p2,p3) _Pre_impl_ p1 _Pre_impl_ p2 _Pre_impl_ p3 + +#define _Post1_impl_(p1) _Post_impl_ p1 +#define _Post2_impl_(p1,p2) _Post_impl_ p1 _Post_impl_ p2 +#define _Post3_impl_(p1,p2,p3) _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3 + +#define _Ret1_impl_(p1) _Post_impl_ p1 +#define _Ret2_impl_(p1,p2) _Post_impl_ p1 _Post_impl_ p2 +#define _Ret3_impl_(p1,p2,p3) _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3 + +#define _Deref_pre1_impl_(p1) _Deref_pre_impl_ p1 +#define _Deref_pre2_impl_(p1,p2) _Deref_pre_impl_ p1 _Deref_pre_impl_ p2 +#define _Deref_pre3_impl_(p1,p2,p3) _Deref_pre_impl_ p1 _Deref_pre_impl_ p2 _Deref_pre_impl_ p3 + +#define _Deref_post1_impl_(p1) _Deref_post_impl_ p1 +#define _Deref_post2_impl_(p1,p2) _Deref_post_impl_ p1 _Deref_post_impl_ p2 +#define _Deref_post3_impl_(p1,p2,p3) _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3 + +#define _Deref_ret1_impl_(p1) _Deref_post_impl_ p1 +#define _Deref_ret2_impl_(p1,p2) _Deref_post_impl_ p1 _Deref_post_impl_ p2 +#define _Deref_ret3_impl_(p1,p2,p3) _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3 + +#define _Deref2_pre1_impl_(p1) _Deref_pre_impl_ _Notref_impl_ _Deref_impl_ p1 +#define _Deref2_post1_impl_(p1) _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1 +#define _Deref2_ret1_impl_(p1) _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1 + +#define __inner_typefix(ctype) _SA_annotes1(SAL_typefix, ctype) +#define __inner_exceptthat _SA_annotes0(SAL_except) + +#elif defined(_MSC_EXTENSIONS) && !defined( MIDL_PASS ) && !defined(__midl) && !defined(RC_INVOKED) && defined(_PFT_VER) && _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // ][ + +// minimum attribute expansion for foreground build + +#pragma push_macro( "SA" ) +#pragma push_macro( "REPEATABLE" ) + +#ifdef __cplusplus // [ +#define SA( id ) id +#define REPEATABLE [repeatable] +#else // !__cplusplus // ][ +#define SA( id ) SA_##id +#define REPEATABLE +#endif // !__cplusplus // ] + +REPEATABLE +[source_annotation_attribute( SA( Parameter ) )] +struct __P_impl +{ +#ifdef __cplusplus // [ + __P_impl(); +#endif // ] + int __d_; +}; +typedef struct __P_impl __P_impl; + +REPEATABLE +[source_annotation_attribute( SA( ReturnValue ) )] +struct __R_impl +{ +#ifdef __cplusplus // [ + __R_impl(); +#endif // ] + int __d_; +}; +typedef struct __R_impl __R_impl; + +[source_annotation_attribute( SA( Method ) )] +struct __M_ +{ +#ifdef __cplusplus // [ + __M_(); +#endif // ] + int __d_; +}; +typedef struct __M_ __M_; + +[source_annotation_attribute( SA( All ) )] +struct __A_ +{ +#ifdef __cplusplus // [ + __A_(); +#endif // ] + int __d_; +}; +typedef struct __A_ __A_; + +[source_annotation_attribute( SA( Field ) )] +struct __F_ +{ +#ifdef __cplusplus // [ + __F_(); +#endif // ] + int __d_; +}; +typedef struct __F_ __F_; + +#pragma pop_macro( "REPEATABLE" ) +#pragma pop_macro( "SA" ) + + +#define _SAL_nop_impl_ + +#define _At_impl_(target, annos) [__A_(__d_=0)] +#define _At_buffer_impl_(target, iter, bound, annos) [__A_(__d_=0)] +#define _When_impl_(expr, annos) annos +#define _Group_impl_(annos) annos +#define _GrouP_impl_(annos) annos +#define _Use_decl_anno_impl_ [__M_(__d_=0)] + +#define _Points_to_data_impl_ [__P_impl(__d_=0)] +#define _Literal_impl_ [__P_impl(__d_=0)] +#define _Notliteral_impl_ [__P_impl(__d_=0)] + +#define _Pre_valid_impl_ [__P_impl(__d_=0)] +#define _Post_valid_impl_ [__P_impl(__d_=0)] +#define _Ret_valid_impl_ [__R_impl(__d_=0)] + +#define _Check_return_impl_ [__R_impl(__d_=0)] +#define _Must_inspect_impl_ [__R_impl(__d_=0)] + +#define _Success_impl_(expr) [__M_(__d_=0)] +#define _On_failure_impl_(expr) [__M_(__d_=0)] +#define _Always_impl_(expr) [__M_(__d_=0)] + +#define _Printf_format_string_impl_ [__P_impl(__d_=0)] +#define _Scanf_format_string_impl_ [__P_impl(__d_=0)] +#define _Scanf_s_format_string_impl_ [__P_impl(__d_=0)] + +#define _Raises_SEH_exception_impl_ [__M_(__d_=0)] +#define _Maybe_raises_SEH_exception_impl_ [__M_(__d_=0)] + +#define _In_bound_impl_ [__P_impl(__d_=0)] +#define _Out_bound_impl_ [__P_impl(__d_=0)] +#define _Ret_bound_impl_ [__R_impl(__d_=0)] +#define _Deref_in_bound_impl_ [__P_impl(__d_=0)] +#define _Deref_out_bound_impl_ [__P_impl(__d_=0)] +#define _Deref_ret_bound_impl_ [__R_impl(__d_=0)] + +#define _Range_impl_(min,max) [__P_impl(__d_=0)] +#define _In_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Out_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Ret_range_impl_(min,max) [__R_impl(__d_=0)] +#define _Deref_in_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Deref_out_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Deref_ret_range_impl_(min,max) [__R_impl(__d_=0)] + +#define _Field_range_impl_(min,max) [__F_(__d_=0)] + +#define _Pre_satisfies_impl_(cond) [__A_(__d_=0)] +#define _Post_satisfies_impl_(cond) [__A_(__d_=0)] +#define _Satisfies_impl_(cond) [__A_(__d_=0)] + +#define _Null_impl_ [__A_(__d_=0)] +#define _Notnull_impl_ [__A_(__d_=0)] +#define _Maybenull_impl_ [__A_(__d_=0)] + +#define _Valid_impl_ [__A_(__d_=0)] +#define _Notvalid_impl_ [__A_(__d_=0)] +#define _Maybevalid_impl_ [__A_(__d_=0)] + +#define _Readable_bytes_impl_(size) [__A_(__d_=0)] +#define _Readable_elements_impl_(size) [__A_(__d_=0)] +#define _Writable_bytes_impl_(size) [__A_(__d_=0)] +#define _Writable_elements_impl_(size) [__A_(__d_=0)] + +#define _Null_terminated_impl_ [__A_(__d_=0)] +#define _NullNull_terminated_impl_ [__A_(__d_=0)] + +#define _Pre_impl_ [__P_impl(__d_=0)] +#define _Pre1_impl_(p1) [__P_impl(__d_=0)] +#define _Pre2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Pre3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Post_impl_ [__P_impl(__d_=0)] +#define _Post1_impl_(p1) [__P_impl(__d_=0)] +#define _Post2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Post3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Ret1_impl_(p1) [__R_impl(__d_=0)] +#define _Ret2_impl_(p1,p2) [__R_impl(__d_=0)] +#define _Ret3_impl_(p1,p2,p3) [__R_impl(__d_=0)] + +#define _Deref_pre1_impl_(p1) [__P_impl(__d_=0)] +#define _Deref_pre2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Deref_pre3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Deref_post1_impl_(p1) [__P_impl(__d_=0)] +#define _Deref_post2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Deref_post3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Deref_ret1_impl_(p1) [__R_impl(__d_=0)] +#define _Deref_ret2_impl_(p1,p2) [__R_impl(__d_=0)] +#define _Deref_ret3_impl_(p1,p2,p3) [__R_impl(__d_=0)] + +#define _Deref2_pre1_impl_(p1) //[__P_impl(__d_=0)] +#define _Deref2_post1_impl_(p1) //[__P_impl(__d_=0)] +#define _Deref2_ret1_impl_(p1) //[__P_impl(__d_=0)] + +#else // ][ + + +#define _SAL_nop_impl_ X + +#define _At_impl_(target, annos) +#define _When_impl_(expr, annos) +#define _Group_impl_(annos) +#define _GrouP_impl_(annos) +#define _At_buffer_impl_(target, iter, bound, annos) +#define _Use_decl_anno_impl_ +#define _Points_to_data_impl_ +#define _Literal_impl_ +#define _Notliteral_impl_ +#define _Notref_impl_ + +#define _Pre_valid_impl_ +#define _Post_valid_impl_ +#define _Ret_valid_impl_ + +#define _Check_return_impl_ +#define _Must_inspect_impl_ + +#define _Success_impl_(expr) +#define _On_failure_impl_(annos) +#define _Always_impl_(annos) + +#define _Printf_format_string_impl_ +#define _Scanf_format_string_impl_ +#define _Scanf_s_format_string_impl_ + +#define _In_bound_impl_ +#define _Out_bound_impl_ +#define _Ret_bound_impl_ +#define _Deref_in_bound_impl_ +#define _Deref_out_bound_impl_ +#define _Deref_ret_bound_impl_ + +#define _Range_impl_(min,max) +#define _In_range_impl_(min,max) +#define _Out_range_impl_(min,max) +#define _Ret_range_impl_(min,max) +#define _Deref_in_range_impl_(min,max) +#define _Deref_out_range_impl_(min,max) +#define _Deref_ret_range_impl_(min,max) + +#define _Satisfies_impl_(expr) +#define _Pre_satisfies_impl_(expr) +#define _Post_satisfies_impl_(expr) + +#define _Null_impl_ +#define _Notnull_impl_ +#define _Maybenull_impl_ + +#define _Valid_impl_ +#define _Notvalid_impl_ +#define _Maybevalid_impl_ + +#define _Field_range_impl_(min,max) + +#define _Pre_impl_ +#define _Pre1_impl_(p1) +#define _Pre2_impl_(p1,p2) +#define _Pre3_impl_(p1,p2,p3) + +#define _Post_impl_ +#define _Post1_impl_(p1) +#define _Post2_impl_(p1,p2) +#define _Post3_impl_(p1,p2,p3) + +#define _Ret1_impl_(p1) +#define _Ret2_impl_(p1,p2) +#define _Ret3_impl_(p1,p2,p3) + +#define _Deref_pre1_impl_(p1) +#define _Deref_pre2_impl_(p1,p2) +#define _Deref_pre3_impl_(p1,p2,p3) + +#define _Deref_post1_impl_(p1) +#define _Deref_post2_impl_(p1,p2) +#define _Deref_post3_impl_(p1,p2,p3) + +#define _Deref_ret1_impl_(p1) +#define _Deref_ret2_impl_(p1,p2) +#define _Deref_ret3_impl_(p1,p2,p3) + +#define _Deref2_pre1_impl_(p1) +#define _Deref2_post1_impl_(p1) +#define _Deref2_ret1_impl_(p1) + +#define _Readable_bytes_impl_(size) +#define _Readable_elements_impl_(size) +#define _Writable_bytes_impl_(size) +#define _Writable_elements_impl_(size) + +#define _Null_terminated_impl_ +#define _NullNull_terminated_impl_ + +// Obsolete -- may be needed for transition to attributes. +#define __inner_typefix(ctype) +#define __inner_exceptthat + +#endif // ] + +// This section contains the deprecated annotations + +/* + ------------------------------------------------------------------------------- + Introduction + + sal.h provides a set of annotations to describe how a function uses its + parameters - the assumptions it makes about them, and the guarantees it makes + upon finishing. + + Annotations may be placed before either a function parameter's type or its return + type, and describe the function's behavior regarding the parameter or return value. + There are two classes of annotations: buffer annotations and advanced annotations. + Buffer annotations describe how functions use their pointer parameters, and + advanced annotations either describe complex/unusual buffer behavior, or provide + additional information about a parameter that is not otherwise expressible. + + ------------------------------------------------------------------------------- + Buffer Annotations + + The most important annotations in sal.h provide a consistent way to annotate + buffer parameters or return values for a function. Each of these annotations describes + a single buffer (which could be a string, a fixed-length or variable-length array, + or just a pointer) that the function interacts with: where it is, how large it is, + how much is initialized, and what the function does with it. + + The appropriate macro for a given buffer can be constructed using the table below. + Just pick the appropriate values from each category, and combine them together + with a leading underscore. Some combinations of values do not make sense as buffer + annotations. Only meaningful annotations can be added to your code; for a list of + these, see the buffer annotation definitions section. + + Only a single buffer annotation should be used for each parameter. + + |------------|------------|---------|--------|----------|----------|---------------| + | Level | Usage | Size | Output | NullTerm | Optional | Parameters | + |------------|------------|---------|--------|----------|----------|---------------| + | <> | <> | <> | <> | _z | <> | <> | + | _deref | _in | _ecount | _full | _nz | _opt | (size) | + | _deref_opt | _out | _bcount | _part | | | (size,length) | + | | _inout | | | | | | + | | | | | | | | + |------------|------------|---------|--------|----------|----------|---------------| + + Level: Describes the buffer pointer's level of indirection from the parameter or + return value 'p'. + + <> : p is the buffer pointer. + _deref : *p is the buffer pointer. p must not be NULL. + _deref_opt : *p may be the buffer pointer. p may be NULL, in which case the rest of + the annotation is ignored. + + Usage: Describes how the function uses the buffer. + + <> : The buffer is not accessed. If used on the return value or with _deref, the + function will provide the buffer, and it will be uninitialized at exit. + Otherwise, the caller must provide the buffer. This should only be used + for alloc and free functions. + _in : The function will only read from the buffer. The caller must provide the + buffer and initialize it. Cannot be used with _deref. + _out : The function will only write to the buffer. If used on the return value or + with _deref, the function will provide the buffer and initialize it. + Otherwise, the caller must provide the buffer, and the function will + initialize it. + _inout : The function may freely read from and write to the buffer. The caller must + provide the buffer and initialize it. If used with _deref, the buffer may + be reallocated by the function. + + Size: Describes the total size of the buffer. This may be less than the space actually + allocated for the buffer, in which case it describes the accessible amount. + + <> : No buffer size is given. If the type specifies the buffer size (such as + with LPSTR and LPWSTR), that amount is used. Otherwise, the buffer is one + element long. Must be used with _in, _out, or _inout. + _ecount : The buffer size is an explicit element count. + _bcount : The buffer size is an explicit byte count. + + Output: Describes how much of the buffer will be initialized by the function. For + _inout buffers, this also describes how much is initialized at entry. Omit this + category for _in buffers; they must be fully initialized by the caller. + + <> : The type specifies how much is initialized. For instance, a function initializing + an LPWSTR must NULL-terminate the string. + _full : The function initializes the entire buffer. + _part : The function initializes part of the buffer, and explicitly indicates how much. + + NullTerm: States if the present of a '\0' marks the end of valid elements in the buffer. + _z : A '\0' indicated the end of the buffer + _nz : The buffer may not be null terminated and a '\0' does not indicate the end of the + buffer. + Optional: Describes if the buffer itself is optional. + + <> : The pointer to the buffer must not be NULL. + _opt : The pointer to the buffer might be NULL. It will be checked before being dereferenced. + + Parameters: Gives explicit counts for the size and length of the buffer. + + <> : There is no explicit count. Use when neither _ecount nor _bcount is used. + (size) : Only the buffer's total size is given. Use with _ecount or _bcount but not _part. + (size,length) : The buffer's total size and initialized length are given. Use with _ecount_part + and _bcount_part. + + ------------------------------------------------------------------------------- + Buffer Annotation Examples + + LWSTDAPI_(BOOL) StrToIntExA( + __in LPCSTR pszString, + DWORD dwFlags, + __out int *piRet -- A pointer whose dereference will be filled in. + ); + + void MyPaintingFunction( + __in HWND hwndControl, -- An initialized read-only parameter. + __in_opt HDC hdcOptional, -- An initialized read-only parameter that might be NULL. + __inout IPropertyStore *ppsStore -- An initialized parameter that may be freely used + -- and modified. + ); + + LWSTDAPI_(BOOL) PathCompactPathExA( + __out_ecount(cchMax) LPSTR pszOut, -- A string buffer with cch elements that will + -- be NULL terminated on exit. + __in LPCSTR pszSrc, + UINT cchMax, + DWORD dwFlags + ); + + HRESULT SHLocalAllocBytes( + size_t cb, + __deref_bcount(cb) T **ppv -- A pointer whose dereference will be set to an + -- uninitialized buffer with cb bytes. + ); + + __inout_bcount_full(cb) : A buffer with cb elements that is fully initialized at + entry and exit, and may be written to by this function. + + __out_ecount_part(count, *countOut) : A buffer with count elements that will be + partially initialized by this function. The function indicates how much it + initialized by setting *countOut. + + ------------------------------------------------------------------------------- + Advanced Annotations + + Advanced annotations describe behavior that is not expressible with the regular + buffer macros. These may be used either to annotate buffer parameters that involve + complex or conditional behavior, or to enrich existing annotations with additional + information. + + __success(expr) f : + indicates whether function f succeeded or not. If is true at exit, + all the function's guarantees (as given by other annotations) must hold. If + is false at exit, the caller should not expect any of the function's guarantees + to hold. If not used, the function must always satisfy its guarantees. Added + automatically to functions that indicate success in standard ways, such as by + returning an HRESULT. + + __nullterminated p : + Pointer p is a buffer that may be read or written up to and including the first + NULL character or pointer. May be used on typedefs, which marks valid (properly + initialized) instances of that type as being NULL-terminated. + + __nullnullterminated p : + Pointer p is a buffer that may be read or written up to and including the first + sequence of two NULL characters or pointers. May be used on typedefs, which marks + valid instances of that type as being double-NULL terminated. + + __reserved v : + Value v must be 0/NULL, reserved for future use. + + __checkReturn v : + Return value v must not be ignored by callers of this function. + + __typefix(ctype) v : + Value v should be treated as an instance of ctype, rather than its declared type. + + __override f : + Specify C#-style 'override' behaviour for overriding virtual methods. + + __callback f : + Function f can be used as a function pointer. + + __format_string p : + Pointer p is a string that contains % markers in the style of printf. + + __blocksOn(resource) f : + Function f blocks on the resource 'resource'. + + __fallthrough : + Annotates switch statement labels where fall-through is desired, to distinguish + from forgotten break statements. + + ------------------------------------------------------------------------------- + Advanced Annotation Examples + + __success(return != FALSE) LWSTDAPI_(BOOL) + PathCanonicalizeA(__out_ecount(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) : + pszBuf is only guaranteed to be NULL-terminated when TRUE is returned. + + typedef __nullterminated WCHAR* LPWSTR : Initialized LPWSTRs are NULL-terminated strings. + + __out_ecount(cch) __typefix(LPWSTR) void *psz : psz is a buffer parameter which will be + a NULL-terminated WCHAR string at exit, and which initially contains cch WCHARs. + + ------------------------------------------------------------------------------- +*/ + +#define __specstrings + +#ifdef __cplusplus // [ +#ifndef __nothrow // [ +# define __nothrow __declspec(nothrow) +#endif // ] +extern "C" { +#else // ][ +#ifndef __nothrow // [ +# define __nothrow +#endif // ] +#endif /* #ifdef __cplusplus */ // ] + + +/* + ------------------------------------------------------------------------------- + Helper Macro Definitions + + These express behavior common to many of the high-level annotations. + DO NOT USE THESE IN YOUR CODE. + ------------------------------------------------------------------------------- +*/ + +/* + The helper annotations are only understood by the compiler version used by + various defect detection tools. When the regular compiler is running, they + are defined into nothing, and do not affect the compiled code. +*/ + +#if !defined(__midl) && defined(_PREFAST_) // [ + + /* + In the primitive "SAL_*" annotations "SAL" stands for Standard + Annotation Language. These "SAL_*" annotations are the + primitives the compiler understands and high-level MACROs + will decompose into these primivates. + */ + + #define _SA_SPECSTRIZE( x ) #x + + /* + __null p + __notnull p + __maybenull p + + Annotates a pointer p. States that pointer p is null. Commonly used + in the negated form __notnull or the possibly null form __maybenull. + */ + +#ifndef PAL_STDCPP_COMPAT + #define __null _Null_impl_ + #define __notnull _Notnull_impl_ + #define __maybenull _Maybenull_impl_ +#endif // !PAL_STDCPP_COMPAT + + /* + __readonly l + __notreadonly l + __mabyereadonly l + + Annotates a location l. States that location l is not modified after + this point. If the annotation is placed on the precondition state of + a function, the restriction only applies until the postcondition state + of the function. __maybereadonly states that the annotated location + may be modified, whereas __notreadonly states that a location must be + modified. + */ + + #define __readonly _Pre1_impl_(__readaccess_impl) + #define __notreadonly _Pre1_impl_(__allaccess_impl) + #define __maybereadonly _Pre1_impl_(__readaccess_impl) + + /* + __valid v + __notvalid v + __maybevalid v + + Annotates any value v. States that the value satisfies all properties of + valid values of its type. For example, for a string buffer, valid means + that the buffer pointer is either NULL or points to a NULL-terminated string. + */ + + #define __valid _Valid_impl_ + #define __notvalid _Notvalid_impl_ + #define __maybevalid _Maybevalid_impl_ + + /* + __readableTo(extent) p + + Annotates a buffer pointer p. If the buffer can be read, extent describes + how much of the buffer is readable. For a reader of the buffer, this is + an explicit permission to read up to that amount, rather than a restriction to + read only up to it. + */ + + #define __readableTo(extent) _SA_annotes1(SAL_readableTo, extent) + + /* + + __elem_readableTo(size) + + Annotates a buffer pointer p as being readable to size elements. + */ + + #define __elem_readableTo(size) _SA_annotes1(SAL_readableTo, elementCount( size )) + + /* + __byte_readableTo(size) + + Annotates a buffer pointer p as being readable to size bytes. + */ + #define __byte_readableTo(size) _SA_annotes1(SAL_readableTo, byteCount(size)) + + /* + __writableTo(extent) p + + Annotates a buffer pointer p. If the buffer can be modified, extent + describes how much of the buffer is writable (usually the allocation + size). For a writer of the buffer, this is an explicit permission to + write up to that amount, rather than a restriction to write only up to it. + */ + #define __writableTo(size) _SA_annotes1(SAL_writableTo, size) + + /* + __elem_writableTo(size) + + Annotates a buffer pointer p as being writable to size elements. + */ + #define __elem_writableTo(size) _SA_annotes1(SAL_writableTo, elementCount( size )) + + /* + __byte_writableTo(size) + + Annotates a buffer pointer p as being writable to size bytes. + */ + #define __byte_writableTo(size) _SA_annotes1(SAL_writableTo, byteCount( size)) + + /* + __deref p + + Annotates a pointer p. The next annotation applies one dereference down + in the type. If readableTo(p, size) then the next annotation applies to + all elements *(p+i) for which i satisfies the size. If p is a pointer + to a struct, the next annotation applies to all fields of the struct. + */ + #define __deref _Deref_impl_ + + /* + __pre __next_annotation + + The next annotation applies in the precondition state + */ + #define __pre _Pre_impl_ + + /* + __post __next_annotation + + The next annotation applies in the postcondition state + */ + #define __post _Post_impl_ + + /* + __precond() + + When is true, the next annotation applies in the precondition state + (currently not enabled) + */ + #define __precond(expr) __pre + + /* + __postcond() + + When is true, the next annotation applies in the postcondition state + (currently not enabled) + */ + #define __postcond(expr) __post + + /* + __exceptthat + + Given a set of annotations Q containing __exceptthat maybeP, the effect of + the except clause is to erase any P or notP annotations (explicit or + implied) within Q at the same level of dereferencing that the except + clause appears, and to replace it with maybeP. + + Example 1: __valid __pre_except_maybenull on a pointer p means that the + pointer may be null, and is otherwise valid, thus overriding + the implicit notnull annotation implied by __valid on + pointers. + + Example 2: __valid __deref __pre_except_maybenull on an int **p means + that p is not null (implied by valid), but the elements + pointed to by p could be null, and are otherwise valid. + */ + #define __exceptthat __inner_exceptthat + + /* + _refparam + + Added to all out parameter macros to indicate that they are all reference + parameters. + */ + #define __refparam _Notref_ __deref __notreadonly + + /* + __inner_* + + Helper macros that directly correspond to certain high-level annotations. + + */ + + /* + Macros to classify the entrypoints and indicate their category. + + Pre-defined control point categories include: RPC, LPC, DeviceDriver, UserToKernel, ISAPI, COM. + + */ + #define __inner_control_entrypoint(category) _SA_annotes2(SAL_entrypoint, controlEntry, category) + + + /* + Pre-defined data entry point categories include: Registry, File, Network. + */ + #define __inner_data_entrypoint(category) _SA_annotes2(SAL_entrypoint, dataEntry, category) + + #define __inner_override _SA_annotes0(__override) + #define __inner_callback _SA_annotes0(__callback) + #define __inner_blocksOn(resource) _SA_annotes1(SAL_blocksOn, resource) + #define __inner_fallthrough_dec __inline __nothrow void __FallThrough() {} + #define __inner_fallthrough __FallThrough(); + + #define __post_except_maybenull __post __inner_exceptthat _Maybenull_impl_ + #define __pre_except_maybenull __pre __inner_exceptthat _Maybenull_impl_ + + #define __post_deref_except_maybenull __post __deref __inner_exceptthat _Maybenull_impl_ + #define __pre_deref_except_maybenull __pre __deref __inner_exceptthat _Maybenull_impl_ + + #define __inexpressible_readableTo(size) _Readable_elements_impl_(_Inexpressible_(size)) + #define __inexpressible_writableTo(size) _Writable_elements_impl_(_Inexpressible_(size)) + + +#else // ][ +#ifndef PAL_STDCPP_COMPAT + #define __null + #define __notnull +#endif // !PAL_STDCPP_COMPAT + #define __maybenull + #define __readonly + #define __notreadonly + #define __maybereadonly + #define __valid + #define __notvalid + #define __maybevalid + #define __readableTo(extent) + #define __elem_readableTo(size) + #define __byte_readableTo(size) + #define __writableTo(size) + #define __elem_writableTo(size) + #define __byte_writableTo(size) + #define __deref + #define __pre + #define __post + #define __precond(expr) + #define __postcond(expr) + #define __exceptthat + #define __inner_override + #define __inner_callback + #define __inner_blocksOn(resource) + #define __inner_fallthrough_dec + #define __inner_fallthrough + #define __refparam + #define __inner_control_entrypoint(category) + #define __inner_data_entrypoint(category) + + #define __post_except_maybenull + #define __pre_except_maybenull + #define __post_deref_except_maybenull + #define __pre_deref_except_maybenull + + #define __inexpressible_readableTo(size) + #define __inexpressible_writableTo(size) + +#endif /* #if !defined(__midl) && defined(_PREFAST_) */ // ] + +/* +------------------------------------------------------------------------------- +Buffer Annotation Definitions + +Any of these may be used to directly annotate functions, but only one should +be used for each parameter. To determine which annotation to use for a given +buffer, use the table in the buffer annotations section. +------------------------------------------------------------------------------- +*/ + +// These macros conflict with c++ headers. +#ifndef PAL_STDCPP_COMPAT +#define __in _SAL1_Source_(__in, (), _In_) +#define __out _SAL1_Source_(__out, (), _Out_) +#endif // !PAL_STDCPP_COMPAT + +#define __ecount(size) _SAL1_Source_(__ecount, (size), __notnull __elem_writableTo(size)) +#define __bcount(size) _SAL1_Source_(__bcount, (size), __notnull __byte_writableTo(size)) +#define __in_ecount(size) _SAL1_Source_(__in_ecount, (size), _In_reads_(size)) +#define __in_bcount(size) _SAL1_Source_(__in_bcount, (size), _In_reads_bytes_(size)) +#define __in_z _SAL1_Source_(__in_z, (), _In_z_) +#define __in_ecount_z(size) _SAL1_Source_(__in_ecount_z, (size), _In_reads_z_(size)) +#define __in_bcount_z(size) _SAL1_Source_(__in_bcount_z, (size), __in_bcount(size) __pre __nullterminated) +#define __in_nz _SAL1_Source_(__in_nz, (), __in) +#define __in_ecount_nz(size) _SAL1_Source_(__in_ecount_nz, (size), __in_ecount(size)) +#define __in_bcount_nz(size) _SAL1_Source_(__in_bcount_nz, (size), __in_bcount(size)) +#define __out_ecount(size) _SAL1_Source_(__out_ecount, (size), _Out_writes_(size)) +#define __out_bcount(size) _SAL1_Source_(__out_bcount, (size), _Out_writes_bytes_(size)) +#define __out_ecount_part(size,length) _SAL1_Source_(__out_ecount_part, (size,length), _Out_writes_to_(size,length)) +#define __out_bcount_part(size,length) _SAL1_Source_(__out_bcount_part, (size,length), _Out_writes_bytes_to_(size,length)) +#define __out_ecount_full(size) _SAL1_Source_(__out_ecount_full, (size), _Out_writes_all_(size)) +#define __out_bcount_full(size) _SAL1_Source_(__out_bcount_full, (size), _Out_writes_bytes_all_(size)) +#define __out_z _SAL1_Source_(__out_z, (), __post __valid __refparam __post __nullterminated) +#define __out_z_opt _SAL1_Source_(__out_z_opt, (), __post __valid __refparam __post __nullterminated __pre_except_maybenull) +#define __out_ecount_z(size) _SAL1_Source_(__out_ecount_z, (size), __ecount(size) __post __valid __refparam __post __nullterminated) +#define __out_bcount_z(size) _SAL1_Source_(__out_bcount_z, (size), __bcount(size) __post __valid __refparam __post __nullterminated) +#define __out_ecount_part_z(size,length) _SAL1_Source_(__out_ecount_part_z, (size,length), __out_ecount_part(size,length) __post __nullterminated) +#define __out_bcount_part_z(size,length) _SAL1_Source_(__out_bcount_part_z, (size,length), __out_bcount_part(size,length) __post __nullterminated) +#define __out_ecount_full_z(size) _SAL1_Source_(__out_ecount_full_z, (size), __out_ecount_full(size) __post __nullterminated) +#define __out_bcount_full_z(size) _SAL1_Source_(__out_bcount_full_z, (size), __out_bcount_full(size) __post __nullterminated) +#define __out_nz _SAL1_Source_(__out_nz, (), __post __valid __refparam) +#define __out_nz_opt _SAL1_Source_(__out_nz_opt, (), __post __valid __refparam __post_except_maybenull_) +#define __out_ecount_nz(size) _SAL1_Source_(__out_ecount_nz, (size), __ecount(size) __post __valid __refparam) +#define __out_bcount_nz(size) _SAL1_Source_(__out_bcount_nz, (size), __bcount(size) __post __valid __refparam) +#define __inout _SAL1_Source_(__inout, (), _Inout_) +#define __inout_ecount(size) _SAL1_Source_(__inout_ecount, (size), _Inout_updates_(size)) +#define __inout_bcount(size) _SAL1_Source_(__inout_bcount, (size), _Inout_updates_bytes_(size)) +#define __inout_ecount_part(size,length) _SAL1_Source_(__inout_ecount_part, (size,length), _Inout_updates_to_(size,length)) +#define __inout_bcount_part(size,length) _SAL1_Source_(__inout_bcount_part, (size,length), _Inout_updates_bytes_to_(size,length)) +#define __inout_ecount_full(size) _SAL1_Source_(__inout_ecount_full, (size), _Inout_updates_all_(size)) +#define __inout_bcount_full(size) _SAL1_Source_(__inout_bcount_full, (size), _Inout_updates_bytes_all_(size)) +#define __inout_z _SAL1_Source_(__inout_z, (), _Inout_z_) +#define __inout_ecount_z(size) _SAL1_Source_(__inout_ecount_z, (size), _Inout_updates_z_(size)) +#define __inout_bcount_z(size) _SAL1_Source_(__inout_bcount_z, (size), __inout_bcount(size) __pre __nullterminated __post __nullterminated) +#define __inout_nz _SAL1_Source_(__inout_nz, (), __inout) +#define __inout_ecount_nz(size) _SAL1_Source_(__inout_ecount_nz, (size), __inout_ecount(size)) +#define __inout_bcount_nz(size) _SAL1_Source_(__inout_bcount_nz, (size), __inout_bcount(size)) +#define __ecount_opt(size) _SAL1_Source_(__ecount_opt, (size), __ecount(size) __pre_except_maybenull) +#define __bcount_opt(size) _SAL1_Source_(__bcount_opt, (size), __bcount(size) __pre_except_maybenull) +#define __in_opt _SAL1_Source_(__in_opt, (), _In_opt_) +#define __in_ecount_opt(size) _SAL1_Source_(__in_ecount_opt, (size), _In_reads_opt_(size)) +#define __in_bcount_opt(size) _SAL1_Source_(__in_bcount_opt, (size), _In_reads_bytes_opt_(size)) +#define __in_z_opt _SAL1_Source_(__in_z_opt, (), _In_opt_z_) +#define __in_ecount_z_opt(size) _SAL1_Source_(__in_ecount_z_opt, (size), __in_ecount_opt(size) __pre __nullterminated) +#define __in_bcount_z_opt(size) _SAL1_Source_(__in_bcount_z_opt, (size), __in_bcount_opt(size) __pre __nullterminated) +#define __in_nz_opt _SAL1_Source_(__in_nz_opt, (), __in_opt) +#define __in_ecount_nz_opt(size) _SAL1_Source_(__in_ecount_nz_opt, (size), __in_ecount_opt(size)) +#define __in_bcount_nz_opt(size) _SAL1_Source_(__in_bcount_nz_opt, (size), __in_bcount_opt(size)) +#define __out_opt _SAL1_Source_(__out_opt, (), _Out_opt_) +#define __out_ecount_opt(size) _SAL1_Source_(__out_ecount_opt, (size), _Out_writes_opt_(size)) +#define __out_bcount_opt(size) _SAL1_Source_(__out_bcount_opt, (size), _Out_writes_bytes_opt_(size)) +#define __out_ecount_part_opt(size,length) _SAL1_Source_(__out_ecount_part_opt, (size,length), __out_ecount_part(size,length) __pre_except_maybenull) +#define __out_bcount_part_opt(size,length) _SAL1_Source_(__out_bcount_part_opt, (size,length), __out_bcount_part(size,length) __pre_except_maybenull) +#define __out_ecount_full_opt(size) _SAL1_Source_(__out_ecount_full_opt, (size), __out_ecount_full(size) __pre_except_maybenull) +#define __out_bcount_full_opt(size) _SAL1_Source_(__out_bcount_full_opt, (size), __out_bcount_full(size) __pre_except_maybenull) +#define __out_ecount_z_opt(size) _SAL1_Source_(__out_ecount_z_opt, (size), __out_ecount_opt(size) __post __nullterminated) +#define __out_bcount_z_opt(size) _SAL1_Source_(__out_bcount_z_opt, (size), __out_bcount_opt(size) __post __nullterminated) +#define __out_ecount_part_z_opt(size,length) _SAL1_Source_(__out_ecount_part_z_opt, (size,length), __out_ecount_part_opt(size,length) __post __nullterminated) +#define __out_bcount_part_z_opt(size,length) _SAL1_Source_(__out_bcount_part_z_opt, (size,length), __out_bcount_part_opt(size,length) __post __nullterminated) +#define __out_ecount_full_z_opt(size) _SAL1_Source_(__out_ecount_full_z_opt, (size), __out_ecount_full_opt(size) __post __nullterminated) +#define __out_bcount_full_z_opt(size) _SAL1_Source_(__out_bcount_full_z_opt, (size), __out_bcount_full_opt(size) __post __nullterminated) +#define __out_ecount_nz_opt(size) _SAL1_Source_(__out_ecount_nz_opt, (size), __out_ecount_opt(size) __post __nullterminated) +#define __out_bcount_nz_opt(size) _SAL1_Source_(__out_bcount_nz_opt, (size), __out_bcount_opt(size) __post __nullterminated) +#define __inout_opt _SAL1_Source_(__inout_opt, (), _Inout_opt_) +#define __inout_ecount_opt(size) _SAL1_Source_(__inout_ecount_opt, (size), __inout_ecount(size) __pre_except_maybenull) +#define __inout_bcount_opt(size) _SAL1_Source_(__inout_bcount_opt, (size), __inout_bcount(size) __pre_except_maybenull) +#define __inout_ecount_part_opt(size,length) _SAL1_Source_(__inout_ecount_part_opt, (size,length), __inout_ecount_part(size,length) __pre_except_maybenull) +#define __inout_bcount_part_opt(size,length) _SAL1_Source_(__inout_bcount_part_opt, (size,length), __inout_bcount_part(size,length) __pre_except_maybenull) +#define __inout_ecount_full_opt(size) _SAL1_Source_(__inout_ecount_full_opt, (size), __inout_ecount_full(size) __pre_except_maybenull) +#define __inout_bcount_full_opt(size) _SAL1_Source_(__inout_bcount_full_opt, (size), __inout_bcount_full(size) __pre_except_maybenull) +#define __inout_z_opt _SAL1_Source_(__inout_z_opt, (), __inout_opt __pre __nullterminated __post __nullterminated) +#define __inout_ecount_z_opt(size) _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated) +#define __inout_ecount_z_opt(size) _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated) +#define __inout_bcount_z_opt(size) _SAL1_Source_(__inout_bcount_z_opt, (size), __inout_bcount_opt(size)) +#define __inout_nz_opt _SAL1_Source_(__inout_nz_opt, (), __inout_opt) +#define __inout_ecount_nz_opt(size) _SAL1_Source_(__inout_ecount_nz_opt, (size), __inout_ecount_opt(size)) +#define __inout_bcount_nz_opt(size) _SAL1_Source_(__inout_bcount_nz_opt, (size), __inout_bcount_opt(size)) +#define __deref_ecount(size) _SAL1_Source_(__deref_ecount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __elem_writableTo(size)) +#define __deref_bcount(size) _SAL1_Source_(__deref_bcount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __byte_writableTo(size)) +#define __deref_out _SAL1_Source_(__deref_out, (), _Outptr_) +#define __deref_out_ecount(size) _SAL1_Source_(__deref_out_ecount, (size), _Outptr_result_buffer_(size)) +#define __deref_out_bcount(size) _SAL1_Source_(__deref_out_bcount, (size), _Outptr_result_bytebuffer_(size)) +#define __deref_out_ecount_part(size,length) _SAL1_Source_(__deref_out_ecount_part, (size,length), _Outptr_result_buffer_to_(size,length)) +#define __deref_out_bcount_part(size,length) _SAL1_Source_(__deref_out_bcount_part, (size,length), _Outptr_result_bytebuffer_to_(size,length)) +#define __deref_out_ecount_full(size) _SAL1_Source_(__deref_out_ecount_full, (size), __deref_out_ecount_part(size,size)) +#define __deref_out_bcount_full(size) _SAL1_Source_(__deref_out_bcount_full, (size), __deref_out_bcount_part(size,size)) +#define __deref_out_z _SAL1_Source_(__deref_out_z, (), _Outptr_result_z_) +#define __deref_out_ecount_z(size) _SAL1_Source_(__deref_out_ecount_z, (size), __deref_out_ecount(size) __post __deref __nullterminated) +#define __deref_out_bcount_z(size) _SAL1_Source_(__deref_out_bcount_z, (size), __deref_out_bcount(size) __post __deref __nullterminated) +#define __deref_out_nz _SAL1_Source_(__deref_out_nz, (), __deref_out) +#define __deref_out_ecount_nz(size) _SAL1_Source_(__deref_out_ecount_nz, (size), __deref_out_ecount(size)) +#define __deref_out_bcount_nz(size) _SAL1_Source_(__deref_out_bcount_nz, (size), __deref_out_ecount(size)) +#define __deref_inout _SAL1_Source_(__deref_inout, (), _Notref_ __notnull _Notref_ __elem_readableTo(1) __pre __deref __valid __post _Notref_ __deref __valid __refparam) +#define __deref_inout_z _SAL1_Source_(__deref_inout_z, (), __deref_inout __pre __deref __nullterminated __post _Notref_ __deref __nullterminated) +#define __deref_inout_ecount(size) _SAL1_Source_(__deref_inout_ecount, (size), __deref_inout __pre __deref __elem_writableTo(size) __post _Notref_ __deref __elem_writableTo(size)) +#define __deref_inout_bcount(size) _SAL1_Source_(__deref_inout_bcount, (size), __deref_inout __pre __deref __byte_writableTo(size) __post _Notref_ __deref __byte_writableTo(size)) +#define __deref_inout_ecount_part(size,length) _SAL1_Source_(__deref_inout_ecount_part, (size,length), __deref_inout_ecount(size) __pre __deref __elem_readableTo(length) __post __deref __elem_readableTo(length)) +#define __deref_inout_bcount_part(size,length) _SAL1_Source_(__deref_inout_bcount_part, (size,length), __deref_inout_bcount(size) __pre __deref __byte_readableTo(length) __post __deref __byte_readableTo(length)) +#define __deref_inout_ecount_full(size) _SAL1_Source_(__deref_inout_ecount_full, (size), __deref_inout_ecount_part(size,size)) +#define __deref_inout_bcount_full(size) _SAL1_Source_(__deref_inout_bcount_full, (size), __deref_inout_bcount_part(size,size)) +#define __deref_inout_ecount_z(size) _SAL1_Source_(__deref_inout_ecount_z, (size), __deref_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_bcount_z(size) _SAL1_Source_(__deref_inout_bcount_z, (size), __deref_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_nz _SAL1_Source_(__deref_inout_nz, (), __deref_inout) +#define __deref_inout_ecount_nz(size) _SAL1_Source_(__deref_inout_ecount_nz, (size), __deref_inout_ecount(size)) +#define __deref_inout_bcount_nz(size) _SAL1_Source_(__deref_inout_bcount_nz, (size), __deref_inout_ecount(size)) +#define __deref_ecount_opt(size) _SAL1_Source_(__deref_ecount_opt, (size), __deref_ecount(size) __post_deref_except_maybenull) +#define __deref_bcount_opt(size) _SAL1_Source_(__deref_bcount_opt, (size), __deref_bcount(size) __post_deref_except_maybenull) +#define __deref_out_opt _SAL1_Source_(__deref_out_opt, (), __deref_out __post_deref_except_maybenull) +#define __deref_out_ecount_opt(size) _SAL1_Source_(__deref_out_ecount_opt, (size), __deref_out_ecount(size) __post_deref_except_maybenull) +#define __deref_out_bcount_opt(size) _SAL1_Source_(__deref_out_bcount_opt, (size), __deref_out_bcount(size) __post_deref_except_maybenull) +#define __deref_out_ecount_part_opt(size,length) _SAL1_Source_(__deref_out_ecount_part_opt, (size,length), __deref_out_ecount_part(size,length) __post_deref_except_maybenull) +#define __deref_out_bcount_part_opt(size,length) _SAL1_Source_(__deref_out_bcount_part_opt, (size,length), __deref_out_bcount_part(size,length) __post_deref_except_maybenull) +#define __deref_out_ecount_full_opt(size) _SAL1_Source_(__deref_out_ecount_full_opt, (size), __deref_out_ecount_full(size) __post_deref_except_maybenull) +#define __deref_out_bcount_full_opt(size) _SAL1_Source_(__deref_out_bcount_full_opt, (size), __deref_out_bcount_full(size) __post_deref_except_maybenull) +#define __deref_out_z_opt _SAL1_Source_(__deref_out_z_opt, (), _Outptr_result_maybenull_z_) +#define __deref_out_ecount_z_opt(size) _SAL1_Source_(__deref_out_ecount_z_opt, (size), __deref_out_ecount_opt(size) __post __deref __nullterminated) +#define __deref_out_bcount_z_opt(size) _SAL1_Source_(__deref_out_bcount_z_opt, (size), __deref_out_bcount_opt(size) __post __deref __nullterminated) +#define __deref_out_nz_opt _SAL1_Source_(__deref_out_nz_opt, (), __deref_out_opt) +#define __deref_out_ecount_nz_opt(size) _SAL1_Source_(__deref_out_ecount_nz_opt, (size), __deref_out_ecount_opt(size)) +#define __deref_out_bcount_nz_opt(size) _SAL1_Source_(__deref_out_bcount_nz_opt, (size), __deref_out_bcount_opt(size)) +#define __deref_inout_opt _SAL1_Source_(__deref_inout_opt, (), __deref_inout __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_ecount_opt(size) _SAL1_Source_(__deref_inout_ecount_opt, (size), __deref_inout_ecount(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_bcount_opt(size) _SAL1_Source_(__deref_inout_bcount_opt, (size), __deref_inout_bcount(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_ecount_part_opt(size,length) _SAL1_Source_(__deref_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part(size,length) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_bcount_part_opt(size,length) _SAL1_Source_(__deref_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part(size,length) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_ecount_full_opt(size) _SAL1_Source_(__deref_inout_ecount_full_opt, (size), __deref_inout_ecount_full(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_bcount_full_opt(size) _SAL1_Source_(__deref_inout_bcount_full_opt, (size), __deref_inout_bcount_full(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_z_opt _SAL1_Source_(__deref_inout_z_opt, (), __deref_inout_opt __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_ecount_z_opt(size) _SAL1_Source_(__deref_inout_ecount_z_opt, (size), __deref_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_bcount_z_opt(size) _SAL1_Source_(__deref_inout_bcount_z_opt, (size), __deref_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_nz_opt _SAL1_Source_(__deref_inout_nz_opt, (), __deref_inout_opt) +#define __deref_inout_ecount_nz_opt(size) _SAL1_Source_(__deref_inout_ecount_nz_opt, (size), __deref_inout_ecount_opt(size)) +#define __deref_inout_bcount_nz_opt(size) _SAL1_Source_(__deref_inout_bcount_nz_opt, (size), __deref_inout_bcount_opt(size)) +#define __deref_opt_ecount(size) _SAL1_Source_(__deref_opt_ecount, (size), __deref_ecount(size) __pre_except_maybenull) +#define __deref_opt_bcount(size) _SAL1_Source_(__deref_opt_bcount, (size), __deref_bcount(size) __pre_except_maybenull) +#define __deref_opt_out _SAL1_Source_(__deref_opt_out, (), _Outptr_opt_) +#define __deref_opt_out_z _SAL1_Source_(__deref_opt_out_z, (), _Outptr_opt_result_z_) +#define __deref_opt_out_ecount(size) _SAL1_Source_(__deref_opt_out_ecount, (size), __deref_out_ecount(size) __pre_except_maybenull) +#define __deref_opt_out_bcount(size) _SAL1_Source_(__deref_opt_out_bcount, (size), __deref_out_bcount(size) __pre_except_maybenull) +#define __deref_opt_out_ecount_part(size,length) _SAL1_Source_(__deref_opt_out_ecount_part, (size,length), __deref_out_ecount_part(size,length) __pre_except_maybenull) +#define __deref_opt_out_bcount_part(size,length) _SAL1_Source_(__deref_opt_out_bcount_part, (size,length), __deref_out_bcount_part(size,length) __pre_except_maybenull) +#define __deref_opt_out_ecount_full(size) _SAL1_Source_(__deref_opt_out_ecount_full, (size), __deref_out_ecount_full(size) __pre_except_maybenull) +#define __deref_opt_out_bcount_full(size) _SAL1_Source_(__deref_opt_out_bcount_full, (size), __deref_out_bcount_full(size) __pre_except_maybenull) +#define __deref_opt_inout _SAL1_Source_(__deref_opt_inout, (), _Inout_opt_) +#define __deref_opt_inout_ecount(size) _SAL1_Source_(__deref_opt_inout_ecount, (size), __deref_inout_ecount(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount(size) _SAL1_Source_(__deref_opt_inout_bcount, (size), __deref_inout_bcount(size) __pre_except_maybenull) +#define __deref_opt_inout_ecount_part(size,length) _SAL1_Source_(__deref_opt_inout_ecount_part, (size,length), __deref_inout_ecount_part(size,length) __pre_except_maybenull) +#define __deref_opt_inout_bcount_part(size,length) _SAL1_Source_(__deref_opt_inout_bcount_part, (size,length), __deref_inout_bcount_part(size,length) __pre_except_maybenull) +#define __deref_opt_inout_ecount_full(size) _SAL1_Source_(__deref_opt_inout_ecount_full, (size), __deref_inout_ecount_full(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount_full(size) _SAL1_Source_(__deref_opt_inout_bcount_full, (size), __deref_inout_bcount_full(size) __pre_except_maybenull) +#define __deref_opt_inout_z _SAL1_Source_(__deref_opt_inout_z, (), __deref_opt_inout __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_ecount_z(size) _SAL1_Source_(__deref_opt_inout_ecount_z, (size), __deref_opt_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_bcount_z(size) _SAL1_Source_(__deref_opt_inout_bcount_z, (size), __deref_opt_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_nz _SAL1_Source_(__deref_opt_inout_nz, (), __deref_opt_inout) +#define __deref_opt_inout_ecount_nz(size) _SAL1_Source_(__deref_opt_inout_ecount_nz, (size), __deref_opt_inout_ecount(size)) +#define __deref_opt_inout_bcount_nz(size) _SAL1_Source_(__deref_opt_inout_bcount_nz, (size), __deref_opt_inout_bcount(size)) +#define __deref_opt_ecount_opt(size) _SAL1_Source_(__deref_opt_ecount_opt, (size), __deref_ecount_opt(size) __pre_except_maybenull) +#define __deref_opt_bcount_opt(size) _SAL1_Source_(__deref_opt_bcount_opt, (size), __deref_bcount_opt(size) __pre_except_maybenull) +#define __deref_opt_out_opt _SAL1_Source_(__deref_opt_out_opt, (), _Outptr_opt_result_maybenull_) +#define __deref_opt_out_ecount_opt(size) _SAL1_Source_(__deref_opt_out_ecount_opt, (size), __deref_out_ecount_opt(size) __pre_except_maybenull) +#define __deref_opt_out_bcount_opt(size) _SAL1_Source_(__deref_opt_out_bcount_opt, (size), __deref_out_bcount_opt(size) __pre_except_maybenull) +#define __deref_opt_out_ecount_part_opt(size,length) _SAL1_Source_(__deref_opt_out_ecount_part_opt, (size,length), __deref_out_ecount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_out_bcount_part_opt(size,length) _SAL1_Source_(__deref_opt_out_bcount_part_opt, (size,length), __deref_out_bcount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_out_ecount_full_opt(size) _SAL1_Source_(__deref_opt_out_ecount_full_opt, (size), __deref_out_ecount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_out_bcount_full_opt(size) _SAL1_Source_(__deref_opt_out_bcount_full_opt, (size), __deref_out_bcount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_out_z_opt _SAL1_Source_(__deref_opt_out_z_opt, (), __post __deref __valid __refparam __pre_except_maybenull __pre_deref_except_maybenull __post_deref_except_maybenull __post __deref __nullterminated) +#define __deref_opt_out_ecount_z_opt(size) _SAL1_Source_(__deref_opt_out_ecount_z_opt, (size), __deref_opt_out_ecount_opt(size) __post __deref __nullterminated) +#define __deref_opt_out_bcount_z_opt(size) _SAL1_Source_(__deref_opt_out_bcount_z_opt, (size), __deref_opt_out_bcount_opt(size) __post __deref __nullterminated) +#define __deref_opt_out_nz_opt _SAL1_Source_(__deref_opt_out_nz_opt, (), __deref_opt_out_opt) +#define __deref_opt_out_ecount_nz_opt(size) _SAL1_Source_(__deref_opt_out_ecount_nz_opt, (size), __deref_opt_out_ecount_opt(size)) +#define __deref_opt_out_bcount_nz_opt(size) _SAL1_Source_(__deref_opt_out_bcount_nz_opt, (size), __deref_opt_out_bcount_opt(size)) +#define __deref_opt_inout_opt _SAL1_Source_(__deref_opt_inout_opt, (), __deref_inout_opt __pre_except_maybenull) +#define __deref_opt_inout_ecount_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_opt, (size), __deref_inout_ecount_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_opt, (size), __deref_inout_bcount_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_ecount_part_opt(size,length) _SAL1_Source_(__deref_opt_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_inout_bcount_part_opt(size,length) _SAL1_Source_(__deref_opt_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_inout_ecount_full_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_full_opt, (size), __deref_inout_ecount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount_full_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_full_opt, (size), __deref_inout_bcount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_z_opt _SAL1_Source_(__deref_opt_inout_z_opt, (), __deref_opt_inout_opt __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_ecount_z_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_z_opt, (size), __deref_opt_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_bcount_z_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_z_opt, (size), __deref_opt_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_nz_opt _SAL1_Source_(__deref_opt_inout_nz_opt, (), __deref_opt_inout_opt) +#define __deref_opt_inout_ecount_nz_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_nz_opt, (size), __deref_opt_inout_ecount_opt(size)) +#define __deref_opt_inout_bcount_nz_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_nz_opt, (size), __deref_opt_inout_bcount_opt(size)) + +/* +------------------------------------------------------------------------------- +Advanced Annotation Definitions + +Any of these may be used to directly annotate functions, and may be used in +combination with each other or with regular buffer macros. For an explanation +of each annotation, see the advanced annotations section. +------------------------------------------------------------------------------- +*/ + +#define __success(expr) _Success_(expr) +#define __nullterminated _Null_terminated_ +#define __nullnullterminated +#define __clr_reserved _SAL1_Source_(__reserved, (), _Reserved_) +#define __checkReturn _SAL1_Source_(__checkReturn, (), _Check_return_) +#define __typefix(ctype) _SAL1_Source_(__typefix, (ctype), __inner_typefix(ctype)) +#define __override __inner_override +#define __callback __inner_callback +#define __format_string _Printf_format_string_ +#define __blocksOn(resource) __inner_blocksOn(resource) +#define __control_entrypoint(category) __inner_control_entrypoint(category) +#define __data_entrypoint(category) __inner_data_entrypoint(category) +#define __useHeader _Use_decl_anno_impl_ +#define __on_failure(annotes) _On_failure_impl_(annotes _SAL_nop_impl_) + +#ifndef __fallthrough // [ + __inner_fallthrough_dec + #define __fallthrough __inner_fallthrough +#endif // ] + +#ifndef __analysis_assume // [ +#ifdef _PREFAST_ // [ +#define __analysis_assume(expr) __assume(expr) +#else // ][ +#define __analysis_assume(expr) +#endif // ] +#endif // ] + +#ifndef _Analysis_assume_ // [ +#ifdef _PREFAST_ // [ +#define _Analysis_assume_(expr) __assume(expr) +#else // ][ +#define _Analysis_assume_(expr) +#endif // ] +#endif // ] + +#define _Analysis_noreturn_ _SAL2_Source_(_Analysis_noreturn_, (), _SA_annotes0(SAL_terminates)) + +#ifdef _PREFAST_ // [ +__inline __nothrow +void __AnalysisAssumeNullterminated(_Post_ __nullterminated void *p); + +#define _Analysis_assume_nullterminated_(x) __AnalysisAssumeNullterminated(x) +#else // ][ +#define _Analysis_assume_nullterminated_(x) +#endif // ] + +// +// Set the analysis mode (global flags to analysis). +// They take effect at the point of declaration; use at global scope +// as a declaration. +// + +// Synthesize a unique symbol. +#define ___MKID(x, y) x ## y +#define __MKID(x, y) ___MKID(x, y) +#define __GENSYM(x) __MKID(x, __COUNTER__) + +__ANNOTATION(SAL_analysisMode(__AuToQuOtE __In_impl_ char *mode);) + +#define _Analysis_mode_impl_(mode) _SA_annotes1(SAL_analysisMode, #mode) + +#define _Analysis_mode_(mode) \ + typedef _Analysis_mode_impl_(mode) int \ + __GENSYM(__prefast_analysis_mode_flag); + +// The following are predefined: +// _Analysis_operator_new_throw_ (operator new throws) +// _Analysis_operator_new_null_ (operator new returns null) +// _Analysis_operator_new_never_fails_ (operator new never fails) +// + +// Function class annotations. +__ANNOTATION(SAL_functionClassNew(__In_impl_ char*);) +__PRIMOP(int, _In_function_class_(__In_impl_ char*);) +#define _In_function_class_(x) _In_function_class_(#x) + +#define _Function_class_(x) _SA_annotes1(SAL_functionClassNew, #x) + +/* + * interlocked operand used in interlocked instructions + */ +//#define _Interlocked_operand_ _Pre_ _SA_annotes0(SAL_interlocked) + +#define _Enum_is_bitflag_ _SA_annotes0(SAL_enumIsBitflag) +#define _Strict_type_match_ _SA_annotes0(SAL_strictType2) + +#define _Maybe_raises_SEH_exception_ _Pre_ _SA_annotes1(SAL_inTry,__yes) +#define _Raises_SEH_exception_ _Group_(_Maybe_raises_SEH_exception_ _Analysis_noreturn_) + +#ifdef __cplusplus // [ +} +#endif // ] + +// Rotor doesn't need concurrency sal. +// #include + +#define _Interlocked_operand_ + +#endif //__ATTR_SAL diff --git a/WickedEngine/WickedEngine_Linux.vcxproj b/WickedEngine/WickedEngine_Linux.vcxproj new file mode 100644 index 000000000..4e3919c33 --- /dev/null +++ b/WickedEngine/WickedEngine_Linux.vcxproj @@ -0,0 +1,211 @@ + + + + + Debug + ARM + + + Release + ARM + + + Debug + ARM64 + + + Release + ARM64 + + + Debug + x86 + + + Release + x86 + + + Debug + x64 + + + Release + x64 + + + + {d294c41d-d886-4b95-9fd6-ee13eee8d976} + Linux + WickedEngine_Linux + 15.0 + Linux + 1.0 + Generic + {2238F9CD-F817-4ECC-BD14-2524D2669B35} + + + + true + Remote_Clang_1_0 + StaticLibrary + + + false + Remote_Clang_1_0 + StaticLibrary + + + true + Remote_Clang_1_0 + StaticLibrary + + + false + Remote_Clang_1_0 + StaticLibrary + + + true + Remote_Clang_1_0 + StaticLibrary + + + false + Remote_Clang_1_0 + StaticLibrary + + + false + Remote_Clang_1_0 + StaticLibrary + + + true + Remote_Clang_1_0 + StaticLibrary + + + + + + + + + + Utility + + + Utility + + + Utility + + + Utility + + + Utility + + + Utility + + + Utility + + + Utility + + + + + + $(VULKAN_SDK)/Include;BULLET;%(AdditionalIncludeDirectories);$(MSBuildThisFileDirectory) + %(PreprocessorDefinitions) + -fdeclspec %(AdditionalOptions) + c++17 + + + -pthread %(AdditionalOptions) + + + + + $(VULKAN_SDK)/Include;BULLET;%(AdditionalIncludeDirectories);$(MSBuildThisFileDirectory) + %(PreprocessorDefinitions) + -fdeclspec %(AdditionalOptions) + c++17 + + + -pthread %(AdditionalOptions) + + + + + $(VULKAN_SDK)/Include;BULLET;%(AdditionalIncludeDirectories);$(MSBuildThisFileDirectory) + %(PreprocessorDefinitions) + -fdeclspec %(AdditionalOptions) + c++17 + + + -pthread %(AdditionalOptions) + + + + + $(VULKAN_SDK)/Include;BULLET;%(AdditionalIncludeDirectories);$(MSBuildThisFileDirectory) + %(PreprocessorDefinitions) + -fdeclspec %(AdditionalOptions) + c++17 + + + -pthread %(AdditionalOptions) + + + + + $(VULKAN_SDK)/Include;BULLET;%(AdditionalIncludeDirectories);$(MSBuildThisFileDirectory) + %(PreprocessorDefinitions) + -fdeclspec %(AdditionalOptions) + c++17 + + + -pthread %(AdditionalOptions) + + + + + $(VULKAN_SDK)/Include;BULLET;%(AdditionalIncludeDirectories);$(MSBuildThisFileDirectory) + %(PreprocessorDefinitions) + -fdeclspec %(AdditionalOptions) + c++17 + + + -pthread %(AdditionalOptions) + + + + + $(VULKAN_SDK)/Include;BULLET;%(AdditionalIncludeDirectories);$(MSBuildThisFileDirectory) + %(PreprocessorDefinitions) + -fdeclspec %(AdditionalOptions) + c++17 + + + -pthread %(AdditionalOptions) + + + + + $(VULKAN_SDK)/Include;BULLET;%(AdditionalIncludeDirectories);$(MSBuildThisFileDirectory) + %(PreprocessorDefinitions) + -fdeclspec %(AdditionalOptions) + c++17 + + + -pthread %(AdditionalOptions) + + + + + \ No newline at end of file diff --git a/WickedEngine/WickedEngine_SOURCE.vcxitems b/WickedEngine/WickedEngine_SOURCE.vcxitems index 257eba833..9bf0a1265 100644 --- a/WickedEngine/WickedEngine_SOURCE.vcxitems +++ b/WickedEngine/WickedEngine_SOURCE.vcxitems @@ -257,7 +257,13 @@ + + + + + + @@ -623,9 +629,6 @@ false - - false - false @@ -655,6 +658,7 @@ + @@ -753,6 +757,12 @@ + + + + + + diff --git a/WickedEngine/WickedEngine_SOURCE.vcxitems.filters b/WickedEngine/WickedEngine_SOURCE.vcxitems.filters index 98d327f34..b1bf00706 100644 --- a/WickedEngine/WickedEngine_SOURCE.vcxitems.filters +++ b/WickedEngine/WickedEngine_SOURCE.vcxitems.filters @@ -1131,6 +1131,24 @@ ENGINE\Scripting\LuaBindings + + UTILITY + + + UTILITY + + + UTILITY + + + UTILITY + + + UTILITY + + + UTILITY + @@ -1223,9 +1241,6 @@ LUA - - LUA - LUA @@ -1892,6 +1907,9 @@ ENGINE\Scripting\LuaBindings + + ENGINE\Network + @@ -1944,6 +1962,24 @@ scripts + + UTILITY + + + UTILITY + + + UTILITY + + + UTILITY + + + UTILITY + + + UTILITY + diff --git a/WickedEngine/WickedEngine_Windows.vcxproj b/WickedEngine/WickedEngine_Windows.vcxproj index c20bd30d1..aa6e6b6ee 100644 --- a/WickedEngine/WickedEngine_Windows.vcxproj +++ b/WickedEngine/WickedEngine_Windows.vcxproj @@ -184,6 +184,7 @@ $(VULKAN_SDK)/Lib32;%(AdditionalLibraryDirectories);%(AdditionalLibraryDirectories) + true @@ -213,6 +214,7 @@ $(VULKAN_SDK)/Lib;%(AdditionalLibraryDirectories);%(AdditionalLibraryDirectories) + true diff --git a/WickedEngine/wiAudio.cpp b/WickedEngine/wiAudio.cpp index 77aa39e4f..5e6f8e80f 100644 --- a/WickedEngine/wiAudio.cpp +++ b/WickedEngine/wiAudio.cpp @@ -4,12 +4,15 @@ #include +#ifdef _WIN32 + #include // ComPtr #include #include #include #pragma comment(lib,"xaudio2.lib") + #ifdef _XBOX //Big-Endian #define fourccRIFF 'RIFF' #define fourccDATA 'data' @@ -493,3 +496,30 @@ namespace wiAudio assert(SUCCEEDED(hr)); } } + +#else + +namespace wiAudio +{ + void Initialize() {} + + bool CreateSound(const std::string& filename, Sound* sound) {} + bool CreateSound(const std::vector& data, Sound* sound) {} + bool CreateSoundInstance(const Sound* sound, SoundInstance* instance) {} + + void Play(SoundInstance* instance) {} + void Pause(SoundInstance* instance) {} + void Stop(SoundInstance* instance) {} + void SetVolume(float volume, SoundInstance* instance) {} + float GetVolume(const SoundInstance* instance) { return 0; } + void ExitLoop(SoundInstance* instance) {} + + void SetSubmixVolume(SUBMIX_TYPE type, float volume) {} + float GetSubmixVolume(SUBMIX_TYPE type) { return 0; } + + void Update3D(SoundInstance* instance, const SoundInstance3D& instance3D) {} + + void SetReverb(REVERB_PRESET preset) {} +} + +#endif // _WIN32 diff --git a/WickedEngine/wiBackLog.cpp b/WickedEngine/wiBackLog.cpp index 0432b37a9..d825f2475 100644 --- a/WickedEngine/wiBackLog.cpp +++ b/WickedEngine/wiBackLog.cpp @@ -9,6 +9,7 @@ #include "wiImage.h" #include "wiLua.h" #include "wiInput.h" +#include "wiPlatform.h" #include #include @@ -151,14 +152,19 @@ namespace wiBackLog } void post(const char* input) { - logLock.lock(); stringstream ss(""); ss << input << endl; + + logLock.lock(); stream.push_back(ss.str().c_str()); if (stream.size() > deletefromline) { stream.pop_front(); } logLock.unlock(); + +#ifdef _WIN32 + OutputDebugStringA(ss.str().c_str()); +#endif // _WIN32 } void input(const char& input) { diff --git a/WickedEngine/wiGraphics.h b/WickedEngine/wiGraphics.h index ae6a284bb..1da614ab6 100644 --- a/WickedEngine/wiGraphics.h +++ b/WickedEngine/wiGraphics.h @@ -479,24 +479,28 @@ namespace wiGraphics IMAGE_BARRIER, // image layout transition BUFFER_BARRIER, // buffer state transition } type = MEMORY_BARRIER; + + struct Memory + { + const GPUResource* resource; + }; + struct Image + { + const Texture* texture; + IMAGE_LAYOUT layout_before; + IMAGE_LAYOUT layout_after; + }; + struct Buffer + { + const GPUBuffer* buffer; + BUFFER_STATE state_before; + BUFFER_STATE state_after; + }; union { - struct Memory - { - const GPUResource* resource; - } memory; - struct Image - { - const Texture* texture; - IMAGE_LAYOUT layout_before; - IMAGE_LAYOUT layout_after; - } image; - struct Buffer - { - const GPUBuffer* buffer; - BUFFER_STATE state_before; - BUFFER_STATE state_after; - } buffer; + Memory memory; + Image image; + Buffer buffer; }; static GPUBarrier Memory(const GPUResource* resource = nullptr) diff --git a/WickedEngine/wiGraphicsDevice.cpp b/WickedEngine/wiGraphicsDevice.cpp index a86e9f23a..e56e4ef99 100644 --- a/WickedEngine/wiGraphicsDevice.cpp +++ b/WickedEngine/wiGraphicsDevice.cpp @@ -1,11 +1,13 @@ #include "wiGraphicsDevice.h" #include "wiPlatform.h" +#ifdef _WIN32 // These will let the driver select the dedicated GPU in favour of the integrated one: extern "C" { _declspec(dllexport) DWORD NvOptimusEnablement = 0x00000001; _declspec(dllexport) int AmdPowerXpressRequestHighPerformance = 1; } +#endif // _WIN32 using namespace wiGraphics; diff --git a/WickedEngine/wiGraphicsDevice_DX11.cpp b/WickedEngine/wiGraphicsDevice_DX11.cpp index 2b255fd38..9278eaec2 100644 --- a/WickedEngine/wiGraphicsDevice_DX11.cpp +++ b/WickedEngine/wiGraphicsDevice_DX11.cpp @@ -1,4 +1,7 @@ #include "wiGraphicsDevice_DX11.h" + +#ifdef WICKEDENGINE_BUILD_DX11 + #include "wiHelper.h" #include "ResourceMapping.h" #include "wiBackLog.h" @@ -3143,3 +3146,5 @@ void GraphicsDevice_DX11::SetMarker(const char* name, CommandList cmd) } } + +#endif // WICKEDENGINE_BUILD_DX11 diff --git a/WickedEngine/wiGraphicsDevice_DX11.h b/WickedEngine/wiGraphicsDevice_DX11.h index 616314b1a..3230fdf7a 100644 --- a/WickedEngine/wiGraphicsDevice_DX11.h +++ b/WickedEngine/wiGraphicsDevice_DX11.h @@ -1,4 +1,10 @@ #pragma once + +#if __has_include("d3d11_3.h") +#define WICKEDENGINE_BUILD_DX11 +#endif // HAS VULKAN + +#ifdef WICKEDENGINE_BUILD_DX11 #include "CommonInclude.h" #include "wiGraphicsDevice.h" #include "wiPlatform.h" @@ -145,3 +151,5 @@ namespace wiGraphics }; } + +#endif // WICKEDENGINE_BUILD_DX11 diff --git a/WickedEngine/wiGraphicsDevice_DX12.cpp b/WickedEngine/wiGraphicsDevice_DX12.cpp index 34068c809..333a54e68 100644 --- a/WickedEngine/wiGraphicsDevice_DX12.cpp +++ b/WickedEngine/wiGraphicsDevice_DX12.cpp @@ -1,4 +1,7 @@ #include "wiGraphicsDevice_DX12.h" + +#ifdef WICKEDENGINE_BUILD_DX12 + #include "wiGraphicsDevice_SharedInternals.h" #include "wiHelper.h" #include "ResourceMapping.h" @@ -4130,3 +4133,5 @@ using namespace DX12_Internal; } + +#endif // WICKEDENGINE_BUILD_DX12 diff --git a/WickedEngine/wiGraphicsDevice_DX12.h b/WickedEngine/wiGraphicsDevice_DX12.h index 751706b66..238ffc273 100644 --- a/WickedEngine/wiGraphicsDevice_DX12.h +++ b/WickedEngine/wiGraphicsDevice_DX12.h @@ -1,4 +1,10 @@ #pragma once + +#if __has_include("d3d12.h") +#define WICKEDENGINE_BUILD_DX12 +#endif // HAS VULKAN + +#ifdef WICKEDENGINE_BUILD_DX12 #include "CommonInclude.h" #include "wiGraphicsDevice.h" #include "wiPlatform.h" @@ -434,3 +440,5 @@ namespace wiGraphics }; } + +#endif // WICKEDENGINE_BUILD_DX12 diff --git a/WickedEngine/wiHelper.cpp b/WickedEngine/wiHelper.cpp index 107b15a54..bd89d4ee4 100644 --- a/WickedEngine/wiHelper.cpp +++ b/WickedEngine/wiHelper.cpp @@ -7,7 +7,6 @@ #include #include -#include #include #include #include @@ -51,7 +50,9 @@ namespace wiHelper void screenshot(const std::string& name) { +#ifdef _WIN32 CreateDirectoryA("screenshots", 0); +#endif // _WIN32 stringstream ss(""); if (name.length() <= 0) ss << GetOriginalWorkingDirectory() << "screenshots/sc_" << getCurrentDateTimeAsString() << ".jpg"; @@ -83,13 +84,13 @@ namespace wiHelper staging_desc.CPUAccessFlags = CPU_ACCESS_READ; staging_desc.BindFlags = 0; staging_desc.MiscFlags = 0; - HRESULT hr = device->CreateTexture(&staging_desc, nullptr, &stagingTex); - assert(SUCCEEDED(hr)); + bool success = device->CreateTexture(&staging_desc, nullptr, &stagingTex); + assert(success); - bool download_success = device->DownloadResource(&texture, &stagingTex, data.data()); - assert(download_success); + success = device->DownloadResource(&texture, &stagingTex, data.data()); + assert(success); - return download_success; + return success; } bool saveTextureToFile(const wiGraphics::Texture& texture, const string& fileName) @@ -183,7 +184,11 @@ namespace wiHelper { time_t t = std::time(nullptr); struct tm time_info; +#ifdef _WIN32 localtime_s(&time_info, &t); +#else + localtime(&t); +#endif stringstream ss(""); ss << std::put_time(&time_info, "%d-%m-%Y %H-%M-%S"); return ss.str(); @@ -195,15 +200,24 @@ namespace wiHelper static bool initComplete = false; if (!initComplete) { +#ifdef _WIN32 CHAR fileName[1024] = {}; GetModuleFileNameA(NULL, fileName, arraysize(fileName)); appDir = GetDirectoryFromPath(fileName); +#else + // TODO +#endif // _WIN32 initComplete = true; } return appDir; } +#ifdef _WIN32 +#include std::string workingdir = std::string(_getcwd(NULL, 0)) + "/"; +#else + std::string workingdir = ""; // TODO +#endif const std::string __originalWorkingDir = workingdir; string GetOriginalWorkingDirectory() { @@ -220,63 +234,6 @@ namespace wiHelper workingdir = path; } - void GetFilesInDirectory(std::vector& out, const std::string& directory) - { -#ifndef PLATFORM_UWP - // WINDOWS - wstring wdirectory; - StringConvert(directory, wdirectory); - HANDLE dir; - WIN32_FIND_DATA file_data; - - if ((dir = FindFirstFile((wdirectory + L"/*").c_str(), &file_data)) == INVALID_HANDLE_VALUE) - return; /* No files found */ - - do { - const wstring file_name = file_data.cFileName; - const wstring full_file_name = wdirectory + L"/" + file_name; - const bool is_directory = (file_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0; - - //if (file_name[0] == '.') - // continue; - - //if (is_directory) - // continue; - - string fname; - StringConvert(full_file_name, fname); - out.push_back(fname); - } while (FindNextFile(dir, &file_data)); - - FindClose(dir); -#endif - - // UNIX - //DIR *dir; - //class dirent *ent; - //class stat st; - - //dir = opendir(directory); - //while ((ent = readdir(dir)) != NULL) { - // const string file_name = ent->d_name; - // const string full_file_name = directory + "/" + file_name; - - // if (file_name[0] == '.') - // continue; - - // if (stat(full_file_name.c_str(), &st) == -1) - // continue; - - // const bool is_directory = (st.st_mode & S_IFDIR) != 0; - - // if (is_directory) - // continue; - - // out.push_back(full_file_name); - //} - //closedir(dir); - } - void SplitPath(const std::string& fullPath, string& dir, string& fileName) { size_t found; @@ -730,52 +687,73 @@ namespace wiHelper } #endif // PLATFORM_UWP + +#else + + // TODO + #endif // _WIN32 } void StringConvert(const std::string& from, std::wstring& to) { +#ifdef _WIN32 int num = MultiByteToWideChar(CP_UTF8, 0, from.c_str(), -1, NULL, 0); if (num > 0) { to.resize(size_t(num) - 1); MultiByteToWideChar(CP_UTF8, 0, from.c_str(), -1, &to[0], num); } +#else + to = std::wstring(from.begin(), from.end()); // TODO +#endif // _WIN32 } void StringConvert(const std::wstring& from, std::string& to) { +#ifdef _WIN32 int num = WideCharToMultiByte(CP_UTF8, 0, from.c_str(), -1, NULL, 0, NULL, NULL); if (num > 0) { to.resize(size_t(num) - 1); WideCharToMultiByte(CP_UTF8, 0, from.c_str(), -1, &to[0], num, NULL, NULL); } +#else + to = std::string(from.begin(), from.end()); // TODO +#endif // _WIN32 } int StringConvert(const char* from, wchar_t* to) { +#ifdef _WIN32 int num = MultiByteToWideChar(CP_UTF8, 0, from, -1, NULL, 0); if (num > 0) { MultiByteToWideChar(CP_UTF8, 0, from, -1, &to[0], num); } +#else + int num = 0; // TODO +#endif // _WIN32 return num; } int StringConvert(const wchar_t* from, char* to) { +#ifdef _WIN32 int num = WideCharToMultiByte(CP_UTF8, 0, from, -1, NULL, 0, NULL, NULL); if (num > 0) { WideCharToMultiByte(CP_UTF8, 0, from, -1, &to[0], num, NULL, NULL); } +#else + int num = 0; // TODO +#endif // _WIN32 return num; } void Sleep(float milliseconds) { - ::Sleep((DWORD)milliseconds); + std::this_thread::sleep_for(std::chrono::milliseconds((int)milliseconds)); } void Spin(float milliseconds) diff --git a/WickedEngine/wiInput.cpp b/WickedEngine/wiInput.cpp index f835e36cb..2f61f1606 100644 --- a/WickedEngine/wiInput.cpp +++ b/WickedEngine/wiInput.cpp @@ -17,6 +17,7 @@ using namespace std; namespace wiInput { +#ifdef _WIN32 #ifndef PLATFORM_UWP #define KEY_DOWN(vk_code) (GetAsyncKeyState(vk_code) < 0) #define KEY_TOGGLE(vk_code) ((GetAsyncKeyState(vk_code) & 1) != 0) @@ -24,6 +25,10 @@ namespace wiInput #define KEY_DOWN(vk_code) ((int)wiPlatform::GetWindow()->GetAsyncKeyState((Windows::System::VirtualKey)vk_code) < 0) #define KEY_TOGGLE(vk_code) (((int)wiPlatform::GetWindow()->GetAsyncKeyState((Windows::System::VirtualKey)vk_code) & 1) != 0) #endif //PLATFORM_UWP +#else +#define KEY_DOWN(vk_code) 0 +#define KEY_TOGGLE(vk_code) 0 +#endif // WIN32 #define KEY_UP(vk_code) (!KEY_DOWN(vk_code)) KeyboardState keyboard; @@ -358,6 +363,7 @@ namespace wiInput return true; return false; break; +#ifdef _WIN32 case wiInput::KEYBOARD_BUTTON_UP: keycode = VK_UP; break; @@ -442,6 +448,7 @@ namespace wiInput case wiInput::KEYBOARD_BUTTON_PAGEUP: keycode = VK_PRIOR; break; +#endif // _WIN32 } return KEY_DOWN(keycode) || KEY_TOGGLE(keycode); @@ -491,7 +498,7 @@ namespace wiInput } XMFLOAT4 GetPointer() { -#ifndef PLATFORM_UWP +#if defined(_WIN32) && !defined(PLATFORM_UWP) POINT p; GetCursorPos(&p); ScreenToClient(wiPlatform::GetWindow(), &p); @@ -503,6 +510,7 @@ namespace wiInput } void SetPointer(const XMFLOAT4& props) { +#ifdef _WIN32 #ifndef PLATFORM_UWP const float dpiscaling = wiPlatform::GetDPIScaling(); POINT p; @@ -515,9 +523,11 @@ namespace wiInput auto& bounds = window->Bounds; window->PointerPosition = Point(props.x + bounds.X, props.y + bounds.Y); #endif +#endif // _WIN32 } void HidePointer(bool value) { +#ifdef _WIN32 #ifndef PLATFORM_UWP if (value) { @@ -538,6 +548,7 @@ namespace wiInput wiPlatform::GetWindow()->PointerCursor = cursor; } #endif +#endif // _WIN32 } XMFLOAT4 GetAnalog(GAMEPAD_ANALOG analog, int playerindex) diff --git a/WickedEngine/wiLua.cpp b/WickedEngine/wiLua.cpp index a5ca088b9..3f177a08c 100644 --- a/WickedEngine/wiLua.cpp +++ b/WickedEngine/wiLua.cpp @@ -61,8 +61,6 @@ int Internal_DoFile(lua_State* L) stringstream ss(""); ss << WILUA_ERROR_PREFIX << str; wiBackLog::post(ss.str().c_str()); - ss << endl; - OutputDebugStringA(ss.str().c_str()); lua_pop(L, 1); // remove error message } } @@ -80,7 +78,6 @@ wiLua::wiLua() m_luaState = NULL; m_luaState = luaL_newstate(); luaL_openlibs(m_luaState); - RegisterFunc("debugout", DebugOut); RegisterFunc("dofile", Internal_DoFile); RunText(wiLua_Globals); } @@ -151,7 +148,7 @@ string wiLua::PopErrorMsg() lock.unlock(); return retVal; } -void wiLua::PostErrorMsg(bool todebug, bool tobacklog) +void wiLua::PostErrorMsg() { if (Failed()) { @@ -162,15 +159,7 @@ void wiLua::PostErrorMsg(bool todebug, bool tobacklog) return; stringstream ss(""); ss << WILUA_ERROR_PREFIX << str; - if (tobacklog) - { - wiBackLog::post(ss.str().c_str()); - } - if (todebug) - { - ss << endl; - OutputDebugStringA(ss.str().c_str()); - } + wiBackLog::post(ss.str().c_str()); lock.lock(); lua_pop(m_luaState, 1); // remove error message lock.unlock(); @@ -335,31 +324,6 @@ void wiLua::KillProcesses() RunText("killProcesses();"); } -int wiLua::DebugOut(lua_State* L) -{ - int argc = lua_gettop(L); - - stringstream ss(""); - - for (int i = 1; i <= argc; i++) - { - static mutex sm; - sm.lock(); - const char* str = lua_tostring(L, i); - sm.unlock(); - if (str != nullptr) - { - ss << str; - } - } - ss << endl; - - OutputDebugStringA(ss.str().c_str()); - - //number of results - return 0; -} - string wiLua::SGetString(lua_State* L, int stackpos) { const char* str = lua_tostring(L, stackpos); @@ -475,7 +439,7 @@ void wiLua::SSetNull(lua_State* L) lua_pushnil(L); } -void wiLua::SError(lua_State* L, const std::string& error, bool todebug, bool tobacklog) +void wiLua::SError(lua_State* L, const std::string& error) { //retrieve line number for error info lua_Debug ar; @@ -489,15 +453,7 @@ void wiLua::SError(lua_State* L, const std::string& error, bool todebug, bool to { ss << error; } - if (tobacklog) - { - wiBackLog::post(ss.str().c_str()); - } - if (todebug) - { - ss << endl; - OutputDebugStringA(ss.str().c_str()); - } + wiBackLog::post(ss.str().c_str()); } void wiLua::SAddMetatable(lua_State* L, const std::string& name) diff --git a/WickedEngine/wiLua.h b/WickedEngine/wiLua.h index 35a76c846..7c950f5fc 100644 --- a/WickedEngine/wiLua.h +++ b/WickedEngine/wiLua.h @@ -3,9 +3,9 @@ extern "C" { -#include "LUA\lua.h" -#include "LUA\lualib.h" -#include "LUA\lauxlib.h" +#include "LUA/lua.h" +#include "LUA/lualib.h" +#include "LUA/lauxlib.h" } #include @@ -20,8 +20,6 @@ private: std::mutex lock; - static int DebugOut(lua_State *L); - //run the previously loaded script bool RunScript(); public: @@ -42,7 +40,7 @@ public: //remove and get error message from stack std::string PopErrorMsg(); //post error to backlog and/or debug output - void PostErrorMsg(bool todebug = true, bool tobacklog = true); + void PostErrorMsg(); //run a script from file bool RunFile(const std::string& filename); //run a script from param @@ -134,7 +132,7 @@ public: static void SSetNull(lua_State* L); //throw error - static void SError(lua_State* L, const std::string& error = "", bool todebug = true, bool tobacklog = true); + static void SError(lua_State* L, const std::string& error = ""); //add new metatable static void SAddMetatable(lua_State* L, const std::string& name); diff --git a/WickedEngine/wiLuna.h b/WickedEngine/wiLuna.h index b024c6acd..074d7a816 100644 --- a/WickedEngine/wiLuna.h +++ b/WickedEngine/wiLuna.h @@ -3,6 +3,7 @@ //Luna : Official C++ to Lua binder project, 5th version //modified to fit with Wicked Engine, removed warnings + #define lunamethod(class, name) {#name, &class::name} template < class T > class Luna { @@ -221,7 +222,7 @@ public: if (_index >> 8) // Try to set a func { char c[128]; - sprintf_s(c, sizeof(c), "Trying to set the method [%s] of class [%s]", (*obj)->T::methods[_index ^ (1 << 8)].name, T::className); + snprintf(c, sizeof(c), "Trying to set the method [%s] of class [%s]", (*obj)->T::methods[_index ^ (1 << 8)].name, T::className); luaL_error(L, c); return 0; } diff --git a/WickedEngine/wiNetwork_Linux.cpp b/WickedEngine/wiNetwork_Linux.cpp new file mode 100644 index 000000000..93d693be4 --- /dev/null +++ b/WickedEngine/wiNetwork_Linux.cpp @@ -0,0 +1,45 @@ +#include "wiPlatform.h" + +#ifdef PLATFORM_LINUX +#include "wiNetwork.h" +#include "wiBackLog.h" + +namespace wiNetwork +{ + void Initialize() + { + wiBackLog::post("TODO wiNetwork_Linux"); + } + + bool CreateSocket(Socket* sock) + { + return false; + } + bool Destroy(Socket* sock) + { + return false; + } + + bool Send(const Socket* sock, const Connection* connection, const void* data, size_t dataSize) + { + return false; + } + + bool ListenPort(const Socket* sock, uint16_t port) + { + return false; + } + + bool CanReceive(const Socket* sock, long timeout_microseconds) + { + return false; + } + + bool Receive(const Socket* sock, Connection* connection, void* data, size_t dataSize) + { + return false; + } + +} + +#endif // LINUX diff --git a/WickedEngine/wiNetwork_UWP.cpp b/WickedEngine/wiNetwork_UWP.cpp index 5ec82ffef..139873ab8 100644 --- a/WickedEngine/wiNetwork_UWP.cpp +++ b/WickedEngine/wiNetwork_UWP.cpp @@ -1,6 +1,6 @@ #include "wiPlatform.h" -#ifdef PLATFORM_UWP +#if defined(_WIN32) && defined(PLATFORM_UWP) #include "wiNetwork.h" #include "wiBackLog.h" @@ -42,4 +42,4 @@ namespace wiNetwork } -#endif // PLATFORM_UWP +#endif // _WIN32 && PLATFORM_UWP diff --git a/WickedEngine/wiNetwork_Windows.cpp b/WickedEngine/wiNetwork_Windows.cpp index 054c6d69c..0013fb5b4 100644 --- a/WickedEngine/wiNetwork_Windows.cpp +++ b/WickedEngine/wiNetwork_Windows.cpp @@ -1,6 +1,6 @@ #include "wiPlatform.h" -#ifndef PLATFORM_UWP +#if defined(_WIN32) && !defined(PLATFORM_UWP) #include "wiNetwork.h" #include "wiBackLog.h" @@ -193,4 +193,4 @@ namespace wiNetwork } -#endif // PLATFORM_UWP +#endif // _WIN32 && !PLATFORM_UWP diff --git a/WickedEngine/wiPlatform.h b/WickedEngine/wiPlatform.h index f8454c6dd..6ff2ad026 100644 --- a/WickedEngine/wiPlatform.h +++ b/WickedEngine/wiPlatform.h @@ -17,6 +17,10 @@ #include #endif // UWP +#else + +#define PLATFORM_LINUX + #endif // _WIN32 @@ -28,6 +32,8 @@ namespace wiPlatform #else using window_type = Platform::Agile; #endif // PLATFORM_UWP +#else + using window_type = int; #endif // _WIN32 struct DeferredMessageBox diff --git a/WickedEngine/wiRectPacker.cpp b/WickedEngine/wiRectPacker.cpp index dee7ddb86..16c74a4e6 100644 --- a/WickedEngine/wiRectPacker.cpp +++ b/WickedEngine/wiRectPacker.cpp @@ -1,6 +1,5 @@ -#pragma once #include "wiRectPacker.h" -#include + #include using namespace std; diff --git a/WickedEngine/wiRectPacker.h b/WickedEngine/wiRectPacker.h index 6ce8dd185..670037807 100644 --- a/WickedEngine/wiRectPacker.h +++ b/WickedEngine/wiRectPacker.h @@ -1,4 +1,6 @@ #pragma once +#include "CommonInclude.h" + #include // NOTE: diff --git a/WickedEngine/wiScene.cpp b/WickedEngine/wiScene.cpp index e41f3be94..34748b6bf 100644 --- a/WickedEngine/wiScene.cpp +++ b/WickedEngine/wiScene.cpp @@ -2668,7 +2668,6 @@ namespace wiScene return result; } -#include SceneIntersectSphereResult SceneIntersectSphere(const SPHERE& sphere, uint32_t renderTypeMask, uint32_t layerMask, const Scene& scene) { SceneIntersectSphereResult result; diff --git a/WickedEngine/wiScene_Serializers.cpp b/WickedEngine/wiScene_Serializers.cpp index e2a519803..016ba717c 100644 --- a/WickedEngine/wiScene_Serializers.cpp +++ b/WickedEngine/wiScene_Serializers.cpp @@ -93,7 +93,7 @@ namespace wiScene archive >> roughness; if (archive.GetVersion() < 35) { - roughness = roughness == 0 ? 0 : std::sqrtf(roughness); + roughness = roughness == 0 ? 0 : std::sqrt(roughness); } archive >> reflectance; archive >> metalness; diff --git a/WickedEngine/wiTimer.cpp b/WickedEngine/wiTimer.cpp index 19ebfe7d5..62dcbe332 100644 --- a/WickedEngine/wiTimer.cpp +++ b/WickedEngine/wiTimer.cpp @@ -2,8 +2,8 @@ #include "wiHelper.h" #include "wiPlatform.h" -double wiTimer::PCFreq = 0; -__int64 wiTimer::CounterStart = 0; +static double PCFreq = 0; +static int64_t CounterStart = 0; wiTimer::wiTimer() { @@ -19,6 +19,7 @@ wiTimer::~wiTimer() void wiTimer::Start() { +#ifdef _WIN32 LARGE_INTEGER li; if(!QueryPerformanceFrequency(&li)) wiHelper::messageBox("QueryPerformanceFrequency failed!\n"); @@ -27,12 +28,17 @@ void wiTimer::Start() QueryPerformanceCounter(&li); CounterStart = li.QuadPart; +#endif // _WIN32 } double wiTimer::TotalTime() { +#ifdef _WIN32 LARGE_INTEGER li; QueryPerformanceCounter(&li); return double(li.QuadPart-CounterStart)/PCFreq; +#else + return 0; +#endif // _WIN32 } void wiTimer::record() diff --git a/WickedEngine/wiTimer.h b/WickedEngine/wiTimer.h index 4a13c335c..466bb2039 100644 --- a/WickedEngine/wiTimer.h +++ b/WickedEngine/wiTimer.h @@ -4,9 +4,6 @@ class wiTimer { private: - static double PCFreq; - static __int64 CounterStart; - double lastTime; public: wiTimer(); diff --git a/WickedEngine/wiVersion.cpp b/WickedEngine/wiVersion.cpp index 4ad1f88ac..6155236c2 100644 --- a/WickedEngine/wiVersion.cpp +++ b/WickedEngine/wiVersion.cpp @@ -9,7 +9,7 @@ namespace wiVersion // minor features, major updates const int minor = 42; // minor bug fixes, alterations, refactors, updates - const int revision = 9; + const int revision = 10; const std::string version_string = std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(revision); diff --git a/WickedEngine/wiWidget.cpp b/WickedEngine/wiWidget.cpp index cda326484..0529e5bcf 100644 --- a/WickedEngine/wiWidget.cpp +++ b/WickedEngine/wiWidget.cpp @@ -8,8 +8,6 @@ #include "wiRenderer.h" #include "ShaderInterop_Renderer.h" -#include - #include using namespace std;