diff --git a/r5dev/bonesetup/bone_utils.cpp b/r5dev/bonesetup/bone_utils.cpp new file mode 100644 index 00000000..6154b971 --- /dev/null +++ b/r5dev/bonesetup/bone_utils.cpp @@ -0,0 +1,101 @@ +//===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======// +// +// Purpose: +// +// $NoKeywords: $ +// +//===========================================================================// + +#include "core/stdafx.h" +#include "mathlib/mathlib.h" + +//----------------------------------------------------------------------------- +// Purpose: qt = ( s * p ) * q +//----------------------------------------------------------------------------- +void QuaternionSM(float s, const Quaternion& p, const Quaternion& q, Quaternion& qt) +{ + Quaternion p1, q1; + + QuaternionScale(p, s, p1); + QuaternionMult(p1, q, q1); + QuaternionNormalize(q1); + qt[0] = q1[0]; + qt[1] = q1[1]; + qt[2] = q1[2]; + qt[3] = q1[3]; +} + +#if ALLOW_SIMD_QUATERNION_MATH +FORCEINLINE fltx4 QuaternionSMSIMD(const fltx4& s, const fltx4& p, const fltx4& q) +{ + fltx4 p1, q1, result; + p1 = QuaternionScaleSIMD(p, s); + q1 = QuaternionMultSIMD(p1, q); + result = QuaternionNormalizeSIMD(q1); + return result; +} + +FORCEINLINE fltx4 QuaternionSMSIMD(float s, const fltx4& p, const fltx4& q) +{ + return QuaternionSMSIMD(ReplicateX4(s), p, q); +} +#endif + +//----------------------------------------------------------------------------- +// Purpose: qt = p * ( s * q ) +//----------------------------------------------------------------------------- +void QuaternionMA(const Quaternion& p, float s, const Quaternion& q, Quaternion& qt) +{ + Quaternion p1, q1; + + QuaternionScale(q, s, q1); + QuaternionMult(p, q1, p1); + QuaternionNormalize(p1); + qt[0] = p1[0]; + qt[1] = p1[1]; + qt[2] = p1[2]; + qt[3] = p1[3]; +} + +#if ALLOW_SIMD_QUATERNION_MATH + +FORCEINLINE fltx4 QuaternionMASIMD(const fltx4& p, const fltx4& s, const fltx4& q) +{ + fltx4 p1, q1, result; + q1 = QuaternionScaleSIMD(q, s); + p1 = QuaternionMultSIMD(p, q1); + result = QuaternionNormalizeSIMD(p1); + return result; +} + +FORCEINLINE fltx4 QuaternionMASIMD(const fltx4& p, float s, const fltx4& q) +{ + return QuaternionMASIMD(p, ReplicateX4(s), q); +} +#endif + + +//----------------------------------------------------------------------------- +// Purpose: qt = p + s * q +//----------------------------------------------------------------------------- +void QuaternionAccumulate(const Quaternion& p, float s, const Quaternion& q, Quaternion& qt) +{ + Quaternion q2; + QuaternionAlign(p, q, q2); + + qt[0] = p[0] + s * q2[0]; + qt[1] = p[1] + s * q2[1]; + qt[2] = p[2] + s * q2[2]; + qt[3] = p[3] + s * q2[3]; +} + +#if ALLOW_SIMD_QUATERNION_MATH +FORCEINLINE fltx4 QuaternionAccumulateSIMD(const fltx4& p, float s, const fltx4& q) +{ + fltx4 q2, s4, result; + q2 = QuaternionAlignSIMD(p, q); + s4 = ReplicateX4(s); + result = MaddSIMD(s4, q2, p); + return result; +} +#endif diff --git a/r5dev/core/init.cpp b/r5dev/core/init.cpp index b672b445..75568b4c 100644 --- a/r5dev/core/init.cpp +++ b/r5dev/core/init.cpp @@ -35,6 +35,7 @@ #ifndef DEDICATED #include "milessdk/win64_rrthreads.h" #endif // !DEDICATED +#include "mathlib/mathlib.h" #include "vphysics/QHull.h" #include "bsplib/bsplib.h" #include "materialsystem/cmaterialsystem.h" @@ -118,9 +119,10 @@ void Systems_Init() { spdlog::info("+-------------------------------------------------------------+\n"); QuerySystemInfo(); - CFastTimer initTimer; + CFastTimer initTimer; initTimer.Start(); + for (IDetour* pDetour : vDetour) { pDetour->GetCon(); @@ -128,13 +130,14 @@ void Systems_Init() pDetour->GetVar(); } initTimer.End(); + spdlog::info("+-------------------------------------------------------------+\n"); spdlog::info("Detour->Init() '{:10.6f}' seconds ('{:12d}' clocks)\n", initTimer.GetDuration().GetSeconds(), initTimer.GetDuration().GetCycles()); initTimer.Start(); - // Initialize WinSock system. - WS_Init(); + WS_Init(); // Initialize WinSock. + MathLib_Init(); // Initialize MathLib. // Begin the detour transaction to hook the the process DetourTransactionBegin(); @@ -404,11 +407,14 @@ void QuerySystemInfo() std::system_category().message(static_cast(::GetLastError()))); } - if (!(pi.m_bSSE && pi.m_bSSE2)) + if (!s_bMathlibInitialized) { - if (MessageBoxA(NULL, "SSE and SSE2 are required.", "Unsupported CPU", MB_ICONERROR | MB_OK)) + if (!(pi.m_bSSE && pi.m_bSSE2)) { - TerminateProcess(GetCurrentProcess(), 0xBAD0C0DE); + if (MessageBoxA(NULL, "SSE and SSE2 are required.", "Unsupported CPU", MB_ICONERROR | MB_OK)) + { + TerminateProcess(GetCurrentProcess(), 0xBAD0C0DE); + } } } } diff --git a/r5dev/mathlib/almostequal.cpp b/r5dev/mathlib/almostequal.cpp index 76f99a74..01865efa 100644 --- a/r5dev/mathlib/almostequal.cpp +++ b/r5dev/mathlib/almostequal.cpp @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//========= Copyright � 1996-2008, Valve Corporation, All rights reserved. ============// // // Purpose: Fast ways to compare equality of two floats. Assumes // sizeof(float) == sizeof(int) and we are using IEEE format. diff --git a/r5dev/mathlib/color_conversion.cpp b/r5dev/mathlib/color_conversion.cpp index 37f03fe4..ab7d87b0 100644 --- a/r5dev/mathlib/color_conversion.cpp +++ b/r5dev/mathlib/color_conversion.cpp @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============// // // Purpose: Color conversion routines. // @@ -34,71 +34,71 @@ static float g_Mathlib_LinearToGamma[256]; // linear (0..1) to gamma (0..1) // TODO: move this into the one DLL that actually uses it, instead of statically // linking it everywhere via mathlib. ALIGN128 float power2_n[256] = // 2**(index - 128) / 255 -{ - 1.152445441982634800E-041, 2.304890883965269600E-041, 4.609781767930539200E-041, 9.219563535861078400E-041, +{ + 1.152445441982634800E-041, 2.304890883965269600E-041, 4.609781767930539200E-041, 9.219563535861078400E-041, 1.843912707172215700E-040, 3.687825414344431300E-040, 7.375650828688862700E-040, 1.475130165737772500E-039, - 2.950260331475545100E-039, 5.900520662951090200E-039, 1.180104132590218000E-038, 2.360208265180436100E-038, - 4.720416530360872100E-038, 9.440833060721744200E-038, 1.888166612144348800E-037, 3.776333224288697700E-037, - 7.552666448577395400E-037, 1.510533289715479100E-036, 3.021066579430958200E-036, 6.042133158861916300E-036, - 1.208426631772383300E-035, 2.416853263544766500E-035, 4.833706527089533100E-035, 9.667413054179066100E-035, - 1.933482610835813200E-034, 3.866965221671626400E-034, 7.733930443343252900E-034, 1.546786088668650600E-033, - 3.093572177337301200E-033, 6.187144354674602300E-033, 1.237428870934920500E-032, 2.474857741869840900E-032, - 4.949715483739681800E-032, 9.899430967479363700E-032, 1.979886193495872700E-031, 3.959772386991745500E-031, - 7.919544773983491000E-031, 1.583908954796698200E-030, 3.167817909593396400E-030, 6.335635819186792800E-030, - 1.267127163837358600E-029, 2.534254327674717100E-029, 5.068508655349434200E-029, 1.013701731069886800E-028, - 2.027403462139773700E-028, 4.054806924279547400E-028, 8.109613848559094700E-028, 1.621922769711818900E-027, - 3.243845539423637900E-027, 6.487691078847275800E-027, 1.297538215769455200E-026, 2.595076431538910300E-026, - 5.190152863077820600E-026, 1.038030572615564100E-025, 2.076061145231128300E-025, 4.152122290462256500E-025, - 8.304244580924513000E-025, 1.660848916184902600E-024, 3.321697832369805200E-024, 6.643395664739610400E-024, - 1.328679132947922100E-023, 2.657358265895844200E-023, 5.314716531791688300E-023, 1.062943306358337700E-022, - 2.125886612716675300E-022, 4.251773225433350700E-022, 8.503546450866701300E-022, 1.700709290173340300E-021, - 3.401418580346680500E-021, 6.802837160693361100E-021, 1.360567432138672200E-020, 2.721134864277344400E-020, - 5.442269728554688800E-020, 1.088453945710937800E-019, 2.176907891421875500E-019, 4.353815782843751100E-019, - 8.707631565687502200E-019, 1.741526313137500400E-018, 3.483052626275000900E-018, 6.966105252550001700E-018, - 1.393221050510000300E-017, 2.786442101020000700E-017, 5.572884202040001400E-017, 1.114576840408000300E-016, - 2.229153680816000600E-016, 4.458307361632001100E-016, 8.916614723264002200E-016, 1.783322944652800400E-015, - 3.566645889305600900E-015, 7.133291778611201800E-015, 1.426658355722240400E-014, 2.853316711444480700E-014, - 5.706633422888961400E-014, 1.141326684577792300E-013, 2.282653369155584600E-013, 4.565306738311169100E-013, - 9.130613476622338300E-013, 1.826122695324467700E-012, 3.652245390648935300E-012, 7.304490781297870600E-012, - 1.460898156259574100E-011, 2.921796312519148200E-011, 5.843592625038296500E-011, 1.168718525007659300E-010, - 2.337437050015318600E-010, 4.674874100030637200E-010, 9.349748200061274400E-010, 1.869949640012254900E-009, - 3.739899280024509800E-009, 7.479798560049019500E-009, 1.495959712009803900E-008, 2.991919424019607800E-008, - 5.983838848039215600E-008, 1.196767769607843100E-007, 2.393535539215686200E-007, 4.787071078431372500E-007, - 9.574142156862745000E-007, 1.914828431372549000E-006, 3.829656862745098000E-006, 7.659313725490196000E-006, - 1.531862745098039200E-005, 3.063725490196078400E-005, 6.127450980392156800E-005, 1.225490196078431400E-004, - 2.450980392156862700E-004, 4.901960784313725400E-004, 9.803921568627450800E-004, 1.960784313725490200E-003, - 3.921568627450980300E-003, 7.843137254901960700E-003, 1.568627450980392100E-002, 3.137254901960784300E-002, - 6.274509803921568500E-002, 1.254901960784313700E-001, 2.509803921568627400E-001, 5.019607843137254800E-001, - 1.003921568627451000E+000, 2.007843137254901900E+000, 4.015686274509803900E+000, 8.031372549019607700E+000, - 1.606274509803921500E+001, 3.212549019607843100E+001, 6.425098039215686200E+001, 1.285019607843137200E+002, - 2.570039215686274500E+002, 5.140078431372548900E+002, 1.028015686274509800E+003, 2.056031372549019600E+003, - 4.112062745098039200E+003, 8.224125490196078300E+003, 1.644825098039215700E+004, 3.289650196078431300E+004, - 6.579300392156862700E+004, 1.315860078431372500E+005, 2.631720156862745100E+005, 5.263440313725490100E+005, - 1.052688062745098000E+006, 2.105376125490196000E+006, 4.210752250980392100E+006, 8.421504501960784200E+006, - 1.684300900392156800E+007, 3.368601800784313700E+007, 6.737203601568627400E+007, 1.347440720313725500E+008, - 2.694881440627450900E+008, 5.389762881254901900E+008, 1.077952576250980400E+009, 2.155905152501960800E+009, - 4.311810305003921500E+009, 8.623620610007843000E+009, 1.724724122001568600E+010, 3.449448244003137200E+010, - 6.898896488006274400E+010, 1.379779297601254900E+011, 2.759558595202509800E+011, 5.519117190405019500E+011, - 1.103823438081003900E+012, 2.207646876162007800E+012, 4.415293752324015600E+012, 8.830587504648031200E+012, - 1.766117500929606200E+013, 3.532235001859212500E+013, 7.064470003718425000E+013, 1.412894000743685000E+014, - 2.825788001487370000E+014, 5.651576002974740000E+014, 1.130315200594948000E+015, 2.260630401189896000E+015, - 4.521260802379792000E+015, 9.042521604759584000E+015, 1.808504320951916800E+016, 3.617008641903833600E+016, - 7.234017283807667200E+016, 1.446803456761533400E+017, 2.893606913523066900E+017, 5.787213827046133800E+017, - 1.157442765409226800E+018, 2.314885530818453500E+018, 4.629771061636907000E+018, 9.259542123273814000E+018, - 1.851908424654762800E+019, 3.703816849309525600E+019, 7.407633698619051200E+019, 1.481526739723810200E+020, - 2.963053479447620500E+020, 5.926106958895241000E+020, 1.185221391779048200E+021, 2.370442783558096400E+021, - 4.740885567116192800E+021, 9.481771134232385600E+021, 1.896354226846477100E+022, 3.792708453692954200E+022, - 7.585416907385908400E+022, 1.517083381477181700E+023, 3.034166762954363400E+023, 6.068333525908726800E+023, - 1.213666705181745400E+024, 2.427333410363490700E+024, 4.854666820726981400E+024, 9.709333641453962800E+024, - 1.941866728290792600E+025, 3.883733456581585100E+025, 7.767466913163170200E+025, 1.553493382632634000E+026, - 3.106986765265268100E+026, 6.213973530530536200E+026, 1.242794706106107200E+027, 2.485589412212214500E+027, - 4.971178824424429000E+027, 9.942357648848857900E+027, 1.988471529769771600E+028, 3.976943059539543200E+028, - 7.953886119079086300E+028, 1.590777223815817300E+029, 3.181554447631634500E+029, 6.363108895263269100E+029, - 1.272621779052653800E+030, 2.545243558105307600E+030, 5.090487116210615300E+030, 1.018097423242123100E+031, - 2.036194846484246100E+031, 4.072389692968492200E+031, 8.144779385936984400E+031, 1.628955877187396900E+032, - 3.257911754374793800E+032, 6.515823508749587500E+032, 1.303164701749917500E+033, 2.606329403499835000E+033, - 5.212658806999670000E+033, 1.042531761399934000E+034, 2.085063522799868000E+034, 4.170127045599736000E+034, - 8.340254091199472000E+034, 1.668050818239894400E+035, 3.336101636479788800E+035, 6.672203272959577600E+035 + 2.950260331475545100E-039, 5.900520662951090200E-039, 1.180104132590218000E-038, 2.360208265180436100E-038, + 4.720416530360872100E-038, 9.440833060721744200E-038, 1.888166612144348800E-037, 3.776333224288697700E-037, + 7.552666448577395400E-037, 1.510533289715479100E-036, 3.021066579430958200E-036, 6.042133158861916300E-036, + 1.208426631772383300E-035, 2.416853263544766500E-035, 4.833706527089533100E-035, 9.667413054179066100E-035, + 1.933482610835813200E-034, 3.866965221671626400E-034, 7.733930443343252900E-034, 1.546786088668650600E-033, + 3.093572177337301200E-033, 6.187144354674602300E-033, 1.237428870934920500E-032, 2.474857741869840900E-032, + 4.949715483739681800E-032, 9.899430967479363700E-032, 1.979886193495872700E-031, 3.959772386991745500E-031, + 7.919544773983491000E-031, 1.583908954796698200E-030, 3.167817909593396400E-030, 6.335635819186792800E-030, + 1.267127163837358600E-029, 2.534254327674717100E-029, 5.068508655349434200E-029, 1.013701731069886800E-028, + 2.027403462139773700E-028, 4.054806924279547400E-028, 8.109613848559094700E-028, 1.621922769711818900E-027, + 3.243845539423637900E-027, 6.487691078847275800E-027, 1.297538215769455200E-026, 2.595076431538910300E-026, + 5.190152863077820600E-026, 1.038030572615564100E-025, 2.076061145231128300E-025, 4.152122290462256500E-025, + 8.304244580924513000E-025, 1.660848916184902600E-024, 3.321697832369805200E-024, 6.643395664739610400E-024, + 1.328679132947922100E-023, 2.657358265895844200E-023, 5.314716531791688300E-023, 1.062943306358337700E-022, + 2.125886612716675300E-022, 4.251773225433350700E-022, 8.503546450866701300E-022, 1.700709290173340300E-021, + 3.401418580346680500E-021, 6.802837160693361100E-021, 1.360567432138672200E-020, 2.721134864277344400E-020, + 5.442269728554688800E-020, 1.088453945710937800E-019, 2.176907891421875500E-019, 4.353815782843751100E-019, + 8.707631565687502200E-019, 1.741526313137500400E-018, 3.483052626275000900E-018, 6.966105252550001700E-018, + 1.393221050510000300E-017, 2.786442101020000700E-017, 5.572884202040001400E-017, 1.114576840408000300E-016, + 2.229153680816000600E-016, 4.458307361632001100E-016, 8.916614723264002200E-016, 1.783322944652800400E-015, + 3.566645889305600900E-015, 7.133291778611201800E-015, 1.426658355722240400E-014, 2.853316711444480700E-014, + 5.706633422888961400E-014, 1.141326684577792300E-013, 2.282653369155584600E-013, 4.565306738311169100E-013, + 9.130613476622338300E-013, 1.826122695324467700E-012, 3.652245390648935300E-012, 7.304490781297870600E-012, + 1.460898156259574100E-011, 2.921796312519148200E-011, 5.843592625038296500E-011, 1.168718525007659300E-010, + 2.337437050015318600E-010, 4.674874100030637200E-010, 9.349748200061274400E-010, 1.869949640012254900E-009, + 3.739899280024509800E-009, 7.479798560049019500E-009, 1.495959712009803900E-008, 2.991919424019607800E-008, + 5.983838848039215600E-008, 1.196767769607843100E-007, 2.393535539215686200E-007, 4.787071078431372500E-007, + 9.574142156862745000E-007, 1.914828431372549000E-006, 3.829656862745098000E-006, 7.659313725490196000E-006, + 1.531862745098039200E-005, 3.063725490196078400E-005, 6.127450980392156800E-005, 1.225490196078431400E-004, + 2.450980392156862700E-004, 4.901960784313725400E-004, 9.803921568627450800E-004, 1.960784313725490200E-003, + 3.921568627450980300E-003, 7.843137254901960700E-003, 1.568627450980392100E-002, 3.137254901960784300E-002, + 6.274509803921568500E-002, 1.254901960784313700E-001, 2.509803921568627400E-001, 5.019607843137254800E-001, + 1.003921568627451000E+000, 2.007843137254901900E+000, 4.015686274509803900E+000, 8.031372549019607700E+000, + 1.606274509803921500E+001, 3.212549019607843100E+001, 6.425098039215686200E+001, 1.285019607843137200E+002, + 2.570039215686274500E+002, 5.140078431372548900E+002, 1.028015686274509800E+003, 2.056031372549019600E+003, + 4.112062745098039200E+003, 8.224125490196078300E+003, 1.644825098039215700E+004, 3.289650196078431300E+004, + 6.579300392156862700E+004, 1.315860078431372500E+005, 2.631720156862745100E+005, 5.263440313725490100E+005, + 1.052688062745098000E+006, 2.105376125490196000E+006, 4.210752250980392100E+006, 8.421504501960784200E+006, + 1.684300900392156800E+007, 3.368601800784313700E+007, 6.737203601568627400E+007, 1.347440720313725500E+008, + 2.694881440627450900E+008, 5.389762881254901900E+008, 1.077952576250980400E+009, 2.155905152501960800E+009, + 4.311810305003921500E+009, 8.623620610007843000E+009, 1.724724122001568600E+010, 3.449448244003137200E+010, + 6.898896488006274400E+010, 1.379779297601254900E+011, 2.759558595202509800E+011, 5.519117190405019500E+011, + 1.103823438081003900E+012, 2.207646876162007800E+012, 4.415293752324015600E+012, 8.830587504648031200E+012, + 1.766117500929606200E+013, 3.532235001859212500E+013, 7.064470003718425000E+013, 1.412894000743685000E+014, + 2.825788001487370000E+014, 5.651576002974740000E+014, 1.130315200594948000E+015, 2.260630401189896000E+015, + 4.521260802379792000E+015, 9.042521604759584000E+015, 1.808504320951916800E+016, 3.617008641903833600E+016, + 7.234017283807667200E+016, 1.446803456761533400E+017, 2.893606913523066900E+017, 5.787213827046133800E+017, + 1.157442765409226800E+018, 2.314885530818453500E+018, 4.629771061636907000E+018, 9.259542123273814000E+018, + 1.851908424654762800E+019, 3.703816849309525600E+019, 7.407633698619051200E+019, 1.481526739723810200E+020, + 2.963053479447620500E+020, 5.926106958895241000E+020, 1.185221391779048200E+021, 2.370442783558096400E+021, + 4.740885567116192800E+021, 9.481771134232385600E+021, 1.896354226846477100E+022, 3.792708453692954200E+022, + 7.585416907385908400E+022, 1.517083381477181700E+023, 3.034166762954363400E+023, 6.068333525908726800E+023, + 1.213666705181745400E+024, 2.427333410363490700E+024, 4.854666820726981400E+024, 9.709333641453962800E+024, + 1.941866728290792600E+025, 3.883733456581585100E+025, 7.767466913163170200E+025, 1.553493382632634000E+026, + 3.106986765265268100E+026, 6.213973530530536200E+026, 1.242794706106107200E+027, 2.485589412212214500E+027, + 4.971178824424429000E+027, 9.942357648848857900E+027, 1.988471529769771600E+028, 3.976943059539543200E+028, + 7.953886119079086300E+028, 1.590777223815817300E+029, 3.181554447631634500E+029, 6.363108895263269100E+029, + 1.272621779052653800E+030, 2.545243558105307600E+030, 5.090487116210615300E+030, 1.018097423242123100E+031, + 2.036194846484246100E+031, 4.072389692968492200E+031, 8.144779385936984400E+031, 1.628955877187396900E+032, + 3.257911754374793800E+032, 6.515823508749587500E+032, 1.303164701749917500E+033, 2.606329403499835000E+033, + 5.212658806999670000E+033, 1.042531761399934000E+034, 2.085063522799868000E+034, 4.170127045599736000E+034, + 8.340254091199472000E+034, 1.668050818239894400E+035, 3.336101636479788800E+035, 6.672203272959577600E+035 }; // You can use this to double check the exponent table and assert that @@ -108,20 +108,20 @@ ALIGN128 float power2_n[256] = // 2**(index - 128) / 255 #pragma warning( disable : 4189 ) // disable unused local variable warning static void CheckExponentTable() { - for( int i = 0; i < 256; i++ ) + for (int i = 0; i < 256; i++) { - float testAgainst = pow( 2.0f, i - 128 ) / 255.0f; - float diff = testAgainst - power2_n[i] ; + float testAgainst = pow(2.0f, i - 128) / 255.0f; + float diff = testAgainst - power2_n[i]; float relativeDiff = diff / testAgainst; - Assert( testAgainst == 0 ? - power2_n[i] < 1.16E-041 : - power2_n[i] == testAgainst ); + Assert(testAgainst == 0 ? + power2_n[i] < 1.16E-041 : + power2_n[i] == testAgainst); } } #pragma warning(pop) #endif -void BuildGammaTable( float gamma, float texGamma, float brightness, int overbright ) +void BuildGammaTable(float gamma, float texGamma, float brightness, int overbright) { int i, inf; float g1, g3; @@ -129,30 +129,30 @@ void BuildGammaTable( float gamma, float texGamma, float brightness, int overbri // Con_Printf("BuildGammaTable %.1f %.1f %.1f\n", g, v_lightgamma.GetFloat(), v_texgamma.GetFloat() ); float g = gamma; - if (g > 3.0) + if (g > 3.0) { g = 3.0; } g = 1.0 / g; - g1 = texGamma * g; + g1 = texGamma * g; - if (brightness <= 0.0) + if (brightness <= 0.0) { g3 = 0.125; } - else if (brightness > 1.0) + else if (brightness > 1.0) { g3 = 0.05; } - else + else { g3 = 0.125 - (brightness * brightness) * 0.075; } - for (i=0 ; i<256 ; i++) + for (i = 0; i < 256; i++) { - inf = 255 * pow ( i/255.f, g1 ); + inf = (int)(255 * pow(i / 255.f, g1)); if (inf < 0) inf = 0; if (inf > 255) @@ -160,7 +160,7 @@ void BuildGammaTable( float gamma, float texGamma, float brightness, int overbri texgammatable[i] = inf; } - for (i=0 ; i<1024 ; i++) + for (i = 0; i < 1024; i++) { float f; @@ -173,11 +173,11 @@ void BuildGammaTable( float gamma, float texGamma, float brightness, int overbri // shift up if (f <= g3) f = (f / g3) * 0.125; - else + else f = 0.125 + ((f - g3) / (1.0 - g3)) * 0.875; // convert linear space to desired gamma space - inf = 255 * pow ( f, g ); + inf = (int)(255 * pow(f, g)); if (inf < 0) inf = 0; @@ -196,32 +196,32 @@ void BuildGammaTable( float gamma, float texGamma, float brightness, int overbri } */ - for (i=0 ; i<256 ; i++) + for (i = 0; i < 256; i++) { // convert from nonlinear texture space (0..255) to linear space (0..1) - texturetolinear[i] = pow( i / 255.f, texGamma ); + texturetolinear[i] = pow(i / 255.f, texGamma); // convert from linear space (0..1) to nonlinear (sRGB) space (0..1) - g_Mathlib_LinearToGamma[i] = LinearToGammaFullRange( i / 255.f ); + g_Mathlib_LinearToGamma[i] = LinearToGammaFullRange(i / 255.f); // convert from sRGB gamma space (0..1) to linear space (0..1) - g_Mathlib_GammaToLinear[i] = GammaToLinearFullRange( i / 255.f ); + g_Mathlib_GammaToLinear[i] = GammaToLinearFullRange(i / 255.f); } - for (i=0 ; i<1024 ; i++) + for (i = 0; i < 1024; i++) { // convert from linear space (0..1) to nonlinear texture space (0..255) - lineartotexture[i] = pow( i / 1023.0, 1.0 / texGamma ) * 255; + lineartotexture[i] = (int)pow(i / 1023.0, 1.0 / texGamma) * 255; } #if 0 - for (i=0 ; i<256 ; i++) + for (i = 0; i < 256; i++) { float f; // convert from nonlinear lightmap space (0..255) to linear space (0..4) // f = (i / 255.0) * sqrt( 4 ); - f = i * (2.0 / 255.0); + f = i * (2.0 / 255.0); f = f * f; texlighttolinear[i] = f; @@ -234,50 +234,50 @@ void BuildGammaTable( float gamma, float texGamma, float brightness, int overbri // Can't do overbright without texcombine // UNDONE: Add GAMMA ramp to rectify this - if ( overbright == 2 ) + if (overbright == 2) { overbrightFactor = 0.5; } - else if ( overbright == 4 ) + else if (overbright == 4) { overbrightFactor = 0.25; } - for (i=0 ; i<4096 ; i++) + for (i = 0; i < 4096; i++) { // convert from linear 0..4 (x1024) to screen corrected vertex space (0..1?) - f = pow ( i/1024.0, 1.0 / gamma ); + f = pow(i / 1024.0, 1.0 / gamma); lineartovertex[i] = f * overbrightFactor; if (lineartovertex[i] > 1) lineartovertex[i] = 1; - int nLightmap = RoundFloatToInt( f * 255 * overbrightFactor ); - nLightmap = clamp( nLightmap, 0, 255 ); + int nLightmap = RoundFloatToInt(f * 255 * overbrightFactor); + nLightmap = clamp(nLightmap, 0, 255); lineartolightmap[i] = (unsigned char)nLightmap; } } } -float GammaToLinearFullRange( float gamma ) +float GammaToLinearFullRange(float gamma) { - return pow( gamma, 2.2f ); + return pow(gamma, 2.2f); } -float LinearToGammaFullRange( float linear ) +float LinearToGammaFullRange(float linear) { - return pow( linear, 1.0f / 2.2f ); + return pow(linear, 1.0f / 2.2f); } -float GammaToLinear( float gamma ) +float GammaToLinear(float gamma) { - Assert( s_bMathlibInitialized ); - if ( gamma < 0.0f ) + Assert(s_bMathlibInitialized); + if (gamma < 0.0f) { return 0.0f; } - if ( gamma >= 0.95f ) + if (gamma >= 0.95f) { // Use GammaToLinearFullRange maybe if you trip this. // X360TEMP @@ -285,129 +285,129 @@ float GammaToLinear( float gamma ) return 1.0f; } - int index = RoundFloatToInt( gamma * 255.0f ); - Assert( index >= 0 && index < 256 ); + int index = RoundFloatToInt(gamma * 255.0f); + Assert(index >= 0 && index < 256); return g_Mathlib_GammaToLinear[index]; } -float LinearToGamma( float linear ) +float LinearToGamma(float linear) { - Assert( s_bMathlibInitialized ); - if ( linear < 0.0f ) + Assert(s_bMathlibInitialized); + if (linear < 0.0f) { return 0.0f; } - if ( linear > 1.0f ) + if (linear > 1.0f) { // Use LinearToGammaFullRange maybe if you trip this. - Assert( 0 ); + Assert(0); return 1.0f; } - int index = RoundFloatToInt( linear * 255.0f ); - Assert( index >= 0 && index < 256 ); + int index = RoundFloatToInt(linear * 255.0f); + Assert(index >= 0 && index < 256); return g_Mathlib_LinearToGamma[index]; } //----------------------------------------------------------------------------- // Helper functions to convert between sRGB and 360 gamma space //----------------------------------------------------------------------------- -float SrgbGammaToLinear( float flSrgbGammaValue ) +float SrgbGammaToLinear(float flSrgbGammaValue) { - float x = clamp( flSrgbGammaValue, 0.0f, 1.0f ); - return ( x <= 0.04045f ) ? ( x / 12.92f ) : ( pow( ( x + 0.055f ) / 1.055f, 2.4f ) ); + float x = clamp(flSrgbGammaValue, 0.0f, 1.0f); + return (x <= 0.04045f) ? (x / 12.92f) : (pow((x + 0.055f) / 1.055f, 2.4f)); } -float SrgbLinearToGamma( float flLinearValue ) +float SrgbLinearToGamma(float flLinearValue) { - float x = clamp( flLinearValue, 0.0f, 1.0f ); - return ( x <= 0.0031308f ) ? ( x * 12.92f ) : ( 1.055f * pow( x, ( 1.0f / 2.4f ) ) ) - 0.055f; + float x = clamp(flLinearValue, 0.0f, 1.0f); + return (x <= 0.0031308f) ? (x * 12.92f) : (1.055f * pow(x, (1.0f / 2.4f))) - 0.055f; } -float X360GammaToLinear( float fl360GammaValue ) +float X360GammaToLinear(float fl360GammaValue) { float flLinearValue; - fl360GammaValue = clamp( fl360GammaValue, 0.0f, 1.0f ); - if ( fl360GammaValue < ( 96.0f / 255.0f ) ) + fl360GammaValue = clamp(fl360GammaValue, 0.0f, 1.0f); + if (fl360GammaValue < (96.0f / 255.0f)) { - if ( fl360GammaValue < ( 64.0f / 255.0f ) ) + if (fl360GammaValue < (64.0f / 255.0f)) { flLinearValue = fl360GammaValue * 255.0f; } else { - flLinearValue = fl360GammaValue * ( 255.0f * 2.0f ) - 64.0f; - flLinearValue += floor( flLinearValue * ( 1.0f / 512.0f ) ); + flLinearValue = fl360GammaValue * (255.0f * 2.0f) - 64.0f; + flLinearValue += floor(flLinearValue * (1.0f / 512.0f)); } } else { - if( fl360GammaValue < ( 192.0f / 255.0f ) ) + if (fl360GammaValue < (192.0f / 255.0f)) { - flLinearValue = fl360GammaValue * ( 255.0f * 4.0f ) - 256.0f; - flLinearValue += floor( flLinearValue * ( 1.0f / 256.0f ) ); + flLinearValue = fl360GammaValue * (255.0f * 4.0f) - 256.0f; + flLinearValue += floor(flLinearValue * (1.0f / 256.0f)); } else { - flLinearValue = fl360GammaValue * ( 255.0f * 8.0f ) - 1024.0f; - flLinearValue += floor( flLinearValue * ( 1.0f / 128.0f ) ); + flLinearValue = fl360GammaValue * (255.0f * 8.0f) - 1024.0f; + flLinearValue += floor(flLinearValue * (1.0f / 128.0f)); } } flLinearValue *= 1.0f / 1023.0f; - flLinearValue = clamp( flLinearValue, 0.0f, 1.0f ); + flLinearValue = clamp(flLinearValue, 0.0f, 1.0f); return flLinearValue; } -float X360LinearToGamma( float flLinearValue ) +float X360LinearToGamma(float flLinearValue) { float fl360GammaValue; - flLinearValue = clamp( flLinearValue, 0.0f, 1.0f ); - if ( flLinearValue < ( 128.0f / 1023.0f ) ) + flLinearValue = clamp(flLinearValue, 0.0f, 1.0f); + if (flLinearValue < (128.0f / 1023.0f)) { - if ( flLinearValue < ( 64.0f / 1023.0f ) ) + if (flLinearValue < (64.0f / 1023.0f)) { - fl360GammaValue = flLinearValue * ( 1023.0f * ( 1.0f / 255.0f ) ); + fl360GammaValue = flLinearValue * (1023.0f * (1.0f / 255.0f)); } else { - fl360GammaValue = flLinearValue * ( ( 1023.0f / 2.0f ) * ( 1.0f / 255.0f ) ) + ( 32.0f / 255.0f ); + fl360GammaValue = flLinearValue * ((1023.0f / 2.0f) * (1.0f / 255.0f)) + (32.0f / 255.0f); } } else { - if ( flLinearValue < ( 512.0f / 1023.0f ) ) + if (flLinearValue < (512.0f / 1023.0f)) { - fl360GammaValue = flLinearValue * ( ( 1023.0f / 4.0f ) * ( 1.0f / 255.0f ) ) + ( 64.0f / 255.0f ); + fl360GammaValue = flLinearValue * ((1023.0f / 4.0f) * (1.0f / 255.0f)) + (64.0f / 255.0f); } else { - fl360GammaValue = flLinearValue * ( ( 1023.0f /8.0f ) * ( 1.0f / 255.0f ) ) + ( 128.0f /255.0f ); // 1.0 -> 1.0034313725490196078431372549016 - if ( fl360GammaValue > 1.0f ) + fl360GammaValue = flLinearValue * ((1023.0f / 8.0f) * (1.0f / 255.0f)) + (128.0f / 255.0f); // 1.0 -> 1.0034313725490196078431372549016 + if (fl360GammaValue > 1.0f) { fl360GammaValue = 1.0f; } } } - fl360GammaValue = clamp( fl360GammaValue, 0.0f, 1.0f ); + fl360GammaValue = clamp(fl360GammaValue, 0.0f, 1.0f); return fl360GammaValue; } -float SrgbGammaTo360Gamma( float flSrgbGammaValue ) +float SrgbGammaTo360Gamma(float flSrgbGammaValue) { - float flLinearValue = SrgbGammaToLinear( flSrgbGammaValue ); - float fl360GammaValue = X360LinearToGamma( flLinearValue ); + float flLinearValue = SrgbGammaToLinear(flSrgbGammaValue); + float fl360GammaValue = X360LinearToGamma(flLinearValue); return fl360GammaValue; } // convert texture to linear 0..1 value -float TextureToLinear( int c ) +float TextureToLinear(int c) { - Assert( s_bMathlibInitialized ); + Assert(s_bMathlibInitialized); if (c < 0) return 0; if (c > 255) @@ -417,11 +417,11 @@ float TextureToLinear( int c ) } // convert texture to linear 0..1 value -int LinearToTexture( float f ) +int LinearToTexture(float f) { - Assert( s_bMathlibInitialized ); + Assert(s_bMathlibInitialized); int i; - i = f * 1023; // assume 0..1 range + i = (int)(f * 1023); // assume 0..1 range if (i < 0) i = 0; if (i > 1023) @@ -432,11 +432,11 @@ int LinearToTexture( float f ) // converts 0..1 linear value to screen gamma (0..255) -int LinearToScreenGamma( float f ) +int LinearToScreenGamma(float f) { - Assert( s_bMathlibInitialized ); + Assert(s_bMathlibInitialized); int i; - i = f * 1023; // assume 0..1 range + i = (int)(f * 1023); // assume 0..1 range if (i < 0) i = 0; if (i > 1023) @@ -445,30 +445,30 @@ int LinearToScreenGamma( float f ) return lineartoscreen[i]; } -void ColorRGBExp32ToVector( const ColorRGBExp32& in, Vector3D& out ) +void ColorRGBExp32ToVector(const ColorRGBExp32& in, Vector3D& out) { - Assert( s_bMathlibInitialized ); + Assert(s_bMathlibInitialized); // FIXME: Why is there a factor of 255 built into this? - out.x = 255.0f * TexLightToLinear( in.r, in.exponent ); - out.y = 255.0f * TexLightToLinear( in.g, in.exponent ); - out.z = 255.0f * TexLightToLinear( in.b, in.exponent ); + out.x = 255.0f * TexLightToLinear(in.r, in.exponent); + out.y = 255.0f * TexLightToLinear(in.g, in.exponent); + out.z = 255.0f * TexLightToLinear(in.b, in.exponent); } #if 0 // assumes that the desired mantissa range is 128..255 -static int VectorToColorRGBExp32_CalcExponent( float in ) +static int VectorToColorRGBExp32_CalcExponent(float in) { int power = 0; - - if( in != 0.0f ) + + if (in != 0.0f) { - while( in > 255.0f ) + while (in > 255.0f) { power += 1; in *= 0.5f; } - - while( in < 128.0f ) + + while (in < 128.0f) { power -= 1; in *= 2.0f; @@ -478,51 +478,51 @@ static int VectorToColorRGBExp32_CalcExponent( float in ) return power; } -void VectorToColorRGBExp32( const Vector& vin, ColorRGBExp32 &c ) +void VectorToColorRGBExp32(const Vector3D& vin, ColorRGBExp32& c) { - Vector v = vin; - Assert( s_bMathlibInitialized ); - Assert( v.x >= 0.0f && v.y >= 0.0f && v.z >= 0.0f ); - int i; - float max = v[0]; - for( i = 1; i < 3; i++ ) + Vector3D v = vin; + Assert(s_bMathlibInitialized); + Assert(v.x >= 0.0f && v.y >= 0.0f && v.z >= 0.0f); + int i; + float max = v[0]; + for (i = 1; i < 3; i++) { // Get the maximum value. - if( v[i] > max ) + if (v[i] > max) { max = v[i]; } } - + // figure out the exponent for this luxel. - int exponent = VectorToColorRGBExp32_CalcExponent( max ); - + int exponent = VectorToColorRGBExp32_CalcExponent(max); + // make the exponent fits into a signed byte. - if( exponent < -128 ) + if (exponent < -128) { exponent = -128; } - else if( exponent > 127 ) + else if (exponent > 127) { exponent = 127; } - + // undone: optimize with a table - float scalar = pow( 2.0f, -exponent ); + float scalar = pow(2.0f, -exponent); // convert to mantissa x 2^exponent format - for( i = 0; i < 3; i++ ) + for (i = 0; i < 3; i++) { v[i] *= scalar; // clamp - if( v[i] > 255.0f ) + if (v[i] > 255.0f) { v[i] = 255.0f; } } - c.r = ( unsigned char )v[0]; - c.g = ( unsigned char )v[1]; - c.b = ( unsigned char )v[2]; - c.exponent = ( signed char )exponent; + c.r = (unsigned char)v[0]; + c.g = (unsigned char)v[1]; + c.b = (unsigned char)v[2]; + c.exponent = (signed char)exponent; } #else @@ -531,7 +531,7 @@ void VectorToColorRGBExp32( const Vector& vin, ColorRGBExp32 &c ) // for f' = f * 2^e, f is on [128..255]. // Uses IEEE 754 representation to directly extract this information // from the float. -inline static int VectorToColorRGBExp32_CalcExponent( const float *pin ) +inline static int VectorToColorRGBExp32_CalcExponent(const float* pin) { // The thing we will take advantage of here is that the exponent component // is stored in the float itself, and because we want to map to 128..255, we @@ -542,12 +542,12 @@ inline static int VectorToColorRGBExp32_CalcExponent( const float *pin ) if (*pin == 0.0f) return 0; - unsigned int fbits = *reinterpret_cast(pin); - + unsigned int fbits = *reinterpret_cast(pin); + // the exponent component is bits 23..30, and biased by +127 const unsigned int biasedSeven = 7 + 127; - signed int expComponent = ( fbits & 0x7F800000 ) >> 23; + signed int expComponent = (fbits & 0x7F800000) >> 23; expComponent -= biasedSeven; // now the difference from seven (positive if was less than, etc) return expComponent; } @@ -561,15 +561,15 @@ inline static int VectorToColorRGBExp32_CalcExponent( const float *pin ) /// moving it onto the cell. /// \warning: Assumes an IEEE 754 single-precision float representation! Those of you /// porting to an 8080 are out of luck. -void VectorToColorRGBExp32( const Vector3D& vin, ColorRGBExp32 &c ) +void VectorToColorRGBExp32(const Vector3D& vin, ColorRGBExp32& c) { - Assert( s_bMathlibInitialized ); - Assert( vin.x >= 0.0f && vin.y >= 0.0f && vin.z >= 0.0f ); + Assert(s_bMathlibInitialized); + Assert(vin.x >= 0.0f && vin.y >= 0.0f && vin.z >= 0.0f); // work out which of the channels is the largest ( we will use that to map the exponent ) // this is a sluggish branch-based decision tree -- most architectures will offer a [max] // assembly opcode to do this faster. - const float *pMax; + const float* pMax; if (vin.x > vin.y) { if (vin.x > vin.z) @@ -594,7 +594,7 @@ void VectorToColorRGBExp32( const Vector3D& vin, ColorRGBExp32 &c ) } // now work out the exponent for this luxel. - signed int exponent = VectorToColorRGBExp32_CalcExponent( pMax ); + signed int exponent = VectorToColorRGBExp32_CalcExponent(pMax); // make sure the exponent fits into a signed byte. // (in single precision format this is assured because it was a signed byte to begin with) @@ -604,20 +604,20 @@ void VectorToColorRGBExp32( const Vector3D& vin, ColorRGBExp32 &c ) float scalar; { unsigned int fbits = (127 - exponent) << 23; - scalar = *reinterpret_cast(&fbits); + scalar = *reinterpret_cast(&fbits); } - // We can totally wind up above 255 and that's okay--but above 256 would be right out. - Assert(vin.x * scalar < 256.0f && - vin.y * scalar < 256.0f && - vin.z * scalar < 256.0f); + // we should never need to clamp: + Assert(vin.x * scalar <= 255.0f && + vin.y * scalar <= 255.0f && + vin.z * scalar <= 255.0f); // This awful construction is necessary to prevent VC2005 from using the // fldcw/fnstcw control words around every float-to-unsigned-char operation. { - int red = (vin.x * scalar); - int green = (vin.y * scalar); - int blue = (vin.z * scalar); + int red = (int)(vin.x * scalar); + int green = (int)(vin.y * scalar); + int blue = (int)(vin.z * scalar); c.r = red; c.g = green; @@ -629,7 +629,7 @@ void VectorToColorRGBExp32( const Vector3D& vin, ColorRGBExp32 &c ) c.b = ( unsigned char )(vin.z * scalar); */ - c.exponent = ( signed char )exponent; + c.exponent = (signed char)exponent; } -#endif \ No newline at end of file +#endif diff --git a/r5dev/mathlib/fltx4.h b/r5dev/mathlib/fltx4.h new file mode 100644 index 00000000..b091ac56 --- /dev/null +++ b/r5dev/mathlib/fltx4.h @@ -0,0 +1,107 @@ +//===== Copyright 1996-2010, Valve Corporation, All rights reserved. ======// +// +// Purpose: - defines the type fltx4 - Avoid cyclic includion. +// +//===========================================================================// + +#ifndef FLTX4_H +#define FLTX4_H + +#if defined(GNUC) +#define USE_STDC_FOR_SIMD 0 +#else +#define USE_STDC_FOR_SIMD 0 +#endif + +#if (!defined(PLATFORM_PPC) && (USE_STDC_FOR_SIMD == 0)) +#define _SSE1 1 +#endif + +// I thought about defining a class/union for the SIMD packed floats instead of using fltx4, +// but decided against it because (a) the nature of SIMD code which includes comparisons is to blur +// the relationship between packed floats and packed integer types and (b) not sure that the +// compiler would handle generating good code for the intrinsics. + +#if USE_STDC_FOR_SIMD +#error "hello" +typedef union +{ + float m128_f32[4]; + uint32 m128_u32[4]; +} fltx4; + +typedef fltx4 i32x4; +typedef fltx4 u32x4; + +#ifdef _PS3 +typedef fltx4 u32x4; +typedef fltx4 i32x4; +#endif +typedef fltx4 bi32x4; + +#elif ( defined( _PS3 ) ) + +typedef union +{ + // This union allows float/int access (which generally shouldn't be done in inner loops) + + vec_float4 vmxf; + vec_int4 vmxi; + vec_uint4 vmxui; +#if defined(__SPU__) + vec_uint4 vmxbi; +#else + __vector bool vmxbi; +#endif + + struct + { + float x; + float y; + float z; + float w; + }; + + float m128_f32[4]; + uint32 m128_u32[4]; + int32 m128_i32[4]; + +} fltx4_union; + +typedef vec_float4 fltx4; +typedef vec_uint4 u32x4; +typedef vec_int4 i32x4; + +#if defined(__SPU__) +typedef vec_uint4 bi32x4; +#else +typedef __vector bool bi32x4; +#endif + +#define DIFFERENT_NATIVE_VECTOR_TYPES // true if the compiler has different types for float4, uint4, int4, etc + +#elif ( defined( _X360 ) ) + +typedef union +{ + // This union allows float/int access (which generally shouldn't be done in inner loops) + __vector4 vmx; + float m128_f32[4]; + uint32 m128_u32[4]; +} fltx4_union; + +typedef __vector4 fltx4; +typedef __vector4 i32x4; // a VMX register; just a way of making it explicit that we're doing integer ops. +typedef __vector4 u32x4; // a VMX register; just a way of making it explicit that we're doing unsigned integer ops. +typedef fltx4 bi32x4; +#else + +typedef __m128 fltx4; +typedef __m128 i32x4; +typedef __m128 u32x4; +typedef __m128i shortx8; +typedef fltx4 bi32x4; + +#endif + +#endif diff --git a/r5dev/mathlib/math_pfns.h b/r5dev/mathlib/math_pfns.h index cdf7ccd2..268fe37c 100644 --- a/r5dev/mathlib/math_pfns.h +++ b/r5dev/mathlib/math_pfns.h @@ -9,11 +9,36 @@ #include +// YUP_ACTIVE is from Source2. It's (obviously) not supported on this branch, just including it here to help merge camera.cpp/.h and the CSM shadow code. +//#define YUP_ACTIVE 1 + +enum MatrixAxisType_t +{ +#ifdef YUP_ACTIVE + FORWARD_AXIS = 2, + LEFT_AXIS = 0, + UP_AXIS = 1, +#else + FORWARD_AXIS = 0, + LEFT_AXIS = 1, + UP_AXIS = 2, +#endif + + X_AXIS = 0, + Y_AXIS = 1, + Z_AXIS = 2, + ORIGIN = 3, + PROJECTIVE = 3, +}; + #if defined( _X360 ) #include #elif defined(_PS3) -#ifndef SPU +#ifdef SPU +#include +#include +#else #include #endif @@ -53,17 +78,19 @@ #include + + // These globals are initialized by mathlib and redirected based on available fpu features // The following are not declared as macros because they are often used in limiting situations, // and sometimes the compiler simply refuses to inline them for some reason -FORCEINLINE float FastSqrt(float x) +FORCEINLINE float VECTORCALL FastSqrt(float x) { __m128 root = _mm_sqrt_ss(_mm_load_ss(&x)); return *(reinterpret_cast(&root)); } -FORCEINLINE float FastRSqrtFast(float x) +FORCEINLINE float VECTORCALL FastRSqrtFast(float x) { // use intrinsics __m128 rroot = _mm_rsqrt_ss(_mm_load_ss(&x)); @@ -72,7 +99,7 @@ FORCEINLINE float FastRSqrtFast(float x) // Single iteration NewtonRaphson reciprocal square root: // 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) // Very low error, and fine to use in place of 1.f / sqrtf(x). -FORCEINLINE float FastRSqrt(float x) +FORCEINLINE float VECTORCALL FastRSqrt(float x) { float rroot = FastRSqrtFast(x); return (0.5f * rroot) * (3.f - (x * rroot) * rroot); @@ -136,6 +163,7 @@ inline double FastSqrtEst(double x) { return __frsqrte(x) * x; } #endif // !defined( PLATFORM_PPC ) && !defined(_SPU) + // if x is infinite, return FLT_MAX inline float FastClampInfinity(float x) { @@ -146,7 +174,19 @@ inline float FastClampInfinity(float x) #endif } -#if defined (_PS3) && !defined(SPU) +#if defined (_PS3) + +#if defined(__SPU__) + +inline int _rotl(int a, int count) +{ + vector signed int vi; + vi = spu_promote(a, 0); + vi = spu_rl(vi, count); + return spu_extract(vi, 0); +} + +#else // extern float cosvf(float); /* single precision cosine */ // extern float sinvf(float); /* single precision sine */ @@ -164,63 +204,6 @@ inline int64 _rotl64(int64 x, int c) return __rldicl(x, c, 0); } -//----------------------------------------------------------------- -// Vector Unions -//----------------------------------------------------------------- - -//----------------------------------------------------------------- -// Floats -//----------------------------------------------------------------- -typedef union -{ - vector float vf; - float f[4]; -} vector_float_union; - -//----------------------------------------------------------------- -// Ints -//----------------------------------------------------------------- -typedef union -{ - vector int vi; - int i[4]; -} vector_int4_union; - -typedef union -{ - vector unsigned int vui; - unsigned int ui[4]; -} vector_uint4_union; - -//----------------------------------------------------------------- -// Shorts -//----------------------------------------------------------------- -typedef union -{ - vector signed short vs; - signed short s[8]; -} vector_short8_union; - -typedef union -{ - vector unsigned short vus; - unsigned short us[8]; -} vector_ushort8_union; - -//----------------------------------------------------------------- -// Chars -//----------------------------------------------------------------- -typedef union -{ - vector signed char vc; - signed char c[16]; -} vector_char16_union; - -typedef union -{ - vector unsigned char vuc; - unsigned char uc[16]; -} vector_uchar16_union; /* FORCEINLINE float _VMX_Sqrt( float x ) @@ -277,6 +260,95 @@ FORCEINLINE float _VMX_Cos(float a) #define FastSinCos(x,s,c) _VMX_SinCos(x,s,c) #define FastCos(x) _VMX_Cos(x) */ + +#endif + + +#if defined(__SPU__) + +// do we need these optimized yet? + +FORCEINLINE float FastSqrt(float x) +{ + return sqrtf(x); +} + +FORCEINLINE float FastRSqrt(float x) +{ + float rroot = 1.f / (sqrtf(x) + FLT_EPSILON); + return rroot; +} + + +#define FastRSqrtFast(x) FastRSqrt(x) + + +#endif + + + +//----------------------------------------------------------------- +// Vector Unions +//----------------------------------------------------------------- + +//----------------------------------------------------------------- +// Floats +//----------------------------------------------------------------- +typedef union +{ + vector float vf; + float f[4]; +} vector_float_union; + +#if !defined(__SPU__) +//----------------------------------------------------------------- +// Ints +//----------------------------------------------------------------- +typedef union +{ + vector int vi; + int i[4]; +} vector_int4_union; + +typedef union +{ + vector unsigned int vui; + unsigned int ui[4]; +} vector_uint4_union; + +//----------------------------------------------------------------- +// Shorts +//----------------------------------------------------------------- +typedef union +{ + vector signed short vs; + signed short s[8]; +} vector_short8_union; + +typedef union +{ + vector unsigned short vus; + unsigned short us[8]; +} vector_ushort8_union; + +//----------------------------------------------------------------- +// Chars +//----------------------------------------------------------------- +typedef union +{ + vector signed char vc; + signed char c[16]; +} vector_char16_union; + +typedef union +{ + vector unsigned char vuc; + unsigned char uc[16]; +} vector_uchar16_union; +#endif + + + #endif // _PS3 #endif // #ifndef SPU diff --git a/r5dev/mathlib/mathlib.h b/r5dev/mathlib/mathlib.h index 0a9ff11f..ad11b2f6 100644 --- a/r5dev/mathlib/mathlib.h +++ b/r5dev/mathlib/mathlib.h @@ -11,9 +11,8 @@ #include "mathlib/vector.h" #include "mathlib/vector2d.h" #include "tier0/dbg.h" - #include "mathlib/math_pfns.h" -#include "mathlib/bits.h" +#include "mathlib/fltx4.h" #ifndef ALIGN8_POST #define ALIGN8_POST @@ -21,68 +20,19 @@ #if defined(_PS3) +#if defined(__SPU__) +#include +#include +#include +#else #include #include -#include +#include +#endif +#include #endif -// -// Returns a clamped value in the range [min, max]. -// -template< class T > -inline T clamp(T const& val, T const& minVal, T const& maxVal) -{ - if (maxVal < minVal) - return maxVal; - else if (val < minVal) - return minVal; - else if (val > maxVal) - return maxVal; - else - return val; -} -#define fsel(c,x,y) ( (c) >= 0 ? (x) : (y) ) - -// integer conditional move -// if a >= 0, return x, else y -#define isel(a,x,y) ( ((a) >= 0) ? (x) : (y) ) - -// if x = y, return a, else b -#define ieqsel(x,y,a,b) (( (x) == (y) ) ? (a) : (b)) - -// if the nth bit of a is set (counting with 0 = LSB), -// return x, else y -// this is fast if nbit is a compile-time immediate -#define ibitsel(a, nbit, x, y) ( ( ((a) & (1 << (nbit))) != 0 ) ? (x) : (y) ) - - -FORCEINLINE double fpmin(double a, double b) -{ - return a > b ? b : a; -} - -FORCEINLINE double fpmax(double a, double b) -{ - return a >= b ? a : b; -} - -// clamp x to lie inside [a,b]. Assumes b>a -FORCEINLINE float fclamp(float x, float a, float b) -{ - return fpmin(fpmax(x, a), b); -} -// clamp x to lie inside [a,b]. Assumes b>a -FORCEINLINE double fclamp(double x, double a, double b) -{ - return fpmin(fpmax(x, a), b); -} - -// At some point, we will need a unified API. -#define imin( x, y ) ( (x) < (y) ? (x) : (y) ) -#define imax( x, y ) ( (x) > (y) ? (x) : (y) ) -#define iclamp clamp - // plane_t structure // !!! if this is changed, it must be changed in asm code too !!! // FIXME: does the asm code even exist anymore? @@ -95,7 +45,7 @@ struct cplane_t byte signbits; // signx + (signy<<1) + (signz<<1) byte pad[2]; -#ifdef Vector_NO_SLOW_OPERATIONS +#ifdef VECTOR_NO_SLOW_OPERATIONS cplane_t() {} private: @@ -142,26 +92,7 @@ enum }; extern int SignbitsForPlane(cplane_t* out); - -class Frustum_t -{ -public: - void SetPlane(int i, int nType, const Vector3D& vecNormal, float dist) - { - m_Plane[i].normal = vecNormal; - m_Plane[i].dist = dist; - m_Plane[i].type = nType; - m_Plane[i].signbits = SignbitsForPlane(&m_Plane[i]); - m_AbsNormal[i].Init(fabs(vecNormal.x), fabs(vecNormal.y), fabs(vecNormal.z)); - } - - inline const cplane_t* GetPlane(int i) const { return &m_Plane[i]; } - inline const Vector3D& GetAbsNormal(int i) const { return m_AbsNormal[i]; } - -private: - cplane_t m_Plane[FRUSTUM_NUMPLANES]; - Vector3D m_AbsNormal[FRUSTUM_NUMPLANES]; -}; +class Frustum_t; // Computes Y fov from an X fov and a screen aspect ratio + X from Y float CalcFovY(float flFovX, float flScreenAspect); @@ -171,12 +102,13 @@ float CalcFovX(float flFovY, float flScreenAspect); // NOTE: FOV is specified in degrees, as the *full* view angle (not half-angle) class VPlane; void GeneratePerspectiveFrustum(const Vector3D& origin, const QAngle& angles, float flZNear, float flZFar, float flFovX, float flAspectRatio, Frustum_t& frustum); -void GeneratePerspectiveFrustum(const Vector3D& origin, const Vector3D& forward, const Vector3D& right, const Vector3D& up, float flZNear, float flZFar, float flFovX, float flFovY, Frustum_t& frustum); +void GeneratePerspectiveFrustum(const Vector3D& origin, const Vector3D& forward, const Vector3D& right, const Vector3D& up, float flZNear, float flZFar, float flFovX, float flFovY, VPlane* pPlanesOut); // Cull the world-space bounding box to the specified frustum. -// bool R_CullBox( const Vector3D& mins, const Vector3D& maxs, const Frustum_t &frustum ); -// bool R_CullBoxSkipNear( const Vector3D& mins, const Vector3D& maxs, const Frustum_t &frustum ); +// bool R_CullBox( const Vector& mins, const Vector& maxs, const Frustum_t &frustum ); +// bool R_CullBoxSkipNear( const Vector& mins, const Vector& maxs, const Frustum_t &frustum ); void GenerateOrthoFrustum(const Vector3D& origin, const Vector3D& forward, const Vector3D& right, const Vector3D& up, float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar, VPlane* pPlanesOut); +class CTransform; class matrix3x4a_t; struct matrix3x4_t @@ -192,6 +124,14 @@ struct matrix3x4_t m_flMatVal[2][0] = m20; m_flMatVal[2][1] = m21; m_flMatVal[2][2] = m22; m_flMatVal[2][3] = m23; } + /// Creates a matrix where the X axis = forward the Y axis = left, and the Z axis = up + void InitXYZ(const Vector3D& xAxis, const Vector3D& yAxis, const Vector3D& zAxis, const Vector3D& vecOrigin) + { + m_flMatVal[0][0] = xAxis.x; m_flMatVal[0][1] = yAxis.x; m_flMatVal[0][2] = zAxis.x; m_flMatVal[0][3] = vecOrigin.x; + m_flMatVal[1][0] = xAxis.y; m_flMatVal[1][1] = yAxis.y; m_flMatVal[1][2] = zAxis.y; m_flMatVal[1][3] = vecOrigin.y; + m_flMatVal[2][0] = xAxis.z; m_flMatVal[2][1] = yAxis.z; m_flMatVal[2][2] = zAxis.z; m_flMatVal[2][3] = vecOrigin.z; + } + //----------------------------------------------------------------------------- // Creates a matrix where the X axis = forward // the Y axis = left, and the Z axis = up @@ -212,6 +152,27 @@ struct matrix3x4_t Init(xAxis, yAxis, zAxis, vecOrigin); } + inline void InitFromQAngles(const QAngle& angles, const Vector3D& vPosition); + inline void InitFromQAngles(const QAngle& angles); + inline void InitFromRadianEuler(const RadianEuler& angles, const Vector3D& vPosition); + inline void InitFromRadianEuler(const RadianEuler& angles); + inline void InitFromCTransform(const CTransform& transform); + inline void InitFromQuaternion(const Quaternion& orientation, const Vector3D& vPosition); + inline void InitFromQuaternion(const Quaternion& orientation); + inline void InitFromDiagonal(const Vector3D& vDiagonal); + + inline Quaternion ToQuaternion() const; + inline QAngle ToQAngle() const; + inline CTransform ToCTransform() const; + + inline void SetToIdentity(); + + /// multiply the scale/rot part of the matrix by a constant. This doesn't init the matrix , + /// just scale in place. So if you want to construct a scaling matrix, init to identity and + /// then call this. + FORCEINLINE void ScaleUpper3x3Matrix(float flScale); + + /// modify the origin inline void SetOrigin(Vector3D const& p) { m_flMatVal[0][3] = p.x; @@ -219,6 +180,13 @@ struct matrix3x4_t m_flMatVal[2][3] = p.z; } + /// return the origin + inline Vector3D GetOrigin(void) const + { + Vector3D vecRet(m_flMatVal[0][3], m_flMatVal[1][3], m_flMatVal[2][3]); + return vecRet; + } + inline void Invalidate(void) { for (int i = 0; i < 3; i++) @@ -230,6 +198,60 @@ struct matrix3x4_t } } + /// check all components for invalid floating point values + inline bool IsValid(void) const + { + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 4; j++) + { + if (!IsFinite(m_flMatVal[i][j])) + return false; + } + } + return true; + } + + bool operator==(const matrix3x4_t& other) const + { + return memcmp(this, &other, sizeof(matrix3x4_t)) == 0; + } + + bool operator!=(const matrix3x4_t& other) const + { + return memcmp(this, &other, sizeof(matrix3x4_t)) != 0; + } + + inline bool IsEqualTo(const matrix3x4_t& other, float flTolerance = 1e-5f) const; + + inline void GetBasisVectorsFLU(Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp) const; + inline Vector3D TransformVector(const Vector3D& v0) const; + inline Vector3D RotateVector(const Vector3D& v0) const; + inline Vector3D TransformVectorByInverse(const Vector3D& v0) const; + inline Vector3D RotateVectorByInverse(const Vector3D& v0) const; + inline Vector3D RotateExtents(const Vector3D& vBoxExtents) const; // these are extents and must remain positive/symmetric after rotation + inline void TransformAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const; + inline void TransformAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const; + inline void RotateAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const; + inline void RotateAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const; + inline void TransformPlane(const cplane_t& inPlane, cplane_t& outPlane) const; + inline void TransformPlaneByInverse(const cplane_t& inPlane, cplane_t& outPlane) const; + inline float GetOrthogonalityError() const; + inline float GetDeterminant()const; + inline float GetSylvestersCriterion()const; // for symmetrical matrices only: should be >0 iff it's a positive definite matrix + + inline Vector3D GetColumn(MatrixAxisType_t nColumn) const; + inline void SetColumn(const Vector3D& vColumn, MatrixAxisType_t nColumn); + inline Vector3D GetForward() const { return GetColumn(FORWARD_AXIS); } + inline Vector3D GetLeft() const { return GetColumn(LEFT_AXIS); } + inline Vector3D GetUp() const { return GetColumn(UP_AXIS); } + inline Vector3D GetRow(int nRow) const { return *(Vector3D*)(m_flMatVal[nRow]); } + inline void SetRow(int nRow, const Vector3D& vRow) { m_flMatVal[nRow][0] = vRow.x; m_flMatVal[nRow][1] = vRow.y; m_flMatVal[nRow][2] = vRow.z; } + + inline void InverseTR(matrix3x4_t& out) const; + inline matrix3x4_t InverseTR() const; + + float* operator[](int i) { Assert((i >= 0) && (i < 3)); return m_flMatVal[i]; } const float* operator[](int i) const { Assert((i >= 0) && (i < 3)); return m_flMatVal[i]; } float* Base() { return &m_flMatVal[0][0]; } @@ -244,14 +266,50 @@ public: /* matrix3x4a_t() { if (((size_t)Base()) % 16 != 0) { Error( "matrix3x4a_t missaligned" ); } } */ + matrix3x4a_t(const matrix3x4_t& src) { *this = src; }; matrix3x4a_t& operator=(const matrix3x4_t& src) { memcpy(Base(), src.Base(), sizeof(float) * 3 * 4); return *this; }; + + matrix3x4a_t( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23) + { + AssertDbg(((size_t)Base() & 0xf) == 0); + m_flMatVal[0][0] = m00; m_flMatVal[0][1] = m01; m_flMatVal[0][2] = m02; m_flMatVal[0][3] = m03; + m_flMatVal[1][0] = m10; m_flMatVal[1][1] = m11; m_flMatVal[1][2] = m12; m_flMatVal[1][3] = m13; + m_flMatVal[2][0] = m20; m_flMatVal[2][1] = m21; m_flMatVal[2][2] = m22; m_flMatVal[2][3] = m23; + } + matrix3x4a_t() {} + + static FORCEINLINE bool TypeIsAlignedForSIMD(void) { return true; } + + + // raw data simd accessor + FORCEINLINE fltx4& SIMDRow(uint nIdx) { AssertDbg(nIdx < 3); return *((fltx4*)(&(m_flMatVal[nIdx]))); } + FORCEINLINE const fltx4& SIMDRow(uint nIdx) const { AssertDbg(nIdx < 3); return *((const fltx4*)(&(m_flMatVal[nIdx]))); } + } ALIGN16_POST; + +FORCEINLINE void matrix3x4_t::ScaleUpper3x3Matrix(float flScale) +{ + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + m_flMatVal[i][j] *= flScale; + } + } +} + + #ifndef M_PI #define M_PI 3.14159265358979323846 // matches value in gcc v2 math.h #endif -#define M_PI_F ((float)(M_PI)) // Shouldn't collide with anything. +#ifndef M_PI_F +#define M_PI_F ((float)(M_PI)) +#endif // NJS: Inlined to prevent floats from being autopromoted to doubles, as with the old system. #ifndef RAD2DEG @@ -282,6 +340,7 @@ enum Sides extern bool s_bMathlibInitialized; +extern const matrix3x4a_t g_MatrixIdentity; extern const Vector3D vec3_origin; extern const QAngle vec3_angle; extern const Quaternion quat_identity; @@ -359,7 +418,7 @@ inline void VectorNegate(vec_t* a) // NJS: Some functions in VBSP still need to use these for dealing with mixing vec4's and shorts with vec_t's. // remove when no longer needed. -#define Vector_COPY( A, B ) do { (B)[0] = (A)[0]; (B)[1] = (A)[1]; (B)[2]=(A)[2]; } while(0) +#define VECTOR_COPY( A, B ) do { (B)[0] = (A)[0]; (B)[1] = (A)[1]; (B)[2]=(A)[2]; } while(0) #define DOT_PRODUCT( A, B ) ( (A)[0]*(B)[0] + (A)[1]*(B)[1] + (A)[2]*(B)[2] ) FORCEINLINE void VectorMAInline(const float* start, float scale, const float* direction, float* dest) @@ -396,6 +455,21 @@ inline float VectorLength(const float* v) void CrossProduct(const float* v1, const float* v2, float* cross); +inline float CrossProductX(const Vector3D& v1, const Vector3D& v2) +{ + return v1.y * v2.z - v1.z * v2.y; +} + +inline float CrossProductY(const Vector3D& v1, const Vector3D& v2) +{ + return v1.z * v2.x - v1.x * v2.z; +} + +inline float CrossProductZ(const Vector3D& v1, const Vector3D& v2) +{ + return v1.x * v2.y - v1.y * v2.x; +} + qboolean VectorsEqual(const float* v1, const float* v2); inline vec_t RoundInt(vec_t in) @@ -403,7 +477,7 @@ inline vec_t RoundInt(vec_t in) return floor(in + 0.5f); } -int Q_log2(int val); +size_t Q_log2(unsigned int val); // Math routines done in optimized assembly math package routines void inline SinCos(float radians, float* RESTRICT sine, float* RESTRICT cosine) @@ -412,8 +486,8 @@ void inline SinCos(float radians, float* RESTRICT sine, float* RESTRICT cosine) XMScalarSinCos(sine, cosine, radians); #elif defined( _PS3 ) #if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 ) - Vector_float_union s; - Vector_float_union c; + vector_float_union s; + vector_float_union c; vec_float4 rad = vec_splats(radians); vec_float4 sin; @@ -427,9 +501,9 @@ void inline SinCos(float radians, float* RESTRICT sine, float* RESTRICT cosine) *sine = s.f[0]; *cosine = c.f[0]; #else //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1 - Vector_float_union r; - Vector_float_union s; - Vector_float_union c; + vector_float_union r; + vector_float_union s; + vector_float_union c; vec_float4 rad; vec_float4 sin; @@ -476,6 +550,10 @@ extern float SinCosTable[SIN_TABLE_SIZE]; inline float TableCos(float theta) { +#if defined( LINUX ) + return cos(theta); // under the GCC compiler the float-represented-as-an-int causes an internal compiler error +#else + union { int i; @@ -485,10 +563,14 @@ inline float TableCos(float theta) // ideally, the following should compile down to: theta * constant + constant, changing any of these constants from defines sometimes fubars this. ftmp.f = theta * (float)(SIN_TABLE_SIZE / (2.0f * M_PI)) + (FTOIBIAS + (SIN_TABLE_SIZE / 4)); return SinCosTable[ftmp.i & (SIN_TABLE_SIZE - 1)]; +#endif } inline float TableSin(float theta) { +#if defined( LINUX ) + return sin(theta); // under the GCC compiler the float-represented-as-an-int causes an internal compiler error +#else union { int i; @@ -498,6 +580,7 @@ inline float TableSin(float theta) // ideally, the following should compile down to: theta * constant + constant ftmp.f = theta * (float)(SIN_TABLE_SIZE / (2.0f * M_PI)) + FTOIBIAS; return SinCosTable[ftmp.i & (SIN_TABLE_SIZE - 1)]; +#endif } template @@ -551,16 +634,25 @@ enum ROLL // fall over }; +void MatrixVectorsFLU(const matrix3x4_t& matrix, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp); void MatrixAngles(const matrix3x4_t& matrix, float* angles); // !!!! void MatrixVectors(const matrix3x4_t& matrix, Vector3D* pForward, Vector3D* pRight, Vector3D* pUp); -void VectorTransform(const float* in1, const matrix3x4_t& in2, float* out); +void VectorTransform(const float* RESTRICT in1, const matrix3x4_t& in2, float* RESTRICT out); void VectorITransform(const float* in1, const matrix3x4_t& in2, float* out); -void VectorRotate(const float* in1, const matrix3x4_t& in2, float* out); +void VectorRotate(const float* RESTRICT in1, const matrix3x4_t& in2, float* RESTRICT out); void VectorRotate(const Vector3D& in1, const QAngle& in2, Vector3D& out); void VectorRotate(const Vector3D& in1, const Quaternion& in2, Vector3D& out); -void VectorIRotate(const float* in1, const matrix3x4_t& in2, float* out); +void VectorIRotate(const float* RESTRICT in1, const matrix3x4_t& in2, float* RESTRICT out); -#ifndef Vector_NO_SLOW_OPERATIONS +inline const Vector3D VectorRotate(const Vector3D& vIn1, const Quaternion& qIn2) +{ + Vector3D out; + VectorRotate(vIn1, qIn2, out); + return out; +} + + +#ifndef VECTOR_NO_SLOW_OPERATIONS QAngle TransformAnglesToLocalSpace(const QAngle& angles, const matrix3x4_t& parentMatrix); QAngle TransformAnglesToWorldSpace(const QAngle& angles, const matrix3x4_t& parentMatrix); @@ -581,7 +673,7 @@ void MatrixSetColumn(const Vector3D& in, int column, matrix3x4_t& out); void ConcatRotations(const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out); void ConcatTransforms(const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out); // faster version assumes m0, m1, out are 16-byte aligned addresses -void ConcatTransforms_Aligned(const matrix3x4_t& m0, const matrix3x4_t& m1, matrix3x4_t& out); +void ConcatTransforms_Aligned(const matrix3x4a_t& m0, const matrix3x4a_t& m1, matrix3x4a_t& out); // For identical interface w/ VMatrix inline void MatrixMultiply(const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out) @@ -605,20 +697,242 @@ float QuaternionDotProduct(const Quaternion& p, const Quaternion& q); void QuaternionConjugate(const Quaternion& p, Quaternion& q); void QuaternionInvert(const Quaternion& p, Quaternion& q); float QuaternionNormalize(Quaternion& q); +void QuaternionMultiply(const Quaternion& q, const Vector3D& v, Vector3D& result); void QuaternionAdd(const Quaternion& p, const Quaternion& q, Quaternion& qt); void QuaternionMult(const Quaternion& p, const Quaternion& q, Quaternion& qt); void QuaternionMatrix(const Quaternion& q, matrix3x4_t& matrix); void QuaternionMatrix(const Quaternion& q, const Vector3D& pos, matrix3x4_t& matrix); +void QuaternionMatrix(const Quaternion& q, const Vector3D& pos, const Vector3D& vScale, matrix3x4_t& mat); void QuaternionAngles(const Quaternion& q, QAngle& angles); void AngleQuaternion(const QAngle& angles, Quaternion& qt); void QuaternionAngles(const Quaternion& q, RadianEuler& angles); +void QuaternionVectorsFLU(Quaternion const& q, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp); +void QuaternionVectorsForward(const Quaternion& q, Vector3D* pForward); void AngleQuaternion(RadianEuler const& angles, Quaternion& qt); void QuaternionAxisAngle(const Quaternion& q, Vector3D& axis, float& angle); void AxisAngleQuaternion(const Vector3D& axis, float angle, Quaternion& q); void BasisToQuaternion(const Vector3D& vecForward, const Vector3D& vecRight, const Vector3D& vecUp, Quaternion& q); void MatrixQuaternion(const matrix3x4_t& mat, Quaternion& q); -// A couple methods to find the dot product of a Vector3D with a matrix row or column... + +void MatrixQuaternionFast(const matrix3x4_t& mat, Quaternion& q); +void MatrixPosition(const matrix3x4_t& matrix, Vector3D& position); +Vector3D MatrixNormalize(const matrix3x4_t& in, matrix3x4_t& out); + +inline void MatrixQuaternion(const matrix3x4_t& mat, Quaternion& q, Vector3D& o) +{ + MatrixQuaternion(mat, q); + MatrixPosition(mat, o); +} + + + +float MatrixQuaternionTest(uint); +float MatrixQuaternionTest2(uint); + +/// qt = p + s * q +void QuaternionAccumulate(const Quaternion& p, float s, const Quaternion& q, Quaternion& qt); + +/// qt = ( s * p ) * q +void QuaternionSM(float s, const Quaternion& p, const Quaternion& q, Quaternion& qt); + +/// qt = p * ( s * q ) +void QuaternionMA(const Quaternion& p, float s, const Quaternion& q, Quaternion& qt); + +/* +//----------------------------------------------------------------------------- +// Quaternion equality with tolerance +//----------------------------------------------------------------------------- +inline bool QuaternionsAreEqualInternal( const Quaternion& src1, const Quaternion& src2, float flTolerance ) +{ + if ( !FloatsAreEqual( src1.x, src2.x, flTolerance ) ) + return false; + + if ( !FloatsAreEqual( src1.y, src2.y, flTolerance ) ) + return false; + + if ( !FloatsAreEqual( src1.z, src2.z, flTolerance ) ) + return false; + + return FloatsAreEqual( src1.w, src2.w, flTolerance ); +} + +inline bool QuaternionsAreEqual( const Quaternion& src1, const Quaternion& src2, float flTolerance ) +{ + if ( QuaternionsAreEqualInternal( src1, src2, flTolerance ) ) + return true; + + // negated quaternions are also 'equal' + Quaternion src2neg( -src2.x, -src2.y, -src2.z, -src2.w ); + return QuaternionsAreEqualInternal( src1, src2neg, flTolerance ); +} +*/ +inline const Quaternion GetNormalized(const Quaternion& q) +{ + float flInv = 1.0f / sqrtf(q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w); + return Quaternion(q.x * flInv, q.y * flInv, q.z * flInv, q.w * flInv); +} + +inline const Quaternion AngleQuaternion(const QAngle& angles) +{ + Quaternion qt; + AngleQuaternion(angles, qt); + return qt; +} + + +inline const Quaternion AngleQuaternion(RadianEuler const& angles) +{ + Quaternion qt; + AngleQuaternion(angles, qt); + return qt; +} + + + +inline Quaternion QuaternionFromPitchYawRoll(float flPitch, float flYaw, float flRoll) +{ + QAngle ang(flPitch, flYaw, flRoll); + + Quaternion q; + AngleQuaternion(ang, q); + return q; +} + +inline Quaternion QuaternionAddPitch(const Quaternion& q, float flPitch) +{ + // FIXME: I know this can be made *tons* faster, but I just want to get something working quickly + // that matches being able to add to the pitch of a QAngles so I can expose Quats to script/game code + QAngle ang; + QuaternionAngles(q, ang); + ang[PITCH] += flPitch; + + Quaternion res; + AngleQuaternion(ang, res); + return res; +} + +inline Quaternion QuaternionAddYaw(const Quaternion& q, float flYaw) +{ + // FIXME: I know this can be made *tons* faster, but I just want to get something working quickly + // that matches being able to add to the yaw of a QAngles so I can expose Quats to script/game code + QAngle ang; + QuaternionAngles(q, ang); + ang[YAW] += flYaw; + + Quaternion res; + AngleQuaternion(ang, res); + return res; +} + +inline Quaternion QuaternionAddRoll(const Quaternion& q, float flRoll) +{ + // FIXME: I know this can be made *tons* faster, but I just want to get something working quickly + // that matches being able to add to the roll of a QAngles so I can expose Quats to script/game code + QAngle ang; + QuaternionAngles(q, ang); + ang[ROLL] += flRoll; + + Quaternion res; + AngleQuaternion(ang, res); + return res; +} + +inline const Quaternion MatrixQuaternion(const matrix3x4_t& mat) +{ + Quaternion tmp; + MatrixQuaternion(mat, tmp); + return tmp; +} + +inline const Quaternion MatrixQuaternionFast(const matrix3x4_t& mat) +{ + Quaternion tmp; + MatrixQuaternionFast(mat, tmp); + return tmp; +} + +inline const matrix3x4_t QuaternionMatrix(const Quaternion& q) +{ + matrix3x4_t mat; + QuaternionMatrix(q, mat); + return mat; +} + +inline const matrix3x4_t QuaternionMatrix(const Quaternion& q, const Vector3D& pos) +{ + matrix3x4_t mat; + QuaternionMatrix(q, pos, mat); + return mat; +} + +//! Shortest-arc quaternion that rotates vector v1 into vector v2 +const Quaternion RotateBetween(const Vector3D& v1, const Vector3D& v2); + +inline const Quaternion QuaternionConjugate(const Quaternion& p) +{ + Quaternion q; + QuaternionConjugate(p, q); + return q; +} + +inline const Quaternion QuaternionInvert(const Quaternion& p) +{ + Quaternion q; + QuaternionInvert(p, q); + return q; +} + + + + + +/// Actual quaternion multiplication; NOTE: QuaternionMult aligns quaternions first, so that q * +/// conjugate(q) may be -1 instead of 1! +inline const Quaternion operator * (const Quaternion& p, const Quaternion& q) +{ + Quaternion qt; + qt.x = p.x * q.w + p.y * q.z - p.z * q.y + p.w * q.x; + qt.y = -p.x * q.z + p.y * q.w + p.z * q.x + p.w * q.y; + qt.z = p.x * q.y - p.y * q.x + p.z * q.w + p.w * q.z; + qt.w = -p.x * q.x - p.y * q.y - p.z * q.z + p.w * q.w; + return qt; +} + +inline Quaternion& operator *= (Quaternion& p, const Quaternion& q) +{ + QuaternionMult(p, q, p); + return p; +} + +inline const matrix3x4_t ConcatTransforms(const matrix3x4_t& in1, const matrix3x4_t& in2) +{ + matrix3x4_t out; + ConcatTransforms(in1, in2, out); + return out; +} + +inline const matrix3x4_t operator *(const matrix3x4_t& in1, const matrix3x4_t& in2) +{ + matrix3x4_t out; + ConcatTransforms(in1, in2, out); + return out; +} + + +inline const matrix3x4_t MatrixInvert(const matrix3x4_t& in) +{ + matrix3x4_t out; + ::MatrixInvert(in, out); + return out; +} + +inline const Vector3D MatrixGetColumn(const matrix3x4_t& in, MatrixAxisType_t nColumn) +{ + return in.GetColumn(nColumn); +} + +// A couple methods to find the dot product of a vector with a matrix row or column... inline float MatrixRowDotProduct(const matrix3x4_t& in1, int row, const Vector3D& in2) { Assert((row >= 0) && (row < 3)); @@ -755,7 +1069,7 @@ static inline float FLerp(float f1, float f2, float i1, float i2, float x) } -#ifndef Vector_NO_SLOW_OPERATIONS +#ifndef VECTOR_NO_SLOW_OPERATIONS // YWB: Specialization for interpolating euler angles via quaternions... template<> FORCEINLINE QAngle Lerp(float flPercent, const QAngle& q1, const QAngle& q2) @@ -809,7 +1123,7 @@ template<> FORCEINLINE QAngleByValue Lerp(float flPercent, const return output; } -#endif // Vector_NO_SLOW_OPERATIONS +#endif // VECTOR_NO_SLOW_OPERATIONS // Swap two of anything. @@ -829,7 +1143,7 @@ template FORCEINLINE T AVG(T a, T b) // number of elements in an array of static size #define NELEMS(x) ((sizeof(x))/sizeof(x[0])) -// XYZ macro, for printf type functions - ex printf("%f %f %f",XYZ(myVector)); +// XYZ macro, for printf type functions - ex printf("%f %f %f",XYZ(myvector)); #define XYZ(v) (v).x,(v).y,(v).z @@ -897,12 +1211,13 @@ int InsideOut(int nTotal, int nCounter); BoxOnPlaneSide( (emins), (emaxs), (p))) //----------------------------------------------------------------------------- -// FIXME: Vector3D versions.... the float versions will go away hopefully soon! +// FIXME: Vector versions.... the float versions will go away hopefully soon! //----------------------------------------------------------------------------- void AngleVectors(const QAngle& angles, Vector3D* forward); void AngleVectors(const QAngle& angles, Vector3D* forward, Vector3D* right, Vector3D* up); void AngleVectorsTranspose(const QAngle& angles, Vector3D* forward, Vector3D* right, Vector3D* up); +void AngleVectorsFLU(const QAngle& angles, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp); void AngleMatrix(const QAngle& angles, matrix3x4_t& mat); void AngleMatrix(const QAngle& angles, const Vector3D& position, matrix3x4_t& mat); void AngleMatrix(const RadianEuler& angles, matrix3x4_t& mat); @@ -996,13 +1311,38 @@ inline void VectorTransform(const Vector3D& in1, const matrix3x4_t& in2, Vector3 VectorTransform(&in1.x, in2, &out.x); } +// MSVC folds the return value nicely and creates no temporaries on the stack, +// we need more experiments with different compilers and in different circumstances +inline const Vector3D VectorTransform(const Vector3D& in1, const matrix3x4_t& in2) +{ + Vector3D out; + VectorTransform(in1, in2, out); + return out; +} + +inline const Vector3D VectorRotate(const Vector3D& in1, const matrix3x4_t& in2) +{ + Vector3D out; + VectorRotate(in1, in2, out); + return out; +} + + + inline void VectorITransform(const Vector3D& in1, const matrix3x4_t& in2, Vector3D& out) { VectorITransform(&in1.x, in2, &out.x); } +inline const Vector3D VectorITransform(const Vector3D& in1, const matrix3x4_t& in2) +{ + Vector3D out; + VectorITransform(in1, in2, out); + return out; +} + /* -inline void DecomposeRotation( const matrix3x4_t &mat, Vector3D &out ) +inline void DecomposeRotation( const matrix3x4_t &mat, Vector &out ) { DecomposeRotation( mat, &out.x ); } @@ -1110,7 +1450,9 @@ void BuildGammaTable(float gamma, float texGamma, float brightness, int overbrig // convert texture to linear 0..1 value inline float TexLightToLinear(int c, int exponent) { - extern float power2_n[256]; + // On VS 2013 LTCG builds it is required that the array declaration be annotated with + // the same alignment requirements as the array definition. + extern ALIGN128 float power2_n[256]; Assert(exponent >= -128 && exponent <= 127); return (float)c * power2_n[exponent + 128]; } @@ -1129,8 +1471,8 @@ struct ColorRGBExp32 signed char exponent; }; -void ColorRGBExp32ToVector3D(const ColorRGBExp32& in, Vector3D& out); -void Vector3DToColorRGBExp32(const Vector3D& v, ColorRGBExp32& c); +void ColorRGBExp32ToVector(const ColorRGBExp32& in, Vector3D& out); +void VectorToColorRGBExp32(const Vector3D& v, ColorRGBExp32& c); // solve for "x" where "a x^2 + b x + c = 0", return true if solution exists bool SolveQuadratic(float a, float b, float c, float& root1, float& root2); @@ -1151,7 +1493,7 @@ bool SolveInverseQuadraticMonotonic(float x1, float y1, float x2, float y2, // solves for "a, b, c" where "1/(a x^2 + b x + c ) = y", return true if solution exists bool SolveInverseReciprocalQuadratic(float x1, float y1, float x2, float y2, float x3, float y3, float& a, float& b, float& c); -// rotate a Vector3D around the Z axis (YAW) +// rotate a vector around the Z axis (YAW) void VectorYawRotate(const Vector3D& in, float flYaw, Vector3D& out); @@ -1304,10 +1646,11 @@ inline float SimpleSplineRemapValClamped(float val, float A, float B, float C, f if (A == B) return val >= B ? D : C; float cVal = (val - A) / (B - A); - cVal = std::clamp(cVal, 0.0f, 1.0f); + cVal = clamp(cVal, 0.0f, 1.0f); return C + (D - C) * SimpleSpline(cVal); } + FORCEINLINE int RoundFloatToInt(float f) { #if defined( _X360 ) @@ -1322,7 +1665,13 @@ FORCEINLINE int RoundFloatToInt(float f) flResult = __fctiw(f); return pResult[1]; #elif defined ( _PS3 ) +#if defined(__SPU__) + int nResult; + nResult = static_cast(f); + return nResult; +#else return __fctiw(f); +#endif #else // !X360 int nResult; #if defined( COMPILER_MSVC32 ) @@ -1361,7 +1710,13 @@ FORCEINLINE unsigned char RoundFloatToByte(float f) return pResult[7]; #elif defined ( _PS3 ) +#if defined(__SPU__) + int nResult; + nResult = static_cast (f) & 0xff; + return nResult; +#else return __fctiw(f); +#endif #else // !X360 int nResult; @@ -1404,7 +1759,11 @@ FORCEINLINE unsigned long RoundFloatToUnsignedLong(float f) Assert(pIntResult[1] >= 0); return pResult[1]; #elif defined ( _PS3 ) +#if defined(__SPU__) + return static_cast(f); +#else return __fctiw(f); +#endif #else // !X360 #if defined( COMPILER_MSVC32 ) @@ -1445,7 +1804,13 @@ FORCEINLINE int Float2Int(float a) flResult = __fctiwz(a); return pResult[1]; #elif defined ( _PS3 ) +#if defined(__SPU__) + int RetVal; + RetVal = static_cast(a); + return RetVal; +#else return __fctiwz(a); +#endif #else // !X360 int RetVal; @@ -1473,6 +1838,8 @@ FORCEINLINE int Float2Int(float a) #endif } + + // Over 15x faster than: (int)floor(value) inline int Floor2Int(float a) { @@ -1801,7 +2168,7 @@ float Hermite_Spline( float t); -void Hermite_SplineBasis(float t, float basis[]); +void Hermite_SplineBasis(float t, float basis[4]); void Hermite_Spline( const Quaternion& q0, @@ -1906,7 +2273,7 @@ float CubicBasis3(float t); // quintic interpolating polynomial from Perlin. // 0->0, 1->1, smooth-in between with smooth tangents -FORCEINLINE float QuinticInterpolatingPolynomial(float t) +inline float QuinticInterpolatingPolynomial(float t) { // 6t^5-15t^4+10t^3 return t * t * t * (t * (t * 6.0 - 15.0) + 10.0); @@ -1971,6 +2338,7 @@ bool MathLib_MMXEnabled(void); bool MathLib_SSEEnabled(void); bool MathLib_SSE2Enabled(void); +inline float Approach(float target, float value, float speed); float ApproachAngle(float target, float value, float speed); float AngleDiff(float destAngle, float srcAngle); float AngleDistance(float next, float cur); @@ -1987,7 +2355,7 @@ void RotationDelta(const QAngle& srcAngles, const QAngle& destAngles, QAngle* ou //----------------------------------------------------------------------------- // Clips a line segment such that only the portion in the positive half-space -// of the plane remains. If the segment is entirely clipped, the Vector3Ds +// of the plane remains. If the segment is entirely clipped, the vectors // are set to vec3_invalid (all components are FLT_MAX). // // flBias is added to the dot product with the normal. A positive bias @@ -1998,13 +2366,20 @@ void ClipLineSegmentToPlane(const Vector3D& vNormal, const Vector3D& vPlanePoint void ComputeTrianglePlane(const Vector3D& v1, const Vector3D& v2, const Vector3D& v3, Vector3D& normal, float& intercept); int PolyFromPlane(Vector3D* pOutVerts, const Vector3D& normal, float dist, float fHalfScale = 9000.0f); -//void PolyFromPlane_SIMD(fltx4* pOutVerts, const fltx4& plane, float fHalfScale = 9000.0f); +void PolyFromPlane_SIMD(fltx4* pOutVerts, const fltx4& plane, float fHalfScale = 9000.0f); int ClipPolyToPlane(Vector3D* inVerts, int vertCount, Vector3D* outVerts, const Vector3D& normal, float dist, float fOnPlaneEpsilon = 0.1f); -//int ClipPolyToPlane_SIMD(fltx4* pInVerts, int vertCount, fltx4* pOutVerts, const fltx4& plane, float fOnPlaneEpsilon = 0.1f); +int ClipPolyToPlane_SIMD(fltx4* pInVerts, int vertCount, fltx4* pOutVerts, const fltx4& plane, float fOnPlaneEpsilon = 0.1f); int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, const double* normal, double dist, double fOnPlaneEpsilon = 0.1); float TetrahedronVolume(const Vector3D& p0, const Vector3D& p1, const Vector3D& p2, const Vector3D& p3); float TriangleArea(const Vector3D& p0, const Vector3D& p1, const Vector3D& p2); +/// return surface area of an AABB +FORCEINLINE float BoxSurfaceArea(Vector3D const& vecBoxMin, Vector3D const& vecBoxMax) +{ + Vector3D boxdim = vecBoxMax - vecBoxMin; + return 2.0 * ((boxdim[0] * boxdim[2]) + (boxdim[0] * boxdim[1]) + (boxdim[1] * boxdim[2])); +} + //----------------------------------------------------------------------------- // Computes a reasonable tangent space for a triangle //----------------------------------------------------------------------------- @@ -2146,7 +2521,7 @@ FORCEINLINE unsigned int* PackNormal_HEND3N(float nx, float ny, float nz, unsign FORCEINLINE float* UnpackNormal_SHORT2(const unsigned int* pPackedNormal, float* pNormal, bool bIsTangent = FALSE) { - // Unpacks from Jason's 2-short format (fills in a 4th binormal-sign (+1/-1) value, if this is a tangent Vector3D) + // Unpacks from Jason's 2-short format (fills in a 4th binormal-sign (+1/-1) value, if this is a tangent vector) // FIXME: short math is slow on 360 - use ints here instead (bit-twiddle to deal w/ the sign bits) short iX = (*pPackedNormal & 0x0000FFFF); @@ -2183,9 +2558,9 @@ FORCEINLINE float* UnpackNormal_SHORT2(const unsigned int* pPackedNormal, float* FORCEINLINE unsigned int* PackNormal_SHORT2(float nx, float ny, float nz, unsigned int* pPackedNormal, float binormalSign = +1.0f) { - // Pack a Vector3D (ASSUMED TO BE NORMALIZED) into Jason's 4-byte (SHORT2) format. + // Pack a vector (ASSUMED TO BE NORMALIZED) into Jason's 4-byte (SHORT2) format. // This simply reconstructs Z from X & Y. It uses the sign bits of the X & Y coords - // to reconstruct the sign of Z and, if this is a tangent Vector3D, the sign of the + // to reconstruct the sign of Z and, if this is a tangent vector, the sign of the // binormal (this is needed because tangent/binormal vectors are supposed to follow // UV gradients, but shaders reconstruct the binormal from the tangent and normal // assuming that they form a right-handed basis). @@ -2204,7 +2579,7 @@ FORCEINLINE unsigned int* PackNormal_SHORT2(float nx, float ny, float nz, unsign if (nz < 0.0f) nx = -nx; // Set the sign bit for z - ny *= binormalSign; // Set the sign bit for the binormal (use when encoding a tangent Vector3D) + ny *= binormalSign; // Set the sign bit for the binormal (use when encoding a tangent vector) // FIXME: short math is slow on 360 - use ints here instead (bit-twiddle to deal w/ the sign bits), also use Float2Int() short sX = (short)nx; // signed short [1,32767] @@ -2278,7 +2653,7 @@ FORCEINLINE float* UnpackNormal_UBYTE4(const unsigned int* pPackedNormal, float* // See: http://www.oroboro.com/rafael/docserv.php/index/programming/article/unitv2 // // UBYTE4 encoding, using per-octant projection onto x+y+z=1 -// Assume input Vector3D is already unit length +// Assume input vector is already unit length // // binormalSign specifies 'sign' of binormal, stored in t sign bit of tangent // (lets the shader know whether norm/tan/bin form a right-handed basis) @@ -2359,7 +2734,7 @@ FORCEINLINE void RGB2YUV(int& nR, int& nG, int& nB, float& fY, float& fU, float& dX = 2 * (fU - 0.5f); dY = 2 * (fV - 0.5f); sat = sqrtf(dX * dX + dY * dY); - sat = clamp((int)(sat * (1 + SNAP_TO_GREY) - SNAP_TO_GREY), 0, 1); + sat = clamp((sat * (1 + SNAP_TO_GREY) - SNAP_TO_GREY), 0.f, 1.f); scale = (sat == 0) ? 0 : MIN((sqrtf(sat) / sat), 4.0f); fU = 0.5f + scale * (fU - 0.5f); fV = 0.5f + scale * (fV - 0.5f); @@ -2445,6 +2820,21 @@ inline bool AlmostEqual(const Vector3D& a, const Vector3D& b, int maxUlps = 10) AlmostEqual(a.z, b.z, maxUlps); } +inline Vector3D Approach(Vector3D target, Vector3D value, float speed) +{ + Vector3D diff = (target - value); + float delta = diff.Length(); + + if (delta > speed) + value += diff.Normalized() * speed; + else if (delta < -speed) + value -= diff.Normalized() * speed; + else + value = target; + + return value; +} + inline float Approach(float target, float value, float speed) { float delta = target - value; @@ -2472,6 +2862,20 @@ inline float Approach(float target, float value, float speed) #endif } + +// return a 0..1 value based on the position of x between edge0 and edge1 +inline float smoothstep_bounds(float edge0, float edge1, float x) +{ + x = clamp(static_cast((x - edge0) / (edge1 - edge0)), 0, 1); + return x * x * (3 - 2 * x); +} + +// return a value between edge0 and edge1 based on the 0..1 value of x +inline float interpstep(float edge0, float edge1, float x) +{ + return edge0 + (x * (edge1 - edge0)); +} + // on PPC we can do this truncate without converting to int #if defined(_X360) || defined(_PS3) inline double TruncateFloatToIntAsFloat(double flVal) @@ -2480,9 +2884,14 @@ inline double TruncateFloatToIntAsFloat(double flVal) double flIntFormat = __fctiwz(flVal); return __fcfid(flIntFormat); #elif defined(_PS3) +#if defined(__SPU__) + int iVal = int(flVal); + return static_cast(iVal); +#else double flIntFormat = __builtin_fctiwz(flVal); return __builtin_fcfid(flIntFormat); #endif +#endif } #endif @@ -2494,5 +2903,231 @@ inline double SubtractIntegerPart(double flVal) return flVal - int(flVal); #endif } + + +inline void matrix3x4_t::InitFromQAngles(const QAngle& angles, const Vector3D& vPosition) +{ + AngleMatrix(angles, vPosition, *this); +} +inline void matrix3x4_t::InitFromQAngles(const QAngle& angles) { InitFromQAngles(angles, vec3_origin); } + +inline void matrix3x4_t::InitFromRadianEuler(const RadianEuler& angles, const Vector3D& vPosition) +{ + AngleMatrix(angles, vPosition, *this); +} + +inline void matrix3x4_t::InitFromRadianEuler(const RadianEuler& angles) { InitFromRadianEuler(angles, vec3_origin); } + +inline void matrix3x4_t::InitFromQuaternion(const Quaternion& orientation, const Vector3D& vPosition) +{ + QuaternionMatrix(orientation, vPosition, *this); +} + +inline void matrix3x4_t::InitFromDiagonal(const Vector3D& vDiagonal) +{ + SetToIdentity(); + m_flMatVal[0][0] = vDiagonal.x; + m_flMatVal[1][1] = vDiagonal.y; + m_flMatVal[2][2] = vDiagonal.z; +} + + +inline void matrix3x4_t::InitFromQuaternion(const Quaternion& orientation) { InitFromQuaternion(orientation, vec3_origin); } + +inline Quaternion matrix3x4_t::ToQuaternion() const +{ + return MatrixQuaternion(*this); +} + +inline QAngle matrix3x4_t::ToQAngle() const +{ + QAngle tmp; + MatrixAngles(*this, tmp); + return tmp; +} + +inline void matrix3x4_t::SetToIdentity() +{ + SetIdentityMatrix(*this); +} + +inline bool matrix3x4_t::IsEqualTo(const matrix3x4_t& other, float flTolerance) const +{ + return MatricesAreEqual(*this, other, flTolerance); +} + +inline void matrix3x4_t::GetBasisVectorsFLU(Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp) const +{ + return MatrixVectorsFLU(*this, pForward, pLeft, pUp); +} + +inline Vector3D matrix3x4_t::TransformVector(const Vector3D& v0) const +{ + return VectorTransform(v0, *this); +} + +inline Vector3D matrix3x4_t::RotateVector(const Vector3D& v0) const +{ + return VectorRotate(v0, *this); +} + +inline Vector3D matrix3x4_t::TransformVectorByInverse(const Vector3D& v0) const +{ + return VectorITransform(v0, *this); +} + +inline Vector3D matrix3x4_t::RotateVectorByInverse(const Vector3D& v0) const +{ + Vector3D tmp; + VectorIRotate(v0, *this, tmp); + return tmp; +} + +inline Vector3D matrix3x4_t::RotateExtents(const Vector3D& vBoxExtents) const +{ + return Vector3D(DotProductAbs(vBoxExtents, m_flMatVal[0]), DotProductAbs(vBoxExtents, m_flMatVal[1]), DotProductAbs(vBoxExtents, m_flMatVal[2])); +} + +inline Vector3D matrix3x4_t::GetColumn(MatrixAxisType_t nColumn) const +{ + return Vector3D(m_flMatVal[0][nColumn], m_flMatVal[1][nColumn], m_flMatVal[2][nColumn]); +} + +inline void matrix3x4_t::SetColumn(const Vector3D& vColumn, MatrixAxisType_t nColumn) +{ + m_flMatVal[0][nColumn] = vColumn.x; + m_flMatVal[1][nColumn] = vColumn.y; + m_flMatVal[2][nColumn] = vColumn.z; +} + +inline void matrix3x4_t::InverseTR(matrix3x4_t& out) const +{ + ::MatrixInvert(*this, out); +} + +inline matrix3x4_t matrix3x4_t::InverseTR() const +{ + matrix3x4_t out; + ::MatrixInvert(*this, out); + return out; +} + +inline void matrix3x4_t::TransformAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const +{ + ::TransformAABB(*this, vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut); +} + +inline void matrix3x4_t::TransformAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const +{ + ::ITransformAABB(*this, vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut); +} + +inline void matrix3x4_t::RotateAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const +{ + ::RotateAABB(*this, vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut); +} +inline void matrix3x4_t::RotateAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const +{ + ::IRotateAABB(*this, vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut); +} + +inline void matrix3x4_t::TransformPlane(const cplane_t& inPlane, cplane_t& outPlane) const +{ + ::MatrixTransformPlane(*this, inPlane, outPlane); +} +inline void matrix3x4_t::TransformPlaneByInverse(const cplane_t& inPlane, cplane_t& outPlane) const +{ + ::MatrixITransformPlane(*this, inPlane, outPlane); +} + +inline float matrix3x4_t::GetOrthogonalityError() const +{ + return + fabsf(m_flMatVal[0][0] * m_flMatVal[0][1] + m_flMatVal[1][0] * m_flMatVal[1][1] + m_flMatVal[2][0] * m_flMatVal[2][1]) + + fabsf(m_flMatVal[0][1] * m_flMatVal[0][2] + m_flMatVal[1][1] * m_flMatVal[1][2] + m_flMatVal[2][1] * m_flMatVal[2][2]) + + fabsf(m_flMatVal[0][2] * m_flMatVal[0][0] + m_flMatVal[1][2] * m_flMatVal[1][0] + m_flMatVal[2][2] * m_flMatVal[2][0]); +} + +inline matrix3x4_t Quaternion::ToMatrix() const +{ + matrix3x4_t mat; + mat.InitFromQuaternion(*this); + return mat; +} + +inline matrix3x4_t QAngle::ToMatrix() const +{ + matrix3x4_t mat; + AngleMatrix(*this, mat); + return mat; +} + +inline Quaternion QAngle::ToQuaternion() const +{ + return AngleQuaternion(*this); +} + +inline float matrix3x4_t::GetDeterminant() const +{ + return + m_flMatVal[0][0] * (m_flMatVal[1][1] * m_flMatVal[2][2] - m_flMatVal[2][1] * m_flMatVal[1][2]) + - m_flMatVal[0][1] * (m_flMatVal[1][0] * m_flMatVal[2][2] - m_flMatVal[1][2] * m_flMatVal[2][0]) + + m_flMatVal[0][2] * (m_flMatVal[1][0] * m_flMatVal[2][1] - m_flMatVal[1][1] * m_flMatVal[2][0]); +} + +inline float GetRelativeDifferenceSqr(const Vector3D& a, const Vector3D& b) +{ + return (a - b).LengthSqr() / Max(1.0f, Max(a.LengthSqr(), b.LengthSqr())); +} + + +inline float GetRelativeDifference(const Vector3D& a, const Vector3D& b) +{ + return sqrtf(GetRelativeDifferenceSqr(a, b)); +} + + +// a good measure of relative error between two TR matrices, perhaps with a reasonable scale +inline float GetRelativeDifference(const matrix3x4_t& a, const matrix3x4_t& b) +{ + return sqrtf(Max(Max(GetRelativeDifferenceSqr(a.GetColumn(X_AXIS), b.GetColumn(X_AXIS)), + GetRelativeDifferenceSqr(a.GetColumn(Y_AXIS), b.GetColumn(Y_AXIS))), + Max(GetRelativeDifferenceSqr(a.GetColumn(Z_AXIS), b.GetColumn(Z_AXIS)), + GetRelativeDifferenceSqr(a.GetOrigin(), b.GetOrigin())) + ) + ); +} + + + +inline float matrix3x4_t::GetSylvestersCriterion()const +{ + // http://en.wikipedia.org/wiki/Sylvester%27s_criterion + float flDet1 = m_flMatVal[0][0]; + float flDet2 = m_flMatVal[0][0] * m_flMatVal[1][1] - m_flMatVal[1][0] * m_flMatVal[0][1]; + float flDet3 = GetDeterminant(); + return MIN(MIN(flDet1, flDet2), flDet3); +} + + + +// Generate the corner points of a box: +// +y _+z +// ^ /| +// | / +// | 3---7 +// /| /| +// / | / | +// 2---6 | +// | 1|--5 +// | / | / +// |/ |/ +// 0---4 --> +x +// +void PointsFromBox(const Vector3D& mins, const Vector3D& maxs, Vector3D* points); +void BuildTransformedBox(Vector3D* v2, Vector3D const& bbmin, Vector3D const& bbmax, const matrix3x4_t& m); + + + #endif // MATH_BASE_H diff --git a/r5dev/mathlib/mathlib_base.cpp b/r5dev/mathlib/mathlib_base.cpp index 25bf1462..0d4ce678 100644 --- a/r5dev/mathlib/mathlib_base.cpp +++ b/r5dev/mathlib/mathlib_base.cpp @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======// // // Purpose: Math primitives. // @@ -7,30 +7,38 @@ /// FIXME: As soon as all references to mathlib.c are gone, include it in here #include "core/stdafx.h" -#include -#include // Needed for FLT_EPSILON #include "tier0/basetypes.h" -#include +//#include #include "tier0/dbg.h" +#include "tier0/cpu.h" +//#include "tier0/vprof.h" //#define _VPROF_MATHLIB +#if !defined(__SPU__) #pragma warning(disable:4244) // "conversion from 'const int' to 'float', possible loss of data" #pragma warning(disable:4730) // "mixing _m64 and floating point expressions may result in incorrect code" +#endif -#include "mathlib/bits.h" -#include "mathlib/vplane.h" -#include "mathlib/Vector.h" -#include "mathlib/Vector2d.h" #include "mathlib/mathlib.h" +#include "mathlib/vector.h" +#include "mathlib/vplane.h" +#if !defined(__SPU__) +#include "mathlib/vmatrix.h" +#endif + +#if !defined( _X360 ) +//#include "sse.h" +#endif #include "mathlib/ssemath.h" -#include "mathlib/math_pfns.h" -#include +#include "mathlib/ssequaternion.h" + +// memdbgon must be the last include file in a .cpp file!!! +//#include "tier0/memdbgon.h" bool s_bMathlibInitialized = false; - #ifdef PARANOID // User must provide an implementation of Sys_Error() void Sys_Error(char* error, ...); @@ -38,9 +46,17 @@ void Sys_Error(char* error, ...); const Vector3D vec3_origin(0, 0, 0); const QAngle vec3_angle(0, 0, 0); +const Quaternion quat_identity(0, 0, 0, 1); const Vector3D vec3_invalid(FLT_MAX, FLT_MAX, FLT_MAX); const int nanmask = 255 << 23; +const matrix3x4a_t g_MatrixIdentity( + 1, 0, 0, 0, + 0, 1, 0, 0, + 0, 0, 1, 0 +); + +#if !defined(__SPU__) //----------------------------------------------------------------------------- // Standard C implementations of optimized routines: //----------------------------------------------------------------------------- @@ -57,10 +73,11 @@ float _rsqrtf(float x) return 1.f / _sqrtf(x); } -float FASTCALL _VectorNormalize(Vector3D& vec) +#ifndef PLATFORM_PPC +float VectorNormalize(Vector3D& vec) { #ifdef _VPROF_MATHLIB - VPROF_BUDGET("_Vector3Normalize", "Mathlib"); + VPROF_BUDGET("_VectorNormalize", "Mathlib"); #endif Assert(s_bMathlibInitialized); float radius = sqrtf(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z); @@ -74,6 +91,8 @@ float FASTCALL _VectorNormalize(Vector3D& vec) return radius; } +#endif + // TODO: Add fast C VectorNormalizeFast. // Perhaps use approximate rsqrt trick, if the accuracy isn't too bad. @@ -97,17 +116,11 @@ float _InvRSquared(const float* v) return r2 < 1.f ? 1.f : 1 / r2; } +#if !defined(__SPU__) //----------------------------------------------------------------------------- // Function pointers selecting the appropriate implementation //----------------------------------------------------------------------------- -float (*pfSqrt)(float x) = _sqrtf; -float (*pfRSqrt)(float x) = _rsqrtf; -float (*pfRSqrtFast)(float x) = _rsqrtf; -float (FASTCALL* pfVectorNormalize)(Vector3D& v) = _VectorNormalize; void (FASTCALL* pfVectorNormalizeFast)(Vector3D& v) = _VectorNormalizeFast; -float (*pfInvRSquared)(const float* v) = _InvRSquared; -void (*pfFastSinCos)(float x, float* s, float* c) = SinCos; -float (*pfFastCos)(float x) = cosf; float SinCosTable[SIN_TABLE_SIZE]; void InitSinCosTable() @@ -117,6 +130,8 @@ void InitSinCosTable() SinCosTable[i] = sin(i * 2.0 * M_PI / SIN_TABLE_SIZE); } } +#endif // !defined(__SPU__) + qboolean VectorsEqual(const float* v1, const float* v2) { @@ -125,11 +140,11 @@ qboolean VectorsEqual(const float* v1, const float* v2) (v1[1] == v2[1]) && (v1[2] == v2[2])); } - +#endif // #if !defined(__SPU__) //----------------------------------------------------------------------------- // Purpose: Generates Euler angles given a left-handed orientation matrix. The -// columns of the matrix contain the forward, left, and up Vector3s. +// columns of the matrix contain the forward, left, and up vectors. // Input : matrix - Left-handed orientation matrix. // angles[PITCH, YAW, ROLL]. Receives right-handed counterclockwise // rotations in degrees around Y, Z, and X respectively. @@ -210,8 +225,8 @@ void MatrixAngles(const matrix3x4_t& matrix, float* angles) float up[3]; // - // Extract the basis Vector3s from the matrix. Since we only need the Z - // component of the up Vector3, we don't get X and Y. + // Extract the basis vectors from the matrix. Since we only need the Z + // component of the up vector, we don't get X and Y. // forward[0] = matrix[0][0]; forward[1] = matrix[1][0]; @@ -248,15 +263,39 @@ void MatrixAngles(const matrix3x4_t& matrix, float* angles) } } +Vector3D MatrixNormalize(const matrix3x4_t& in, matrix3x4_t& out) +{ + Vector3D vScale; + vScale.x = sqrt(in[0][0] * in[0][0] + in[1][0] * in[1][0] + in[2][0] * in[2][0]); + vScale.y = sqrt(in[0][1] * in[0][1] + in[1][1] * in[1][1] + in[2][1] * in[2][1]); + vScale.z = sqrt(in[0][2] * in[0][2] + in[1][2] * in[1][2] + in[2][2] * in[2][2]); + matrix3x4_t norm; + float flInvScaleX = 1.0f / vScale.x; + float flInvScaleY = 1.0f / vScale.y; + float flInvScaleZ = 1.0f / vScale.z; + out[0][0] = in[0][0] * flInvScaleX; out[1][0] = in[1][0] * flInvScaleX; out[2][0] = in[2][0] * flInvScaleX; + out[0][1] = in[0][1] * flInvScaleY; out[1][1] = in[1][1] * flInvScaleY; out[2][1] = in[2][1] * flInvScaleY; + out[0][2] = in[0][2] * flInvScaleZ; out[1][2] = in[1][2] * flInvScaleZ; out[2][2] = in[2][2] * flInvScaleZ; + out[0][3] = in[0][3]; out[1][3] = in[1][3]; out[2][3] = in[2][3]; + + return vScale; +} + + + +#if !defined(__SPU__) // transform in1 by the matrix in2 -void VectorTransform(const float* in1, const matrix3x4_t& in2, float* out) +void VectorTransform(const float* RESTRICT in1, const matrix3x4_t& in2, float* RESTRICT out) { Assert(s_bMathlibInitialized); - Assert(in1 != out); - out[0] = DotProduct(in1, in2[0]) + in2[0][3]; - out[1] = DotProduct(in1, in2[1]) + in2[1][3]; - out[2] = DotProduct(in1, in2[2]) + in2[2][3]; + float x = DotProduct(in1, in2[0]) + in2[0][3]; + float y = DotProduct(in1, in2[1]) + in2[1][3]; + float z = DotProduct(in1, in2[2]) + in2[2][3]; + + out[0] = x; + out[1] = y; + out[2] = z; } @@ -270,23 +309,31 @@ void VectorITransform(const float* in1, const matrix3x4_t& in2, float* out) in1t[1] = in1[1] - in2[1][3]; in1t[2] = in1[2] - in2[2][3]; - out[0] = in1t[0] * in2[0][0] + in1t[1] * in2[1][0] + in1t[2] * in2[2][0]; - out[1] = in1t[0] * in2[0][1] + in1t[1] * in2[1][1] + in1t[2] * in2[2][1]; - out[2] = in1t[0] * in2[0][2] + in1t[1] * in2[1][2] + in1t[2] * in2[2][2]; + float x = in1t[0] * in2[0][0] + in1t[1] * in2[1][0] + in1t[2] * in2[2][0]; + float y = in1t[0] * in2[0][1] + in1t[1] * in2[1][1] + in1t[2] * in2[2][1]; + float z = in1t[0] * in2[0][2] + in1t[1] * in2[1][2] + in1t[2] * in2[2][2]; + + out[0] = x; + out[1] = y; + out[2] = z; } +#endif // #if !defined(__SPU__) - -// assume in2 is a rotation and rotate the input Vector3D -void VectorRotate(const float* in1, const matrix3x4_t& in2, float* out) +// assume in2 is a rotation and rotate the input vector +void VectorRotate(const float* RESTRICT in1, const matrix3x4_t& in2, float* RESTRICT out) { Assert(s_bMathlibInitialized); - Assert(in1 != out); - out[0] = DotProduct(in1, in2[0]); - out[1] = DotProduct(in1, in2[1]); - out[2] = DotProduct(in1, in2[2]); + float x = DotProduct(in1, in2[0]); + float y = DotProduct(in1, in2[1]); + float z = DotProduct(in1, in2[2]); + + out[0] = x; + out[1] = y; + out[2] = z; } -// assume in2 is a rotation and rotate the input Vector3D +#if !defined(__SPU__) +// assume in2 is a rotation and rotate the input vector void VectorRotate(const Vector3D& in1, const QAngle& in2, Vector3D& out) { matrix3x4_t matRotate; @@ -294,17 +341,38 @@ void VectorRotate(const Vector3D& in1, const QAngle& in2, Vector3D& out) VectorRotate(in1, matRotate, out); } -// assume in2 is a rotation and rotate the input Vector3D +// assume in2 is a rotation and rotate the input vector void VectorRotate(const Vector3D& in1, const Quaternion& in2, Vector3D& out) { +#if WE_WANT_OUR_CODE_TO_BE_POINTLESSLY_SLOW matrix3x4_t matRotate; QuaternionMatrix(in2, matRotate); VectorRotate(in1, matRotate, out); +#else + // rotation is q * v * q^-1 + + Quaternion conjugate = in2.Conjugate(); + + + // do the rotation as unrolled flop code ( QuaternionMult is a function call, which murders instruction scheduling ) + // first q*v + Quaternion temp; + temp.x = in2.y * in1.z - in2.z * in1.y + in2.w * in1.x; + temp.y = -in2.x * in1.z + in2.z * in1.x + in2.w * in1.y; + temp.z = in2.x * in1.y - in2.y * in1.x + in2.w * in1.z; + temp.w = -in2.x * in1.x - in2.y * in1.y - in2.z * in1.z; + + // now (qv)(q*) + out.x = temp.x * conjugate.w + temp.y * conjugate.z - temp.z * conjugate.y + temp.w * conjugate.x; + out.y = -temp.x * conjugate.z + temp.y * conjugate.w + temp.z * conjugate.x + temp.w * conjugate.y; + out.z = temp.x * conjugate.y - temp.y * conjugate.x + temp.z * conjugate.w + temp.w * conjugate.z; + Assert(fabs(-temp.x * conjugate.x - temp.y * conjugate.y - temp.z * conjugate.z + temp.w * conjugate.w) < 0.0001); +#endif } // rotate by the inverse of the matrix -void VectorIRotate(const float* in1, const matrix3x4_t& in2, float* out) +void VectorIRotate(const float* RESTRICT in1, const matrix3x4_t& in2, float* RESTRICT out) { Assert(s_bMathlibInitialized); Assert(in1 != out); @@ -313,7 +381,7 @@ void VectorIRotate(const float* in1, const matrix3x4_t& in2, float* out) out[2] = in1[0] * in2[0][2] + in1[1] * in2[1][2] + in1[2] * in2[2][2]; } -#ifndef Vector_NO_SLOW_OPERATIONS +#ifndef VECTOR_NO_SLOW_OPERATIONS // transform a set of angles in the output space of parentMatrix to the input space QAngle TransformAnglesToLocalSpace(const QAngle& angles, const matrix3x4_t& parentMatrix) { @@ -338,7 +406,7 @@ QAngle TransformAnglesToWorldSpace(const QAngle& angles, const matrix3x4_t& pare return out; } -#endif // Vector3D_NO_SLOW_OPERATIONS +#endif // VECTOR_NO_SLOW_OPERATIONS void MatrixInitialize(matrix3x4_t& mat, const Vector3D& vecOrigin, const Vector3D& vecXAxis, const Vector3D& vecYAxis, const Vector3D& vecZAxis) { @@ -369,6 +437,8 @@ bool MatricesAreEqual(const matrix3x4_t& src1, const matrix3x4_t& src2, float fl } return true; } +#endif // #if !defined(__SPU__) + // NOTE: This is just the transpose not a general inverse void MatrixInvert(const matrix3x4_t& in, matrix3x4_t& out) @@ -421,34 +491,7 @@ void MatrixSetColumn(const Vector3D& in, int column, matrix3x4_t& out) out[2][column] = in.z; } -void MatrixScaleBy(const float flScale, matrix3x4_t& out) -{ - out[0][0] *= flScale; - out[1][0] *= flScale; - out[2][0] *= flScale; - out[0][1] *= flScale; - out[1][1] *= flScale; - out[2][1] *= flScale; - out[0][2] *= flScale; - out[1][2] *= flScale; - out[2][2] *= flScale; -} - -void MatrixScaleByZero(matrix3x4_t& out) -{ - out[0][0] = 0.0f; - out[1][0] = 0.0f; - out[2][0] = 0.0f; - out[0][1] = 0.0f; - out[1][1] = 0.0f; - out[2][1] = 0.0f; - out[0][2] = 0.0f; - out[1][2] = 0.0f; - out[2][2] = 0.0f; -} - - - +#if !defined(__SPU__) int VectorCompare(const float* v1, const float* v2) { Assert(s_bMathlibInitialized); @@ -471,15 +514,28 @@ void CrossProduct(const float* v1, const float* v2, float* cross) cross[2] = v1[0] * v2[1] - v1[1] * v2[0]; } -int Q_log2(int val) +size_t Q_log2(unsigned int val) { +#ifdef _X360 // use hardware + // both zero and one return zero (per old implementation) + return (val == 0) ? 0 : 31 - _CountLeadingZeros(val); +#else // use N. Compoop's algorithm ( inherited from days of yore ) int answer = 0; while (val >>= 1) answer++; return answer; +#endif } -// Matrix is right-handed x=forward, y=left, z=up. We a left-handed convention for Vector3Ds in the game code (forward, right, up) +// Matrix is right-handed x=forward, y=left, z=up. We a left-handed convention for vectors in the game code (forward, right, up) +void MatrixVectorsFLU(const matrix3x4_t& matrix, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp) +{ + MatrixGetColumn(matrix, FORWARD_AXIS, *pForward); + MatrixGetColumn(matrix, LEFT_AXIS, *pLeft); + MatrixGetColumn(matrix, UP_AXIS, *pUp); +} + +// Matrix is right-handed x=forward, y=left, z=up. We a left-handed convention for vectors in the game code (forward, right, up) void MatrixVectors(const matrix3x4_t& matrix, Vector3D* pForward, Vector3D* pRight, Vector3D* pUp) { MatrixGetColumn(matrix, 0, *pForward); @@ -494,7 +550,7 @@ void VectorVectors(const Vector3D& forward, Vector3D& right, Vector3D& up) Assert(s_bMathlibInitialized); Vector3D tmp; - if (forward[0] == 0 && forward[1] == 0) + if (fabs(forward[0]) < 1e-6 && fabs(forward[1]) < 1e-6) { // pitch 90 degrees up/down from identity right[0] = 0; @@ -525,6 +581,62 @@ void VectorMatrix(const Vector3D& forward, matrix3x4_t& matrix) MatrixSetColumn(up, 2, matrix); } +void VectorPerpendicularToVector(Vector3D const& in, Vector3D* pvecOut) +{ + float flY = in.y * in.y; + pvecOut->x = RemapVal(flY, 0, 1, in.z, 1); + pvecOut->y = 0; + pvecOut->z = -in.x; + pvecOut->NormalizeInPlace(); + float flDot = DotProduct(*pvecOut, in); + *pvecOut -= flDot * in; + pvecOut->NormalizeInPlace(); +} + +//----------------------------------------------------------------------------- +// Euler QAngle -> Basis Vectors. Each vector is optional +//----------------------------------------------------------------------------- +void AngleVectorsFLU(const QAngle& angles, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp) +{ + Assert(s_bMathlibInitialized); + + float sr, sp, sy, cr, cp, cy; + +#ifdef _X360 + fltx4 radians, scale, sine, cosine; + radians = LoadUnaligned3SIMD(angles.Base()); + scale = ReplicateX4(M_PI_F / 180.f); + radians = MulSIMD(radians, scale); + SinCos3SIMD(sine, cosine, radians); + sp = SubFloat(sine, 0); sy = SubFloat(sine, 1); sr = SubFloat(sine, 2); + cp = SubFloat(cosine, 0); cy = SubFloat(cosine, 1); cr = SubFloat(cosine, 2); +#else + SinCos(DEG2RAD(angles[YAW]), &sy, &cy); + SinCos(DEG2RAD(angles[PITCH]), &sp, &cp); + SinCos(DEG2RAD(angles[ROLL]), &sr, &cr); +#endif + + if (pForward) + { + (*pForward)[FORWARD_AXIS] = cp * cy; + (*pForward)[LEFT_AXIS] = cp * sy; + (*pForward)[UP_AXIS] = -sp; + } + + if (pLeft) + { + (*pLeft)[FORWARD_AXIS] = (sr * sp * cy + cr * -sy); + (*pLeft)[LEFT_AXIS] = (sr * sp * sy + cr * cy); + (*pLeft)[UP_AXIS] = sr * cp; + } + + if (pUp) + { + (*pUp)[FORWARD_AXIS] = (cr * sp * cy + -sr * -sy); + (*pUp)[LEFT_AXIS] = (cr * sp * sy + -sr * cy); + (*pUp)[UP_AXIS] = cr * cp; + } +} void VectorAngles(const float* forward, float* angles) { @@ -562,7 +674,7 @@ void VectorAngles(const float* forward, float* angles) R_ConcatRotations ================ */ -void ConcatRotations(const float in1[3][3], const float in2[3][3], float out[3][3]) +void ConcatRotations(const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out) { Assert(s_bMathlibInitialized); Assert(in1 != out); @@ -586,12 +698,14 @@ void ConcatRotations(const float in1[3][3], const float in2[3][3], float out[3][ out[2][2] = in1[2][0] * in2[0][2] + in1[2][1] * in2[1][2] + in1[2][2] * in2[2][2]; } +#endif // #if !defined(__SPU__) -void ConcatTransforms_Aligned(const matrix3x4_t& m0, const matrix3x4_t& m1, matrix3x4_t& out) + +void ConcatTransforms_Aligned(const matrix3x4a_t& m0, const matrix3x4a_t& m1, matrix3x4a_t& out) { - Assert((((size_t)&m0) % 16) == 0); - Assert((((size_t)&m1) % 16) == 0); - Assert((((size_t)&out) % 16) == 0); + //AssertAligned(&m0); + //AssertAligned(&m1); + //AssertAligned(&out); fltx4 lastMask = *(fltx4*)(&g_SIMD_ComponentMask[3]); fltx4 rowA0 = LoadAlignedSIMD(m0.m_flMatVal[0]); @@ -630,7 +744,7 @@ void ConcatTransforms_Aligned(const matrix3x4_t& m0, const matrix3x4_t& m1, matr fltx4 mul22 = MulSIMD(A2, rowB2); fltx4 out2 = AddSIMD(mul20, AddSIMD(mul21, mul22)); - // add in translation Vector3D + // add in translation vector A0 = AndSIMD(rowA0, lastMask); A1 = AndSIMD(rowA1, lastMask); A2 = AndSIMD(rowA2, lastMask); @@ -697,7 +811,7 @@ void ConcatTransforms(const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_ fltx4 mul22 = MulSIMD(A2, rowB2); fltx4 out2 = AddSIMD(mul20, AddSIMD(mul21, mul22)); - // add in translation Vector3D + // add in translation vector A0 = AndSIMD(rowA0, lastMask); A1 = AndSIMD(rowA1, lastMask); A2 = AndSIMD(rowA2, lastMask); @@ -721,7 +835,7 @@ numer and denom, both of which should contain no fractional part. The quotient must fit in 32 bits. ==================== */ - +#if !defined(__SPU__) void FloorDivMod(double numer, double denom, int* quotient, int* rem) { @@ -889,7 +1003,7 @@ int __cdecl BoxOnPlaneSide(const float* emins, const float* emaxs, const cplane_ } //----------------------------------------------------------------------------- -// Euler QAngle -> Basis Vector3Ds +// Euler QAngle -> Basis Vectors //----------------------------------------------------------------------------- void AngleVectors(const QAngle& angles, Vector3D* forward) @@ -908,7 +1022,7 @@ void AngleVectors(const QAngle& angles, Vector3D* forward) } //----------------------------------------------------------------------------- -// Euler QAngle -> Basis Vector3Ds. Each Vector3D is optional +// Euler QAngle -> Basis Vectors. Each vector is optional //----------------------------------------------------------------------------- void AngleVectors(const QAngle& angles, Vector3D* forward, Vector3D* right, Vector3D* up) { @@ -953,7 +1067,7 @@ void AngleVectors(const QAngle& angles, Vector3D* forward, Vector3D* right, Vect } //----------------------------------------------------------------------------- -// Euler QAngle -> Basis Vector3Ds transposed +// Euler QAngle -> Basis Vectors transposed //----------------------------------------------------------------------------- void AngleVectorsTranspose(const QAngle& angles, Vector3D* forward, Vector3D* right, Vector3D* up) @@ -988,7 +1102,7 @@ void AngleVectorsTranspose(const QAngle& angles, Vector3D* forward, Vector3D* ri } //----------------------------------------------------------------------------- -// Forward direction Vector3D -> Euler angles +// Forward direction vector -> Euler angles //----------------------------------------------------------------------------- void VectorAngles(const Vector3D& forward, QAngle& angles) @@ -1022,7 +1136,7 @@ void VectorAngles(const Vector3D& forward, QAngle& angles) } //----------------------------------------------------------------------------- -// Forward direction Vector3D with a reference up Vector3D -> Euler angles +// Forward direction vector with a reference up vector -> Euler angles //----------------------------------------------------------------------------- void VectorAngles(const Vector3D& forward, const Vector3D& pseudoup, QAngle& angles) @@ -1067,6 +1181,8 @@ void VectorAngles(const Vector3D& forward, const Vector3D& pseudoup, QAngle& ang } } +#endif // #if !defined(__SPU__) + void SetIdentityMatrix(matrix3x4_t& matrix) { memset(matrix.Base(), 0, sizeof(float) * 3 * 4); @@ -1076,6 +1192,7 @@ void SetIdentityMatrix(matrix3x4_t& matrix) } +#if !defined(__SPU__) //----------------------------------------------------------------------------- // Builds a scale matrix //----------------------------------------------------------------------------- @@ -1154,13 +1271,13 @@ void MatrixTranspose(const matrix3x4_t& src, matrix3x4_t& dst) dst[1][0] = src[0][1]; dst[1][1] = src[1][1]; dst[1][2] = src[2][1]; dst[1][3] = 0.0f; dst[2][0] = src[0][2]; dst[2][1] = src[1][2]; dst[2][2] = src[2][2]; dst[2][3] = 0.0f; } - +#endif // #if !defined(__SPU__) //----------------------------------------------------------------------------- // Purpose: converts engine euler angles into a matrix // Input : vec3_t angles - PITCH, YAW, ROLL // Output : *matrix - left-handed column matrix -// the basis Vector3Ds for the rotations will be in the columns as follows: +// the basis vectors for the rotations will be in the columns as follows: // matrix[][0] is forward // matrix[][1] is left // matrix[][2] is up @@ -1214,16 +1331,12 @@ void AngleMatrix(const QAngle& angles, matrix3x4_t& matrix) matrix[1][0] = cp * sy; matrix[2][0] = -sp; - float crcy = cr * cy; - float crsy = cr * sy; - float srcy = sr * cy; - float srsy = sr * sy; - matrix[0][1] = sp * srcy - crsy; - matrix[1][1] = sp * srsy + crcy; + // NOTE: Do not optimize this to reduce multiplies! optimizer bug will screw this up. + matrix[0][1] = sr * sp * cy + cr * -sy; + matrix[1][1] = sr * sp * sy + cr * cy; matrix[2][1] = sr * cp; - - matrix[0][2] = (sp * crcy + srsy); - matrix[1][2] = (sp * crsy - srcy); + matrix[0][2] = (cr * sp * cy + -sr * -sy); + matrix[1][2] = (cr * sp * sy + -sr * cy); matrix[2][2] = cr * cp; matrix[0][3] = 0.0f; @@ -1231,6 +1344,7 @@ void AngleMatrix(const QAngle& angles, matrix3x4_t& matrix) matrix[2][3] = 0.0f; } +#if !defined(__SPU__) void AngleIMatrix(const RadianEuler& angles, matrix3x4_t& matrix) { QAngle quakeEuler(RAD2DEG(angles.y), RAD2DEG(angles.z), RAD2DEG(angles.x)); @@ -1271,8 +1385,9 @@ void AngleIMatrix(const QAngle& angles, const Vector3D& position, matrix3x4_t& m vecTranslation *= -1.0f; MatrixSetColumn(vecTranslation, 3, mat); } +#endif // #if !defined(__SPU__) - +#if !defined(__SPU__) //----------------------------------------------------------------------------- // Bounding box construction methods //----------------------------------------------------------------------------- @@ -1280,8 +1395,8 @@ void AngleIMatrix(const QAngle& angles, const Vector3D& position, matrix3x4_t& m void ClearBounds(Vector3D& mins, Vector3D& maxs) { Assert(s_bMathlibInitialized); - mins[0] = mins[1] = mins[2] = 99999; - maxs[0] = maxs[1] = maxs[2] = -99999; + mins[0] = mins[1] = mins[2] = FLT_MAX; + maxs[0] = maxs[1] = maxs[2] = -FLT_MAX; } void AddPointToBounds(const Vector3D& v, Vector3D& mins, Vector3D& maxs) @@ -1300,6 +1415,32 @@ void AddPointToBounds(const Vector3D& v, Vector3D& mins, Vector3D& maxs) } } +bool AreBoundsValid(const Vector3D& vMin, const Vector3D& vMax) +{ + for (int i = 0; i < 3; ++i) + { + if (vMin[i] > vMax[i]) + { + return false; + } + } + + return true; +} + +bool IsPointInBounds(const Vector3D& vPoint, const Vector3D& vMin, const Vector3D& vMax) +{ + for (int i = 0; i < 3; ++i) + { + if (vPoint[i] < vMin[i] || vPoint[i] > vMax[i]) + { + return false; + } + } + + return true; +} + // solve a x^2 + b x + c = 0 bool SolveQuadratic(float a, float b, float c, float& root1, float& root2) { @@ -1423,7 +1564,7 @@ bool SolveInverseReciprocalQuadratic(float x1, float y1, float x2, float y2, flo } -// Rotate a Vector3D around the Z axis (YAW) +// Rotate a vector around the Z axis (YAW) void VectorYawRotate(const Vector3D& in, float flYaw, Vector3D& out) { Assert(s_bMathlibInitialized); @@ -1455,9 +1596,7 @@ float Bias(float x, float biasAmt) { lastExponent = log(biasAmt) * -1.4427f; // (-1.4427 = 1 / log(0.5)) } - float fRet = pow(x, lastExponent); - Assert(!IS_NAN(fRet)); - return fRet; + return pow(x, lastExponent); } @@ -1473,9 +1612,7 @@ float Gain(float x, float biasAmt) float SmoothCurve(float x) { - // Actual smooth curve. Visualization: - // http://www.wolframalpha.com/input/?i=plot%5B+0.5+*+%281+-+cos%5B2+*+pi+*+x%5D%29+for+x+%3D+%280%2C+1%29+%5D - return 0.5f * (1 - cos(2.0f * M_PI * x)); + return (1 - cos(x * M_PI)) * 0.5f; } @@ -1496,118 +1633,7 @@ float SmoothCurve_Tweak(float x, float flPeakPos, float flPeakSharpness) return SmoothCurve(flSharpened); } -void QuaternionExp(const Quaternion& p, Quaternion& q) -{ - float r = sqrt(p[0] * p[0] + p[1] * p[1] + p[2] * p[2]); - float et = exp(p[3]); - float s = r >= 0.00001f ? et * sin(r) / r : 0.f; - q.Init(s * p[0], s * p[1], s * p[2], et * cos(r)); -} - -void QuaternionLn(const Quaternion& p, Quaternion& q) -{ - float r = sqrt(p[0] * p[0] + p[1] * p[1] + p[2] * p[2]); - float t = r > 0.00001f ? atan2(r, p[3]) / r : 0.f; - float norm = p[0] * p[0] + p[1] * p[1] + p[2] * p[2] + p[3] * p[3]; - q.Init(t * p[0], t * p[1], t * p[2], 0.5 * log(norm)); -} - -// Average using exponential method -// Qave = exp( 1 / n * log( Q1 ) + ... + 1 / n * log( Qn ) ) where -// if pflWeights passed in 1/n is replaced by normalized weighting -void QuaternionAverageExponential(Quaternion& q, int nCount, const Quaternion* pQuaternions, const float* pflWeights /*=NULL*/) -{ - Assert(nCount >= 1); - Assert(pQuaternions); - - // Nothing to do if only one input quaternions - if (nCount == 1) - { - q = pQuaternions[0]; - return; - } - - float ooWeightSum = 1.0f; - float flWeightSum = 0.0f; - for (int i = 0; i < nCount; ++i) - { - if (pflWeights) - { - flWeightSum += pflWeights[i]; - } - else - { - flWeightSum += 1.0f; - } - } - - if (flWeightSum > 0.0f) - { - ooWeightSum = 1.0f / flWeightSum; - } - - Quaternion sum(0, 0, 0, 0); - // Now sum the ln of the quaternions - for (int i = 0; i < nCount; ++i) - { - float weight = ooWeightSum; - if (pflWeights) - { - weight *= pflWeights[i]; - } - - // Make sure all quaternions are aligned with the - // first to avoid blending the wrong direction. - Quaternion alignedQuat; - QuaternionAlign(pQuaternions[0], pQuaternions[i], alignedQuat); - - Quaternion qLn; - QuaternionLn(alignedQuat, qLn); - for (int j = 0; j < 4; ++j) - { - sum[j] += (qLn[j] * weight); - } - } - - // then exponentiate to get final value - QuaternionExp(sum, q); -} - -// Given a vector and a pseudo-up reference vector, create a quaternion which represents -// the orientation of the forward vector. Note, will be unstable if vecForward is close -// to referenceUp -void QuaternionLookAt(const Vector3D& vecForward, const Vector3D& referenceUp, Quaternion& q) -{ - Vector3D forward = vecForward; - forward.NormalizeInPlace(); - float ratio = DotProduct(forward, referenceUp); - Vector3D up = referenceUp - (forward * ratio); - up.NormalizeInPlace(); - - Vector3D right = forward.Cross(up); - right.NormalizeInPlace(); - - const Vector3D& x = right; - const Vector3D& y = forward; - const Vector3D& z = up; - - float tr = x.x + y.y + z.z; - q.Init(y.z - z.y, z.x - x.z, x.y - y.x, tr + 1.0f); - QuaternionNormalize(q); - - /* - Vector z = vecForward; - z.NormalizeInPlace(); - Vector x = referenceUp.Cross( z ); - x.NormalizeInPlace(); - Vector y = z.Cross( x ); - y.NormalizeInPlace(); - - float tr = x.x + y.y + z.z; - q.Init( y.z - z.y , z.x - x.z, x.y - y.x, tr + 1.0f ); - QuaternionNormalize( q ); - */ -} +#endif // !defined(__SPU__) //----------------------------------------------------------------------------- // make sure quaternions are within 180 degrees of one another, if not, reverse q @@ -1764,7 +1790,7 @@ void QuaternionSlerpNoAlign(const Quaternion& p, const Quaternion& q, float t, Q Assert(qt.IsValid()); } - +#if !defined(__SPU__) //----------------------------------------------------------------------------- // Purpose: Returns the angular delta between the two normalized quaternions in degrees. //----------------------------------------------------------------------------- @@ -1836,6 +1862,18 @@ void QuaternionInvert(const Quaternion& p, Quaternion& q) } } +void QuaternionMultiply(const Quaternion& q, const Vector3D& v, Vector3D& result) +{ + Vector3D t, t2; + CrossProduct(q.ImaginaryPart(), v, t); + t *= 2.0f; + VectorMA(v, q.RealPart(), t, result); + CrossProduct(q.ImaginaryPart(), t, t2); + result += t2; +} + +#endif // #if !defined(__SPU__) + //----------------------------------------------------------------------------- // Make sure the quaternion is of unit length //----------------------------------------------------------------------------- @@ -1882,7 +1920,7 @@ void QuaternionScale(const Quaternion& p, float t, Quaternion& q) // FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to // use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale. float sinom = sqrt(DotProduct(&p.x, &p.x)); - sinom = min(sinom, 1.f); + sinom = MIN(sinom, 1.f); float sinsom = sin(asin(sinom) * t); @@ -1965,12 +2003,126 @@ void QuaternionMult(const Quaternion& p, const Quaternion& q, Quaternion& qt) } +#if !defined(__SPU__) + +void QuaternionExp(const Quaternion& p, Quaternion& q) +{ + float r = sqrt(p[0] * p[0] + p[1] * p[1] + p[2] * p[2]); + float et = exp(p[3]); + float s = r >= 0.00001f ? et * sin(r) / r : 0.f; + q.Init(s * p[0], s * p[1], s * p[2], et * cos(r)); +} + +void QuaternionLn(const Quaternion& p, Quaternion& q) +{ + float r = sqrt(p[0] * p[0] + p[1] * p[1] + p[2] * p[2]); + float t = r > 0.00001f ? atan2(r, p[3]) / r : 0.f; + float norm = p[0] * p[0] + p[1] * p[1] + p[2] * p[2] + p[3] * p[3]; + q.Init(t * p[0], t * p[1], t * p[2], 0.5 * log(norm)); +} + +// Average using exponential method +// Qave = exp( 1 / n * log( Q1 ) + ... + 1 / n * log( Qn ) ) where +// if pflWeights passed in 1/n is replaced by normalized weighting +void QuaternionAverageExponential(Quaternion& q, int nCount, const Quaternion* pQuaternions, const float* pflWeights /*=NULL*/) +{ + Assert(nCount >= 1); + Assert(pQuaternions); + + // Nothing to do if only one input quaternions + if (nCount == 1) + { + q = pQuaternions[0]; + return; + } + + float ooWeightSum = 1.0f; + float flWeightSum = 0.0f; + for (int i = 0; i < nCount; ++i) + { + if (pflWeights) + { + flWeightSum += pflWeights[i]; + } + else + { + flWeightSum += 1.0f; + } + } + + if (flWeightSum > 0.0f) + { + ooWeightSum = 1.0f / flWeightSum; + } + + Quaternion sum(0, 0, 0, 0); + // Now sum the ln of the quaternions + for (int i = 0; i < nCount; ++i) + { + float weight = ooWeightSum; + if (pflWeights) + { + weight *= pflWeights[i]; + } + + // Make sure all quaternions are aligned with the + // first to avoid blending the wrong direction. + Quaternion alignedQuat; + QuaternionAlign(pQuaternions[0], pQuaternions[i], alignedQuat); + + Quaternion qLn; + QuaternionLn(alignedQuat, qLn); + for (int j = 0; j < 4; ++j) + { + sum[j] += (qLn[j] * weight); + } + } + + // then exponentiate to get final value + QuaternionExp(sum, q); +} + +// Given a vector and a pseudo-up reference vector, create a quaternion which represents +// the orientation of the forward vector. Note, will be unstable if vecForward is close +// to referenceUp +void QuaternionLookAt(const Vector3D& vecForward, const Vector3D& referenceUp, Quaternion& q) +{ + Vector3D forward = vecForward; + forward.NormalizeInPlace(); + float ratio = DotProduct(forward, referenceUp); + Vector3D up = referenceUp - (forward * ratio); + up.NormalizeInPlace(); + + Vector3D right = forward.Cross(up); + right.NormalizeInPlace(); + + const Vector3D& x = right; + const Vector3D& y = forward; + const Vector3D& z = up; + + float tr = x.x + y.y + z.z; + q.Init(y.z - z.y, z.x - x.z, x.y - y.x, tr + 1.0f); + QuaternionNormalize(q); + + /* + Vector z = vecForward; + z.NormalizeInPlace(); + Vector x = referenceUp.Cross( z ); + x.NormalizeInPlace(); + Vector y = z.Cross( x ); + y.NormalizeInPlace(); + + float tr = x.x + y.y + z.z; + q.Init( y.z - z.y , z.x - x.z, x.y - y.x, tr + 1.0f ); + QuaternionNormalize( q ); + */ +} + +#endif // !defined(__SPU__) + void QuaternionMatrix(const Quaternion& q, const Vector3D& pos, matrix3x4_t& matrix) { - if (!HushAsserts()) - { - Assert(pos.IsValid()); - } + Assert(pos.IsValid()); QuaternionMatrix(q, matrix); @@ -1979,13 +2131,25 @@ void QuaternionMatrix(const Quaternion& q, const Vector3D& pos, matrix3x4_t& mat matrix[2][3] = pos.z; } +void QuaternionMatrix(const Quaternion& q, const Vector3D& pos, const Vector3D& vScale, matrix3x4_t& mat) +{ + Assert(pos.IsValid()); + Assert(q.IsValid()); + Assert(vScale.IsValid()); + + QuaternionMatrix(q, mat); + + mat[0][0] *= vScale.x; mat[1][0] *= vScale.x; mat[2][0] *= vScale.x; + mat[0][1] *= vScale.y; mat[1][1] *= vScale.y; mat[2][1] *= vScale.y; + mat[0][2] *= vScale.z; mat[1][2] *= vScale.z; mat[2][2] *= vScale.z; + mat[0][3] = pos.x; mat[1][3] = pos.y; mat[2][3] = pos.z; +} + + void QuaternionMatrix(const Quaternion& q, matrix3x4_t& matrix) { Assert(s_bMathlibInitialized); - if (!HushAsserts()) - { - Assert(q.IsValid()); - } + Assert(q.IsValid()); #ifdef _VPROF_MATHLIB VPROF_BUDGET("QuaternionMatrix", "Mathlib"); @@ -2045,6 +2209,109 @@ void QuaternionMatrix(const Quaternion& q, matrix3x4_t& matrix) } +const Vector3D Quaternion::GetForward()const +{ + Vector3D vAxisX; + vAxisX.x = 1.0 - 2.0 * y * y - 2.0 * z * z; + vAxisX.y = 2.0 * x * y + 2.0 * w * z; + vAxisX.z = 2.0 * x * z - 2.0 * w * y; + return vAxisX; +} + + +const Vector3D Quaternion::GetLeft()const +{ + Vector3D vAxisY; + vAxisY.x = 2.0f * x * y - 2.0f * w * z; + vAxisY.y = 1.0f - 2.0f * x * x - 2.0f * z * z; + vAxisY.z = 2.0f * y * z + 2.0f * w * x; + return vAxisY; +} + + + +const Vector3D Quaternion::GetUp()const +{ + Vector3D vAxisZ; + vAxisZ.x = 2.0f * x * z + 2.0f * w * y; + vAxisZ.y = 2.0f * y * z - 2.0f * w * x; + vAxisZ.z = 1.0f - 2.0f * x * x - 2.0f * y * y; + return vAxisZ; +} + + + +const Quaternion RotateBetween(const Vector3D& v1, const Vector3D& v2) +{ + // Find quaternion that rotates v1 into v2 + Quaternion qOut; + + Vector3D vBisector = 0.5f * (v1 + v2); + if (vBisector.LengthSqr() > 1e-9f) + { + qOut.Init(CrossProduct(v1, vBisector), DotProduct(v1, vBisector)); + } + else + { + // Anti-parallel: Use a perpendicular vector + if (fabsf(v1.x) > 0.5f) + { + qOut.x = v1.y; + qOut.y = -v1.x; + qOut.z = 0.0f; + } + else + { + qOut.x = 0.0f; + qOut.y = v1.z; + qOut.z = -v1.y; + } + + qOut.w = 0.0f; + } + + // The algorithm is simplified and made more accurate by normalizing at the end + QuaternionNormalize(qOut); + + Assert((VectorTransform(v1, QuaternionMatrix(qOut)) - v2).Length() < 2e-3f); + + return qOut; +} + + +void UnitTestQuatExpLog() +{ + for (int i = 0; i < 300000; ++i) + { + Quaternion q = RandomQuaternion(); + Vector3D l = QuaternionLog(q); + Quaternion q2 = Exp(l); + Assert(QuaternionLength(q - q2) < 0.0001f); + } +} + + +void UnitTestRotateBetween() +{ + RandomSeed(1); + float flMaxError = 0; + int nMaxError; + for (int i = 0; i < 3000000; ++i) + { + Vector3D u = RandomVectorOnUnitSphere(), v = RandomVectorOnUnitSphere(); + Quaternion q = RotateBetween(u, v); + + float flError = (VectorTransform(u, QuaternionMatrix(q)) - v).Length(); + if (flMaxError < flError) + { + flMaxError = flError; + nMaxError = i; + } + } + Assert(flMaxError < 0.001f); +} + + //----------------------------------------------------------------------------- // Purpose: Converts a quaternion into engine angles // Input : *quaternion - q3 + q0.i + q1.j + q2.k @@ -2082,6 +2349,97 @@ void QuaternionAngles(const Quaternion& q, QAngle& angles) Assert(angles.IsValid()); } + +float QuaternionionGetYaw(const Quaternion& q) +{ + // FIXME: doing it this way calculates too much data, need to do an optimized version... + QAngle angles; + matrix3x4_t matrix; + QuaternionMatrix(q, matrix); + MatrixAngles(matrix, angles); + return angles[YAW]; +} + +float QuaternionionGetPitch(const Quaternion& q) +{ + // FIXME: doing it this way calculates too much data, need to do an optimized version... + QAngle angles; + matrix3x4_t matrix; + QuaternionMatrix(q, matrix); + MatrixAngles(matrix, angles); + return angles[PITCH]; +} + +float QuaternionionGetRoll(const Quaternion& q) +{ + // FIXME: doing it this way calculates too much data, need to do an optimized version... + QAngle angles; + matrix3x4_t matrix; + QuaternionMatrix(q, matrix); + MatrixAngles(matrix, angles); + return angles[ROLL]; +} + + +//----------------------------------------------------------------------------- +// Purpose: Converts a quaternion into FLU vectors +// Input : *quaternion - q3 + q0.i + q1.j + q2.k +// basis vectors, each vector is optional +//----------------------------------------------------------------------------- +void QuaternionVectorsFLU(Quaternion const& q, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp) +{ + Assert(s_bMathlibInitialized); + Assert(q.IsValid()); + +#ifdef _VPROF_MATHLIB + // @TODO: VPROF_BUDGET( "QuaternionVectorsFLU", "Mathlib" ); +#endif + + // Note: it's pretty much identical to just computing the quaternion matrix and assigning its columns to the vectors + * pForward = q.GetForward(); + *pLeft = q.GetLeft(); + *pUp = q.GetUp(); +#ifdef DBGFLAG_ASSERT + matrix3x4_t matrix; + QuaternionMatrix(q, matrix); + Vector3D forward, left, up; + MatrixVectorsFLU(matrix, &forward, &left, &up); + Assert((forward - *pForward).Length() + (left - *pLeft).Length() + (up - *pUp).Length() < 1e-4f); +#endif +} + +void QuaternionVectorsForward(const Quaternion& q, Vector3D* pForward) +{ + Assert(s_bMathlibInitialized); + Assert(q.IsValid()); + +#ifdef _VPROF_MATHLIB + // @TODO: VPROF_BUDGET( "QuaternionVectorsForward", "Mathlib" ); +#endif + + * pForward = q.GetForward(); +#ifdef DBGFLAG_ASSERT + matrix3x4_t matrix; + QuaternionMatrix(q, matrix); + Assert((MatrixGetColumn(matrix, FORWARD_AXIS) - *pForward).Length() < 1e-4f); +#endif +} + + +void UnitTestVectorFLU() +{ + for (int i = 0; i < 100000; ++i) + { + Quaternion q = RandomQuaternion(); + Vector3D forward, left, up; + QuaternionVectorsForward(q, &forward); + QuaternionVectorsFLU(q, &forward, &left, &up); + } +} + + + +#if !defined(__SPU__) //----------------------------------------------------------------------------- // Purpose: Converts a quaternion to an axis / angle in degrees // (exponential map) @@ -2113,7 +2471,7 @@ void AxisAngleQuaternion(const Vector3D& axis, float angle, Quaternion& q) q.z = axis.z * sa; q.w = ca; } - +#endif // #if !defined(__SPU__) //----------------------------------------------------------------------------- // Purpose: Converts radian-euler axis aligned angles to a quaternion @@ -2158,6 +2516,72 @@ void AngleQuaternion(const RadianEuler& angles, Quaternion& outQuat) outQuat.w = crXcp * cy + srXsp * sy; // W (real component) } +#ifdef _X360 +//----------------------------------------------------------------------------- +// Purpose: Converts radian-euler axis aligned angles to a quaternion, returning +// it on a vector register. +// Input : *vAngles - Right-handed Euler angles in radians (roll pitch yaw) +// +// Algorithm based on that found in the XDK (which really uses RPY order, as +// opposed to this which takes the parameters in RPY order but catenates them +// in PYR order). +//----------------------------------------------------------------------------- +fltx4 AngleQuaternionSIMD(FLTX4 vAngles) +{ + Assert(s_bMathlibInitialized); + // Assert( angles.IsValid() ); + +#ifdef _VPROF_MATHLIB + VPROF_BUDGET("AngleQuaternion", "Mathlib"); +#endif + + // we compute the sin and cos of half all the angles. + // in the comments I'll call these components + // sr = sin(r/2), cp = cos(p/2), sy = sin(y/2), etc. + + fltx4 OneHalf = __vspltisw(1); + OneHalf = __vcfsx(OneHalf, 1); + + fltx4 HalfAngles = MulSIMD(vAngles, OneHalf); + fltx4 sine, cosine; + SinCos3SIMD(sine, cosine, HalfAngles); + + fltx4 SignMask = __vspltisw(-1); + fltx4 Zero = __vspltisw(0); + SignMask = __vslw(SignMask, SignMask); // shift left so 1 is only in the sign bit + SignMask = __vrlimi(SignMask, Zero, 0x5, 0); // { -1, 0, -1, 0 } + + fltx4 Rc, Pc, Yc, Rs, Ps, Ys, retsum, retval; + + Rc = __vspltw(cosine, 0); // cr cr cr cr + Pc = __vspltw(cosine, 1); // cp cp cp cp + Yc = __vspltw(cosine, 2); // cy cy cy cy + Rs = __vspltw(sine, 0); // sr sr sr sr + Ps = __vspltw(sine, 1); // sp sp sp sp + Ys = __vspltw(sine, 2); // sy sy sy sy + + Rc = __vrlimi(Rc, sine, 0x8, 0); // sr cr cr cr + Rs = __vrlimi(Rs, cosine, 0x8, 0); // cr sr sr sr + Pc = __vrlimi(Pc, sine, 0x4, 0); // cp sp cp cp + Ps = __vrlimi(Ps, cosine, 0x4, 0); // sp cp sp sp + Yc = __vrlimi(Yc, sine, 0x2, 0); // cy cy sy cy + Ys = __vrlimi(Ys, cosine, 0x2, 0); // sy sy cy sy + + retsum = __vxor(Rs, SignMask); // -cr sr -sr sr + retval = __vmulfp(Pc, Yc); // cp*cy sp*cy cp*sy cp*cy + retsum = __vmulfp(retsum, Ys); // -cr*sy sr*sy -sr*cy sr*sy + retval = __vmulfp(retval, Rc); // cp*cy*sr sp*cy*cr cp*sy*cr cp*cy*cr + retval = __vmaddfp(retsum, Ps, retval); // cp*cy*sr + -cr*sy*sp ... + + return retval; +} + +inline fltx4 AngleQuaternionSIMD(const RadianEuler& angles) +{ + return AngleQuaternionSIMD(LoadUnaligned3SIMD(angles.Base())); +} +#endif + //----------------------------------------------------------------------------- // Purpose: Converts engine-format euler angles to a quaternion @@ -2202,7 +2626,7 @@ void AngleQuaternion(const QAngle& angles, Quaternion& outQuat) outQuat.w = crXcp * cy + srXsp * sy; // W (real component) } - +#if !defined(__SPU__) //----------------------------------------------------------------------------- // Purpose: Converts a basis to a quaternion //----------------------------------------------------------------------------- @@ -2288,7 +2712,116 @@ void MatrixQuaternion(const matrix3x4_t& mat, Quaternion& q) MatrixAngles(mat, angles); AngleQuaternion(angles, q); } +#endif // #if !defined(__SPU__) +void MatrixQuaternionFast(const matrix3x4_t& mat, Quaternion& q) +{ + float t; + if (mat[2][2] < 0) + { + if (mat[0][0] > mat[1][1]) + { + t = 1 + mat[0][0] - mat[1][1] - mat[2][2]; + q.Init(t, mat[0][1] + mat[1][0], mat[2][0] + mat[0][2], mat[2][1] - mat[1][2]); + } + else + { + t = 1 - mat[0][0] + mat[1][1] - mat[2][2]; + q.Init(mat[0][1] + mat[1][0], t, mat[1][2] + mat[2][1], mat[0][2] - mat[2][0]); + } + } + else + { + if (mat[0][0] < -mat[1][1]) + { + t = 1 - mat[0][0] - mat[1][1] + mat[2][2]; + q.Init(mat[2][0] + mat[0][2], mat[1][2] + mat[2][1], t, mat[1][0] - mat[0][1]); + } + else + { + t = 1 + mat[0][0] + mat[1][1] + mat[2][2]; + q.Init(mat[2][1] - mat[1][2], mat[0][2] - mat[2][0], mat[1][0] - mat[0][1], t); + } + } + q = q * (0.5f / sqrtf(t)); +} + + +float MatrixQuaternionTest(uint nCount) +{ + float flMaxError = 0, flSumError = 0; + for (uint i = 0; i < nCount; ++i) + { + Quaternion q = RandomQuaternion(), r; + Assert(fabsf(q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w - 1) < 1e-5f); + matrix3x4_t mat; + QuaternionMatrix(q, mat); + MatrixQuaternion(mat, r); + if (QuaternionDotProduct(q, r) < 0) + { + r = -r; + } + float flError = Sqr(q.x - r.x) + Sqr(q.y - r.y) + Sqr(q.z - r.z) + Sqr(q.w - r.w); + flSumError += flError; + if (flError > flMaxError) + { + flMaxError = flError; + } + } + NOTE_UNUSED(flMaxError); NOTE_UNUSED(flSumError); + return flSumError / nCount; +} + +float MatrixQuaternionFastTest(uint nCount) +{ + float flMaxError = 0, flSumError = 0; + for (uint i = 0; i < nCount; ++i) + { + Quaternion q = RandomQuaternion(), r; + Assert(fabsf(q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w - 1) < 1e-5f); + matrix3x4_t mat; + QuaternionMatrix(q, mat); + MatrixQuaternionFast(mat, r); + if (QuaternionDotProduct(q, r) < 0) + { + r = -r; + } + float flError = Sqr(q.x - r.x) + Sqr(q.y - r.y) + Sqr(q.z - r.z) + Sqr(q.w - r.w); + flSumError += flError; + if (flError > flMaxError) + { + flMaxError = flError; + } + } + NOTE_UNUSED(flMaxError); NOTE_UNUSED(flSumError); + return flSumError / nCount; +} + +// the same as MatrixQuaternionTest, but uses inline helper functions that return matrix and quaternion instead of using return-by-reference versions +// on MSVC10, this generates the same code as MatrixQuaternionTest, but it's easier to read, write and maintain code +float MatrixQuaternionTest2(uint nCount) +{ + float flMaxError = 0, flSumError = 0; + for (uint i = 0; i < nCount; ++i) + { + Quaternion q = RandomQuaternion(), r; + Assert(fabsf(q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w - 1) < 1e-5f); + matrix3x4_t mat = QuaternionMatrix(q); + r = MatrixQuaternion(mat); + if (QuaternionDotProduct(q, r) < 0) + { + r = -r; + } + float flError = Sqr(q.x - r.x) + Sqr(q.y - r.y) + Sqr(q.z - r.z) + Sqr(q.w - r.w); + flSumError += flError; + if (flError > flMaxError) + { + flMaxError = flError; + } + } + NOTE_UNUSED(flMaxError); NOTE_UNUSED(flSumError); + return flSumError / nCount; +} //----------------------------------------------------------------------------- // Purpose: Converts a quaternion into engine angles @@ -2308,6 +2841,7 @@ void QuaternionAngles(const Quaternion& q, RadianEuler& angles) Assert(angles.IsValid()); } +#if !defined(__SPU__) //----------------------------------------------------------------------------- // Purpose: A helper function to normalize p2.x->p1.x and p3.x->p4.x to // be the same length as p2.x->p3.x @@ -2342,7 +2876,9 @@ void Spline_Normalize( } } } +#endif // #if !defined(__SPU__) +#if !defined(__SPU__) //----------------------------------------------------------------------------- // Purpose: // Input : @@ -2546,6 +3082,7 @@ void Catmull_Rom_Spline_NormalizeX( Catmull_Rom_Spline(p1n, p2, p3, p4n, t, output); } +#endif // !defined(__SPU__) //----------------------------------------------------------------------------- // Purpose: basic hermite spline. t = 0 returns p1, t = 1 returns p2, @@ -2626,8 +3163,10 @@ void Hermite_SplineBasis(float t, float basis[4]) // Input : //----------------------------------------------------------------------------- -// BUG: the Vector3DSubtract()'s calls go away if the global optimizer is enabled +// BUG: the VectorSubtract()'s calls go away if the global optimizer is enabled +#if !defined(__SPU__) #pragma optimize( "g", off ) +#endif void Hermite_Spline(const Vector3D& p0, const Vector3D& p1, const Vector3D& p2, float t, Vector3D& output) { @@ -2637,7 +3176,9 @@ void Hermite_Spline(const Vector3D& p0, const Vector3D& p1, const Vector3D& p2, Hermite_Spline(p1, p2, e10, e21, t, output); } +#if !defined(__SPU__) #pragma optimize( "", on ) +#endif float Hermite_Spline(float p0, float p1, float p2, float t) { @@ -2662,6 +3203,8 @@ void Hermite_Spline(const Quaternion& q0, const Quaternion& q1, const Quaternion QuaternionNormalize(output); } + +#if !defined(__SPU__) // See http://en.wikipedia.org/wiki/Kochanek-Bartels_curves // // Tension: -1 = Round -> 1 = Tight @@ -2961,6 +3504,33 @@ void Parabolic_Spline_NormalizeX( Parabolic_Spline(p1n, p2, p3, p4n, t, output); } +//----------------------------------------------------------------------------- +// Cubic Bernstein basis functions +// http://mathworld.wolfram.com/BernsteinPolynomial.html +// +// Purpose: Evaluate the cubic Bernstein basis for the input parametric coordinate. +// Output is the coefficient for that basis polynomial. +//----------------------------------------------------------------------------- +float CubicBasis0(float t) +{ + float invT = 1.0f - t; + return invT * invT * invT; +} +float CubicBasis1(float t) +{ + float invT = 1.0f - t; + return 3.0f * t * invT * invT; +} +float CubicBasis2(float t) +{ + float invT = 1.0f - t; + return 3.0f * t * t * invT; +} +float CubicBasis3(float t) +{ + return t * t * t; +} + //----------------------------------------------------------------------------- // Purpose: Compress the input values for a ranged result such that from 75% to 200% smoothly of the range maps //----------------------------------------------------------------------------- @@ -3033,6 +3603,8 @@ void TransformAABB(const matrix3x4_t& transform, const Vector3D& vecMinsIn, cons VectorSubtract(worldCenter, worldExtents, vecMinsOut); VectorAdd(worldCenter, worldExtents, vecMaxsOut); + // sanity chec + Assert(vecMinsOut.LengthSqr() + vecMaxsOut.LengthSqr() < 1e+12); } @@ -3246,7 +3818,7 @@ void CalcClosestPointOnLineSegment(const Vector3D& P, const Vector3D& vLineA, co { Vector3D vDir; float t = CalcClosestPointToLineT(P, vLineA, vLineB, vDir); - t = clamp(t, 0.f, 1.f); + t = clamp(static_cast(t), 0, 1); if (outT) { *outT = t; @@ -3318,7 +3890,7 @@ void CalcClosestPointOnLineSegment2D(const Vector2D& P, const Vector2D& vLineA, { Vector2D vDir; float t = CalcClosestPointToLineT2D(P, vLineA, vLineB, vDir); - t = clamp(t, 0.f, 1.f); + t = clamp(static_cast(t), 0, 1); if (outT) { *outT = t; @@ -3393,12 +3965,15 @@ bool CalcLineToLineIntersectionSegment( *t1 = numer / denom; *t2 = (d1343 + d4321 * (*t1)) / d4343; - s1->x = p1.x + *t1 * p21.x; - s1->y = p1.y + *t1 * p21.y; - s1->z = p1.z + *t1 * p21.z; - s2->x = p3.x + *t2 * p43.x; - s2->y = p3.y + *t2 * p43.y; - s2->z = p3.z + *t2 * p43.z; + if (s1 != NULL && s2 != NULL) + { + s1->x = p1.x + *t1 * p21.x; + s1->y = p1.y + *t1 * p21.y; + s1->z = p1.z + *t1 * p21.z; + s2->x = p3.x + *t2 * p43.x; + s2->y = p3.y + *t2 * p43.y; + s2->z = p3.z + *t2 * p43.z; + } return true; } @@ -3411,132 +3986,67 @@ bool CalcLineToLineIntersectionSegment( #pragma optimize( "", on ) -static bool s_b3DNowEnabled = false; -static bool s_bMMXEnabled = false; -static bool s_bSSEEnabled = false; -static bool s_bSSE2Enabled = false; + +#ifndef NDEBUG +volatile static char const* pDebugString; +#endif void MathLib_Init(float gamma, float texGamma, float brightness, int overbright, bool bAllow3DNow, bool bAllowSSE, bool bAllowSSE2, bool bAllowMMX) { if (s_bMathlibInitialized) return; +#ifdef _WIN32 + Assert(_rotl(0xC7654321, 1) == 0x8ECA8643); + Assert(_rotl64(0xC7654321ABCDEF00ull, 1) == 0x8ECA8643579BDE01ull); +#endif +#ifndef NDEBUG + pDebugString = "mathlib.lib built debug!"; +#endif - // FIXME: Hook SSE into Vector3DAligned + Vector3D4DAligned + // FIXME: Hook SSE into VectorAligned + Vector4DAligned -#if !defined( _X360 ) +#if !defined( _GAMECONSOLE ) // Grab the processor information: const CPUInformation& pi = GetCPUInformation(); - // Select the default generic routines. - pfSqrt = _sqrtf; - pfRSqrt = _rsqrtf; - pfRSqrtFast = _rsqrtf; - pfVectorNormalize = _VectorNormalize; - pfVectorNormalizeFast = _VectorNormalizeFast; - pfInvRSquared = _InvRSquared; - pfFastSinCos = SinCos; - pfFastCos = cosf; + if (!(pi.m_bSSE && pi.m_bSSE2)) + { + Assert(0); + if (MessageBoxA(NULL, "SSE and SSE2 are required.", "Unsupported CPU", MB_ICONERROR | MB_OK)) + { + TerminateProcess(GetCurrentProcess(), 0xBAD0C0DE); + } + } +#endif //!360 - if (bAllowMMX && pi.m_bMMX) - { - // Select the MMX specific routines if available - // (MMX routines were used by SW span fillers - not currently used for HW) - s_bMMXEnabled = true; - } - else - { - s_bMMXEnabled = false; - } - - // SSE Generally performs better than 3DNow when present, so this is placed - // first to allow SSE to override these settings. -#if !defined( OSX ) && !defined( PLATFORM_WINDOWS_PC64 ) && !defined(LINUX) - if (bAllow3DNow && pi.m_b3DNow) - { - s_b3DNowEnabled = true; - - // Select the 3DNow specific routines if available; - pfVector3DNormalize = _3DNow_Vector3DNormalize; - pfVector3DNormalizeFast = _3DNow_Vector3DNormalizeFast; - pfInvRSquared = _3DNow_InvRSquared; - pfSqrt = _3DNow_Sqrt; - pfRSqrt = _3DNow_RSqrt; - pfRSqrtFast = _3DNow_RSqrt; - } - else -#endif - { - s_b3DNowEnabled = false; - } - - if (bAllowSSE && pi.m_bSSE) - { - s_bSSEEnabled = true; - -#ifndef PLATFORM_WINDOWS_PC64 - // These are not yet available. - // Select the SSE specific routines if available - pfVector3DNormalize = _Vector3DNormalize; - pfVector3DNormalizeFast = _SSE_Vector3DNormalizeFast; - pfInvRSquared = _SSE_InvRSquared; - pfSqrt = _SSE_Sqrt; - pfRSqrt = _SSE_RSqrtAccurate; - pfRSqrtFast = _SSE_RSqrtFast; -#endif -#ifdef PLATFORM_WINDOWS_PC32 - pfFastSinCos = _SSE_SinCos; - pfFastCos = _SSE_cos; -#endif - } - else - { - s_bSSEEnabled = false; - } - - if (bAllowSSE2 && pi.m_bSSE2) - { - s_bSSE2Enabled = true; -#ifdef PLATFORM_WINDOWS_PC32 - pfFastSinCos = _SSE2_SinCos; - pfFastCos = _SSE2_cos; -#endif - } - else - { - s_bSSE2Enabled = false; - } -#endif // !_X360 s_bMathlibInitialized = true; InitSinCosTable(); BuildGammaTable(gamma, texGamma, brightness, overbright); + SeedRandSIMD(0x31415926); } -bool MathLib_3DNowEnabled(void) -{ - Assert(s_bMathlibInitialized); - return s_b3DNowEnabled; -} bool MathLib_MMXEnabled(void) { Assert(s_bMathlibInitialized); - return s_bMMXEnabled; + return true; } bool MathLib_SSEEnabled(void) { Assert(s_bMathlibInitialized); - return s_bSSEEnabled; + return true; } bool MathLib_SSE2Enabled(void) { Assert(s_bMathlibInitialized); - return s_bSSE2Enabled; + return true; } + // BUGBUG: Why doesn't this call angle diff?!?!? float ApproachAngle(float target, float value, float speed) { @@ -3662,6 +4172,34 @@ void RotationDelta(const QAngle& srcAngles, const QAngle& destAngles, QAngle* ou } } +void ClipLineSegmentToPlane(const Vector3D& vNormal, const Vector3D& vPlanePoint, Vector3D* p1, Vector3D* p2, float flBias) +{ + float flDot1, flDot2; + flDot1 = (*p1 - vPlanePoint).Dot(vNormal) + flBias; + flDot2 = (*p2 - vPlanePoint).Dot(vNormal) + flBias; + + if (flDot1 >= 0 && flDot2 >= 0) + { + return; + } + + if (flDot1 >= 0) + { + Vector3D vRay = *p2 - *p1; + *p2 = *p1 + vRay * flDot1 / (flDot1 - flDot2); + } + else if (flDot2 >= 0) + { + Vector3D vRay = *p1 - *p2; + *p1 = *p2 + vRay * flDot2 / (flDot2 - flDot1); + } + else + { + *p1 = vec3_invalid; + *p2 = vec3_invalid; + } +} + //----------------------------------------------------------------------------- // Purpose: Computes a triangle normal //----------------------------------------------------------------------------- @@ -3675,14 +4213,49 @@ void ComputeTrianglePlane(const Vector3D& v1, const Vector3D& v2, const Vector3D intercept = DotProduct(normal, v1); } +//----------------------------------------------------------------------------- +// Purpose: Calculate the volume of a tetrahedron with these vertices +// Input : p0 - points of tetrahedron +// p1 - +// p2 - +// p3 - +// Output : float (volume in units^3) +//----------------------------------------------------------------------------- +float TetrahedronVolume(const Vector3D& p0, const Vector3D& p1, const Vector3D& p2, const Vector3D& p3) +{ + Vector3D a, b, c, cross; + float volume = 1.0f / 6.0f; + + a = p1 - p0; + b = p2 - p0; + c = p3 - p0; + cross = CrossProduct(b, c); + + volume *= DotProduct(a, cross); + if (volume < 0) + return -volume; + return volume; +} + + +// computes the area of a triangle given three verts +float TriangleArea(const Vector3D& v0, const Vector3D& v1, const Vector3D& v2) +{ + Vector3D vecEdge0, vecEdge1, vecCross; + VectorSubtract(v1, v0, vecEdge0); + VectorSubtract(v2, v0, vecEdge1); + CrossProduct(vecEdge0, vecEdge1, vecCross); + return (VectorLength(vecCross) * 0.5f); +} + //----------------------------------------------------------------------------- // Purpose: This is a clone of BaseWindingForPlane() -// Input : *outVerts - an array of preallocated verts to build the polygon in +// Input : *pOutVerts - an array of preallocated verts to build the polygon in // normal - the plane normal // dist - the plane constant // Output : int - vert count (always 4) //----------------------------------------------------------------------------- -int PolyFromPlane(Vector3D* outVerts, const Vector3D& normal, float dist, float fHalfScale) +int PolyFromPlane(Vector3D* pOutVerts, const Vector3D& normal, float dist, float fHalfScale) { int i, x; vec_t max, v; @@ -3705,7 +4278,7 @@ int PolyFromPlane(Vector3D* outVerts, const Vector3D& normal, float dist, float if (x == -1) return 0; - // Build a unit Vector3D along something other than the major axis + // Build a unit vector along something other than the major axis VectorCopy(vec3_origin, vup); switch (x) { @@ -3718,7 +4291,7 @@ int PolyFromPlane(Vector3D* outVerts, const Vector3D& normal, float dist, float break; } - // Remove the component of this Vector3D along the normal + // Remove the component of this vector along the normal v = DotProduct(vup, normal); VectorMA(vup, -v, normal, vup); // Make it a unit (perpendicular) @@ -3726,30 +4299,80 @@ int PolyFromPlane(Vector3D* outVerts, const Vector3D& normal, float dist, float // Center of the poly is at normal * dist VectorScale(normal, dist, org); - // Calculate the third orthonormal basis Vector3D for our plane space (this one and vup are in the plane) + // Calculate the third orthonormal basis vector for our plane space (this one and vup are in the plane) CrossProduct(vup, normal, vright); - // Make the plane's basis Vector3Ds big (these are the half-sides of the polygon we're making) + // Make the plane's basis vectors big (these are the half-sides of the polygon we're making) VectorScale(vup, fHalfScale, vup); VectorScale(vright, fHalfScale, vright); // Move diagonally away from org to create the corner verts - VectorSubtract(org, vright, outVerts[0]); // left - VectorAdd(outVerts[0], vup, outVerts[0]); // up + VectorSubtract(org, vright, pOutVerts[0]); // left + VectorAdd(pOutVerts[0], vup, pOutVerts[0]); // up - VectorAdd(org, vright, outVerts[1]); // right - VectorAdd(outVerts[1], vup, outVerts[1]); // up + VectorAdd(org, vright, pOutVerts[1]); // right + VectorAdd(pOutVerts[1], vup, pOutVerts[1]); // up - VectorAdd(org, vright, outVerts[2]); // right - VectorSubtract(outVerts[2], vup, outVerts[2]); // down + VectorAdd(org, vright, pOutVerts[2]); // right + VectorSubtract(pOutVerts[2], vup, pOutVerts[2]); // down - VectorSubtract(org, vright, outVerts[3]); // left - VectorSubtract(outVerts[3], vup, outVerts[3]); // down + VectorSubtract(org, vright, pOutVerts[3]); // left + VectorSubtract(pOutVerts[3], vup, pOutVerts[3]); // down // The four corners form a planar quadrilateral normal to "normal" return 4; } +// Returns void as it was impossible for the function to returns anything other than 4. +// Any absolute of a floating value will always return a number greater than -16384. That test seemed bogus. +void PolyFromPlane_SIMD(fltx4* pOutVerts, const fltx4& plane, float fHalfScale) +{ + // So we need to find the biggest component of all three, + // And depending of the value, we need to build a unit vector along something that is not the major axis. + + fltx4 f4Abs = AbsSIMD(plane); + fltx4 x = SplatXSIMD(f4Abs); + fltx4 y = SplatYSIMD(f4Abs); + fltx4 z = SplatZSIMD(f4Abs); + fltx4 max = MaxSIMD(x, y); + max = MaxSIMD(max, z); + + // Simplify the code, if Z is the biggest component, we will use 1 0 0. + // If X or Y are the biggest, we will use 0 0 1. + bi32x4 fIsMax = CmpEqSIMD(max, f4Abs); // isMax will be set for the components that are the max + fltx4 fIsZMax = SplatZSIMD((fltx4)fIsMax); // 0 if Z is not the max, 0xffffffff is Z is the max + // And depending if Z is max or not, we are going to select one unit vector or the other + fltx4 vup = MaskedAssign((bi32x4)fIsZMax, g_SIMD_Identity[0], g_SIMD_Identity[2]); + + fltx4 normal = SetWToZeroSIMD(plane); + fltx4 dist = SplatWSIMD(plane); + + // Remove the component of this vector along the normal + fltx4 v = Dot3SIMD(vup, normal); + vup = MaddSIMD(-v, normal, vup); + // Make it a unit (perpendicular) + vup = Normalized3SIMD(vup); + + // Center of the poly is at normal * dist + fltx4 org = MulSIMD(dist, normal); + // Calculate the third orthonormal basis vector for our plane space (this one and vup are in the plane) + fltx4 vright = CrossProductSIMD(vup, normal); + + // Make the plane's basis vectors big (these are the half-sides of the polygon we're making) + fltx4 f4HalfScale = ReplicateX4(fHalfScale); + vup = MulSIMD(f4HalfScale, vup); + vright = MulSIMD(f4HalfScale, vright); + + // Move diagonally away from org to create the corner verts + fltx4 vleft = SubSIMD(org, vright); + vright = AddSIMD(org, vright); + + pOutVerts[0] = AddSIMD(vleft, vup); // left + up + pOutVerts[1] = AddSIMD(vright, vup); // right + up + pOutVerts[2] = SubSIMD(vright, vup); // right + down + pOutVerts[3] = SubSIMD(vleft, vup); // left + down +} + //----------------------------------------------------------------------------- // Purpose: clip a poly to the plane and return the poly on the front side of the plane // Input : *inVerts - input polygon @@ -3849,6 +4472,119 @@ int ClipPolyToPlane(Vector3D* inVerts, int vertCount, Vector3D* outVerts, const return outCount; } +int ClipPolyToPlane_SIMD(fltx4* pInVerts, int nVertCount, fltx4* pOutVerts, const fltx4& plane, float fOnPlaneEpsilon) +{ + vec_t* dists = (vec_t*)stackalloc(sizeof(vec_t) * nVertCount * 4); //4* nVertCount should cover all cases + uint8* sides = (uint8*)stackalloc(sizeof(uint8) * nVertCount * 4); + int i; + + /* + * It seems something could be done here... Especially in relation with the code below i, i + 1, etc... + fltx4 f4OnPlaneEpsilonP = ReplicateX4( fOnPlaneEpsilon ); + fltx4 f4OnPlaneEpsilonM = -f4OnPlaneEpsilonP; + Also we could store the full fltx4 instead of a single float. It would avoid doing a SubFloat() here, + and a ReplicateX4() later. Trading off potential LHS against L2 cache misses? + */ + // determine sides for each point + int nAllSides = 0; + fltx4 f4Dist = SplatWSIMD(plane); + for (i = 0; i < nVertCount; i++) + { + // dot = DotProduct( pInVerts[i], normal) - dist; + fltx4 dot = Dot3SIMD(pInVerts[i], plane); + dot = SubSIMD(dot, f4Dist); + float fDot = SubFloat(dot, 0); + dists[i] = fDot; + // Look how to update sides with a branch-less version + int nSide = OR_SIDE_ON; + if (fDot > fOnPlaneEpsilon) + { + nSide = OR_SIDE_FRONT; + } + else if (fDot < -fOnPlaneEpsilon) + { + nSide = OR_SIDE_BACK; + } + sides[i] = nSide; + nAllSides |= nSide; + } + sides[i] = sides[0]; + dists[i] = dists[0]; + + // Shortcuts (either completely clipped or not clipped at all) + if ((nAllSides & OR_SIDE_FRONT) == 0) + { + return 0; // Completely clipped + } + + if ((nAllSides & OR_SIDE_BACK) == 0) + { + // Not clipped at all, copy to output verts + Assert(i == nVertCount); + int nIndex = 0; + while (i >= 4) + { + pOutVerts[nIndex] = pInVerts[nIndex]; + pOutVerts[nIndex + 1] = pInVerts[nIndex + 1]; + pOutVerts[nIndex + 2] = pInVerts[nIndex + 2]; + pOutVerts[nIndex + 3] = pInVerts[nIndex + 3]; + nIndex += 4; + i -= 4; + } + while (i > 0) + { + pOutVerts[nIndex] = pInVerts[nIndex]; + ++nIndex; + --i; + } + return nVertCount; + } + + fltx4 f4one = Four_Ones; + fltx4 f4MOne = -f4one; + + fltx4 f4OneMask = (fltx4)CmpEqSIMD(plane, f4one); + fltx4 f4mOneMask = (fltx4)CmpEqSIMD(plane, f4MOne); + fltx4 f4AllMask = OrSIMD(f4OneMask, f4mOneMask); // 0xffffffff where normal was 1 or -1, 0 otherwise + f4OneMask = AndSIMD(f4OneMask, f4Dist); // Dist where normal.* was 1 + f4mOneMask = AndSIMD(f4mOneMask, -f4Dist); // -Dist where normal.* was -1 + fltx4 f4AllValue = OrSIMD(f4OneMask, f4mOneMask); // Dist and -Dist where normal.* was 1 and -1 + // f4AllMask and f4AllValue will be used together (to override the default calculation). + + int nOutCount = 0; + for (i = 0; i < nVertCount; i++) + { + const fltx4& p1 = pInVerts[i]; + + if (sides[i] == OR_SIDE_ON) + { + pOutVerts[nOutCount++] = p1; + continue; + } + + if (sides[i] == OR_SIDE_FRONT) + { + pOutVerts[nOutCount++] = p1; + } + + if (sides[i + 1] == OR_SIDE_ON || sides[i + 1] == sides[i]) + continue; + + // generate a split point + fltx4& p2 = pInVerts[(i + 1) % nVertCount]; + + float fDot = dists[i] / (dists[i] - dists[i + 1]); + fltx4 f4Dot = ReplicateX4(fDot); + + // mid[j] = v1[j] + dot*(v2[j]-v1[j]); - For j=0...2 + fltx4 f4Result = MaddSIMD(f4Dot, SubSIMD(p2, p1), p1); + // If normal.* is 1, it should be dist, if -1, it should be -dist, otherwise it should be mid[j] = v1[j] + dot*(v2[j]-v1[j]); + fltx4 mid = MaskedAssign((bi32x4)f4AllMask, f4AllValue, f4Result); + pOutVerts[nOutCount++] = mid; + } + + return nOutCount; +} int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, const double* normal, double dist, double fOnPlaneEpsilon) { @@ -3857,7 +4593,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co int counts[3]; double dot; int i, j; - //Vector3D mid = vec3_origin; + //Vector mid = vec3_origin; double mid[3]; mid[0] = 0.0; mid[1] = 0.0; @@ -3898,7 +4634,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co //for ( i = 0; i < vertCount; i++ ) for (i = 0; i < vertCount * 3; i++) { - //Vector3DCopy( inVerts[i], outVerts[i] ); + //VectorCopy( inVerts[i], outVerts[i] ); outVerts[i] = inVerts[i]; } return vertCount; @@ -3907,7 +4643,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co outCount = 0; for (i = 0; i < vertCount; i++) { - //Vector3D& p1 = inVerts[i]; + //Vector& p1 = inVerts[i]; double* p1 = &inVerts[i * 3]; //p1[0] = inVerts[i*3 + 0]; //p1[1] = inVerts[i*3 + 1]; @@ -3915,7 +4651,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co if (sides[i] == SIDE_ON) { - //Vector3DCopy( p1, outVerts[outCount]); + //VectorCopy( p1, outVerts[outCount]); outVerts[outCount * 3 + 0] = p1[0]; outVerts[outCount * 3 + 1] = p1[1]; outVerts[outCount * 3 + 2] = p1[2]; @@ -3925,7 +4661,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co if (sides[i] == SIDE_FRONT) { - //Vector3DCopy( p1, outVerts[outCount]); + //VectorCopy( p1, outVerts[outCount]); outVerts[outCount * 3 + 0] = p1[0]; outVerts[outCount * 3 + 1] = p1[1]; outVerts[outCount * 3 + 2] = p1[2]; @@ -3936,7 +4672,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co continue; // generate a split point - //Vector3D& p2 = inVerts[(i+1)%vertCount]; + //Vector& p2 = inVerts[(i+1)%vertCount]; int wrappedindex = (i + 1) % vertCount; double* p2 = &inVerts[wrappedindex * 3]; //p2[0] = inVerts[wrappedindex*3 + 0]; @@ -3949,7 +4685,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co mid[j] = (double)p1[j] + dot * ((double)p2[j] - (double)p1[j]); } - //Vector3DCopy (mid, outVerts[outCount]); + //VectorCopy (mid, outVerts[outCount]); outVerts[outCount * 3 + 0] = mid[0]; outVerts[outCount * 3 + 1] = mid[1]; outVerts[outCount * 3 + 2] = mid[2]; @@ -4009,6 +4745,9 @@ float CalcFovX(float flFovY, float flAspect) return RAD2DEG(atan(tan(DEG2RAD(flFovY) * 0.5f) * flAspect)) * 2.0f; } +#endif // !defined(__SPU__) + +#if !defined(__SPU__) //----------------------------------------------------------------------------- // Generate a frustum based on perspective view parameters //----------------------------------------------------------------------------- @@ -4071,36 +4810,533 @@ void GenerateOrthoFrustum(const Vector3D& origin, const Vector3D& forward, const pPlanesOut[FRUSTUM_TOP].Init(-up, -flTop - flIntercept); } +//----------------------------------------------------------------------------- +// Version that accepts angles instead of vectors +//----------------------------------------------------------------------------- +void GeneratePerspectiveFrustum(const Vector3D& origin, const QAngle& angles, float flZNear, float flZFar, float flFovX, float flAspectRatio, Frustum_t& frustum) +{ + VPlane planes[FRUSTUM_NUMPLANES]; + Vector3D vecForward, vecRight, vecUp; + AngleVectors(angles, &vecForward, &vecRight, &vecUp); + float flFovY = CalcFovY(flFovX, flAspectRatio); + GeneratePerspectiveFrustum(origin, vecForward, vecRight, vecUp, flZNear, flZFar, flFovX, flFovY, planes); + frustum.SetPlanes(planes); +} + +void fourplanes_t::ComputeSignbits() +{ + xSign = CmpLtSIMD(nX, Four_Zeros); + ySign = CmpLtSIMD(nY, Four_Zeros); + zSign = CmpLtSIMD(nZ, Four_Zeros); + nXAbs = fabs(nX); + nYAbs = fabs(nY); + nZAbs = fabs(nZ); +} + +void fourplanes_t::GetPlane(int index, Vector3D* pNormalOut, float* pDistOut) const +{ + pNormalOut->x = SubFloat(nX, index); + pNormalOut->y = SubFloat(nY, index); + pNormalOut->z = SubFloat(nZ, index); + *pDistOut = SubFloat(dist, index); +} +void fourplanes_t::SetPlane(int index, const Vector3D& vecNormal, float planeDist) +{ + SubFloat(nX, index) = vecNormal.x; + SubFloat(nY, index) = vecNormal.y; + SubFloat(nZ, index) = vecNormal.z; + SubFloat(dist, index) = planeDist; + ComputeSignbits(); +} + +void fourplanes_t::Set4Planes(const VPlane* pPlanes) +{ + nX = LoadUnalignedSIMD(&pPlanes[0].m_Normal.x); + nY = LoadUnalignedSIMD(&pPlanes[1].m_Normal.x); + nZ = LoadUnalignedSIMD(&pPlanes[2].m_Normal.x); + dist = LoadUnalignedSIMD(&pPlanes[3].m_Normal.x); + TransposeSIMD(nX, nY, nZ, dist); + ComputeSignbits(); +} + +void fourplanes_t::Set2Planes(const VPlane* pPlanes) +{ + nX = LoadUnalignedSIMD(&pPlanes[0].m_Normal.x); + nY = LoadUnalignedSIMD(&pPlanes[1].m_Normal.x); + nZ = Four_Zeros; + dist = Four_Zeros; + TransposeSIMD(nX, nY, nZ, dist); + ComputeSignbits(); +} + +void fourplanes_t::Get4Planes(VPlane* pPlanesOut) const +{ + fltx4 p0 = nX; + fltx4 p1 = nY; + fltx4 p2 = nZ; + fltx4 p3 = dist; + TransposeSIMD(p0, p1, p2, p3); + StoreUnalignedSIMD(&pPlanesOut[0].m_Normal.x, p0); + StoreUnalignedSIMD(&pPlanesOut[1].m_Normal.x, p1); + StoreUnalignedSIMD(&pPlanesOut[2].m_Normal.x, p2); + StoreUnalignedSIMD(&pPlanesOut[3].m_Normal.x, p3); +} + +void fourplanes_t::Get2Planes(VPlane* pPlanesOut) const +{ + fltx4 p0 = nX; + fltx4 p1 = nY; + fltx4 p2 = nZ; + fltx4 p3 = dist; + TransposeSIMD(p0, p1, p2, p3); + StoreUnalignedSIMD(&pPlanesOut[0].m_Normal.x, p0); + StoreUnalignedSIMD(&pPlanesOut[1].m_Normal.x, p1); +} + + +Frustum_t::Frustum_t() +{ + memset(this, 0, sizeof(*this)); +} + +void Frustum_t::SetPlane(int i, const Vector3D& vecNormal, float dist) +{ + if (i < 4) + { + planes[0].SetPlane(i, vecNormal, dist); + } + else + { + planes[1].SetPlane(i - 4, vecNormal, dist); + } +} + +void Frustum_t::GetPlane(int i, Vector3D* pNormalOut, float* pDistOut) const +{ + if (i < 4) + { + planes[0].GetPlane(i, pNormalOut, pDistOut); + } + else + { + planes[1].GetPlane(i - 4, pNormalOut, pDistOut); + } +} + +void Frustum_t::SetPlanes(const VPlane* pPlanes) +{ + planes[0].Set4Planes(pPlanes); + planes[1].Set2Planes(pPlanes + 4); +} + +void Frustum_t::GetPlanes(VPlane* pPlanesOut) const +{ + planes[0].Get4Planes(pPlanesOut); + planes[1].Get2Planes(pPlanesOut + 4); +} + + +bool Frustum_t::CullBox(const Vector3D& mins, const Vector3D& maxs) const +{ + fltx4 mins4 = LoadUnalignedSIMD(&mins.x); + fltx4 minx = SplatXSIMD(mins4); + fltx4 miny = SplatYSIMD(mins4); + fltx4 minz = SplatZSIMD(mins4); + fltx4 maxs4 = LoadUnalignedSIMD(&maxs.x); + fltx4 maxx = SplatXSIMD(maxs4); + fltx4 maxy = SplatYSIMD(maxs4); + fltx4 maxz = SplatZSIMD(maxs4); + + // compute the dot product of the normal and the farthest corner + // dotBack0 = DotProduct( normal, normals.x < 0 ? mins.x : maxs.x ); + for (int i = 0; i < 2; i++) + { + fltx4 xTotalBack = MulSIMD(planes[i].nX, MaskedAssign(planes[i].xSign, minx, maxx)); + fltx4 yTotalBack = MulSIMD(planes[i].nY, MaskedAssign(planes[i].ySign, miny, maxy)); + fltx4 zTotalBack = MulSIMD(planes[i].nZ, MaskedAssign(planes[i].zSign, minz, maxz)); + fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack)); + // if plane of the farthest corner is behind the plane, then the box is completely outside this plane + if (IsVector4LessThan(dotBack, planes[i].dist)) + return true; + } + return false; +} + +bool Frustum_t::CullBox(const fltx4& mins4, const fltx4& maxs4) const +{ + fltx4 minx = SplatXSIMD(mins4); + fltx4 miny = SplatYSIMD(mins4); + fltx4 minz = SplatZSIMD(mins4); + fltx4 maxx = SplatXSIMD(maxs4); + fltx4 maxy = SplatYSIMD(maxs4); + fltx4 maxz = SplatZSIMD(maxs4); + + // compute the dot product of the normal and the farthest corner + // dotBack0 = DotProduct( normal, normals.x < 0 ? mins.x : maxs.x ); + for (int i = 0; i < 2; i++) + { + fltx4 xTotalBack = MulSIMD(planes[i].nX, MaskedAssign(planes[i].xSign, minx, maxx)); + fltx4 yTotalBack = MulSIMD(planes[i].nY, MaskedAssign(planes[i].ySign, miny, maxy)); + fltx4 zTotalBack = MulSIMD(planes[i].nZ, MaskedAssign(planes[i].zSign, minz, maxz)); + fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack)); + // if plane of the farthest corner is behind the plane, then the box is completely outside this plane + if (IsVector4LessThan(dotBack, planes[i].dist)) + return true; + } + return false; +} + +bool Frustum_t::CullBoxCenterExtents(const Vector3D& center, const Vector3D& extents) const +{ + fltx4 center4 = LoadUnalignedSIMD(¢er.x); + fltx4 centerx = SplatXSIMD(center4); + fltx4 centery = SplatYSIMD(center4); + fltx4 centerz = SplatZSIMD(center4); + fltx4 extents4 = LoadUnalignedSIMD(&extents.x); + fltx4 extx = SplatXSIMD(extents4); + fltx4 exty = SplatYSIMD(extents4); + fltx4 extz = SplatZSIMD(extents4); + + // compute the dot product of the normal and the farthest corner + for (int i = 0; i < 2; i++) + { + fltx4 xTotalBack = AddSIMD(MulSIMD(planes[i].nX, centerx), MulSIMD(planes[i].nXAbs, extx)); + fltx4 yTotalBack = AddSIMD(MulSIMD(planes[i].nY, centery), MulSIMD(planes[i].nYAbs, exty)); + fltx4 zTotalBack = AddSIMD(MulSIMD(planes[i].nZ, centerz), MulSIMD(planes[i].nZAbs, extz)); + fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack)); + // if plane of the farthest corner is behind the plane, then the box is completely outside this plane + if (IsVector4LessThan(dotBack, planes[i].dist)) + return true; + } + return false; +} + + +bool Frustum_t::CullBoxCenterExtents(const fltx4& fl4Center, const fltx4& fl4Extents) const +{ + fltx4 centerx = SplatXSIMD(fl4Center); + fltx4 centery = SplatYSIMD(fl4Center); + fltx4 centerz = SplatZSIMD(fl4Center); + fltx4 extx = SplatXSIMD(fl4Extents); + fltx4 exty = SplatYSIMD(fl4Extents); + fltx4 extz = SplatZSIMD(fl4Extents); + + // compute the dot product of the normal and the farthest corner + for (int i = 0; i < 2; i++) + { + fltx4 xTotalBack = AddSIMD(MulSIMD(planes[i].nX, centerx), MulSIMD(planes[i].nXAbs, extx)); + fltx4 yTotalBack = AddSIMD(MulSIMD(planes[i].nY, centery), MulSIMD(planes[i].nYAbs, exty)); + fltx4 zTotalBack = AddSIMD(MulSIMD(planes[i].nZ, centerz), MulSIMD(planes[i].nZAbs, extz)); + fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack)); + // if plane of the farthest corner is behind the plane, then the box is completely outside this plane + if (IsVector4LessThan(dotBack, planes[i].dist)) + return true; + } + return false; +} + +// Return true if this bounding volume is contained in the frustum, false if it is not +// TODO SIMDIFY +bool Frustum_t::Contains(const Vector3D& mins, const Vector3D& maxs) const +{ + // Get box corners + Vector3D vCorners[8]; + vCorners[0] = mins; + vCorners[1] = Vector3D(mins.x, mins.y, maxs.z); + vCorners[2] = Vector3D(mins.x, maxs.y, mins.z); + vCorners[3] = Vector3D(mins.x, maxs.y, maxs.z); + + vCorners[4] = Vector3D(maxs.x, mins.y, mins.z); + vCorners[5] = Vector3D(maxs.x, mins.y, maxs.z); + vCorners[6] = Vector3D(maxs.x, maxs.y, mins.z); + vCorners[7] = maxs; + + + // if we are in with all points, then we are fully in + for (int j = 0; j < FRUSTUM_NUMPLANES; ++j) + { + for (int i = 0; i < 8; ++i) + { + // compute the dot product of the normal and the corner + Vector3D vNormal; + float dist; + GetPlane(i, &vNormal, &dist); + if (DotProduct(vCorners[j], vNormal) <= 0) + { + return false; + } + } + } + + return true; // all pts were inside +} + +// Brute force SAT frustum intersection between two frustums +bool Frustum_t::Intersects(Frustum_t& otherFrustum) const +{ + Vector3D pPointsA[8]; + bool bResult = false; + bResult = GetCorners(pPointsA); + Assert(bResult); + VPlane pPlanesA[FRUSTUM_NUMPLANES]; + GetPlanes(pPlanesA); + + Vector3D pPointsB[8]; + bResult = otherFrustum.GetCorners(pPointsB); + Assert(bResult); + VPlane pPlanesB[FRUSTUM_NUMPLANES]; + otherFrustum.GetPlanes(pPlanesB); + + // See if all points in B are on one side of any plane in A + for (int p = 0; p < 6; ++p) + { + bool bPointsOnOutside = true; + for (int i = 0; i < 8; ++i) + { + float flDist = pPlanesA[p].DistTo(pPointsB[i]); + + // If dist is pos, we are not on the outside + if (flDist > 0) + { + bPointsOnOutside = false; + break; + } + } + + // We never hit a negative case, we have a separating axis + if (bPointsOnOutside) + { + return false; + } + } + + // See if all points in A are on one side of any plane in B + for (int p = 0; p < 6; ++p) + { + bool bPointsOnOutside = true; + for (int i = 0; i < 8; ++i) + { + float flDist = pPlanesB[p].DistTo(pPointsA[i]); + + // If dist is pos, we are not on the outside + if (flDist > 0) + { + bPointsOnOutside = false; + break; + } + } + + // We never hit a negative case, we have a separating axis + if (bPointsOnOutside) + { + return false; + } + } + + // They intersect + return true; +} + +// Return true if this bounding volume intersects the frustum, false if it is outside +bool Frustum_t::Intersects(const Vector3D& mins, const Vector3D& maxs) const +{ + fltx4 mins4 = LoadUnalignedSIMD(&mins.x); + fltx4 minx = SplatXSIMD(mins4); + fltx4 miny = SplatYSIMD(mins4); + fltx4 minz = SplatZSIMD(mins4); + fltx4 maxs4 = LoadUnalignedSIMD(&maxs.x); + fltx4 maxx = SplatXSIMD(maxs4); + fltx4 maxy = SplatYSIMD(maxs4); + fltx4 maxz = SplatZSIMD(maxs4); + + // compute the dot product of the normal and the farthest corner + // dotBack0 = DotProduct( normal, normals.x < 0 ? mins.x : maxs.x ); + for (int i = 0; i < 2; i++) + { + fltx4 xTotalBack = MulSIMD(planes[i].nX, MaskedAssign(planes[i].xSign, minx, maxx)); + fltx4 yTotalBack = MulSIMD(planes[i].nY, MaskedAssign(planes[i].ySign, miny, maxy)); + fltx4 zTotalBack = MulSIMD(planes[i].nZ, MaskedAssign(planes[i].zSign, minz, maxz)); + fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack)); + // if plane of the farthest corner is behind the plane, then the box is completely outside this plane +#if _X360 + if (!XMVector3GreaterOrEqual(dotBack, planes[i].dist)) + return false; +#elif defined( _PS3 ) + bi32x4 isOut = CmpLtSIMD(dotBack, planes[i].dist); + if (IsAnyNegative(isOut)) + return false; +#else + fltx4 isOut = CmpLtSIMD(dotBack, planes[i].dist); + if (IsAnyNegative(isOut)) + return false; +#endif + } + return true; +} + +bool Frustum_t::Intersects(const fltx4& mins4, const fltx4& maxs4) const +{ + fltx4 minx = SplatXSIMD(mins4); + fltx4 miny = SplatYSIMD(mins4); + fltx4 minz = SplatZSIMD(mins4); + fltx4 maxx = SplatXSIMD(maxs4); + fltx4 maxy = SplatYSIMD(maxs4); + fltx4 maxz = SplatZSIMD(maxs4); + + // compute the dot product of the normal and the farthest corner + // dotBack0 = DotProduct( normal, normals.x < 0 ? mins.x : maxs.x ); + for (int i = 0; i < 2; i++) + { + fltx4 xTotalBack = MulSIMD(planes[i].nX, MaskedAssign(planes[i].xSign, minx, maxx)); + fltx4 yTotalBack = MulSIMD(planes[i].nY, MaskedAssign(planes[i].ySign, miny, maxy)); + fltx4 zTotalBack = MulSIMD(planes[i].nZ, MaskedAssign(planes[i].zSign, minz, maxz)); + fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack)); + // if plane of the farthest corner is behind the plane, then the box is completely outside this plane +#if _X360 + if (!XMVector4GreaterOrEqual(dotBack, planes[i].dist)) + return false; +#elif defined( _PS3 ) + bi32x4 isOut = CmpLtSIMD(dotBack, planes[i].dist); + if (IsAnyNegative(isOut)) + return false; +#else + fltx4 isOut = CmpLtSIMD(dotBack, planes[i].dist); + if (IsAnyNegative(isOut)) + return false; +#endif + } + return true; +} + +bool Frustum_t::IntersectsCenterExtents(const Vector3D& center, const Vector3D& extents) const +{ + fltx4 center4 = LoadUnalignedSIMD(¢er.x); + fltx4 centerx = SplatXSIMD(center4); + fltx4 centery = SplatYSIMD(center4); + fltx4 centerz = SplatZSIMD(center4); + fltx4 extents4 = LoadUnalignedSIMD(&extents.x); + fltx4 extx = SplatXSIMD(extents4); + fltx4 exty = SplatYSIMD(extents4); + fltx4 extz = SplatZSIMD(extents4); + + // compute the dot product of the normal and the farthest corner + for (int i = 0; i < 2; i++) + { + fltx4 xTotalBack = AddSIMD(MulSIMD(planes[i].nX, centerx), MulSIMD(planes[i].nXAbs, extx)); + fltx4 yTotalBack = AddSIMD(MulSIMD(planes[i].nY, centery), MulSIMD(planes[i].nYAbs, exty)); + fltx4 zTotalBack = AddSIMD(MulSIMD(planes[i].nZ, centerz), MulSIMD(planes[i].nZAbs, extz)); + fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack)); + // if plane of the farthest corner is behind the plane, then the box is completely outside this plane +#if _X360 + if (!XMVector4GreaterOrEqual(dotBack, planes[i].dist)) + return false; +#elif defined( _PS3 ) + bi32x4 isOut = CmpLtSIMD(dotBack, planes[i].dist); + if (IsAnyNegative(isOut)) + return false; +#else + fltx4 isOut = CmpLtSIMD(dotBack, planes[i].dist); + if (IsAnyNegative(isOut)) + return false; +#endif + } + return true; +} + + +bool Frustum_t::IntersectsCenterExtents(const fltx4& fl4Center, const fltx4& fl4Extents) const +{ + fltx4 centerx = SplatXSIMD(fl4Center); + fltx4 centery = SplatYSIMD(fl4Center); + fltx4 centerz = SplatZSIMD(fl4Center); + fltx4 extx = SplatXSIMD(fl4Extents); + fltx4 exty = SplatYSIMD(fl4Extents); + fltx4 extz = SplatZSIMD(fl4Extents); + + // compute the dot product of the normal and the farthest corner + for (int i = 0; i < 2; i++) + { + fltx4 xTotalBack = AddSIMD(MulSIMD(planes[i].nX, centerx), MulSIMD(planes[i].nXAbs, extx)); + fltx4 yTotalBack = AddSIMD(MulSIMD(planes[i].nY, centery), MulSIMD(planes[i].nYAbs, exty)); + fltx4 zTotalBack = AddSIMD(MulSIMD(planes[i].nZ, centerz), MulSIMD(planes[i].nZAbs, extz)); + fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack)); + // if plane of the farthest corner is behind the plane, then the box is completely outside this plane +#if _X360 + if (!XMVector3GreaterOrEqual(dotBack, planes[i].dist)) + return false; +#elif defined( _PS3 ) + bi32x4 isOut = CmpLtSIMD(dotBack, planes[i].dist); + if (IsAnyNegative(isOut)) + return false; +#else + fltx4 isOut = CmpLtSIMD(dotBack, planes[i].dist); + if (IsAnyNegative(isOut)) + return false; +#endif + } + return true; +} + +//----------------------------------------------------------------------------- +// Generate a frustum based on orthographic parameters +//----------------------------------------------------------------------------- +void GenerateOrthoFrustumFLU(const Vector3D& origin, const Vector3D& forward, const Vector3D& vLeft, const Vector3D& up, float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar, VPlane* pPlanesOut) +{ + // YUP_ACTIVE: FIXME : This is actually producing incorrect planes (see the VectorMA below) + Vector3D vRight = vLeft; + vRight *= -1.0f; + + float flIntercept = DotProduct(origin, forward); + + pPlanesOut[FRUSTUM_NEARZ].Init(forward, flZNear + flIntercept); + pPlanesOut[FRUSTUM_FARZ].Init(-forward, -flZFar - flIntercept); + + flIntercept = DotProduct(origin, vRight); + + pPlanesOut[FRUSTUM_RIGHT].Init(-vRight, -flRight - flIntercept); + pPlanesOut[FRUSTUM_LEFT].Init(vRight, flLeft + flIntercept); + + flIntercept = DotProduct(origin, up); + + pPlanesOut[FRUSTUM_BOTTOM].Init(up, flBottom + flIntercept); + pPlanesOut[FRUSTUM_TOP].Init(-up, -flTop - flIntercept); +} + //----------------------------------------------------------------------------- // Generate a frustum based on perspective view parameters //----------------------------------------------------------------------------- -void GeneratePerspectiveFrustum(const Vector3D& origin, const Vector3D& forward, - const Vector3D& right, const Vector3D& up, float flZNear, float flZFar, - float flFovX, float flFovY, Frustum_t& frustum) +void GeneratePerspectiveFrustumFLU(const Vector3D& origin, const Vector3D& forward, + const Vector3D& vLeft, const Vector3D& up, float flZNear, float flZFar, + float flFovX, float flAspect, VPlane* pPlanesOut) { + // YUP_ACTIVE: FIXME : This is actually producing incorrect planes (see the VectorMA below) + Vector3D vRight = vLeft; + vRight *= -1.0f; + float flIntercept = DotProduct(origin, forward); // Setup the near and far planes. - frustum.SetPlane(FRUSTUM_FARZ, PLANE_ANYZ, -forward, -flZFar - flIntercept); - frustum.SetPlane(FRUSTUM_NEARZ, PLANE_ANYZ, forward, flZNear + flIntercept); + pPlanesOut[FRUSTUM_FARZ].Init(-forward, -flZFar - flIntercept); + pPlanesOut[FRUSTUM_NEARZ].Init(forward, flZNear + flIntercept); flFovX *= 0.5f; - flFovY *= 0.5f; float flTanX = tan(DEG2RAD(flFovX)); - float flTanY = tan(DEG2RAD(flFovY)); + float flTanY = flTanX / flAspect; // OPTIMIZE: Normalizing these planes is not necessary for culling Vector3D normalPos, normalNeg; - VectorMA(right, flTanX, forward, normalPos); - VectorMA(normalPos, -2.0f, right, normalNeg); + // NOTE: This should be using left and not right to produce correct planes, not changing it quite yet + // because I'm not able to test whether fixing this breaks anything. + VectorMA(vRight, flTanX, forward, normalPos); + VectorMA(normalPos, -2.0f, vRight, normalNeg); VectorNormalize(normalPos); VectorNormalize(normalNeg); - frustum.SetPlane(FRUSTUM_LEFT, PLANE_ANYZ, normalPos, normalPos.Dot(origin)); - frustum.SetPlane(FRUSTUM_RIGHT, PLANE_ANYZ, normalNeg, normalNeg.Dot(origin)); + pPlanesOut[FRUSTUM_LEFT].Init(normalPos, normalPos.Dot(origin)); + pPlanesOut[FRUSTUM_RIGHT].Init(normalNeg, normalNeg.Dot(origin)); VectorMA(up, flTanY, forward, normalPos); VectorMA(normalPos, -2.0f, up, normalNeg); @@ -4108,44 +5344,109 @@ void GeneratePerspectiveFrustum(const Vector3D& origin, const Vector3D& forward, VectorNormalize(normalPos); VectorNormalize(normalNeg); - frustum.SetPlane(FRUSTUM_BOTTOM, PLANE_ANYZ, normalPos, normalPos.Dot(origin)); - frustum.SetPlane(FRUSTUM_TOP, PLANE_ANYZ, normalNeg, normalNeg.Dot(origin)); + pPlanesOut[FRUSTUM_BOTTOM].Init(normalPos, normalPos.Dot(origin)); + pPlanesOut[FRUSTUM_TOP].Init(normalNeg, normalNeg.Dot(origin)); } - -//----------------------------------------------------------------------------- -// Version that accepts angles instead of Vector3Ds -//----------------------------------------------------------------------------- -void GeneratePerspectiveFrustum(const Vector3D& origin, const QAngle& angles, float flZNear, float flZFar, float flFovX, float flAspectRatio, Frustum_t& frustum) +// Generate a frustum based on perspective view parameters +void Frustum_t::CreatePerspectiveFrustumFLU(const Vector3D& vOrigin, const Vector3D& vForward, + const Vector3D& vLeft, const Vector3D& vUp, float flZNear, float flZFar, + float flFovX, float flAspect) { - Vector3D vecForward, vecRight, vecUp; - AngleVectors(angles, &vecForward, &vecRight, &vecUp); - float flFovY = CalcFovY(flFovX, flAspectRatio); - GeneratePerspectiveFrustum(origin, vecForward, vecRight, vecUp, flZNear, flZFar, flFovX, flFovY, frustum); + VPlane planes[FRUSTUM_NUMPLANES]; + GeneratePerspectiveFrustumFLU(vOrigin, vForward, vLeft, vUp, flZNear, flZFar, flFovX, flAspect, planes); + SetPlanes(planes); } -bool R_CullBox(const Vector3D& mins, const Vector3D& maxs, const Frustum_t& frustum) +//#ifndef YUP_ACTIVE +void Frustum_t::CreatePerspectiveFrustum(const Vector3D& origin, const Vector3D& forward, + const Vector3D& right, const Vector3D& up, float flZNear, float flZFar, + float flFovX, float flAspect) { - return ((BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_RIGHT)) == 2) || - (BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_LEFT)) == 2) || - (BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_TOP)) == 2) || - (BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_BOTTOM)) == 2) || - (BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_NEARZ)) == 2) || - (BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_FARZ)) == 2)); + Vector3D vLeft = right; + vLeft *= -1.0f; + CreatePerspectiveFrustumFLU(origin, forward, vLeft, up, flZNear, flZFar, flFovX, flAspect); } +//#endif -bool R_CullBoxSkipNear(const Vector3D& mins, const Vector3D& maxs, const Frustum_t& frustum) +// Version that accepts angles instead of vectors +void Frustum_t::CreatePerspectiveFrustum(const Vector3D& origin, const QAngle& angles, float flZNear, float flZFar, float flFovX, float flAspectRatio) { - return ((BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_RIGHT)) == 2) || - (BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_LEFT)) == 2) || - (BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_TOP)) == 2) || - (BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_BOTTOM)) == 2) || - (BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_FARZ)) == 2)); + VPlane planes[FRUSTUM_NUMPLANES]; + Vector3D vecForward, vecLeft, vecUp; + AngleVectorsFLU(angles, &vecForward, &vecLeft, &vecUp); + GeneratePerspectiveFrustumFLU(origin, vecForward, vecLeft, vecUp, flZNear, flZFar, flFovX, flAspectRatio, planes); + SetPlanes(planes); } +// Generate a frustum based on orthographic parameters +void Frustum_t::CreateOrthoFrustumFLU(const Vector3D& origin, const Vector3D& forward, const Vector3D& vLeft, const Vector3D& up, float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar) +{ + VPlane planes[FRUSTUM_NUMPLANES]; + GenerateOrthoFrustumFLU(origin, forward, vLeft, up, flLeft, flRight, flBottom, flTop, flZNear, flZFar, planes); + SetPlanes(planes); +} + +//#ifndef YUP_ACTIVE +void Frustum_t::CreateOrthoFrustum(const Vector3D& origin, const Vector3D& forward, const Vector3D& right, const Vector3D& up, float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar) +{ + Vector3D vLeft = right; + vLeft *= -1.0f; + CreateOrthoFrustumFLU(origin, forward, vLeft, up, flLeft, flRight, flBottom, flTop, flZNear, flZFar); +} + +// The points returned correspond to the corners of the frustum faces +// Points 0 to 3 correspond to the near face +// Points 4 to 7 correspond to the far face +// Returns points in a face in this order: +// 2--3 +// | | +// 0--1 +bool Frustum_t::GetCorners(Vector3D* pPoints) const +{ + VPlane planes[FRUSTUM_NUMPLANES]; + GetPlanes(planes); + + // Near face + // Bottom Left + if (!PlaneIntersection(planes[FRUSTUM_NEARZ], planes[FRUSTUM_LEFT], planes[FRUSTUM_BOTTOM], pPoints[0])) + return false; + + // Bottom right + if (!PlaneIntersection(planes[FRUSTUM_NEARZ], planes[FRUSTUM_RIGHT], planes[FRUSTUM_BOTTOM], pPoints[1])) + return false; + + // Upper Left + if (!PlaneIntersection(planes[FRUSTUM_NEARZ], planes[FRUSTUM_LEFT], planes[FRUSTUM_TOP], pPoints[2])) + return false; + + // Upper right + if (!PlaneIntersection(planes[FRUSTUM_NEARZ], planes[FRUSTUM_RIGHT], planes[FRUSTUM_TOP], pPoints[3])) + return false; + + // Far face + // Bottom Left + if (!PlaneIntersection(planes[FRUSTUM_FARZ], planes[FRUSTUM_LEFT], planes[FRUSTUM_BOTTOM], pPoints[4])) + return false; + + // Bottom right + if (!PlaneIntersection(planes[FRUSTUM_FARZ], planes[FRUSTUM_RIGHT], planes[FRUSTUM_BOTTOM], pPoints[5])) + return false; + + // Upper Left + if (!PlaneIntersection(planes[FRUSTUM_FARZ], planes[FRUSTUM_LEFT], planes[FRUSTUM_TOP], pPoints[6])) + return false; + + // Upper right + if (!PlaneIntersection(planes[FRUSTUM_FARZ], planes[FRUSTUM_RIGHT], planes[FRUSTUM_TOP], pPoints[7])) + return false; + + + return true; +} // NOTE: This routine was taken (and modified) from NVidia's BlinnReflection demo -// Creates basis Vector3Ds, based on a vertex and index list. +// Creates basis vectors, based on a vertex and index list. // See the NVidia white paper 'GDC2K PerPixel Lighting' for a description // of how this computation works #define SMALL_FLOAT 1e-12 @@ -4203,10 +5504,10 @@ void CalcTriangleTangentSpace(const Vector3D& p0, const Vector3D& p1, const Vect //----------------------------------------------------------------------------- void RGBtoHSV(const Vector3D& rgb, Vector3D& hsv) { - float flMax = max(rgb.x, rgb.y); - flMax = max(flMax, rgb.z); - float flMin = min(rgb.x, rgb.y); - flMin = min(flMin, rgb.z); + float flMax = MAX(rgb.x, rgb.y); + flMax = MAX(flMax, rgb.z); + float flMin = MIN(rgb.x, rgb.y); + flMin = MIN(flMin, rgb.z); // hsv.z is the value hsv.z = flMax; @@ -4267,7 +5568,7 @@ void HSVtoRGB(const Vector3D& hsv, Vector3D& rgb) hue = 0.0F; } hue /= 60.0F; - int i = hue; // integer part + int i = Float2Int(hue); // integer part float32 f = hue - i; // fractional part float32 p = hsv.z * (1.0F - hsv.y); float32 q = hsv.z * (1.0F - hsv.y * f); @@ -4354,7 +5655,37 @@ void GetInterpolationData(float const* pKnotPositions, return; } -float RandomVector3DInUnitSphere(Vector3D* pVector3D) + +static Vector3D RandomVectorOnUnitSphere(float u, float v) +{ + float flPhi = acos(1 - 2 * u); + float flTheta = 2 * M_PI * v; + + float flSinPhi, flCosPhi; + float flSinTheta, flCosTheta; + SinCos(flPhi, &flSinPhi, &flCosPhi); + SinCos(flTheta, &flSinTheta, &flCosTheta); + + return Vector3D(flSinPhi * flCosTheta, flSinPhi * flSinTheta, flCosPhi); +} + + +Vector3D RandomVectorOnUnitSphere() +{ + // Guarantee uniform random distribution on a sphere + // Graphics gems III contains this algorithm ("Nonuniform random point sets via warping") + float u = RandomFloat(0., 1.); + float v = RandomFloat(0., 1.); + return RandomVectorOnUnitSphere(u, v); +} + + +Vector3D RandomVectorOnUnitSphere(IUniformRandomStream* pRnd) +{ + return RandomVectorOnUnitSphere(pRnd->RandomFloat(), pRnd->RandomFloat()); +} + +float RandomVectorInUnitSphere(Vector3D* pVector) { // Guarantee uniform random distribution within a sphere // Graphics gems III contains this algorithm ("Nonuniform random point sets via warping") @@ -4371,13 +5702,34 @@ float RandomVector3DInUnitSphere(Vector3D* pVector3D) SinCos(flPhi, &flSinPhi, &flCosPhi); SinCos(flTheta, &flSinTheta, &flCosTheta); - pVector3D->x = flRadius * flSinPhi * flCosTheta; - pVector3D->y = flRadius * flSinPhi * flSinTheta; - pVector3D->z = flRadius * flCosPhi; + pVector->x = flRadius * flSinPhi * flCosTheta; + pVector->y = flRadius * flSinPhi * flSinTheta; + pVector->z = flRadius * flCosPhi; return flRadius; } -float RandomVector3DInUnitCircle(Vector2D* pVector3D) + +Vector3D RandomVectorInUnitSphere() +{ + Vector3D vOut; + RandomVectorInUnitSphere(&vOut); + return vOut; +} + +Vector3D RandomVectorInUnitSphere(IUniformRandomStream* pRnd) +{ + float w = pRnd->RandomFloat(); + float flRadius = powf(w, 1.0f / 3.0f); + + Vector3D v = RandomVectorOnUnitSphere(pRnd) * flRadius; + + return v; +} + + + + +float RandomVectorInUnitCircle(Vector2D* pVector) { // Guarantee uniform random distribution within a sphere // Graphics gems III contains this algorithm ("Nonuniform random point sets via warping") @@ -4390,68 +5742,96 @@ float RandomVector3DInUnitCircle(Vector2D* pVector3D) float flSinTheta, flCosTheta; SinCos(flTheta, &flSinTheta, &flCosTheta); - pVector3D->x = flRadius * flCosTheta; - pVector3D->y = flRadius * flSinTheta; + pVector->x = flRadius * flCosTheta; + pVector->y = flRadius * flSinTheta; return flRadius; } -#ifdef FP_EXCEPTIONS_ENABLED -#include // For _clearfp and _controlfp_s -#endif -// FPExceptionDisable and FPExceptionEnabler taken from my blog post -// at http://www.altdevblogaday.com/2012/04/20/exceptional-floating-point/ -#ifdef FP_EXCEPTIONS_ENABLED -// These functions are all inlined NOPs if FP_EXCEPTIONS_ENABLED is not defined. -FPExceptionDisabler::FPExceptionDisabler() +const Quaternion RandomQuaternion() { - // Retrieve the current state of the exception flags. This - // must be done before changing them. _MCW_EM is a bit - // mask representing all available exception masks. - _controlfp_s(&mOldValues, 0, 0); - // Set all of the exception flags, which suppresses FP - // exceptions on the x87 and SSE units. - _controlfp_s(0, _MCW_EM, _MCW_EM); + // Guarantee uniform distribution within S^3. Found on the internet, looked through the proof very briefly, looks sound enough to tentatively trust it before testing or checking the proof for real. + // http://mathproofs.blogspot.com/2005/05/uniformly-distributed-random-unit.html + float u = RandomFloat(0, 2 * M_PI), flSinU = sinf(u); + float v = acosf(RandomFloat(-1, 1)), flSinV = sinf(v); + float w = 0.5f * (RandomFloat(0, M_PI) + acosf(RandomFloat(0, 1)) + M_PI / 2), flSinW = sinf(w); + return Quaternion(cosf(u), flSinU * cosf(v), flSinU * flSinV * cosf(w), flSinU * flSinV * flSinW); } -FPExceptionDisabler::~FPExceptionDisabler() +const Quaternion RandomQuaternion(IUniformRandomStream* pRnd) { - // Clear any pending FP exceptions. This must be done - // prior to enabling FP exceptions since otherwise there - // may be a 'deferred crash' as soon the exceptions are - // enabled. - _clearfp(); - - // Reset (possibly enabling) the exception status. - _controlfp_s(0, mOldValues, _MCW_EM); + // Guarantee uniform distribution within S^3. Found on the internet, looked through the proof very briefly, looks sound enough to tentatively trust it before testing or checking the proof for real. + // http://mathproofs.blogspot.com/2005/05/uniformly-distributed-random-unit.html + float u = pRnd->RandomFloat(0, 2 * M_PI), flSinU = sinf(u); + float v = acosf(pRnd->RandomFloat(-1, 1)), flSinV = sinf(v); + float w = 0.5f * (pRnd->RandomFloat(0, M_PI) + acosf(pRnd->RandomFloat(0, 1)) + M_PI / 2), flSinW = sinf(w); + return Quaternion(cosf(u), flSinU * cosf(v), flSinU * flSinV * cosf(w), flSinU * flSinV * flSinW); } -// Overflow, divide-by-zero, and invalid-operation are the FP -// exceptions most frequently associated with bugs. -FPExceptionEnabler::FPExceptionEnabler(unsigned int enableBits /*= _EM_OVERFLOW | _EM_ZERODIVIDE | _EM_INVALID*/) +// Originally from hammer_mathlib.cpp +// +// Generate the corner points of a box: +// +y _+z +// ^ /| +// | / +// | 3---7 +// /| /| +// / | / | +// 2---6 | +// | 1|--5 +// | / | / +// |/ |/ +// 0---4 --> +x +// +void PointsFromBox(const Vector3D& mins, const Vector3D& maxs, Vector3D* points) { - // Retrieve the current state of the exception flags. This - // must be done before changing them. _MCW_EM is a bit - // mask representing all available exception masks. - _controlfp_s(&mOldValues, 0, 0); + points[0][0] = mins[0]; + points[0][1] = mins[1]; + points[0][2] = mins[2]; - // Make sure no non-exception flags have been specified, - // to avoid accidental changing of rounding modes, etc. - enableBits &= _MCW_EM; + points[1][0] = mins[0]; + points[1][1] = mins[1]; + points[1][2] = maxs[2]; - // Clear any pending FP exceptions. This must be done - // prior to enabling FP exceptions since otherwise there - // may be a 'deferred crash' as soon the exceptions are - // enabled. - _clearfp(); + points[2][0] = mins[0]; + points[2][1] = maxs[1]; + points[2][2] = mins[2]; - // Zero out the specified bits, leaving other bits alone. - _controlfp_s(0, ~enableBits, enableBits); + points[3][0] = mins[0]; + points[3][1] = maxs[1]; + points[3][2] = maxs[2]; + + points[4][0] = maxs[0]; + points[4][1] = mins[1]; + points[4][2] = mins[2]; + + points[5][0] = maxs[0]; + points[5][1] = mins[1]; + points[5][2] = maxs[2]; + + points[6][0] = maxs[0]; + points[6][1] = maxs[1]; + points[6][2] = mins[2]; + + points[7][0] = maxs[0]; + points[7][1] = maxs[1]; + points[7][2] = maxs[2]; } -FPExceptionEnabler::~FPExceptionEnabler() +void BuildTransformedBox(Vector3D* v2, Vector3D const& bbmin, Vector3D const& bbmax, const matrix3x4_t& m) { - // Reset the exception state. - _controlfp_s(0, mOldValues, _MCW_EM); + Vector3D v[8]; + PointsFromBox(bbmin, bbmax, v); + + VectorTransform(v[0], m, v2[0]); + VectorTransform(v[1], m, v2[1]); + VectorTransform(v[2], m, v2[2]); + VectorTransform(v[3], m, v2[3]); + VectorTransform(v[4], m, v2[4]); + VectorTransform(v[5], m, v2[5]); + VectorTransform(v[6], m, v2[6]); + VectorTransform(v[7], m, v2[7]); } -#endif + + +#endif // !defined(__SPU__) diff --git a/r5dev/mathlib/noisedata.h b/r5dev/mathlib/noisedata.h index d93b10c1..e1bb0ba6 100644 --- a/r5dev/mathlib/noisedata.h +++ b/r5dev/mathlib/noisedata.h @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//====== Copyright � 1996-2006, Valve Corporation, All rights reserved. =======// // // Purpose: static data for noise() primitives. // @@ -178,3 +178,132 @@ static float impulse_zcoords[] = { 0.796078,0.615686,0.878431,0.921569,0.631373,0.200000,0.403922,0.462745 }; +static float s_randomGradients[] = { + -0.460087, -0.887463, -0.058594 ,-0.458151, 0.861646, -0.430176 , + -0.930437, 0.316048, -0.195496 ,-0.883558, -0.393287, -0.276550 , + 0.171025, -0.983455, -0.329712 ,-0.033573, -0.941867, -0.994995 , + -0.476492, 0.014764, 0.879150 ,0.834786, -0.454571, 0.348755 ,-0.585801, + -0.782531, -0.338745 ,0.973990, -0.023774, 0.225403 ,-0.989659, + -0.011313, -0.143005 ,0.507109, -0.838016, -0.369141 ,-0.609995, + -0.766277, 0.314087 ,0.429987, 0.599850, -0.843323 ,0.089587, + -0.904071, -0.977783 ,-0.306997, -0.901432, 0.705078 ,0.031606, + 0.994782, -0.950806 ,0.797663, -0.161508, -0.588806 ,0.811569, + -0.505360, 0.339783 ,0.936130, -0.114223, 0.334778 ,0.217280, + -0.970264, 0.440674 ,0.600976, -0.712375, -0.516418 ,0.197935, + 0.979260, 0.213501 ,0.002956, 0.999995, -0.268127 ,-0.912763, 0.084651, + -0.401062 ,-0.193271, -0.945607, -0.804382 ,0.662480, 0.640156, + -0.506348 ,0.363459, -0.884439, 0.627197 ,-0.433415, 0.685363, + 0.803589 ,-0.721652, 0.416952, -0.607971 ,0.647676, 0.296700, + 0.734863 ,0.723040, -0.444294, 0.590454 ,-0.716318, -0.420435, + -0.613770 ,-0.039076, -0.996459, 0.885437 ,0.175225, -0.969092, + 0.703918 ,0.116952, -0.991832, -0.399048 ,-0.504674, -0.013997, + 0.863281 ,-0.436364, -0.817916, 0.651733 ,0.098030, -0.995090, + 0.137573 ,0.637157, -0.766031, -0.132263 ,-0.594718, 0.583153, + -0.681213 ,-0.625632, 0.419913, -0.724426 ,-0.607341, -0.394521, + 0.750427 ,-0.312161, 0.698925, 0.899719 ,0.101228, -0.927363, + -0.962708 ,-0.934241, 0.041214, -0.354553 ,-0.826005, -0.284775, + -0.507446 ,-0.363751, -0.929287, -0.173584 ,-0.141266, 0.983869, + -0.613525 ,-0.436139, -0.074329, 0.899292 ,-0.875355, -0.480839, + 0.057556 ,0.250714, 0.071270, 0.967896 ,0.182131, 0.811467, 0.950195 , + -0.687696, -0.668570, -0.380554 ,0.785175, -0.540171, -0.359863 , + 0.399774, 0.848526, 0.655151 ,-0.412243, -0.004602, 0.911072 ,-0.132187, + -0.990485, 0.278198 ,0.212421, 0.764179, 0.944214 ,-0.694878, 0.234042, + -0.699402 ,0.404273, 0.904644, -0.316406 ,0.358393, 0.087135, + 0.933044 ,-0.473398, 0.820774, -0.559692 ,0.044667, -0.997938, + 0.718201 ,0.603896, -0.046386, 0.796570 ,-0.968822, 0.180966, + 0.172058 ,-0.458206, 0.886932, -0.126221 ,-0.656709, -0.410319, + 0.693848 ,0.999495, -0.018023, 0.026184 ,-0.486069, -0.740178, + -0.690979 ,0.942399, -0.333819, 0.022461 ,-0.294545, 0.867619, + 0.805664 ,0.886791, -0.416081, -0.221252 ,-0.797187, 0.587661, + -0.171021 ,-0.617708, -0.762817, -0.295654 ,0.449351, -0.853660, + -0.505615 ,0.065153, -0.995535, 0.723572 ,0.996518, 0.000000, + 0.083374 ,0.263346, 0.088663, -0.964417 ,-0.221316, -0.970864, + 0.383423 ,-0.512560, 0.718804, 0.675598 ,0.588859, 0.406293, + -0.764648 ,-0.803841, -0.592769, -0.061646 ,0.860199, 0.492898, + -0.150330 ,-0.351871, 0.858024, 0.728455 ,0.515724, -0.815149, + 0.455322 ,-0.122322, -0.960484, 0.898254 ,-0.529020, 0.844443, + -0.156799 ,0.530671, -0.725304, 0.637024 ,-0.748915, -0.248928, + -0.634094 ,-0.188099, 0.584087, 0.972778 ,0.974165, 0.222094, + -0.041992 ,0.595326, -0.701663, -0.549438 ,-0.060279, -0.998047, + -0.262451 ,-0.191682, -0.782292, -0.951477 ,0.528851, -0.596315, + 0.752319 ,0.612134, 0.639567, -0.604919 ,0.882803, 0.200541, 0.433594 , + -0.936278, -0.039490, 0.349304 ,0.940848, -0.121649, 0.318604 , + -0.115022, 0.048685, -0.993347 ,-0.324162, -0.935726, -0.394226 , + -0.937457, -0.294685, 0.193909 ,0.894463, -0.437237, 0.104065 , + -0.861852, -0.165102, -0.486206 ,-0.980480, -0.139899, 0.139526 , + -0.024496, 0.960750, -0.996094 ,-0.699760, 0.714256, -0.018860 , + 0.538575, -0.792107, 0.470581 ,0.309926, -0.943720, 0.349182 ,0.525671, + -0.772280, 0.561523 ,-0.793079, 0.268745, 0.567505 ,0.697504, + -0.421131, 0.639221 ,-0.737871, 0.672553, -0.076660 ,-0.390769, + -0.894942, -0.482666 ,-0.593469, 0.191892, 0.796448 ,0.439379, + -0.896646, 0.123108 ,0.337698, -0.703709, -0.879822 ,-0.654687, + 0.749517, 0.148071 ,-0.482070, -0.700569, 0.737305 ,0.626971, 0.761948, + -0.250610 ,0.616585, 0.015339, -0.787231 ,-0.175877, -0.982000, + 0.364624 ,0.891483, -0.324585, -0.334167 ,0.858029, 0.438272, + -0.297913 ,0.949369, 0.258757, 0.184448 ,0.105948, -0.901183, + 0.969666 ,-0.261581, 0.943276, -0.615845 ,-0.682063, -0.528339, + -0.595520 ,-0.810856, 0.514103, -0.326050 ,-0.163757, 0.986118, + 0.165527 ,-0.595927, -0.221907, 0.791504 ,-0.160374, -0.977354, + 0.652405 ,-0.428837, 0.641628, -0.829102 ,-0.634149, -0.486378, + -0.687927 ,-0.093271, -0.995222, -0.295654 ,0.988659, -0.150144, + -0.003357 ,0.730821, -0.497396, -0.538818 ,-0.781913, -0.621260, + -0.065674 ,-0.655884, -0.753313, -0.073486 ,0.845542, -0.409094, + 0.375977 ,-0.630041, -0.514925, -0.678101 ,0.205571, 0.978634, + -0.019531 ,0.582841, 0.763684, -0.430054 ,0.685084, -0.728464, + 0.000000 ,-0.241437, -0.958430, -0.532898 ,0.741884, 0.020899, + -0.670349 ,0.740273, -0.318412, 0.624634 ,-0.738068, -0.539041, + 0.481812 ,-0.965798, -0.034508, -0.257141 ,0.495184, 0.805372, + 0.549683 ,-0.572524, 0.809558, -0.221008 ,-0.537181, 0.834652, + 0.220825 ,-0.899741, 0.097826, -0.427368 ,-0.370148, 0.494066, + 0.904846 ,0.711387, 0.577688, 0.490356 ,0.183324, -0.722791, + -0.964172 ,0.552815, -0.807753, -0.347351 ,-0.096050, 0.994565, + -0.386047 ,-0.884907, 0.369536, 0.305115 ,-0.832976, -0.551898, + 0.047363 ,0.338883, 0.641922, 0.897034 ,0.805354, 0.506187, 0.357727 , + -0.040128, 0.998805, -0.570923 ,0.466918, -0.602455, 0.811035 ,0.139166, + -0.983697, 0.633362 ,-0.253765, -0.340498, -0.962891 ,-0.448806, + 0.843929, 0.547791 ,-0.859087, -0.434649, -0.300110 ,0.287570, + 0.957661, 0.047729 ,0.379100, 0.795023, 0.780640 ,0.154245, -0.987903, + -0.103088 ,-0.538067, 0.794791, -0.462524 ,-0.466455, -0.180966, + 0.880371 ,-0.175736, -0.983766, 0.202576 ,-0.891655, 0.192080, + -0.417725 ,-0.688716, -0.619004, 0.480652 ,0.120790, -0.987844, + -0.629456 ,-0.075080, 0.983385, 0.910461 ,0.147032, -0.960431, + -0.849304 ,0.732309, 0.671559, 0.152283 ,0.804657, 0.273913, + -0.547729 ,0.391462, -0.913976, 0.263184 ,-0.567300, 0.783128, + 0.409607 ,0.214917, 0.167182, -0.975952 ,0.367428, -0.789995, + -0.800537 ,-0.320112, 0.912727, -0.621399 ,0.659247, -0.647346, + -0.501892 ,0.222842, -0.696452, -0.950562 ,-0.697513, -0.576278, + 0.521118 ,0.602260, -0.756081, 0.391418 ,-0.116043, 0.992942, + 0.206665 ,0.220693, -0.968855, -0.453552 ,0.737991, 0.670137, + 0.106812 ,0.198419, -0.696590, 0.960999 ,-0.391866, -0.883543, + 0.547668 ,0.082067, -0.996213, 0.330200 ,-0.806059, 0.491897, + -0.377991 ,-0.992265, 0.120698, 0.029236 ,0.406622, -0.867524, + 0.575928 ,0.789945, 0.608406, 0.096191 ,-0.531904, -0.004218, + -0.846802 ,0.558298, -0.089427, 0.828125 ,-0.783155, 0.363828, + -0.541382 ,0.981706, -0.183228, 0.052673 ,-0.388642, 0.920618, + -0.096497 ,-0.506403, -0.044662, -0.862000 ,-0.512421, -0.852059, + -0.204163 ,0.559542, 0.339777, 0.803772 ,0.527502, -0.846389, + 0.137573 ,-0.184315, -0.952725, 0.794983 ,0.125024, -0.977110, + -0.809082 ,-0.643507, 0.678632, 0.482056 ,-0.277474, 0.954056, + 0.377380 ,-0.622333, -0.717603, 0.448914 ,0.366846, -0.110794, + -0.929382 ,0.120402, 0.992596, 0.131653 ,-0.982921, 0.103550, + -0.152954 ,-0.058333, -0.997913, -0.428894 ,0.132631, 0.979299, + 0.755432 ,0.326398, 0.937806, 0.340637 ,0.211720, 0.976659, 0.168640 , + 0.957557, -0.019174, -0.287659 ,-0.016554, 0.999650, 0.780090 , + -0.271222, 0.827292, -0.875732 ,0.850790, -0.448069, 0.307129 ,0.115949, + 0.600003, -0.989441 ,0.285877, -0.940896, -0.536255 ,-0.321317, + -0.278336, -0.942383 ,-0.422133, 0.754447, 0.765747 ,0.669674, + -0.741852, -0.051514 ,0.213604, -0.949888, 0.730103 ,0.619681, + -0.751798, -0.341797 ,-0.223762, 0.438616, -0.968506 ,-0.302925, + -0.945732, 0.361877 ,0.121093, -0.977151, -0.821838 ,0.127125, + 0.758710, -0.980774 ,0.691682, 0.695626, 0.270203 ,0.241114, 0.967463, + -0.303040 ,-0.829705, 0.422869, 0.402100 ,-0.484170, -0.741723, + 0.692017 ,-0.431259, -0.777492, -0.727844 ,0.835756, -0.211986, + 0.518311 ,0.297724, 0.932993, 0.561829 ,0.633475, -0.764920, + -0.181091 ,-0.833849, -0.453546, -0.353027 ,-0.369433, 0.839581, + -0.733154 ,0.555847, 0.392934, -0.796631 ,-0.856065, 0.028375, + 0.516296 ,0.067161, 0.997565, 0.269409 ,-0.962279, -0.051749, + 0.267456 ,-0.738893, 0.080065, -0.671204 ,-0.764325, 0.462240, + 0.507019 ,0.148758, 0.751545, 0.974243 ,-0.153430, -0.318230, + 0.986816 ,-0.439372, 0.776405, 0.716919 +}; + diff --git a/r5dev/mathlib/powsse.cpp b/r5dev/mathlib/powsse.cpp index 2144f549..3c217c6e 100644 --- a/r5dev/mathlib/powsse.cpp +++ b/r5dev/mathlib/powsse.cpp @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============// // // Purpose: // @@ -45,6 +45,7 @@ fltx4 Pow_FixedPoint_Exponent_SIMD(const fltx4& x, int exponent) +#ifndef _PS3 // these aren't fast (or correct) on the PS3 /* * (c) Ian Stephenson * @@ -94,4 +95,7 @@ float FastPow10(float i) { return FastPow2(i * 3.321928f); } +#else +#pragma message("TODO: revisit fast logs on all PPC hardware") +#endif diff --git a/r5dev/mathlib/randsse.cpp b/r5dev/mathlib/randsse.cpp index 85199d58..5469e32b 100644 --- a/r5dev/mathlib/randsse.cpp +++ b/r5dev/mathlib/randsse.cpp @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//========= Copyright � 1996-2006, Valve Corporation, All rights reserved. ============// // // Purpose: generates 4 randum numbers in the range 0..1 quickly, using SIMD // @@ -6,7 +6,7 @@ #include "core/stdafx.h" #include "tier0/dbg.h" -#include "tier0/basetypes.h" +#include "tier0/threadtools.h" #include "mathlib/mathlib.h" #include "mathlib/vector.h" #include "mathlib/ssemath.h" @@ -43,7 +43,7 @@ public: fltx4 retval = AddSIMD(*m_pRand_K, *m_pRand_J); // if ( ret>=1.0) ret-=1.0 - fltx4 overflow_mask = CmpGeSIMD(retval, Four_Ones); + bi32x4 overflow_mask = CmpGeSIMD(retval, Four_Ones); retval = SubSIMD(retval, AndSIMD(Four_Ones, overflow_mask)); *m_pRand_K = retval; @@ -86,6 +86,7 @@ int GetSIMDRandContext(void) // try to take it! if (ThreadInterlockedAssignIf(&(s_nRandContextsInUse[i]), 1, 0)) { + ThreadMemoryBarrier(); return i; // done! } } @@ -97,6 +98,7 @@ int GetSIMDRandContext(void) void ReleaseSIMDRandContext(int nContext) { + ThreadMemoryBarrier(); s_nRandContextsInUse[nContext] = 0; } diff --git a/r5dev/mathlib/sseconst.cpp b/r5dev/mathlib/sseconst.cpp index 9305eefa..c3b2b006 100644 --- a/r5dev/mathlib/sseconst.cpp +++ b/r5dev/mathlib/sseconst.cpp @@ -1,13 +1,27 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======// // // Purpose: // //===========================================================================// +#if defined(__SPU__) +#include "platform.h" +#include "basetypes.h" +#include "mathlib/mathlib.h" +#include "mathlib/math_pfns.h" +// #include "mathlib/fltx4.h" +#include "ps3/spu_job_shared.h" +#endif + #include "core/stdafx.h" #include "mathlib/ssemath.h" #include "mathlib/ssequaternion.h" +//#include "mathlib/compressed_vector.h" +// NOTE: This has to be the last file included! +//#include "tier0/memdbgon.h" + +#if !defined(__SPU__) const fltx4 Four_PointFives = { 0.5,0.5,0.5,0.5 }; #ifndef _X360 const fltx4 Four_Zeros = { 0.0,0.0,0.0,0.0 }; @@ -23,14 +37,27 @@ const fltx4 Four_2ToThe21s = { (float)(1 << 21), (float)(1 << 21), (float)(1 << const fltx4 Four_2ToThe22s = { (float)(1 << 22), (float)(1 << 22), (float)(1 << 22), (float)(1 << 22) }; const fltx4 Four_2ToThe23s = { (float)(1 << 23), (float)(1 << 23), (float)(1 << 23), (float)(1 << 23) }; const fltx4 Four_2ToThe24s = { (float)(1 << 24), (float)(1 << 24), (float)(1 << 24), (float)(1 << 24) }; - +const fltx4 Four_Thirds = { 0.33333333, 0.33333333, 0.33333333, 0.33333333 }; +const fltx4 Four_TwoThirds = { 0.66666666, 0.66666666, 0.66666666, 0.66666666 }; const fltx4 Four_Point225s = { .225, .225, .225, .225 }; const fltx4 Four_Epsilons = { FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON }; +const fltx4 Four_DegToRad = { ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)) }; const fltx4 Four_FLT_MAX = { FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX }; const fltx4 Four_Negative_FLT_MAX = { -FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX }; const fltx4 g_SIMD_0123 = { 0., 1., 2., 3. }; +const fltx4 Four_LinearToGammaCoefficients_A = { -3.7295, -3.7295, -3.7295, -3.7295 }; +const fltx4 Four_LinearToGammaCoefficients_B = { 8.9635, 8.9635, 8.9635, 8.9635 }; +const fltx4 Four_LinearToGammaCoefficients_C = { -7.7397, -7.7397, -7.7397, -7.7397 }; +const fltx4 Four_LinearToGammaCoefficients_D = { 3.443, 3.443, 3.443, 3.443 }; +const fltx4 Four_LinearToGammaCoefficients_E = { 0.048, 0.048, 0.048, 0.048 }; + +const fltx4 Four_GammaToLinearCoefficients_A = { .1731, .1731, .1731, .1731 }; +const fltx4 Four_GammaToLinearCoefficients_B = { .8717, .8717, .8717, .8717 }; +const fltx4 Four_GammaToLinearCoefficients_C = { -.0452, -.0452, -.0452, -.0452 }; +const fltx4 Four_GammaToLinearCoefficients_D = { .0012, .0012, .0012, .0012 }; + const fltx4 g_QuatMultRowSign[4] = { { 1.0f, 1.0f, -1.0f, 1.0f }, @@ -38,20 +65,28 @@ const fltx4 g_QuatMultRowSign[4] = { 1.0f, -1.0f, 1.0f, 1.0f }, { -1.0f, -1.0f, -1.0f, 1.0f } }; +#endif -const uint32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = { 0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff }; -const uint32 ALIGN16 g_SIMD_signmask[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; -const uint32 ALIGN16 g_SIMD_lsbmask[4] ALIGN16_POST = { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe }; -const uint32 ALIGN16 g_SIMD_clear_wmask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; -const uint32 ALIGN16 g_SIMD_AllOnesMask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; // ~0,~0,~0,~0 -const uint32 ALIGN16 g_SIMD_Low16BitsMask[4] ALIGN16_POST = { 0xffff, 0xffff, 0xffff, 0xffff }; // 0xffff x 4 -const uint32 ALIGN16 g_SIMD_ComponentMask[4][4] ALIGN16_POST = +const int32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = { 0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff }; +const int32 ALIGN16 g_SIMD_signmask[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; +const int32 ALIGN16 g_SIMD_lsbmask[4] ALIGN16_POST = { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe }; +const int32 ALIGN16 g_SIMD_clear_wmask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; +const int32 ALIGN16 g_SIMD_AllOnesMask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; // ~0,~0,~0,~0 +const int32 ALIGN16 g_SIMD_Low16BitsMask[4] ALIGN16_POST = { 0xffff, 0xffff, 0xffff, 0xffff }; // 0xffff x 4 + + +const int32 ALIGN16 g_SIMD_ComponentMask[4][4] ALIGN16_POST = { { 0xFFFFFFFF, 0, 0, 0 }, { 0, 0xFFFFFFFF, 0, 0 }, { 0, 0, 0xFFFFFFFF, 0 }, { 0, 0, 0, 0xFFFFFFFF } }; -const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST = +const fltx4 g_SIMD_Identity[4] = +{ + { 1.0, 0, 0, 0 }, { 0, 1.0, 0, 0 }, { 0, 0, 1.0, 0 }, { 0, 0, 0, 1.0 } +}; + +const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST = { { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, { 0xffffffff, 0x00000000, 0x00000000, 0x00000000 }, @@ -59,6 +94,114 @@ const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 }, }; +const int32 ALIGN16 g_SIMD_EveryOtherMask[4] = { 0, ~0, 0, ~0 }; + + + +#ifdef PLATFORM_PPC + +/// Passed as a parameter to vslh, shuffles the z component of a quat48 stored in the zw words left by one bit. +const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[] = { + 0x00, 0x00, // x word + 0x00, 0x00, // y word + 0x00, 0x01, // z word + 0x00, 0x00 }; // w word + +// this permutes uint16's x,y,z packed in the most significant four halfwords of a fltx4 +// so that each gets its own word in the output. expected use is // __vperm( XX, Four_Threes, permute ) +// -- that way each int is represented as 3.0 + n * 2^-22 , which we can pull into the +// appropriate range with a single madd! +const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16] = +{ + 16, 17, 0, 1, // word one: 00XX + 16, 17, 2, 3, // word two: 00YY + 16, 17, 4, 5, // word three: 00ZZ + 16, 17, 6, 7 // word four: 00WW +}; + +// the other permutes are a little trickier. note: I'm defining them out of order. +// 2 and 5 blend together prior results, rather than a source with 3.0f + +// out1 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast(g_SIMD_Quat48_Unpack_Permute1) ); // __x1__y1__z1____ +const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16] = +{ + 16, 17, 6, 7, // word one: 00XX + 16, 17, 8, 9, // word two: 00YY + 16, 17, 10, 11, // word three: 00ZZ + 16, 17, 12, 13 // word four: 00WW +}; + +// out3 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast(g_SIMD_Quat48_Unpack_Permute3) ); // __x3__y3__z3__z2 // z2 is important, goes into out2 +const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16] = +{ + 16, 17, 2, 3, + 16, 17, 4, 5, + 16, 17, 6, 7, + 16, 17, 0, 1 +}; + +// out4 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast(g_SIMD_Quat48_Unpack_Permute4) ); // __x4__y4__z4__x5 // x5 is important, goes into out5 +const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16] = +{ + 16, 17, 8, 9, + 16, 17, 10, 11, + 16, 17, 12, 13, + 16, 17, 14, 15 +}; + +// out6 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast(g_SIMD_Quat48_Unpack_Permute6) ); // __x6__y6__z6____ +const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16] = +{ + 16, 17, 4, 5, // word one + 16, 17, 6, 7, // word two + 16, 17, 8, 9, // word three + 16, 17, 10, 11 // word four (garbage) +}; + +// out7 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast(g_SIMD_Quat48_Unpack_Permute7) ); // __x7__y7__z7____ +const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16] = +{ + 16, 17, 10, 11, // word one + 16, 17, 12, 13, // word two + 16, 17, 14, 15, // word three + 16, 17, 16, 17 // word four (garbage) +}; + +// these last two are tricky because we mix old output with source input. we get the 3.0f +// from the old output. +// out2 = __vperm( x0y0z0x1y1z1x2y2, out3, *reinterpret_cast(g_SIMD_Quat48_Unpack_Permute2) ); // __x2__y2__z2____ +const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16] = +{ + 16, 17, 12, 13, // 3.x2 + 16, 17, 14, 15, // 3.y2 + 16, 17, 30, 31, // 3.z2 (from out2) + 16, 17, 16, 17 +}; + +// out5 = __vperm( y5z5x6y6z6x7y7z7, out4, *reinterpret_cast(g_SIMD_Quat48_Unpack_Permute5) ) // __x5__y5__z5____ +const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16] = +{ + 16, 17, 30, 31, // 3.x5 (from out5) + 16, 17, 0, 1, // 3.y5 + 16, 17, 2, 3, // 3.z5 + 16, 17, 16, 17 // garbage +}; + + +// magic constants that we use to convert the unpacked q48 components from 2 + n * 2^-22 (where n = 0 .. 65535) +// to -1.0 .. 1 +#define UnpackMul16s ( (1 << 22) / 32767.5 ) +#define UnpackAdd16s ( ( -UnpackMul16s * 3.0 ) - 1 ) +// we put the constants all into one word to save a little memory bandwidth +// but otherwise it would look like this: +// static const fltx4 vUpkMul = { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s }; +// static const fltx4 vUpkAdd = { UnpackAdd16s , UnpackAdd16s , UnpackAdd16s , UnpackAdd16s }; +const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants = { UnpackMul16s , UnpackAdd16s, 0, 0 }; +#undef UnpackMul16s +#undef UnpackAdd16s + +#endif + // FUNCTIONS // NOTE: WHY YOU **DO NOT** WANT TO PUT FUNCTIONS HERE @@ -82,7 +225,7 @@ const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST = // function is more than one screen long, yours is probably not one // of those occasions. - +#if !defined(__SPU__) /// You can use this to rotate a long array of FourVectors all by the same /// matrix. The first parameter is the head of the array. The second is the @@ -122,7 +265,7 @@ void FourVectors::RotateManyBy(FourVectors* RESTRICT pVectors, unsigned int numV matSplat22 = SplatZSIMD(matCol2); } -#ifdef _X360 +#if defined(_X360) || defined(_PS3) // Same algorithm as above, but the loop is unrolled to eliminate data hazard latencies // and simplify prefetching. Named variables are deliberately used instead of arrays to // ensure that the variables live on the registers instead of the stack (stack load/store @@ -216,6 +359,172 @@ void FourVectors::RotateManyBy(FourVectors* RESTRICT pVectors, unsigned int numV #endif } +// Get the closest point from P to the (infinite) line through vLineA and vLineB and +// calculate the shortest distance from P to the line. +// If you pass in a value for t, it will tell you the t for (A + (B-A)t) to get the closest point. +// If the closest point lies on the segment between A and B, then 0 <= t <= 1. +void FourVectors::CalcClosestPointOnLineSIMD(const FourVectors& P, const FourVectors& vLineA, const FourVectors& vLineB, FourVectors& vClosest, fltx4* outT) +{ + FourVectors vDir; + fltx4 t = CalcClosestPointToLineTSIMD(P, vLineA, vLineB, vDir); + if (outT) *outT = t; + vClosest = vDir; + vClosest *= t; + vClosest += vLineA; +} + +fltx4 FourVectors::CalcClosestPointToLineTSIMD(const FourVectors& P, const FourVectors& vLineA, const FourVectors& vLineB, FourVectors& vDir) +{ + Assert(s_bMathlibInitialized); + vDir = vLineB; + vDir -= vLineA; + + fltx4 div = vDir * vDir; + bi32x4 Mask; + fltx4 Compare = ReplicateX4(0.00001f); + fltx4 result; + Mask = CmpLtSIMD(div, Compare); + + result = DivSIMD(SubSIMD(vDir * P, vDir * vLineA), div); + + MaskedAssign(Mask, Four_Zeros, result); + return result; +} + +void FourVectors::RotateManyBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors* RESTRICT pOut) +{ + Assert(numVectors > 0); + if (numVectors == 0) + return; + + // Splat out each of the entries in the matrix to a fltx4. Do this + // in the order that we will need them, to hide latency. I'm + // avoiding making an array of them, so that they'll remain in + // registers. + fltx4 matSplat00, matSplat01, matSplat02, + matSplat10, matSplat11, matSplat12, + matSplat20, matSplat21, matSplat22; + + { + // Load the matrix into local vectors. Sadly, matrix3x4_ts are + // often unaligned. The w components will be the tranpose row of + // the matrix, but we don't really care about that. + fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]); + fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]); + fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]); + + matSplat00 = SplatXSIMD(matCol0); + matSplat01 = SplatYSIMD(matCol0); + matSplat02 = SplatZSIMD(matCol0); + + matSplat10 = SplatXSIMD(matCol1); + matSplat11 = SplatYSIMD(matCol1); + matSplat12 = SplatZSIMD(matCol1); + + matSplat20 = SplatXSIMD(matCol2); + matSplat21 = SplatYSIMD(matCol2); + matSplat22 = SplatZSIMD(matCol2); + } + +#if defined(_X360) || defined(_PS3) + // Same algorithm as above, but the loop is unrolled to eliminate data hazard latencies + // and simplify prefetching. Named variables are deliberately used instead of arrays to + // ensure that the variables live on the registers instead of the stack (stack load/store + // is a serious penalty on 360). Nb: for prefetching to be most efficient here, the + // loop should be unrolled to 8 FourVectors per iteration; because each FourVectors is + // 48 bytes long, 48 * 8 = 384, its least common multiple with the 128-byte cache line. + // That way you can fetch the next 3 cache lines while you work on these three. + // If you do go this route, be sure to dissassemble and make sure it doesn't spill + // registers to stack as you do this; the cost of that will be excessive. Unroll the loop + // a little and just live with the fact that you'll be doing a couple of redundant dbcts + // (they don't cost you anything). Be aware that all three cores share L2 and it can only + // have eight cache lines fetching at a time. + fltx4 outX0, outY0, outZ0; // bank one of outputs + fltx4 outX1, outY1, outZ1; // bank two of outputs + + + // Because of instruction latencies and scheduling, it's actually faster to use adds and muls + // rather than madds. (Empirically determined by timing.) + const FourVectors* stop = pVectors + numVectors; + FourVectors* RESTRICT pVectNext; + FourVectors* RESTRICT pOutNext; + // prime the pump. + if (numVectors & 0x01) + { + // odd number of vectors to process + // prime the 1 group of registers + pVectNext = pVectors++; + pOutNext = pOut++; + outX1 = AddSIMD(AddSIMD(MulSIMD(pVectNext->x, matSplat00), MulSIMD(pVectNext->y, matSplat01)), MulSIMD(pVectNext->z, matSplat02)); + outY1 = AddSIMD(AddSIMD(MulSIMD(pVectNext->x, matSplat10), MulSIMD(pVectNext->y, matSplat11)), MulSIMD(pVectNext->z, matSplat12)); + outZ1 = AddSIMD(AddSIMD(MulSIMD(pVectNext->x, matSplat20), MulSIMD(pVectNext->y, matSplat21)), MulSIMD(pVectNext->z, matSplat22)); + } + else + { + // even number of total vectors to process; + // prime the zero group and jump into the middle of the loop + outX0 = AddSIMD(AddSIMD(MulSIMD(pVectors->x, matSplat00), MulSIMD(pVectors->y, matSplat01)), MulSIMD(pVectors->z, matSplat02)); + outY0 = AddSIMD(AddSIMD(MulSIMD(pVectors->x, matSplat10), MulSIMD(pVectors->y, matSplat11)), MulSIMD(pVectors->z, matSplat12)); + outZ0 = AddSIMD(AddSIMD(MulSIMD(pVectors->x, matSplat20), MulSIMD(pVectors->y, matSplat21)), MulSIMD(pVectors->z, matSplat22)); + goto EVEN_CASE; + } + + // perform an even number of iterations through this loop. + while (pVectors < stop) + { + outX0 = MaddSIMD(pVectors->z, matSplat02, AddSIMD(MulSIMD(pVectors->x, matSplat00), MulSIMD(pVectors->y, matSplat01))); + outY0 = MaddSIMD(pVectors->z, matSplat12, AddSIMD(MulSIMD(pVectors->x, matSplat10), MulSIMD(pVectors->y, matSplat11))); + outZ0 = MaddSIMD(pVectors->z, matSplat22, AddSIMD(MulSIMD(pVectors->x, matSplat20), MulSIMD(pVectors->y, matSplat21))); + + pOutNext->x = outX1; + pOutNext->y = outY1; + pOutNext->z = outZ1; + + EVEN_CASE: + pVectNext = pVectors + 1; + pOutNext = pOut + 1; + + outX1 = MaddSIMD(pVectNext->z, matSplat02, AddSIMD(MulSIMD(pVectNext->x, matSplat00), MulSIMD(pVectNext->y, matSplat01))); + outY1 = MaddSIMD(pVectNext->z, matSplat12, AddSIMD(MulSIMD(pVectNext->x, matSplat10), MulSIMD(pVectNext->y, matSplat11))); + outZ1 = MaddSIMD(pVectNext->z, matSplat22, AddSIMD(MulSIMD(pVectNext->x, matSplat20), MulSIMD(pVectNext->y, matSplat21))); + + pOut->x = outX0; + pOut->y = outY0; + pOut->z = outZ0; + + pVectors += 2; + pOut += 2; + } + + // flush the last round of output + pVectNext->x = outX1; + pVectNext->y = outY1; + pVectNext->z = outZ1; +#else + // PC does not benefit from the unroll/scheduling above + fltx4 outX0, outY0, outZ0; // bank one of outputs + + + // Because of instruction latencies and scheduling, it's actually faster to use adds and muls + // rather than madds. (Empirically determined by timing.) + const FourVectors* stop = pVectors + numVectors; + + // perform an even number of iterations through this loop. + while (pVectors < stop) + { + outX0 = MaddSIMD(pVectors->z, matSplat02, AddSIMD(MulSIMD(pVectors->x, matSplat00), MulSIMD(pVectors->y, matSplat01))); + outY0 = MaddSIMD(pVectors->z, matSplat12, AddSIMD(MulSIMD(pVectors->x, matSplat10), MulSIMD(pVectors->y, matSplat11))); + outZ0 = MaddSIMD(pVectors->z, matSplat22, AddSIMD(MulSIMD(pVectors->x, matSplat20), MulSIMD(pVectors->y, matSplat21))); + + pOut->x = outX0; + pOut->y = outY0; + pOut->z = outZ0; + pVectors++; + pOut++; + } +#endif +} + #ifdef _X360 // Loop-scheduled code to process FourVectors in groups of eight quite efficiently. void FourVectors_TransformManyGroupsOfEightBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors* RESTRICT pOut) @@ -1162,4 +1471,9 @@ void TransformManyPointsBy(VectorAligned* RESTRICT pVectors, unsigned int numVec } + +#endif // #if !defined(__SPU__) + + + #endif diff --git a/r5dev/mathlib/ssemath.h b/r5dev/mathlib/ssemath.h index aa4186bd..5d27f38e 100644 --- a/r5dev/mathlib/ssemath.h +++ b/r5dev/mathlib/ssemath.h @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======// // // Purpose: - defines SIMD "structure of arrays" classes and functions. // @@ -8,60 +8,24 @@ #if defined( _X360 ) #include +#elif defined ( _PS3 ) +#include +#include #else #include +#ifndef _LINUX +#include +#endif #endif -#include -#include - -#if defined(GNUC) -#define USE_STDC_FOR_SIMD 0 +#ifndef SPU +#include "mathlib/vector.h" +#include "mathlib/mathlib.h" #else -#define USE_STDC_FOR_SIMD 0 +#include "mathlib/math_pfns.h" #endif -#if (!defined(_X360) && (USE_STDC_FOR_SIMD == 0)) -#define _SSE1 1 -#endif - -// I thought about defining a class/union for the SIMD packed floats instead of using fltx4, -// but decided against it because (a) the nature of SIMD code which includes comparisons is to blur -// the relationship between packed floats and packed integer types and (b) not sure that the -// compiler would handle generating good code for the intrinsics. - -#if USE_STDC_FOR_SIMD - -typedef union -{ - float m128_f32[4]; - uint32 m128_u32[4]; -} fltx4; - -typedef fltx4 i32x4; -typedef fltx4 u32x4; - -#elif ( defined( _X360 ) ) - -typedef union -{ - // This union allows float/int access (which generally shouldn't be done in inner loops) - __vector4 vmx; - float m128_f32[4]; - uint32 m128_u32[4]; -} fltx4_union; - -typedef __vector4 fltx4; -typedef __vector4 i32x4; // a VMX register; just a way of making it explicit that we're doing integer ops. -typedef __vector4 u32x4; // a VMX register; just a way of making it explicit that we're doing unsigned integer ops. - -#else - -typedef __m128 fltx4; -typedef __m128 i32x4; -typedef __m128 u32x4; - -#endif +#include "mathlib/fltx4.h" // The FLTX4 type is a fltx4 used as a parameter to a function. // On the 360, the best way to do this is pass-by-copy on the registers. @@ -71,6 +35,8 @@ typedef __m128 u32x4; // explicitly use a FLTX4 as the parameter type. #ifdef _X360 typedef __vector4 FLTX4; +#elif defined( _PS3 ) +typedef vec_float4 FLTX4; #else typedef const fltx4& FLTX4; #endif @@ -101,7 +67,7 @@ struct ALIGN16 intx4 return m_i32; } - inline const bool operator==(const intx4& other) const + inline bool operator==(const intx4& other) const { return m_i32[0] == other.m_i32[0] && m_i32[1] == other.m_i32[1] && @@ -134,7 +100,33 @@ FORCEINLINE void TestVPUFlags() {} // but are manufactured directly in one or two // instructions, saving a load and possible L2 // miss.) -#ifndef _X360 + +#ifdef _X360 +// Shouldn't the PS3 have something similar? +#define Four_Zeros XMVectorZero() // 0 0 0 0 +#define Four_Ones XMVectorSplatOne() // 1 1 1 1 +extern const fltx4 Four_Twos; // 2 2 2 2 +extern const fltx4 Four_Threes; // 3 3 3 3 +extern const fltx4 Four_Fours; // guess. +extern const fltx4 Four_Point225s; // .225 .225 .225 .225 +extern const fltx4 Four_PointFives; // .5 .5 .5 .5 +extern const fltx4 Four_Thirds; // 1/3 +extern const fltx4 Four_TwoThirds; // 2/3 +extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1 +extern const fltx4 Four_DegToRad; // (float)(M_PI_F / 180.f) times four +#elif defined(SPU) +#define Four_Zeros spu_splats( 0.0f ) // 0 0 0 0 +#define Four_Ones spu_splats( 1.0f ) // 1 1 1 1 +#define Four_Twos spu_splats( 2.0f ) // 2 2 2 2 +#define Four_Threes spu_splats( 3.0f ) // 3 3 3 3 +#define Four_Fours spu_splats( 4.0f ) // guess. +#define Four_Point225s spu_splats( 0.225f ) // .225 .225 .225 .225 +#define Four_PointFives spu_splats( 0.5f ) // .5 .5 .5 .5 +#define Four_Thirds spu_splats( 0.33333333 ); // 1/3 +#define Four_TwoThirds spu_splats( 0.66666666 ); // 2/3 +#define Four_NegativeOnes spu_splats( -1.0f ) // -1 -1 -1 -1 +#define Four_DegToRad spu_splats((float)(M_PI_F / 180.f)) +#else extern const fltx4 Four_Zeros; // 0 0 0 0 extern const fltx4 Four_Ones; // 1 1 1 1 extern const fltx4 Four_Twos; // 2 2 2 2 @@ -142,46 +134,56 @@ extern const fltx4 Four_Threes; // 3 3 3 3 extern const fltx4 Four_Fours; // guess. extern const fltx4 Four_Point225s; // .225 .225 .225 .225 extern const fltx4 Four_PointFives; // .5 .5 .5 .5 -extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON -extern const fltx4 Four_2ToThe21s; // (1<<21).. -extern const fltx4 Four_2ToThe22s; // (1<<22).. -extern const fltx4 Four_2ToThe23s; // (1<<23).. -extern const fltx4 Four_2ToThe24s; // (1<<24).. -extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2) -extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1 -#else -#define Four_Zeros XMVectorZero() // 0 0 0 0 -#define Four_Ones XMVectorSplatOne() // 1 1 1 1 -extern const fltx4 Four_Twos; // 2 2 2 2 -extern const fltx4 Four_Threes; // 3 3 3 3 -extern const fltx4 Four_Fours; // guess. -extern const fltx4 Four_Point225s; // .225 .225 .225 .225 -extern const fltx4 Four_PointFives; // .5 .5 .5 .5 -extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON -extern const fltx4 Four_2ToThe21s; // (1<<21).. -extern const fltx4 Four_2ToThe22s; // (1<<22).. -extern const fltx4 Four_2ToThe23s; // (1<<23).. -extern const fltx4 Four_2ToThe24s; // (1<<24).. -extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2) +extern const fltx4 Four_Thirds; // 1/3 +extern const fltx4 Four_TwoThirds; // 2/3 extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1 +extern const fltx4 Four_DegToRad; // (float)(M_PI_F / 180.f) times four #endif +extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON +extern const fltx4 Four_2ToThe21s; // (1<<21).. +extern const fltx4 Four_2ToThe22s; // (1<<22).. +extern const fltx4 Four_2ToThe23s; // (1<<23).. +extern const fltx4 Four_2ToThe24s; // (1<<24).. +extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2) extern const fltx4 Four_FLT_MAX; // FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX extern const fltx4 Four_Negative_FLT_MAX; // -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX extern const fltx4 g_SIMD_0123; // 0 1 2 3 as float + +// coefficients for polynomial approximation of srgb conversions + +// 4th order polynomial for x^(1/2.2), x in 0..1 +extern const fltx4 Four_LinearToGammaCoefficients_A; // *x^4 +extern const fltx4 Four_LinearToGammaCoefficients_B; // *x^3 +extern const fltx4 Four_LinearToGammaCoefficients_C; // *x^2 +extern const fltx4 Four_LinearToGammaCoefficients_D; // *x^1 +extern const fltx4 Four_LinearToGammaCoefficients_E; // *x^0 + +// 3rd order polynomial for x^2.2 x in 0..1 +extern const fltx4 Four_GammaToLinearCoefficients_A; // *x^3 +extern const fltx4 Four_GammaToLinearCoefficients_B; // *x^2 +extern const fltx4 Four_GammaToLinearCoefficients_C; // *x^1 +extern const fltx4 Four_GammaToLinearCoefficients_D; // *x^0 + + // external aligned integer constants -extern const ALIGN16 uint32 g_SIMD_clear_signmask[] ALIGN16_POST; // 0x7fffffff x 4 -extern const ALIGN16 uint32 g_SIMD_signmask[] ALIGN16_POST; // 0x80000000 x 4 -extern const ALIGN16 uint32 g_SIMD_lsbmask[] ALIGN16_POST; // 0xfffffffe x 4 -extern const ALIGN16 uint32 g_SIMD_clear_wmask[] ALIGN16_POST; // -1 -1 -1 0 -extern const ALIGN16 uint32 g_SIMD_ComponentMask[4][4] ALIGN16_POST; // [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF] -extern const ALIGN16 uint32 g_SIMD_AllOnesMask[] ALIGN16_POST; // ~0,~0,~0,~0 -extern const ALIGN16 uint32 g_SIMD_Low16BitsMask[] ALIGN16_POST; // 0xffff x 4 +#ifndef ALIGN16_POST +#define ALIGN16_POST +#endif +extern const ALIGN16 int32 g_SIMD_clear_signmask[] ALIGN16_POST; // 0x7fffffff x 4 +extern const ALIGN16 int32 g_SIMD_signmask[] ALIGN16_POST; // 0x80000000 x 4 +extern const ALIGN16 int32 g_SIMD_lsbmask[] ALIGN16_POST; // 0xfffffffe x 4 +extern const ALIGN16 int32 g_SIMD_clear_wmask[] ALIGN16_POST; // -1 -1 -1 0 +extern const ALIGN16 int32 g_SIMD_ComponentMask[4][4] ALIGN16_POST; // [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF] +extern const ALIGN16 int32 g_SIMD_AllOnesMask[] ALIGN16_POST; // ~0,~0,~0,~0 +extern const fltx4 g_SIMD_Identity[4]; // [1 0 0 0], [0 1 0 0], [0 0 1 0], [0 0 0 1] +extern const ALIGN16 int32 g_SIMD_Low16BitsMask[] ALIGN16_POST; // 0xffff x 4 // this mask is used for skipping the tail of things. If you have N elements in an array, and wish // to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration. -extern const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST; +extern const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST; +extern const int32 ALIGN16 g_SIMD_EveryOtherMask[]; // 0, ~0, 0, ~0 // Define prefetch macros. // The characteristics of cache and prefetch are completely // different between the different platforms, so you DO NOT @@ -191,12 +193,62 @@ extern const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST; // a higher level code change. // On the other hand, I'm tired of typing #ifdef _X360 // all over the place, so this is just a nop on Intel, PS3. -#ifdef _X360 +#ifdef PLATFORM_PPC +#if defined(_X360) #define PREFETCH360(address, offset) __dcbt(offset,address) +#elif defined(_PS3) +#define PREFETCH360(address, offset) __dcbt( reinterpret_cast< const char * >(address) + offset ) +#else +#error Prefetch not defined for this platform! +#endif #else #define PREFETCH360(x,y) // nothing #endif +// Here's a handy function to align a pointer to the next +// sixteen byte boundary -- it'll round it up to the nearest +// multiple of 16. This is useful if you're subdividing +// big swaths of allocated memory, but in that case, remember +// to leave yourself the necessary slack in the allocation. +template +inline T* AlignPointer(void* ptr) +{ +#if defined( __clang__ ) + uintp temp = (uintp)ptr; +#else + unsigned temp = ptr; +#endif + temp = ALIGN_VALUE(temp, sizeof(T)); + return (T*)temp; +} + +#ifdef _PS3 + +// Note that similar defines exist in math_pfns.h +// Maybe we should consolidate in one place for all platforms. + +#define _VEC_CLEAR_SIGNMASK (__vector unsigned int) {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff} +#define _VEC_SIGNMASK (__vector unsigned int) { 0x80000000, 0x80000000, 0x80000000, 0x80000000 } +#define _VEC_LSBMASK (__vector unsigned int) { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe } +#define _VEC_CLEAR_WMASK (__vector unsigned int) {0xffffffff, 0xffffffff, 0xffffffff, 0} +#define _VEC_COMPONENT_MASK_0 (__vector unsigned int) {0xffffffff, 0, 0, 0} +#define _VEC_COMPONENT_MASK_1 (__vector unsigned int) {0, 0xffffffff, 0, 0} +#define _VEC_COMPONENT_MASK_2 (__vector unsigned int) {0, 0, 0xffffffff, 0} +#define _VEC_COMPONENT_MASK_3 (__vector unsigned int) {0, 0, 0, 0xffffffff} + +#define _VEC_SWIZZLE_WZYX (__vector unsigned char) { 0x0c,0x0d,0x0e,0x0f, 0x08,0x09,0x0a,0x0b, 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03 } +#define _VEC_SWIZZLE_ZWXY (__vector unsigned char) { 0x08,0x09,0x0a,0x0b, 0x0c,0x0d,0x0e,0x0f, 0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07 } +#define _VEC_SWIZZLE_YXWZ (__vector unsigned char) { 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03, 0x0c,0x0d,0x0e,0x0f, 0x08,0x09,0x0a,0x0b } + +#define _VEC_ZERO (__vector unsigned int) {0,0,0,0} + +#define _VEC_FLTMAX (__vector float) {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX} +#define _VEC_FLTMIN (__vector float) {FLT_MIN,FLT_MIN,FLT_MIN,FLT_MIN} + +#define _VEC_ORIGIN (__vector unsigned int) { 0x00000000, 0x00000000, 0x00000000, 0xffffffff } + +#endif + #if USE_STDC_FOR_SIMD //--------------------------------------------------------------------- @@ -310,6 +362,7 @@ FORCEINLINE fltx4 SetComponentSIMD(const fltx4& a, int nComponent, float flValue return result; } + // a b c d -> b c d a FORCEINLINE fltx4 RotateLeft(const fltx4& a) { @@ -368,6 +421,10 @@ FORCEINLINE fltx4 DivSIMD(const fltx4& a, const fltx4& b) // a/b BINOP(/ ); } +FORCEINLINE fltx4 DivEstSIMD(const fltx4& a, const fltx4& b) // a/b +{ + BINOP(/ ); +} FORCEINLINE fltx4 MaddSIMD(const fltx4& a, const fltx4& b, const fltx4& c) // a*b + c { @@ -528,6 +585,15 @@ FORCEINLINE bool IsAllEqual(const fltx4& a, const fltx4& b) SubFloat(a, 3) == SubFloat(b, 3); } +// For branching if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w +FORCEINLINE bool IsAnyEqual(const fltx4& a, const fltx4& b) +{ + return SubFloat(a, 0) == SubFloat(b, 0) || + SubFloat(a, 1) == SubFloat(b, 1) || + SubFloat(a, 2) == SubFloat(b, 2) || + SubFloat(a, 3) == SubFloat(b, 3); +} + FORCEINLINE int TestSignSIMD(const fltx4& a) // mask of which floats have the high bit set { int nRet = 0; @@ -545,6 +611,11 @@ FORCEINLINE bool IsAnyNegative(const fltx4& a) // (a.x < 0) || (a.y < 0) | return (0 != TestSignSIMD(a)); } +FORCEINLINE bool IsAnyTrue(const fltx4& a) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) +{ + return (0 != TestSignSIMD(a)); +} + FORCEINLINE fltx4 CmpEqSIMD(const fltx4& a, const fltx4& b) // (a==b) ? ~0:0 { fltx4 retVal; @@ -806,6 +877,14 @@ FORCEINLINE fltx4 LoadUnaligned3SIMD(const void* pSIMD) return *(reinterpret_cast (pSIMD)); } +// load a single unaligned float into the x component of a SIMD word +FORCEINLINE fltx4 LoadUnalignedFloatSIMD(const float* pFlt) +{ + fltx4 retval; + SubFloat(retval, 0) = *pFlt; + return retval; +} + FORCEINLINE fltx4 LoadAlignedSIMD(const void* pSIMD) { return *(reinterpret_cast (pSIMD)); @@ -820,6 +899,14 @@ FORCEINLINE fltx4 LoadAlignedSIMD(const VectorAligned& pSIMD) return retval; } + +// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous +FORCEINLINE fltx4 LoadGatherSIMD(const float& x, const float& y, const float& z, const float& w) +{ + fltx4 retval = { x, y, z, w }; + return retval; +} + FORCEINLINE void StoreAlignedSIMD(float* pSIMD, const fltx4& a) { *(reinterpret_cast (pSIMD)) = a; @@ -830,6 +917,11 @@ FORCEINLINE void StoreUnalignedSIMD(float* pSIMD, const fltx4& a) *(reinterpret_cast (pSIMD)) = a; } +FORCEINLINE void StoreUnalignedFloat(float* pSingleFloat, const fltx4& a) +{ + *pSingleFloat = SubFloat(a, 0); +} + FORCEINLINE void StoreUnaligned3SIMD(float* pSIMD, const fltx4& a) { *pSIMD = SubFloat(a, 0); @@ -837,12 +929,41 @@ FORCEINLINE void StoreUnaligned3SIMD(float* pSIMD, const fltx4& a) *(pSIMD + 2) = SubFloat(a, 2); } + // strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD FORCEINLINE void StoreAligned3SIMD(VectorAligned* RESTRICT pSIMD, const fltx4& a) { StoreAlignedSIMD(pSIMD->Base(), a); } +// Store the x,y,z components of the four FLTX4 parameters +// into the four consecutive Vectors: +// pDestination[0], pDestination[1], pDestination[2], pDestination[3] +// The Vectors are assumed to be unaligned. +FORCEINLINE void StoreFourUnalignedVector3SIMD(fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) + Vector* const pDestination) +{ + StoreUnaligned3SIMD(pDestination->Base(), a); + StoreUnaligned3SIMD((pDestination + 1)->Base(), b); + StoreUnaligned3SIMD((pDestination + 2)->Base(), c); + StoreUnaligned3SIMD((pDestination + 3)->Base(), d); +} + +// Store the x,y,z components of the four FLTX4 parameters +// into the four consecutive Vectors: +// pDestination , pDestination + 1, pDestination + 2, pDestination + 3 +// The Vectors are assumed to start on an ALIGNED address, that is, +// pDestination is 16-byte aligned (thhough obviously pDestination+1 is not). +FORCEINLINE void StoreFourAlignedVector3SIMD(fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) + Vector* const pDestination) +{ + StoreUnaligned3SIMD(pDestination->Base(), a); + StoreUnaligned3SIMD((pDestination + 1)->Base(), b); + StoreUnaligned3SIMD((pDestination + 2)->Base(), c); + StoreUnaligned3SIMD((pDestination + 3)->Base(), d); +} + + FORCEINLINE void TransposeSIMD(fltx4& x, fltx4& y, fltx4& z, fltx4& w) { #define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ ) { float tmp = SubFloat( _a_, _ia_ ); SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ ); SubFloat( _b_, _ib_ ) = tmp; } @@ -924,6 +1045,19 @@ FORCEINLINE void StoreUnalignedIntSIMD(int32* pSIMD, const fltx4& a) *(reinterpret_cast (pSIMD)) = a; } +// Load four consecutive uint16's, and turn them into floating point numbers. +// This function isn't especially fast and could be made faster if anyone is +// using it heavily. +FORCEINLINE fltx4 LoadAndConvertUint16SIMD(const uint16* pInts) +{ + fltx4 retval; + SubFloat(retval, 0) = pInts[0]; + SubFloat(retval, 1) = pInts[1]; + SubFloat(retval, 2) = pInts[2]; + SubFloat(retval, 3) = pInts[3]; +} + + // Take a fltx4 containing fixed-point uints and // return them as single precision floats. No // fixed point conversion is done. @@ -931,10 +1065,10 @@ FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD(const u32x4& vSrcA) { Assert(0); /* pc has no such operation */ fltx4 retval; - SubFloat(retval, 0) = ((float)SubInt(retval, 0)); - SubFloat(retval, 1) = ((float)SubInt(retval, 1)); - SubFloat(retval, 2) = ((float)SubInt(retval, 2)); - SubFloat(retval, 3) = ((float)SubInt(retval, 3)); + SubFloat(retval, 0) = ((float)SubInt(vSrcA, 0)); + SubFloat(retval, 1) = ((float)SubInt(vSrcA, 1)); + SubFloat(retval, 2) = ((float)SubInt(vSrcA, 2)); + SubFloat(retval, 3) = ((float)SubInt(vSrcA, 3)); return retval; } @@ -974,14 +1108,1449 @@ FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4& vSrcA, const i32x4& vSrcB) return retval; } + #endif +#elif ( defined( _PS3 ) ) +#define SN_IMPROVED_INTRINSICS ( (( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )) ||\ + (defined(__SN_VER__) && (__SN_VER__ > 25002)) ) + +//--------------------------------------------------------------------- +// PS3 implementation +//--------------------------------------------------------------------- + +FORCEINLINE float FloatSIMD(fltx4& a, int idx) +{ +#if SN_IMPROVED_INTRINSICS + return vec_extract(a, idx); +#else + fltx4_union a_union; + vec_st(a, 0, &a_union.vmxf); + return a_union.m128_f32[idx]; +#endif +} + +FORCEINLINE unsigned int UIntSIMD(u32x4& a, int idx) +{ +#if SN_IMPROVED_INTRINSICS + return vec_extract(a, idx); +#else + fltx4_union a_union; + vec_st(a, 0, &a_union.vmxui); + return a_union.m128_u32[idx]; +#endif +} + +FORCEINLINE fltx4 AddSIMD(const fltx4& a, const fltx4& b) +{ + return vec_add(a, b); +} + +FORCEINLINE fltx4 SubSIMD(const fltx4& a, const fltx4& b) // a-b +{ + return vec_sub(a, b); +} + +FORCEINLINE fltx4 MulSIMD(const fltx4& a, const fltx4& b) // a*b +{ + return vec_madd(a, b, _VEC_ZEROF); +} + +FORCEINLINE fltx4 MaddSIMD(const fltx4& a, const fltx4& b, const fltx4& c) // a*b + c +{ + return vec_madd(a, b, c); +} + +FORCEINLINE fltx4 MsubSIMD(const fltx4& a, const fltx4& b, const fltx4& c) // c - a*b +{ + return vec_nmsub(a, b, c); +}; + +FORCEINLINE fltx4 Dot3SIMD(const fltx4& a, const fltx4& b) +{ + // oliviern: it seems that this code could be optimized + // (or maybe the latency will slow down if there is nothing to put in between) + // Something like that (to verify on PS3 and SPU): + // result2 = vec_madd(a, b, _VEC_ZEROF); // a0 * b0, a1 * b1, a2 * b2, a3 * b3 + // result = vec_add(vec_sld(result2, result2, 4), result2); // (a0 * b0) + (a1 * b1), (a1 * b1) + (a2 * b2), (a2 * b2) + (a3 * b3), (a3 * b3) + (a0 * b0) + // result = vec_add(vec_sld(result2, result2, 8), result); // (a0 * b0) + (a1 * b1) + (a2 * b2), (a1 * b1) + (a2 * b2) + (a3 * b3), (a2 * b2) + (a3 * b3) + (a0 * b0), (a3 * b3) + (a0 * b0) + ... + // result = vec_splat(result, 0); // DotProduct3... + // 6 SIMD instructions instead of 8 (but again with potentially one more latency - it depends if other stuff can be interleaved in between). + // It may still be a bit faster in the worst case. + + fltx4 result; + + result = vec_madd(a, b, _VEC_ZEROF); + result = vec_madd(vec_sld(a, a, 4), vec_sld(b, b, 4), result); + result = vec_madd(vec_sld(a, a, 8), vec_sld(b, b, 8), result); + + // replicate across all + result = vec_splat(result, 0); + + return result; +} + +FORCEINLINE fltx4 Dot4SIMD(const fltx4& a, const fltx4& b) +{ + // See comment in Dot3SIMD, we could reduce to 6 SIMD instructions instead of 7 (but again with potentially one more latency). + // result = vec_madd(a, b, _VEC_ZEROF); // a0 * b0, a1 * b1, a2 * b2, a3 * b3 + // result = vec_add(vec_sld(result, result, 4), result); // (a0 * b0) + (a1 * b1), (a1 * b1) + (a2 * b2), (a2 * b2) + (a3 * b3), (a3 * b3) + (a0 * b0) + // result = vec_add(vec_sld(result, result, 8), result); // (a0 * b0) + (a1 * b1) + (a2 * b2) + (a3 * b3), ... + // result = vec_splat(result, 0); // DotProduct3... + // 6 SIMD instructions instead of 7 (but again with potentially one more latency - it depends if other stuff can be interleaved in between). + // It may be a wash in the worst case. + + fltx4 result; + + result = vec_madd(a, b, _VEC_ZEROF); + result = vec_madd(vec_sld(a, a, 4), vec_sld(b, b, 4), result); + result = vec_add(vec_sld(result, result, 8), result); + + // replicate across all + result = vec_splat(result, 0); + + return result; +} + +FORCEINLINE fltx4 SinSIMD(const fltx4& radians) +{ + return sinf4(radians); +} + +FORCEINLINE void SinCos3SIMD(fltx4& sine, fltx4& cosine, const fltx4& radians) +{ + sincosf4(radians, &sine, &cosine); +} + +FORCEINLINE void SinCosSIMD(fltx4& sine, fltx4& cosine, const fltx4& radians) // a*b + c +{ + sincosf4(radians, &sine, &cosine); +} + +FORCEINLINE fltx4 ArcCosSIMD(const fltx4& cs) +{ + return acosf4(cs); +} + +FORCEINLINE fltx4 ArcTan2SIMD(const fltx4& a, const fltx4& b) +{ + return atan2f4(a, b); +} + +FORCEINLINE fltx4 ArcSinSIMD(const fltx4& sine) +{ + return asinf4(sine); +} + +// DivSIMD defined further down, since it uses ReciprocalSIMD + +FORCEINLINE fltx4 MaxSIMD(const fltx4& a, const fltx4& b) // max(a,b) +{ + return vec_max(a, b); +} +FORCEINLINE fltx4 MinSIMD(const fltx4& a, const fltx4& b) // min(a,b) +{ + return vec_min(a, b); +} + +FORCEINLINE fltx4 AndSIMD(const fltx4& a, const fltx4& b) // a & b +{ + return vec_and(a, b); +} +FORCEINLINE fltx4 AndSIMD(const bi32x4& a, const fltx4& b) // a & b +{ + return vec_and((fltx4)a, b); +} +FORCEINLINE fltx4 AndSIMD(const fltx4& a, const bi32x4& b) // a & b +{ + return vec_and(a, (fltx4)b); +} +FORCEINLINE bi32x4 AndSIMD(const bi32x4& a, const bi32x4& b) // a & b +{ + return vec_and(a, b); +} + +#if 0 +FORCEINLINE fltx4 AndNotSIMD(const fltx4& a, const fltx4& b) // ~a & b +{ + // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second + return vec_andc(b, a); +} +FORCEINLINE fltx4 AndNotSIMD(const bi32x4& a, const fltx4& b) // ~a & b +{ + // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second + return vec_andc(b, (fltx4)a); +} +FORCEINLINE fltx4 AndNotSIMD(const fltx4& a, const bi32x4& b) // ~a & b +{ + // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second + return (fltx4)vec_andc(b, (bi32x4)a); +} +FORCEINLINE bi32x4 AndNotSIMD(const bi32x4& a, const bi32x4& b) // ~a & b +{ + // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second + return vec_andc(b, a); +} +#else +template< typename T, typename U > +FORCEINLINE T AndNotSIMD(const T& a, const U& b) // ~a & b +{ + return vec_andc(b, (T)a); +} + +// specialize for the case of bi, flt +FORCEINLINE fltx4 AndNotSIMD(const bi32x4& a, const fltx4& b) // ~a & b +{ + return vec_andc(b, (fltx4)a); +} +#endif + +FORCEINLINE fltx4 XorSIMD(const fltx4& a, const fltx4& b) // a ^ b +{ + return vec_xor(a, b); +} +FORCEINLINE fltx4 XorSIMD(const bi32x4& a, const fltx4& b) // a ^ b +{ + return vec_xor((fltx4)a, b); +} +FORCEINLINE fltx4 XorSIMD(const fltx4& a, const bi32x4& b) // a ^ b +{ + return vec_xor(a, (fltx4)b); +} +FORCEINLINE bi32x4 XorSIMD(const bi32x4& a, const bi32x4& b) // a ^ b +{ + return vec_xor(a, b); +} + +FORCEINLINE fltx4 OrSIMD(const fltx4& a, const fltx4& b) // a | b +{ + return vec_or(a, b); +} +FORCEINLINE fltx4 OrSIMD(const bi32x4& a, const fltx4& b) // a | b +{ + return vec_or((fltx4)a, b); +} +FORCEINLINE fltx4 OrSIMD(const fltx4& a, const bi32x4& b) // a | b +{ + return vec_or(a, (fltx4)b); +} +FORCEINLINE i32x4 OrSIMD(const i32x4& a, const i32x4& b) // a | b +{ + return vec_or(a, b); +} +FORCEINLINE u32x4 OrSIMD(const u32x4& a, const u32x4& b) // a | b +{ + return vec_or(a, b); +} + +#if !defined(__SPU__) // bi32x4 typedef to same as u32x4 on SPU +FORCEINLINE bi32x4 OrSIMD(const bi32x4& a, const bi32x4& b) // a | b +{ + return vec_or(a, b); +} +#endif + +FORCEINLINE fltx4 NegSIMD(const fltx4& a) // negate: -a +{ + return(SubSIMD(_VEC_ZEROF, a)); + + // untested + // vec_float4 signMask; + // vec_float4 result; + // signMask = vec_splat_s32(-1); + // signMask = vec_sll(signMask, signMask); + // result = vec_xor(a, signMask); + // return result; +} + +FORCEINLINE bool IsAnyZeros(const fltx4& a) // any floats are zero? +{ + return vec_any_eq(a, _VEC_ZEROF); +} + +FORCEINLINE bool IsAnyZeros(const bi32x4& a) // any floats are zero? +{ + return vec_any_eq((u32x4)a, _VEC_ZERO); +} + +FORCEINLINE bool IsAllZeros(const bi32x4& a) // all floats of a zero? +{ + return vec_all_eq((u32x4)a, _VEC_ZERO); +} + +FORCEINLINE bool IsAnyXYZZero(const fltx4& a) // are any of x,y,z zero? +{ +#if SN_IMPROVED_INTRINSICS + + // push 1.0 into w (NON-ZERO) + fltx4 b = vec_insert(1.0f, a, 3); + + return vec_any_eq(b, _VEC_ZEROF); +#else + fltx4 b = vec_perm(a, _VEC_ONEF, _VEC_PERMUTE_XYZ0W1); + return vec_any_eq(b, _VEC_ZEROF); +#endif +} + +// for branching when a.xyzw > b.xyzw +FORCEINLINE bool IsAllGreaterThan(const fltx4& a, const fltx4& b) +{ + return vec_all_gt(a, b); +} + +// for branching when a.xyzw >= b.xyzw +FORCEINLINE bool IsAllGreaterThanOrEq(const fltx4& a, const fltx4& b) +{ + return vec_all_ge(a, b); +} + +FORCEINLINE bool IsAllEqual(const fltx4& a, const fltx4& b) +{ + return vec_all_eq(a, b); +} + + +FORCEINLINE int TestSignSIMD(const fltx4& a) // mask of which floats have the high bit set +{ + // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though) + int nRet = 0; + + fltx4_union a_union; + vec_st(a, 0, &a_union.vmxf); + + nRet |= (a_union.m128_u32[0] & 0x80000000) >> 31; // sign(x) -> bit 0 + nRet |= (a_union.m128_u32[1] & 0x80000000) >> 30; // sign(y) -> bit 1 + nRet |= (a_union.m128_u32[2] & 0x80000000) >> 29; // sign(z) -> bit 2 + nRet |= (a_union.m128_u32[3] & 0x80000000) >> 28; // sign(w) -> bit 3 + + return nRet; +} +FORCEINLINE int TestSignSIMD(const bi32x4& a) // mask of which floats have the high bit set +{ + // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though) + int nRet = 0; + + fltx4_union a_union; + vec_st(a, 0, &a_union.vmxbi); + + nRet |= (a_union.m128_u32[0] & 0x80000000) >> 31; // sign(x) -> bit 0 + nRet |= (a_union.m128_u32[1] & 0x80000000) >> 30; // sign(y) -> bit 1 + nRet |= (a_union.m128_u32[2] & 0x80000000) >> 29; // sign(z) -> bit 2 + nRet |= (a_union.m128_u32[3] & 0x80000000) >> 28; // sign(w) -> bit 3 + + return nRet; +} + +FORCEINLINE bool IsAnyNegative(const bi32x4& a) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) +{ + return (0 != TestSignSIMD(a)); +} + +// Squelch the w component of a vector to +0.0. +// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) +FORCEINLINE fltx4 SetWToZeroSIMD(const fltx4& a) +{ + return (fltx4)vec_and((u32x4)a, _VEC_CLEAR_WMASK); +} +FORCEINLINE bi32x4 SetWToZeroSIMD(const bi32x4& a) +{ + return (bi32x4)vec_and((u32x4)a, _VEC_CLEAR_WMASK); +} + +FORCEINLINE bool IsAnyNegative(const fltx4& a) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) +{ + // NOTE: this tests the top bits of each vector element using integer math + // (so it ignores NaNs - it will return true for "-NaN") + return vec_any_lt(a, _VEC_ZEROF); +} + +FORCEINLINE bool IsAnyTrue(const fltx4& a) +{ + return vec_any_ne(a, _VEC_ZEROF); +} + +#ifdef DIFFERENT_NATIVE_VECTOR_TYPES + +FORCEINLINE bool IsAnyTrue(const bi32x4& a) +{ + return vec_any_ne((vector unsigned int) a, _VEC_0L); +} + +#endif + +FORCEINLINE bi32x4 CmpEqSIMD(const fltx4& a, const fltx4& b) // (a==b) ? ~0:0 +{ + return (bi32x4)vec_cmpeq(a, b); +} +FORCEINLINE bi32x4 CmpEqSIMD(const i32x4& a, const i32x4& b) // (a==b) ? ~0:0 +{ + return (bi32x4)vec_cmpeq(a, b); +} +FORCEINLINE bi32x4 CmpEqSIMD(const u32x4& a, const u32x4& b) // (a==b) ? ~0:0 +{ + return (bi32x4)vec_cmpeq(a, b); +} + +FORCEINLINE bi32x4 CmpGtSIMD(const fltx4& a, const fltx4& b) // (a>b) ? ~0:0 +{ + return (bi32x4)vec_cmpgt(a, b); +} +FORCEINLINE bi32x4 CmpGtSIMD(const i32x4& a, const i32x4& b) // (a>b) ? ~0:0 +{ + return (bi32x4)vec_cmpgt(a, b); +} +FORCEINLINE bi32x4 CmpGtSIMD(const u32x4& a, const u32x4& b) // (a>b) ? ~0:0 +{ + return (bi32x4)vec_cmpgt(a, b); +} + +FORCEINLINE bi32x4 CmpGeSIMD(const fltx4& a, const fltx4& b) // (a>=b) ? ~0:0 +{ + return (bi32x4)vec_cmpge(a, b); +} + + +FORCEINLINE bi32x4 CmpLtSIMD(const fltx4& a, const fltx4& b) // (a= -b) ? ~0 : 0 +{ + i32x4 control; + control = vec_cmpb(a, b); + return (bi32x4)vec_cmpeq((u32x4)control, _VEC_ZERO); +} + +FORCEINLINE int CmpAnyLeSIMD(const fltx4& a, const fltx4& b) +{ + return vec_any_le(a, b); +} + +FORCEINLINE int CmpAnyGeSIMD(const fltx4& a, const fltx4& b) +{ + return vec_any_ge(a, b); +} + +FORCEINLINE int CmpAnyLtSIMD(const fltx4& a, const fltx4& b) +{ + return vec_any_lt(a, b); +} +FORCEINLINE int CmpAnyLtSIMD(const bi32x4& a, const i32x4& b) +{ + return vec_any_lt((i32x4)a, b); +} + +FORCEINLINE int CmpAnyGtSIMD(const fltx4& a, const fltx4& b) +{ + return vec_any_gt(a, b); +} + +FORCEINLINE int CmpAnyNeSIMD(const fltx4& a, const fltx4& b) +{ + return vec_any_ne(a, b); +} +FORCEINLINE int CmpAnyNeSIMD(const bi32x4& a, const bi32x4& b) +{ + return vec_any_ne(a, b); +} +FORCEINLINE int CmpAnyNeSIMD(const bi32x4& a, const i32x4& b) +{ + return vec_any_ne(a, (bi32x4)b); +} + +FORCEINLINE int CmpAllLeSIMD(const fltx4& a, const fltx4& b) +{ + return vec_all_le(a, b); +} + +FORCEINLINE fltx4 MaskedAssign(const bi32x4& ReplacementMask, const fltx4& NewValue, const fltx4& OldValue) +{ + return vec_sel(OldValue, NewValue, ReplacementMask); +} + +FORCEINLINE fltx4 MaskedAssign(const fltx4& ReplacementMask, const fltx4& NewValue, const fltx4& OldValue) +{ + return vec_sel(OldValue, NewValue, (const bi32x4)ReplacementMask); +} + +FORCEINLINE vector signed short MaskedAssign(const vector unsigned short& ReplacementMask, const vector signed short& NewValue, const vector signed short& OldValue) +{ + return vec_sel(OldValue, NewValue, ReplacementMask); +} + +// AKA "Broadcast", "Splat" +FORCEINLINE fltx4 ReplicateX4(float flValue) // a,a,a,a +{ +#if SN_IMPROVED_INTRINSICS + return vec_splats(flValue); +#else + // NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) + float* pValue = &flValue; + Assert(pValue); + Assert(((unsigned int)pValue & 3) == 0); + + fltx4 result; + + result = vec_ld(0, pValue); + result = vec_splat(vec_perm(result, result, vec_lvsl(0, pValue)), 0); + + return result; +#endif +} + +FORCEINLINE fltx4 ReplicateX4(const float* pValue) // a,a,a,a +{ +#if SN_IMPROVED_INTRINSICS + return vec_splats(*pValue); +#else + Assert(pValue); + fltx4 result; + + result = vec_ld(0, pValue); + result = vec_splat(vec_perm(result, result, vec_lvsl(0, pValue)), 0); + + return result; +#endif +} + +/// replicate a single 32 bit integer value to all 4 components of an m128 +FORCEINLINE i32x4 ReplicateIX4(int nValue) +{ +#if SN_IMPROVED_INTRINSICS + return vec_splats(nValue); +#else + // NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!) + int* pValue = &nValue; + Assert(pValue); + Assert(((unsigned int)pValue & 3) == 0); + i32x4 result; + + result = vec_ld(0, pValue); + result = vec_splat(vec_perm(result, result, vec_lvsl(0, pValue)), 0); + + return result; +#endif +} + +FORCEINLINE fltx4 SqrtSIMD(const fltx4& a) // sqrt(a) +{ + return sqrtf4(a); +} + +FORCEINLINE fltx4 SqrtEstSIMD(const fltx4& a) // sqrt(a), more or less +{ +#if defined( _PS3 ) && !defined( SPU ) + // This is exactly what the Xbox 360 does in XMVectorSqrtEst + fltx4 vRecipSquareRoot = vec_rsqrte(a); + i32x4 vOne = vec_splat_s32(1); + i32x4 vAllOnes = vec_splat_s32(-1); + i32x4 vShiftLeft24 = vec_splat_s32(-8); // -8 is the same bit pattern as 24 with a 5-bit mask + fltx4 vZero = (fltx4)vec_splat_s32(0); + u32x4 vInputShifted = vec_sl((u32x4)a, (u32x4)vOne); + u32x4 vInfinityShifted = vec_sl((u32x4)vAllOnes, (u32x4)vShiftLeft24); + bi32x4 vEqualsZero = vec_vcmpeqfp(a, vZero); + bi32x4 vEqualsInfinity = vec_vcmpequw(vInputShifted, vInfinityShifted); + fltx4 vSquareRoot = vec_madd(a, vRecipSquareRoot, _VEC_ZEROF); + bi32x4 vResultMask = vec_vcmpequw((u32x4)vEqualsInfinity, (u32x4)vEqualsZero); // mask has 1s wherever the square root is valid + fltx4 vCorrectedSquareRoot = vec_sel(a, vSquareRoot, vResultMask); + + return vCorrectedSquareRoot; +#else + return SqrtSIMD(a); +#endif +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSIMD(const fltx4& a) // 1/sqrt(a), more or less +{ + return vec_rsqrte(a); +} + +FORCEINLINE fltx4 ReciprocalSqrtSIMD(const fltx4& a) // 1/sqrt(a) +{ + // This matches standard library function rsqrtf4 + fltx4 result; + vmathV4RsqrtPerElem((VmathVector4*)&result, (const VmathVector4*)&a); + + return result; +} + +FORCEINLINE fltx4 ReciprocalEstSIMD(const fltx4& a) // 1/a, more or less +{ + return vec_re(a); +} + +/// 1/x for all 4 values, more or less +/// 1/0 will result in a big but NOT infinite result +FORCEINLINE fltx4 ReciprocalEstSaturateSIMD(const fltx4& a) +{ + bi32x4 zero_mask = CmpEqSIMD(a, Four_Zeros); + fltx4 ret = OrSIMD(a, AndSIMD(Four_Epsilons, zero_mask)); + ret = ReciprocalEstSIMD(ret); + return ret; +} + + +/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration. +/// No error checking! +FORCEINLINE fltx4 ReciprocalSIMD(const fltx4& a) // 1/a +{ + // This matches standard library function recipf4 + fltx4 result; + vmathV4RecipPerElem((VmathVector4*)&result, (const VmathVector4*)&a); + + return result; +} + +FORCEINLINE fltx4 DivSIMD(const fltx4& a, const fltx4& b) // a/b +{ + return MulSIMD(ReciprocalSIMD(b), a); +} + +FORCEINLINE fltx4 DivEstSIMD(const fltx4& a, const fltx4& b) // Est(a/b) +{ + return MulSIMD(ReciprocalEstSIMD(b), a); +} + +/// 1/x for all 4 values. +/// 1/0 will result in a big but NOT infinite result +FORCEINLINE fltx4 ReciprocalSaturateSIMD(const fltx4& a) +{ + // Convert zeros to epsilons + bi32x4 zero_mask = CmpEqSIMD(a, _VEC_ZEROF); + fltx4 a_safe = OrSIMD(a, AndSIMD(_VEC_EPSILONF, zero_mask)); + return ReciprocalSIMD(a_safe); + + // FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does) + // fltx4 zeroMask = CmpEqSIMD( gFour_Zeros, a ); + // fltx4 a_safe = XMVectorSelect( a, gFour_Epsilons, zeroMask ); + // return ReciprocalSIMD( a_safe ); +} + + +// CHRISG: is it worth doing integer bitfiddling for this? +// 2^x for all values (the antilog) +FORCEINLINE fltx4 ExpSIMD(const fltx4& toPower) +{ + return exp2f4(toPower); +} + +// a unique Altivec concept, the "Vector 2 Raised to the Exponent Estimate Floating Point", +// which is accurate to four bits of mantissa. +FORCEINLINE fltx4 Exp2EstSIMD(const fltx4& f) +{ + return exp2f4fast(f); +} + + +// Clamps the components of a vector to a specified minimum and maximum range. +FORCEINLINE fltx4 ClampVectorSIMD(FLTX4 in, FLTX4 min, FLTX4 max) +{ + fltx4 result = vec_max(min, in); + return vec_min(max, result); +} + + +FORCEINLINE fltx4 LoadUnalignedSIMD(const void* pSIMD) +{ +#if SN_IMPROVED_INTRINSICS + + fltx4 v0, v1; + + Assert(pSIMD); + + + v0 = (fltx4)vec_lvlx(0, (float*)pSIMD); + v1 = (fltx4)vec_lvrx(16, (float*)pSIMD); + return vec_or(v0, v1); + +#else + + fltx4 v0, v1; + vector unsigned char permMask; + + Assert(pSIMD); + + v0 = vec_ld(0, pSIMD); + permMask = vec_lvsl(0, pSIMD); + v1 = vec_ld(15, pSIMD); + + return vec_perm(v0, v1, permMask); + +#endif +} + +FORCEINLINE fltx4 LoadUnsignedByte4SIMD(unsigned char* pBytes) // unpack contiguous 4 bytes into vec float 4 +{ + +#if SN_IMPROVED_INTRINSICS + + __vector unsigned char res_uc; + __vector unsigned short res_us; + + __vector unsigned char vZero8 = (__vector unsigned char)vec_splat_u8(0); + __vector unsigned short vZero16 = (__vector unsigned short)vec_splat_u16(0); + + res_uc = (__vector unsigned char)vec_lvlx(0, pBytes); + res_uc = vec_mergeh(vZero8, res_uc); + res_us = vec_mergeh(vZero16, (__vector unsigned short)res_uc); + return vec_ctf((__vector unsigned int)res_us, 0); + +#else + + vector unsigned char v0, v1; + vector bool char res_uc; + vector unsigned char permMask; + vector bool short res_us; + + vector bool char vZero8 = (vector bool char)vec_splat_u8(0); + vector bool short vZero16 = (vector bool short)vec_splat_u16(0); + + v0 = vec_ld(0, pBytes); + permMask = vec_lvsl(0, pBytes); + v1 = vec_ld(3, pBytes); + res_uc = (vector bool char)vec_perm(v0, v1, permMask); + res_uc = vec_mergeh(vZero8, res_uc); + res_us = vec_mergeh(vZero16, (vector bool short)res_uc); + return vec_ctf((vector unsigned int)res_us, 0); + +#endif + +} + +FORCEINLINE fltx4 LoadSignedByte4SIMD(signed char* pBytes) // unpack contiguous 4 bytes into vec float 4 +{ + +#if SN_IMPROVED_INTRINSICS + + vector signed char res_uc; + vector signed short res_us; + vector signed int res_ui; + + res_uc = (vector signed char)vec_lvlx(0, pBytes); + res_us = vec_unpackh(res_uc); + res_ui = vec_unpackh(res_us); + return vec_ctf(res_ui, 0); + +#else + + vector signed char v0, v1, res_uc; + vector unsigned char permMask; + vector signed short res_us; + vector signed int res_ui; + + v0 = vec_ld(0, pBytes); + permMask = vec_lvsl(0, pBytes); + v1 = vec_ld(3, pBytes); + res_uc = vec_perm(v0, v1, permMask); + res_us = vec_unpackh(res_uc); + res_ui = vec_unpackh(res_us); + return vec_ctf(res_ui, 0); + +#endif + +} + + +// load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec). +FORCEINLINE fltx4 LoadUnaligned3SIMD(const void* pSIMD) +{ + Assert(pSIMD); + + fltx4 v0 = vec_ld(0, (float*)(pSIMD)); + vector unsigned char permMask = vec_lvsl(0, (float*)(pSIMD)); + fltx4 v1 = vec_ld(11, (float*)(pSIMD)); + + return vec_perm(v0, v1, permMask); +} + + +// load a single unaligned float into the x component of a SIMD word +FORCEINLINE fltx4 LoadUnalignedFloatSIMD(const float* pFlt) +{ + fltx4 v0 = vec_lde(0, const_cast(pFlt)); + vector unsigned char permMask = vec_lvsl(0, const_cast(pFlt)); + return vec_perm(v0, v0, permMask); +} + + +FORCEINLINE fltx4 LoadAlignedSIMD(const void* pSIMD) +{ + return vec_ld(0, (float*)pSIMD); +} + +#ifndef SPU +// No reason to support VectorAligned on SPU. + +// for the transitional class -- load a 3-by VectorAligned and squash its w component +FORCEINLINE fltx4 LoadAlignedSIMD(const VectorAligned& pSIMD) +{ + fltx4 out; + out = vec_ld(0, pSIMD.Base()); + + // squelch w + return (fltx4)vec_and((u32x4)out, _VEC_CLEAR_WMASK); +} + +// for the transitional class -- load a 3-by VectorAligned and squash its w component +FORCEINLINE fltx4 LoadAlignedSIMD(const VectorAligned* RESTRICT pSIMD) +{ + fltx4 out; + out = vec_ld(0, pSIMD->Base()); + + // squelch w + return (fltx4)vec_and((u32x4)out, _VEC_CLEAR_WMASK); +} + + +// strongly typed -- for typechecking as we transition to SIMD +FORCEINLINE void StoreAligned3SIMD(VectorAligned* RESTRICT pSIMD, const fltx4& a) +{ + vec_st(a, 0, pSIMD->Base()); +} +#endif + +FORCEINLINE void StoreAlignedSIMD(float* pSIMD, const fltx4& a) +{ + vec_st(a, 0, pSIMD); +} + +FORCEINLINE void StoreUnalignedSIMD(float* pSIMD, const fltx4& a) +{ +#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 ) + vec_stvlx(a, 0, pSIMD); + vec_stvrx(a, 16, pSIMD); +#else + fltx4_union a_union; + vec_st(a, 0, &a_union.vmxf); + pSIMD[0] = a_union.m128_f32[0]; + pSIMD[1] = a_union.m128_f32[1]; + pSIMD[2] = a_union.m128_f32[2]; + pSIMD[3] = a_union.m128_f32[3]; +#endif + +} + +FORCEINLINE void StoreUnaligned3SIMD(float* pSIMD, const fltx4& a) +{ + fltx4_union a_union; + vec_st(a, 0, &a_union.vmxf); + pSIMD[0] = a_union.m128_f32[0]; + pSIMD[1] = a_union.m128_f32[1]; + pSIMD[2] = a_union.m128_f32[2]; +}; + + + +#ifndef SPU +// No reason to support unaligned Vectors on SPU + + +FORCEINLINE fltx4 Compress4SIMD(fltx4 const a, fltx4 const& b, fltx4 const& c, fltx4 const& d); +// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous +FORCEINLINE fltx4 LoadGatherSIMD(const float& x, const float& y, const float& z, const float& w) +{ +#if USING_POINTLESSLY_SLOW_SONY_CODE + return vmathV4MakeFromElems_V(x, y, z, w).vec128; +#else + // load the float into the low word of each vector register (this exploits the unaligned load op) + fltx4 vx = vec_lvlx(0, &x); + fltx4 vy = vec_lvlx(0, &y); + fltx4 vz = vec_lvlx(0, &z); + fltx4 vw = vec_lvlx(0, &w); + return Compress4SIMD(vx, vy, vz, vw); +#endif +} + + +// Store the x,y,z components of the four FLTX4 parameters +// into the four consecutive Vectors: +// pDestination[0], pDestination[1], pDestination[2], pDestination[3] +// The Vectors are assumed to be unaligned. +FORCEINLINE void StoreFourUnalignedVector3SIMD(fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) + Vector* const pDestination) +{ + StoreUnaligned3SIMD(pDestination->Base(), a); + StoreUnaligned3SIMD((pDestination + 1)->Base(), b); + StoreUnaligned3SIMD((pDestination + 2)->Base(), c); + StoreUnaligned3SIMD((pDestination + 3)->Base(), d); +} + +// Store the x,y,z components of the four FLTX4 parameters +// into the four consecutive Vectors: +// pDestination , pDestination + 1, pDestination + 2, pDestination + 3 +// The Vectors are assumed to start on an ALIGNED address, that is, +// pDestination is 16-byte aligned (thhough obviously pDestination+1 is not). +FORCEINLINE void StoreFourAlignedVector3SIMD(fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) + Vector* const pDestination) +{ + StoreUnaligned3SIMD(pDestination->Base(), a); + StoreUnaligned3SIMD((pDestination + 1)->Base(), b); + StoreUnaligned3SIMD((pDestination + 2)->Base(), c); + StoreUnaligned3SIMD((pDestination + 3)->Base(), d); +} +#endif + +// Fixed-point conversion and save as SIGNED INTS. +// pDest->x = Int (vSrc.x) +// note: some architectures have means of doing +// fixed point conversion when the fix depth is +// specified as an immediate.. but there is no way +// to guarantee an immediate as a parameter to function +// like this. +FORCEINLINE void ConvertStoreAsIntsSIMD(intx4* RESTRICT pDest, const fltx4& vSrc) +{ + i32x4 asInt = vec_cts(vSrc, 0); + vec_st(asInt, 0, pDest->Base()); +} + +FORCEINLINE void TransposeSIMD(fltx4& x, fltx4& y, fltx4& z, fltx4& w) +{ + fltx4 p0, p1, p2, p3; + + p0 = vec_mergeh(x, z); + p1 = vec_mergeh(y, w); + p2 = vec_mergel(x, z); + p3 = vec_mergel(y, w); + + x = vec_mergeh(p0, p1); + y = vec_mergel(p0, p1); + z = vec_mergeh(p2, p3); + w = vec_mergel(p2, p3); +} + +// Return one in the fastest way -- faster even than loading. +FORCEINLINE fltx4 LoadZeroSIMD(void) +{ + return _VEC_ZEROF; +} +FORCEINLINE i32x4 LoadZeroISIMD(void) +{ + return vec_splat_s32(0); +} + + +// Return one in the fastest way -- faster even than loading. +FORCEINLINE fltx4 LoadOneSIMD(void) +{ + return _VEC_ONEF; +} +FORCEINLINE i32x4 LoadOneISIMD(void) +{ + return vec_splat_s32(1); +} + +FORCEINLINE fltx4 SplatXSIMD(fltx4 a) +{ + return vec_splat(a, 0); +} +FORCEINLINE fltx4 SplatYSIMD(fltx4 a) +{ + return vec_splat(a, 1); +} +FORCEINLINE fltx4 SplatZSIMD(fltx4 a) +{ + return vec_splat(a, 2); +} +FORCEINLINE fltx4 SplatWSIMD(fltx4 a) +{ + return vec_splat(a, 3); +} + +FORCEINLINE bi32x4 SplatXSIMD(bi32x4 a) +{ + return vec_splat(a, 0); +} +FORCEINLINE bi32x4 SplatYSIMD(bi32x4 a) +{ + return vec_splat(a, 1); +} +FORCEINLINE bi32x4 SplatZSIMD(bi32x4 a) +{ + return vec_splat(a, 2); +} +FORCEINLINE bi32x4 SplatWSIMD(bi32x4 a) +{ + return vec_splat(a, 3); +} + +FORCEINLINE fltx4 SetXSIMD(const fltx4& a, const fltx4& x) +{ + return vec_sel(a, x, _VEC_COMPONENT_MASK_0); +} + +FORCEINLINE fltx4 SetYSIMD(const fltx4& a, const fltx4& y) +{ + return vec_sel(a, y, _VEC_COMPONENT_MASK_1); +} + +FORCEINLINE fltx4 SetZSIMD(const fltx4& a, const fltx4& z) +{ + return vec_sel(a, z, _VEC_COMPONENT_MASK_2); +} + +FORCEINLINE fltx4 SetWSIMD(const fltx4& a, const fltx4& w) +{ + return vec_sel(a, w, _VEC_COMPONENT_MASK_3); +} + +FORCEINLINE fltx4 SetComponentSIMD(const fltx4& a, int nComponent, float flValue) +{ +#if SN_IMPROVED_INTRINSICS + return vec_insert(flValue, a, nComponent); +#else + fltx4_union a_union; + a_union.vmxf = vec_ld(0, &a); + a_union.m128_f32[nComponent] = flValue; + return a_union.vmxf; +#endif +} + +FORCEINLINE float GetComponentSIMD(const fltx4& a, int nComponent) +{ +#if SN_IMPROVED_INTRINSICS + return vec_extract(a, nComponent); +#else + fltx4_union a_union; + a_union.vmxf = vec_ld(0, &a); + return a_union.m128_f32[nComponent]; +#endif +} + + +FORCEINLINE fltx4 RotateLeft(const fltx4& a) +{ + return vec_sld(a, a, 4); +} + +FORCEINLINE fltx4 RotateLeft2(const fltx4& a) +{ + return vec_sld(a, a, 8); +} + +FORCEINLINE fltx4 RotateRight(const fltx4& a) +{ + return vec_sld(a, a, 12); +} + +FORCEINLINE fltx4 RotateRight2(const fltx4& a) +{ + return vec_sld(a, a, 8); +} + +// rotate a vector left by an arbitrary number of +// bits known at compile time. The bit parameter +// is template because it's actually used as an +// immediate field in an instruction, eg it absolutely +// must be known at compile time. nBits>127 leads +// to doom. +// zeroes are shifted in from the right +template < uint nBits, typename T > +FORCEINLINE T ShiftLeftByBits(const T& a) +{ + // hopefully the compiler, seeing nBits as a const immediate, elides these ifs + if (nBits >= 128) // WTF are you doing?! + { + return (T)LoadZeroSIMD(); + } + else if (nBits == 0) + { + return a; + } + else if ((nBits > 7)) // if we have to rotate by at least one byte, do the by-octet rotation first + { + T t = vec_sld(a, ((T)LoadZeroSIMD()), (nBits >> 3)); // rotated left by octets + return ShiftLeftByBits< (nBits & 0x7) >(t); + } + else // we need to rotate by <= 7 bits + { + // on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift. + // the splat, however, does require an immediate. Go IBM! + vector unsigned int shifter = (vector unsigned int) (vec_splat_s8(((signed char)(nBits & 0x7)))); + return (T)vec_sll((vector signed int) a, shifter); + } +} + +// as above, but shift right +template < uint nBits, typename T > +FORCEINLINE T ShiftRightByBits(const T& a) +{ + // hopefully the compiler, seeing nBits as a const immediate, elides these ifs + if (nBits >= 128) // WTF are you doing?! + { + return (T)LoadZeroSIMD(); + } + else if (nBits == 0) + { + return a; + } + else if ((nBits > 7)) // if we have to rotate by at least one byte, do the by-octet rotation first + { + T t = vec_sld(((T)LoadZeroSIMD()), a, 16 - (nBits >> 3)); // rotated right by octets -- a rotate right of one is like a rotate left of fifteen. + return ShiftRightByBits< (nBits & 0x7) >(t); + } + else // we need to rotate by <= 7 bits + { + // on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift. + // the splat, however, does require an immediate. Go IBM! + vector unsigned int shifter = (vector unsigned int) (vec_splat_s8(((signed char)(nBits & 0x7)))); + return (T)vec_srl((vector unsigned int) a, shifter); + } +} + + +/**** an example of ShiftLeftByBits: +fltx4 ShiftByTwentyOne( fltx4 foo ) +{ + return ShiftLeftByBits<21>(foo); +} + +compiles to: + + ShiftByTwentyOne(float __vector): + 0x000059FC: 0x1060038C vspltisw v3,0 PIPE + 0x00005A00: 0x1085030C vspltisb v4,5 + 0x00005A04: 0x104218AC vsldoi v2,v2,v3,2 02 (000059FC) REG PIPE + 0x00005A08: 0x104221C4 vsl v2,v2,v4 03 (00005A04) REG + 0x00005A0C: 0x4E800020 blr +*****/ + + + +// find the lowest component of a.x, a.y, a.z, +// and replicate it to the whole return value. +// ignores a.w. +// Forcing this inline should hopefully help with scheduling. +FORCEINLINE fltx4 FindLowestSIMD3(const fltx4& a) +{ + fltx4 result; + fltx4 x = vec_splat(a, 0); + fltx4 y = vec_splat(a, 1); + fltx4 z = vec_splat(a, 2); + + if (vec_any_nan(a)) + { + x = vec_all_nan(x) ? _VEC_FLTMAX : x; + y = vec_all_nan(y) ? _VEC_FLTMAX : y; + z = vec_all_nan(z) ? _VEC_FLTMAX : z; + } + + result = vec_min(y, x); + result = vec_min(z, result); + + return result; + +} + +// find the highest component of a.x, a.y, a.z, +// and replicate it to the whole return value. +// ignores a.w. +// Though this is only five instructions long, +// they are all dependent, making this stall city. +// Forcing this inline should hopefully help with scheduling. +FORCEINLINE fltx4 FindHighestSIMD3(const fltx4& a) +{ + fltx4 result; + fltx4 x = vec_splat(a, 0); + fltx4 y = vec_splat(a, 1); + fltx4 z = vec_splat(a, 2); + + if (vec_any_nan(a)) + { + x = vec_all_nan(x) ? _VEC_FLTMIN : x; + y = vec_all_nan(y) ? _VEC_FLTMIN : y; + z = vec_all_nan(z) ? _VEC_FLTMIN : z; + } + + result = vec_max(y, x); + result = vec_max(z, result); + + return result; +} + + +// ------------------------------------ +// INTEGER SIMD OPERATIONS. +// ------------------------------------ + +// Load 4 aligned words into a SIMD register +FORCEINLINE i32x4 LoadAlignedIntSIMD(const int32* RESTRICT pSIMD) +{ + return vec_ld(0, const_cast(pSIMD)); +} + +// Load 4 unaligned words into a SIMD register +FORCEINLINE i32x4 LoadUnalignedIntSIMD(const int32* RESTRICT pSIMD) +{ + i32x4 v0, v1; + vector unsigned char permMask; + + Assert(pSIMD); + + v0 = vec_ld(0, const_cast(pSIMD)); + permMask = vec_lvsl(0, const_cast(pSIMD)); + v1 = vec_ld(15, const_cast(pSIMD)); + + return vec_perm(v0, v1, permMask); + +} + +// save into four words, 16-byte aligned +FORCEINLINE void StoreAlignedIntSIMD(int32* pSIMD, const i32x4& a) +{ + vec_st(a, 0, pSIMD); +} + +FORCEINLINE void StoreAlignedIntSIMD(int32* pSIMD, const fltx4& a) +{ + vec_st((i32x4)a, 0, pSIMD); +} + +FORCEINLINE void StoreAlignedIntSIMD(intx4& pSIMD, const i32x4& a) +{ + vec_st(a, 0, pSIMD.Base()); +} + +FORCEINLINE void StoreUnalignedIntSIMD(int32* pSIMD, const i32x4& a) +{ +#if SN_IMPROVED_INTRINSICS + + // NOTE : NOT TESTED + vec_stvlx(a, 0, pSIMD); + vec_stvrx(a, 16, pSIMD); + +#else + + fltx4_union tmp; + vec_st(a, 0, &tmp.vmxi); + + pSIMD[0] = tmp.m128_u32[0]; + pSIMD[1] = tmp.m128_u32[1]; + pSIMD[2] = tmp.m128_u32[2]; + pSIMD[3] = tmp.m128_u32[3]; + +#endif +} + +// a={ a.x, a.z, b.x, b.z } +// combine two fltx4s by throwing away every other field. +FORCEINLINE fltx4 CompressSIMD(fltx4 const& a, fltx4 const& b) +{ + const int32 ALIGN16 n4shuffleACXZ[4] ALIGN16_POST = { 0x00010203, 0x08090A0B, 0x10111213, 0x18191A1B }; + return vec_perm(a, b, (vec_uchar16)LoadAlignedIntSIMD(n4shuffleACXZ)); +} + +// a={ a.x, b.x, c.x, d.x } +// combine 4 fltx4s by throwing away 3/4s of the fields +// TODO: make more efficient by doing this in a parallel way at the caller +// Compress4SIMD(FourVectors.. ) +FORCEINLINE fltx4 Compress4SIMD(fltx4 const a, fltx4 const& b, fltx4 const& c, fltx4 const& d) +{ + fltx4 ab = vec_mergeh(a, b); // a.x, b.x, a.y, b.y + fltx4 cd = vec_mergeh(c, d); // c.x, d.x... + static const int32 ALIGN16 shuffleABXY[4] ALIGN16_POST = { 0x00010203, 0x04050607, 0x10111213, 0x14151617 }; + + return vec_perm(ab, cd, (vec_uchar16)LoadAlignedIntSIMD(shuffleABXY)); +} + + +// Take a fltx4 containing fixed-point uints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD(const i32x4& vSrcA) +{ + return vec_ctf(vSrcA, 0); +} + + +// Take a fltx4 containing fixed-point sints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 SignedIntConvertToFltSIMD(const i32x4& vSrcA) +{ + return vec_ctf(vSrcA, 0); +} + +// Take a fltx4 containing fixed-point uints and +// return them as single precision floats. Each uint +// will be divided by 2^immed after conversion +// (eg, this is fixed point math). +/* as if: +FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) +{ +return vec_ctf(vSrcA,uImmed); +} +*/ +#define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (vec_ctf( (vSrcA), (uImmed) )) + +// Take a fltx4 containing fixed-point sints and +// return them as single precision floats. Each int +// will be divided by 2^immed (eg, this is fixed point +// math). +/* as if: +FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) +{ +return vec_ctf(vSrcA,uImmed); +} +*/ +#define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (vec_ctf( (vSrcA), (uImmed) )) + +// set all components of a vector to a signed immediate int number. +/* as if: +FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate) +{ +return vec_splat_s32( toImmediate ); +} +*/ +#define IntSetImmediateSIMD(x) (vec_splat_s32(x)) + + +/* +works on fltx4's as if they are four uints. +the first parameter contains the words to be shifted, +the second contains the amount to shift by AS INTS + +for i = 0 to 3 +shift = vSrcB_i*32:(i*32)+4 +vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift +*/ +FORCEINLINE u32x4 IntShiftLeftWordSIMD(u32x4 vSrcA, u32x4 vSrcB) +{ + return vec_sl(vSrcA, vSrcB); +} + + +FORCEINLINE float SubFloat(const fltx4& a, int idx) +{ +#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 ) + return(vec_extract(a, idx)); +#else // GCC 4.1.1 + // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) + fltx4_union a_union; + vec_st(a, 0, &a_union.vmxf); + return a_union.m128_f32[idx]; +#endif // GCC 4.1.1 +} + +FORCEINLINE float& SubFloat(fltx4& a, int idx) +{ + fltx4_union& a_union = (fltx4_union&)a; + return a_union.m128_f32[idx]; +} + +FORCEINLINE uint32 SubInt(const u32x4& a, int idx) +{ +#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 ) + return(vec_extract(a, idx)); +#else // GCC 4.1.1 + fltx4_union a_union; + vec_st(a, 0, &a_union.vmxui); + return a_union.m128_u32[idx]; +#endif // GCC 4.1.1 +} + +FORCEINLINE uint32 SubInt(const fltx4& a, int idx) +{ +#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 ) + return(vec_extract((u32x4)a, idx)); +#else + fltx4_union a_union; + vec_st(a, 0, &a_union.vmxf); + return a_union.m128_u32[idx]; +#endif +} + +FORCEINLINE uint32& SubInt(u32x4& a, int idx) +{ + fltx4_union& a_union = (fltx4_union&)a; + return a_union.m128_u32[idx]; +} + +FORCEINLINE uint32 SubFloatConvertToInt(const fltx4& a, int idx) +{ + +#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 ) + return(vec_extract(vec_ctu(a, 0), idx)); +#else + u32x4 t = vec_ctu(a, 0); + return SubInt(t, idx); +#endif + +} + +// perform an Altivec permute op. There is no corresponding SSE op, so +// this function is missing from that fork. This is deliberate, because +// permute-based algorithms simply need to be abandoned and rebuilt +// differently way for SSE. +// (see http://developer.apple.com/hardwaredrivers/ve/sse.html#Translation_Perm ) +template< typename T, typename U > +FORCEINLINE T PermuteVMX(T a, T b, U swizzleMask) +{ + return vec_perm(a, b, (vec_uchar16)swizzleMask); +} + + +// __fsel(double fComparand, double fValGE, double fLT) == fComparand >= 0 ? fValGE : fLT +// this is much faster than if ( aFloat > 0 ) { x = .. } +#if !defined(__SPU__) +#define fsel __fsel +#endif + +inline bool IsVector3LessThan(const fltx4& v1, const fltx4& v2) +{ + return vec_any_lt(v1, v2); +} + +inline bool IsVector3GreaterOrEqual(const fltx4& v1, const fltx4& v2) +{ + return !IsVector3LessThan(v1, v2); +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD(const fltx4& a) +{ + fltx4 retVal; + SubFloat(retVal, 0) = 1.0 / sqrt(SubFloat(a, 0) != 0.0f ? SubFloat(a, 0) : FLT_EPSILON); + SubFloat(retVal, 1) = 1.0 / sqrt(SubFloat(a, 1) != 0.0f ? SubFloat(a, 1) : FLT_EPSILON); + SubFloat(retVal, 2) = 1.0 / sqrt(SubFloat(a, 2) != 0.0f ? SubFloat(a, 2) : FLT_EPSILON); + SubFloat(retVal, 3) = 1.0 / sqrt(SubFloat(a, 3) != 0.0f ? SubFloat(a, 3) : FLT_EPSILON); + return retVal; +} + +// Round towards negative infinity +FORCEINLINE fltx4 FloorSIMD(const fltx4& a) +{ + fltx4 retVal; + SubFloat(retVal, 0) = floor(SubFloat(a, 0)); + SubFloat(retVal, 1) = floor(SubFloat(a, 1)); + SubFloat(retVal, 2) = floor(SubFloat(a, 2)); + SubFloat(retVal, 3) = floor(SubFloat(a, 3)); + return retVal; +} + #elif ( defined( _X360 ) ) //--------------------------------------------------------------------- // X360 implementation //--------------------------------------------------------------------- +inline bool IsVector3LessThan(const fltx4& v1, const fltx4& v2) +{ + return !XMVector3GreaterOrEqual(v1, v2); +} + +inline BOOL IsVector3GreaterOrEqual(const fltx4& v1, const fltx4& v2) +{ + return XMVector3GreaterOrEqual(v1, v2); +} + + FORCEINLINE float& FloatSIMD(fltx4& a, int idx) { fltx4_union& a_union = (fltx4_union&)a; @@ -1142,6 +2711,22 @@ FORCEINLINE bool IsAllGreaterThanOrEq(const fltx4& a, const fltx4& b) return XMComparisonAllTrue(cr); } +// for branching when a.xyzw > b.xyzw +FORCEINLINE bool IsAnyGreaterThan(const fltx4& a, const fltx4& b) +{ + unsigned int cr; + XMVectorGreaterR(&cr, a, b); + return XMComparisonAnyTrue(cr); +} + +// for branching when a.xyzw >= b.xyzw +FORCEINLINE bool IsAnyGreaterThanOrEq(const fltx4& a, const fltx4& b) +{ + unsigned int cr; + XMVectorGreaterOrEqualR(&cr, a, b); + return XMComparisonAnyTrue(cr); +} + // For branching if all a.xyzw == b.xyzw FORCEINLINE bool IsAllEqual(const fltx4& a, const fltx4& b) { @@ -1183,6 +2768,13 @@ FORCEINLINE bool IsAnyNegative(const fltx4& a) // (a.x < 0) || (a.y < 0) | return !XMComparisonAllTrue(equalFlags); } +FORCEINLINE bool IsAnyTrue(const fltx4& a) +{ + unsigned int equalFlags = 0; + __vcmpequwR(Four_Zeros, a, &equalFlags); // compare to zero + return XMComparisonAnyFalse(equalFlags); // at least one element was not zero, eg was true +} + FORCEINLINE fltx4 CmpEqSIMD(const fltx4& a, const fltx4& b) // (a==b) ? ~0:0 { return __vcmpeqfp(a, b); @@ -1220,6 +2812,18 @@ FORCEINLINE fltx4 MaskedAssign(const fltx4& ReplacementMask, const fltx4& NewVal return __vsel(OldValue, NewValue, ReplacementMask); } + +// perform an Altivec permute op. There is no corresponding SSE op, so +// this function is missing from that fork. This is deliberate, because +// permute-based algorithms simply need to be abandoned and rebuilt +// differently way for SSE. +// (see http://developer.apple.com/hardwaredrivers/ve/sse.html#Translation_Perm ) +template< typename T, typename U > +FORCEINLINE T PermuteVMX(T a, T b, U swizzleMask) +{ + return __vperm(a, b, swizzleMask); +} + // AKA "Broadcast", "Splat" FORCEINLINE fltx4 ReplicateX4(float flValue) // a,a,a,a { @@ -1308,12 +2912,16 @@ FORCEINLINE fltx4 ReciprocalSIMD(const fltx4& a) // 1/a return XMVectorReciprocal(a); } -// FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need DivEstSIMD?) FORCEINLINE fltx4 DivSIMD(const fltx4& a, const fltx4& b) // a/b { return MulSIMD(ReciprocalSIMD(b), a); } +FORCEINLINE fltx4 DivEstSIMD(const fltx4& a, const fltx4& b) // Est(a/b) +{ + return MulSIMD(ReciprocalEstSIMD(b), a); +} + /// 1/x for all 4 values. /// 1/0 will result in a big but NOT infinite result FORCEINLINE fltx4 ReciprocalEstSaturateSIMD(const fltx4& a) @@ -1344,6 +2952,13 @@ FORCEINLINE fltx4 ExpSIMD(const fltx4& toPower) return XMVectorExp(toPower); } +// a unique Altivec concept, the "Vector 2 Raised to the Exponent Estimate Floating Point", +// which is accurate to four bits of mantissa. +FORCEINLINE fltx4 Exp2EstSIMD(const fltx4& f) +{ + return XMVectorExpEst(f); +} + // Clamps the components of a vector to a specified minimum and maximum range. FORCEINLINE fltx4 ClampVectorSIMD(FLTX4 in, FLTX4 min, FLTX4 max) { @@ -1361,6 +2976,12 @@ FORCEINLINE fltx4 LoadUnaligned3SIMD(const void* pSIMD) return XMLoadVector3(pSIMD); } +// load a single unaligned float into the x component of a SIMD word +FORCEINLINE fltx4 LoadUnalignedFloatSIMD(const float* pFlt) +{ + return __lvlx(pFlt, 0); +} + FORCEINLINE fltx4 LoadAlignedSIMD(const void* pSIMD) { return *(reinterpret_cast (pSIMD)); @@ -1397,13 +3018,60 @@ FORCEINLINE void StoreUnaligned3SIMD(float* pSIMD, const fltx4& a) XMStoreVector3(pSIMD, a); } - // strongly typed -- for typechecking as we transition to SIMD FORCEINLINE void StoreAligned3SIMD(VectorAligned* RESTRICT pSIMD, const fltx4& a) { XMStoreVector3A(pSIMD->Base(), a); } +// Store the x,y,z components of the four FLTX4 parameters +// into the four consecutive Vectors: +// pDestination[0], pDestination[1], pDestination[2], pDestination[3] +// The Vectors are assumed to be unaligned. +FORCEINLINE void StoreFourUnalignedVector3SIMD(fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) + Vector* const pDestination) +{ + // since four Vec3s == 48 bytes, we can use full-vector stores here, so long as + // we arrange the data properly first. + // The vrlimi ops trash the destination param which is why we require + // pass-by-copy. I'm counting on the compiler to schedule these properly. + b = __vrlimi(b, b, 15, 1); // b = y1z1__x1 + c = __vrlimi(c, c, 15, 2); // c = z2__x2y2 + + a = __vrlimi(a, b, 1, 0); // a = x0y0z0x1 + b = __vrlimi(b, c, 2 | 1, 0); // b = y1z1x2y2 + c = __vrlimi(c, d, 4 | 2 | 1, 3); // c = z2x3y3z3 + + float* RESTRICT pOut = pDestination->Base(); + StoreUnalignedSIMD(pOut + 0, a); + StoreUnalignedSIMD(pOut + 4, b); + StoreUnalignedSIMD(pOut + 8, c); +} + +// Store the x,y,z components of the four FLTX4 parameters +// into the four consecutive Vectors: +// pDestination , pDestination + 1, pDestination + 2, pDestination + 3 +// The Vectors are assumed to start on an ALIGNED address, that is, +// pDestination is 16-byte aligned (thhough obviously pDestination+1 is not). +FORCEINLINE void StoreFourAlignedVector3SIMD(fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) + Vector* const pDestination) +{ + // since four Vec3s == 48 bytes, we can use full-vector stores here, so long as + // we arrange the data properly first. + // The vrlimi ops trash the destination param which is why we require + // pass-by-copy. I'm counting on the compiler to schedule these properly. + b = __vrlimi(b, b, 15, 1); // b = y1z1__x1 + c = __vrlimi(c, c, 15, 2); // c = z2__x2y2 + + a = __vrlimi(a, b, 1, 0); // a = x0y0z0x1 + b = __vrlimi(b, c, 2 | 1, 0); // b = y1z1x2y2 + c = __vrlimi(c, d, 4 | 2 | 1, 3); // c = z2x3y3z3 + + float* RESTRICT pOut = pDestination->Base(); + StoreAlignedSIMD(pOut + 0, a); + StoreAlignedSIMD(pOut + 4, b); + StoreAlignedSIMD(pOut + 8, c); +} // Fixed-point conversion and save as SIGNED INTS. // pDest->x = Int (vSrc.x) @@ -1504,7 +3172,78 @@ FORCEINLINE fltx4 RotateLeft2(const fltx4& a) return __vrlimi(compareOne, a, 8 | 4 | 2 | 1, 2); } +FORCEINLINE fltx4 RotateRight(const fltx4& a) +{ + fltx4 compareOne = a; + return __vrlimi(compareOne, a, 8 | 4 | 2 | 1, 3); +} +FORCEINLINE fltx4 RotateRight2(const fltx4& a) +{ + fltx4 compareOne = a; + return __vrlimi(compareOne, a, 8 | 4 | 2 | 1, 2); +} + + +// rotate a vector left by an arbitrary number of +// bits known at compile time. The bit parameter +// is template because it's actually used as an +// immediate field in an instruction, eg it absolutely +// must be known at compile time. nBits>127 leads +// to doom. +// zeroes are shifted in from the right +template < uint nBits > +FORCEINLINE fltx4 ShiftLeftByBits(const fltx4& a) +{ + // hopefully the compiler, seeing nBits as a const immediate, elides these ifs + if (nBits >= 128) // WTF are you doing?! + { + return LoadZeroSIMD(); + } + else if (nBits == 0) + { + return a; + } + else if ((nBits > 7)) // if we have to rotate by at least one byte, do the by-octet rotation first + { + fltx4 t = __vsldoi(a, (LoadZeroSIMD()), (nBits >> 3)); // rotated left by octets + return ShiftLeftByBits< (nBits & 0x7) >(t); + } + else // we need to rotate by <= 7 bits + { + // on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift. + // the splat, however, does require an immediate. Go IBM! + u32x4 shifter = u32x4(__vspltisb(((signed char)(nBits & 0x7)))); + return __vsl(a, shifter); + } +} + +// as above, but shift right +template < uint nBits > +FORCEINLINE fltx4 ShiftRightByBits(const fltx4& a) +{ + // hopefully the compiler, seeing nBits as a const immediate, elides these ifs + if (nBits >= 128) // WTF are you doing?! + { + return LoadZeroSIMD(); + } + else if (nBits == 0) + { + return a; + } + else if ((nBits > 7)) // if we have to rotate by at least one byte, do the by-octet rotation first + { + fltx4 t = __vsldoi((LoadZeroSIMD()), a, 16 - (nBits >> 3)); // rotated right by octets -- a rotate right of one is like a rotate left of fifteen. + return ShiftRightByBits< (nBits & 0x7) >(t); + } + else // we need to rotate by <= 7 bits + { + // on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift. + // the splat, however, does require an immediate. Go IBM! + u32x4 shifter = u32x4(__vspltisb(((signed char)(nBits & 0x7)))); + return __vsr(a, shifter); + } +} // find the lowest component of a.x, a.y, a.z, // and replicate it to the whole return value. @@ -1616,6 +3355,45 @@ FORCEINLINE void StoreUnalignedIntSIMD(int32* pSIMD, const fltx4& a) XMStoreVector4(pSIMD, a); } +// Load four consecutive uint16's, and turn them into floating point numbers. +// This function isn't especially fast and could be made faster if anyone is +// using it heavily. +FORCEINLINE fltx4 LoadAndConvertUint16SIMD(const uint16* pInts) +{ + return XMLoadUShort4(reinterpret_cast(pInts)); +} + +// a={ a.x, a.z, b.x, b.z } +// combine two fltx4s by throwing away every other field. +FORCEINLINE fltx4 CompressSIMD(fltx4 const& a, fltx4 const& b) +{ + return XMVectorPermute(a, b, XMVectorPermuteControl(0, 2, 4, 6)); +} + +// a={ a.x, b.x, c.x, d.x } +// combine 4 fltx4s by throwing away 3/4s of the fields +// TODO: make more efficient by doing this in a parallel way at the caller +// Compress4SIMD(FourVectors.. ) +FORCEINLINE fltx4 Compress4SIMD(fltx4 const a, fltx4 const& b, fltx4 const& c, fltx4 const& d) +{ + fltx4 abcd = __vrlimi(a, b, 4, 3); // a.x, b.x, a.z, a.w + abcd = __vrlimi(abcd, c, 2, 2); // ax, bx, cx, aw + abcd = __vrlimi(abcd, d, 1, 1); // ax, bx, cx, dx + + return abcd; +} + + +// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous +FORCEINLINE fltx4 LoadGatherSIMD(const float& x, const float& y, const float& z, const float& w) +{ + // load the float into the low word of each vector register (this exploits the unaligned load op) + fltx4 vx = __lvlx(&x, 0); + fltx4 vy = __lvlx(&y, 0); + fltx4 vz = __lvlx(&z, 0); + fltx4 vw = __lvlx(&w, 0); + return Compress4SIMD(vx, vy, vz, vw); +} // Take a fltx4 containing fixed-point uints and // return them as single precision floats. No @@ -1625,7 +3403,6 @@ FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD(const i32x4& vSrcA) return __vcfux(vSrcA, 0); } - // Take a fltx4 containing fixed-point sints and // return them as single precision floats. No // fixed point conversion is done. @@ -1725,11 +3502,25 @@ FORCEINLINE void StoreAlignedSIMD(float* RESTRICT pSIMD, const fltx4& a) _mm_store_ps(pSIMD, a); } +FORCEINLINE void StoreAlignedSIMD(short* RESTRICT pSIMD, const shortx8& a) +{ + _mm_store_si128((shortx8*)pSIMD, a); +} FORCEINLINE void StoreUnalignedSIMD(float* RESTRICT pSIMD, const fltx4& a) { _mm_storeu_ps(pSIMD, a); } +FORCEINLINE void StoreUnalignedSIMD(short* RESTRICT pSIMD, const shortx8& a) +{ + _mm_storeu_si128((shortx8*)pSIMD, a); +} + +FORCEINLINE void StoreUnalignedFloat(float* pSingleFloat, const fltx4& a) +{ + _mm_store_ss(pSingleFloat, a); +} + FORCEINLINE fltx4 RotateLeft(const fltx4& a); FORCEINLINE fltx4 RotateLeft2(const fltx4& a); @@ -1741,23 +3532,61 @@ FORCEINLINE void StoreUnaligned3SIMD(float* pSIMD, const fltx4& a) _mm_store_ss(pSIMD + 2, RotateLeft2(a)); } + // strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD FORCEINLINE void StoreAligned3SIMD(VectorAligned* RESTRICT pSIMD, const fltx4& a) { StoreAlignedSIMD(pSIMD->Base(), a); } +// Store the x,y,z components of the four FLTX4 parameters +// into the four consecutive Vectors: +// pDestination[0], pDestination[1], pDestination[2], pDestination[3] +// The Vectors are assumed to be unaligned. +FORCEINLINE void StoreFourUnalignedVector3SIMD(fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) + Vector3D* const pDestination) +{ + StoreUnaligned3SIMD(pDestination->Base(), a); + StoreUnaligned3SIMD((pDestination + 1)->Base(), b); + StoreUnaligned3SIMD((pDestination + 2)->Base(), c); + StoreUnaligned3SIMD((pDestination + 3)->Base(), d); +} + +// Store the x,y,z components of the four FLTX4 parameters +// into the four consecutive Vectors: +// pDestination , pDestination + 1, pDestination + 2, pDestination + 3 +// The Vectors are assumed to start on an ALIGNED address, that is, +// pDestination is 16-byte aligned (thhough obviously pDestination+1 is not). +FORCEINLINE void StoreFourAlignedVector3SIMD(fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) + Vector3D* const pDestination) +{ + StoreUnaligned3SIMD(pDestination->Base(), a); + StoreUnaligned3SIMD((pDestination + 1)->Base(), b); + StoreUnaligned3SIMD((pDestination + 2)->Base(), c); + StoreUnaligned3SIMD((pDestination + 3)->Base(), d); +} + FORCEINLINE fltx4 LoadAlignedSIMD(const void* pSIMD) { return _mm_load_ps(reinterpret_cast (pSIMD)); } +FORCEINLINE shortx8 LoadAlignedShortSIMD(const void* pSIMD) +{ + return _mm_load_si128(reinterpret_cast (pSIMD)); +} + +FORCEINLINE shortx8 LoadUnalignedShortSIMD(const void* pSIMD) +{ + return _mm_loadu_si128(reinterpret_cast (pSIMD)); +} + FORCEINLINE fltx4 AndSIMD(const fltx4& a, const fltx4& b) // a & b { return _mm_and_ps(a, b); } -FORCEINLINE fltx4 AndNotSIMD(const fltx4& a, const fltx4& b) // ~a & b +FORCEINLINE fltx4 AndNotSIMD(const fltx4& a, const fltx4& b) // a & ~b { return _mm_andnot_ps(a, b); } @@ -1795,6 +3624,12 @@ FORCEINLINE fltx4 LoadUnaligned3SIMD(const void* pSIMD) return _mm_loadu_ps(reinterpret_cast(pSIMD)); } +// load a single unaligned float into the x component of a SIMD word +FORCEINLINE fltx4 LoadUnalignedFloatSIMD(const float* pFlt) +{ + return _mm_load_ss(pFlt); +} + /// replicate a single 32 bit integer value to all 4 components of an m128 FORCEINLINE fltx4 ReplicateIX4(int i) { @@ -1809,6 +3644,11 @@ FORCEINLINE fltx4 ReplicateX4(float flValue) return _mm_shuffle_ps(value, value, 0); } +FORCEINLINE fltx4 ReplicateX4(const float* flValue) +{ + __m128 value = _mm_set_ss(*flValue); + return _mm_shuffle_ps(value, value, 0); +} FORCEINLINE float SubFloat(const fltx4& a, int idx) { @@ -1893,9 +3733,27 @@ FORCEINLINE fltx4 SplatZSIMD(fltx4 const& a) FORCEINLINE fltx4 SplatWSIMD(fltx4 const& a) { - return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)); + return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(3, 3, 3, 3)); } +FORCEINLINE fltx4 ShuffleXXYY(const fltx4& a) +{ + return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(0, 0, 1, 1)); +} + +FORCEINLINE fltx4 ShuffleXYXY(const fltx4& a) +{ + return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(0, 1, 0, 1)); +} + +FORCEINLINE fltx4 ShuffleZZWW(const fltx4& a) +{ + return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(2, 2, 3, 3)); +} + + + + FORCEINLINE fltx4 SetXSIMD(const fltx4& a, const fltx4& x) { fltx4 result = MaskedAssign(LoadAlignedSIMD(g_SIMD_ComponentMask[0]), x, a); @@ -1942,20 +3800,19 @@ FORCEINLINE fltx4 RotateLeft2(const fltx4& a) // a b c d -> d a b c FORCEINLINE fltx4 RotateRight(const fltx4& a) { - return _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1)); + return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(3, 0, 1, 2)); } // a b c d -> c d a b FORCEINLINE fltx4 RotateRight2(const fltx4& a) { - return _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); + return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(2, 3, 0, 1)); } - FORCEINLINE fltx4 AddSIMD(const fltx4& a, const fltx4& b) // a+b { return _mm_add_ps(a, b); -}; +} FORCEINLINE fltx4 SubSIMD(const fltx4& a, const fltx4& b) // a-b { @@ -1972,6 +3829,12 @@ FORCEINLINE fltx4 DivSIMD(const fltx4& a, const fltx4& b) // a/b return _mm_div_ps(a, b); }; +fltx4 ReciprocalEstSIMD(const fltx4& a); +FORCEINLINE fltx4 DivEstSIMD(const fltx4& a, const fltx4& b) // Est(a/b) +{ + return MulSIMD(ReciprocalEstSIMD(b), a); +}; + FORCEINLINE fltx4 MaddSIMD(const fltx4& a, const fltx4& b, const fltx4& c) // a*b + c { return AddSIMD(MulSIMD(a, b), c); @@ -1985,15 +3848,17 @@ FORCEINLINE fltx4 MsubSIMD(const fltx4& a, const fltx4& b, const fltx4& c) // FORCEINLINE fltx4 Dot3SIMD(const fltx4& a, const fltx4& b) { fltx4 m = MulSIMD(a, b); - float flDot = SubFloat(m, 0) + SubFloat(m, 1) + SubFloat(m, 2); - return ReplicateX4(flDot); + return AddSIMD(AddSIMD(SplatXSIMD(m), SplatYSIMD(m)), SplatZSIMD(m)); } FORCEINLINE fltx4 Dot4SIMD(const fltx4& a, const fltx4& b) { - fltx4 m = MulSIMD(a, b); - float flDot = SubFloat(m, 0) + SubFloat(m, 1) + SubFloat(m, 2) + SubFloat(m, 3); - return ReplicateX4(flDot); + // 4 instructions, serial, order of addition varies so individual elements my differ in the LSB on some CPUs + fltx4 fl4Product = MulSIMD(a, b); + fltx4 fl4YXWZ = _mm_shuffle_ps(fl4Product, fl4Product, MM_SHUFFLE_REV(1, 0, 3, 2)); + fltx4 fl4UUVV = AddSIMD(fl4Product, fl4YXWZ); // U = X+Y; V = Z+W + fltx4 fl4VVUU = RotateLeft2(fl4UUVV); + return AddSIMD(fl4UUVV, fl4VVUU); } //TODO: implement as four-way Taylor series (see xbox implementation) @@ -2072,6 +3937,11 @@ FORCEINLINE bool IsAnyNegative(const fltx4& a) // (a.x < 0) || (a.y < 0) | return (0 != TestSignSIMD(a)); } +FORCEINLINE bool IsAnyTrue(const fltx4& a) +{ + return (0 != TestSignSIMD(a)); +} + FORCEINLINE fltx4 CmpEqSIMD(const fltx4& a, const fltx4& b) // (a==b) ? ~0:0 { return _mm_cmpeq_ps(a, b); @@ -2151,7 +4021,9 @@ FORCEINLINE fltx4 CeilSIMD(const fltx4& a) } +fltx4 AbsSIMD(const fltx4& x); // To make it more coherent with the whole API (the whole SIMD API is postfixed with SIMD except a couple of methods. Well...) fltx4 fabs(const fltx4& x); + // Round towards negative infinity // This is the implementation that was here before; it assumes // you are in round-to-floor mode, which I guess is usually the @@ -2167,6 +4039,11 @@ FORCEINLINE fltx4 FloorSIMD(const fltx4& val) +FORCEINLINE bool IsAnyZeros(const fltx4& a) // any floats are zero? +{ + return TestSignSIMD(CmpEqSIMD(a, Four_Zeros)) != 0; +} + inline bool IsAllZeros(const fltx4& var) { return TestSignSIMD(CmpEqSIMD(var, Four_Zeros)) == 0xF; @@ -2298,6 +4175,20 @@ FORCEINLINE fltx4 FindHighestSIMD3(const fltx4& a) } + +inline bool IsVector3LessThan(const fltx4& v1, const fltx4& v2) +{ + bi32x4 isOut = CmpLtSIMD(v1, v2); + return IsAnyNegative(isOut); +} + +inline bool IsVector4LessThan(const fltx4& v1, const fltx4& v2) +{ + bi32x4 isOut = CmpLtSIMD(v1, v2); + return IsAnyNegative(isOut); +} + + // ------------------------------------ // INTEGER SIMD OPERATIONS. // ------------------------------------ @@ -2345,6 +4236,61 @@ FORCEINLINE void StoreUnalignedIntSIMD(int32* RESTRICT pSIMD, const fltx4& a) _mm_storeu_ps(reinterpret_cast(pSIMD), a); } +// a={ a.x, a.z, b.x, b.z } +// combine two fltx4s by throwing away every other field. +FORCEINLINE fltx4 CompressSIMD(fltx4 const& a, fltx4 const& b) +{ + return _mm_shuffle_ps(a, b, MM_SHUFFLE_REV(0, 2, 0, 2)); +} + +// Load four consecutive uint16's, and turn them into floating point numbers. +// This function isn't especially fast and could be made faster if anyone is +// using it heavily. +FORCEINLINE fltx4 LoadAndConvertUint16SIMD(const uint16* pInts) +{ +#ifdef POSIX + fltx4 retval; + SubFloat(retval, 0) = pInts[0]; + SubFloat(retval, 1) = pInts[1]; + SubFloat(retval, 2) = pInts[2]; + SubFloat(retval, 3) = pInts[3]; + return retval; +#else + __m128i inA = _mm_loadl_epi64((__m128i const*) pInts); // Load the lower 64 bits of the value pointed to by p into the lower 64 bits of the result, zeroing the upper 64 bits of the result. + inA = _mm_unpacklo_epi16(inA, _mm_setzero_si128()); // unpack unsigned 16's to signed 32's + return _mm_cvtepi32_ps(inA); +#endif +} + + +// a={ a.x, b.x, c.x, d.x } +// combine 4 fltx4s by throwing away 3/4s of the fields +FORCEINLINE fltx4 Compress4SIMD(fltx4 const a, fltx4 const& b, fltx4 const& c, fltx4 const& d) +{ + fltx4 aacc = _mm_shuffle_ps(a, c, MM_SHUFFLE_REV(0, 0, 0, 0)); + fltx4 bbdd = _mm_shuffle_ps(b, d, MM_SHUFFLE_REV(0, 0, 0, 0)); + return MaskedAssign(LoadAlignedSIMD(g_SIMD_EveryOtherMask), bbdd, aacc); +} + +// outa={a.x, a.x, a.y, a.y}, outb = a.z, a.z, a.w, a.w } +FORCEINLINE void ExpandSIMD(fltx4 const& a, fltx4& fl4OutA, fltx4& fl4OutB) +{ + fl4OutA = _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(0, 0, 1, 1)); + fl4OutB = _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(2, 2, 3, 3)); + +} + + +// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous +FORCEINLINE fltx4 LoadGatherSIMD(const float& x, const float& y, const float& z, const float& w) +{ + // load the float into the low word of each vector register (this exploits the unaligned load op) + fltx4 vx = _mm_load_ss(&x); + fltx4 vy = _mm_load_ss(&y); + fltx4 vz = _mm_load_ss(&z); + fltx4 vw = _mm_load_ss(&w); + return Compress4SIMD(vx, vy, vz, vw); +} // CHRISG: the conversion functions all seem to operate on m64's only... // how do we make them work here? @@ -2362,7 +4308,20 @@ FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD(const u32x4& vSrcA) return retval; } +// Take a fltx4 containing fixed-point sints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 SignedIntConvertToFltSIMD(const i32x4& vSrcA) +{ + return _mm_cvtepi32_ps((const __m128i&)vSrcA); +} +FORCEINLINE fltx4 SignedIntConvertToFltSIMD(const shortx8& vSrcA) +{ + return _mm_cvtepi32_ps(vSrcA); +} + +#if 0 // Take a fltx4 containing fixed-point sints and // return them as single precision floats. No // fixed point conversion is done. @@ -2376,6 +4335,8 @@ FORCEINLINE fltx4 SignedIntConvertToFltSIMD(const i32x4& vSrcA) return retval; } +#endif + /* works on fltx4's as if they are four uints. the first parameter contains the words to be shifted, @@ -2407,13 +4368,11 @@ FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4& vSrcA, const i32x4& vSrcB) // like this. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4* RESTRICT pDest, const fltx4& vSrc) { -#if defined( COMPILER_MSVC64 ) - - (*pDest)[0] = SubFloat(vSrc, 0); - (*pDest)[1] = SubFloat(vSrc, 1); - (*pDest)[2] = SubFloat(vSrc, 2); - (*pDest)[3] = SubFloat(vSrc, 3); - +#if defined(_MSC_VER) && _MSC_VER >= 1900 && defined(COMPILER_MSVC64) + (*pDest)[0] = (int)SubFloat(vSrc, 0); + (*pDest)[1] = (int)SubFloat(vSrc, 1); + (*pDest)[2] = (int)SubFloat(vSrc, 2); + (*pDest)[3] = (int)SubFloat(vSrc, 3); #else __m64 bottom = _mm_cvttps_pi32(vSrc); __m64 top = _mm_cvttps_pi32(_mm_movehl_ps(vSrc, vSrc)); @@ -2429,8 +4388,179 @@ FORCEINLINE void ConvertStoreAsIntsSIMD(intx4* RESTRICT pDest, const fltx4& vSrc #endif +// a={a.y, a.z, a.w, b.x } b={b.y, b.z, b.w, b.x } +FORCEINLINE void RotateLeftDoubleSIMD(fltx4& a, fltx4& b) +{ + a = SetWSIMD(RotateLeft(a), SplatXSIMD(b)); + b = RotateLeft(b); +} +// // Some convenience operator overloads, which are just aliasing the functions above. +// Unneccessary on 360, as you already have them from xboxmath.h (same for PS3 PPU and SPU) +#if !defined(PLATFORM_PPC) && !defined( POSIX ) && !defined(SPU) +#if 1 // TODO: verify generation of non-bad code. +// Componentwise add +FORCEINLINE fltx4 operator+(FLTX4 a, FLTX4 b) +{ + return AddSIMD(a, b); +} + +// Componentwise subtract +FORCEINLINE fltx4 operator-(FLTX4 a, FLTX4 b) +{ + return SubSIMD(a, b); +} + +// Componentwise multiply +FORCEINLINE fltx4 operator*(FLTX4 a, FLTX4 b) +{ + return MulSIMD(a, b); +} + +// No divide. You need to think carefully about whether you want a reciprocal +// or a reciprocal estimate. + +// bitwise and +FORCEINLINE fltx4 operator&(FLTX4 a, FLTX4 b) +{ + return AndSIMD(a, b); +} + +// bitwise or +FORCEINLINE fltx4 operator|(FLTX4 a, FLTX4 b) +{ + return OrSIMD(a, b); +} + +// bitwise xor +FORCEINLINE fltx4 operator^(FLTX4 a, FLTX4 b) +{ + return XorSIMD(a, b); +} + +// unary negate +FORCEINLINE fltx4 operator-(FLTX4 a) +{ + return NegSIMD(a); +} +#endif // 0 +#endif + +#if defined(_X360) || defined(_PS3) +FORCEINLINE fltx4 VectorMergeHighSIMD(fltx4 fl4SrcA, fltx4 fl4SrcB) +{ +#if defined( _X360 ) + return __vmrghw(fl4SrcA, fl4SrcB); +#else + return vec_mergeh(fl4SrcA, fl4SrcB); +#endif +} + +FORCEINLINE fltx4 VectorMergeLowSIMD(fltx4 fl4SrcA, fltx4 fl4SrcB) +{ +#if defined( _X360 ) + return __vmrglw(fl4SrcA, fl4SrcB); +#else + return vec_mergel(fl4SrcA, fl4SrcB); +#endif +} +#endif + +#ifndef SPU +// fourplanes_t, Frustrum_t are not supported on SPU +// It would make sense to support FourVectors on SPU at some point. + +struct ALIGN16 fourplanes_t +{ + fltx4 nX; + fltx4 nY; + fltx4 nZ; + fltx4 dist; + bi32x4 xSign; + bi32x4 ySign; + bi32x4 zSign; + fltx4 nXAbs; + fltx4 nYAbs; + fltx4 nZAbs; + + void ComputeSignbits(); + + // fast SIMD loads + void Set4Planes(const VPlane* pPlanes); + void Set2Planes(const VPlane* pPlanes); + void Get4Planes(VPlane* pPlanesOut) const; + void Get2Planes(VPlane* pPlanesOut) const; + // not-SIMD, much slower + void GetPlane(int index, Vector3D* pNormal, float* pDist) const; + void SetPlane(int index, const Vector3D& vecNormal, float planeDist); +}; + +class ALIGN16 Frustum_t +{ +public: + Frustum_t(); + void SetPlane(int i, const Vector3D& vecNormal, float dist); + void GetPlane(int i, Vector3D* pNormalOut, float* pDistOut) const; + void SetPlanes(const VPlane* pPlanes); + void GetPlanes(VPlane* pPlanesOut) const; + // returns false if the box is within the frustum, true if it is outside + bool CullBox(const Vector3D& mins, const Vector3D& maxs) const; + bool CullBoxCenterExtents(const Vector3D& center, const Vector3D& extents) const; + + bool CullBox(const fltx4& fl4Mins, const fltx4& fl4Maxs) const; + bool CullBoxCenterExtents(const fltx4& fl4Center, const fltx4& fl4Extents) const; + + + // Return true if frustum contains this bounding volume, false if any corner is outside + bool Contains(const Vector3D& mins, const Vector3D& maxs) const; + + // Return true if this frustum intersects the frustum, false if it is outside + bool Intersects(Frustum_t& otherFrustum) const; + + // Return true if this bounding volume intersects the frustum, false if it is outside + bool Intersects(const Vector3D& mins, const Vector3D& maxs) const; + bool IntersectsCenterExtents(const Vector3D& center, const Vector3D& extents) const; + + bool Intersects(const fltx4& fl4Mins, const fltx4& fl4Maxs) const; + bool IntersectsCenterExtents(const fltx4& fl4Center, const fltx4& fl4Extents) const; + + + void CreatePerspectiveFrustum(const Vector3D& origin, const Vector3D& forward, + const Vector3D& right, const Vector3D& up, float flZNear, float flZFar, + float flFovX, float flAspect); + + void CreatePerspectiveFrustumFLU(const Vector3D& vOrigin, const Vector3D& vForward, + const Vector3D& vLeft, const Vector3D& vUp, float flZNear, float flZFar, + float flFovX, float flAspect); + + // Version that accepts angles instead of vectors + void CreatePerspectiveFrustum(const Vector3D& origin, const QAngle& angles, float flZNear, + float flZFar, float flFovX, float flAspectRatio); + + // Generate a frustum based on orthographic parameters + void CreateOrthoFrustum(const Vector3D& origin, const Vector3D& forward, const Vector3D& right, const Vector3D& up, + float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar); + + void CreateOrthoFrustumFLU(const Vector3D& vOrigin, const Vector3D& vForward, const Vector3D& vLeft, const Vector3D& vUp, + float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar); + + // The points returned correspond to the corners of the frustum faces + // Points 0 to 3 correspond to the near face + // Points 4 to 7 correspond to the far face + // Returns points in a face in this order: + // 2--3 + // | | + // 0--1 + // Returns false if a corner couldn't be generated for some reason. + bool GetCorners(Vector3D* pPoints) const; + + fourplanes_t planes[2]; +}; + +#endif + +class FourQuaternions; /// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are /// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated. class ALIGN16 FourVectors @@ -2438,6 +4568,76 @@ class ALIGN16 FourVectors public: fltx4 x, y, z; + FourVectors(void) + { + } + + FourVectors(FourVectors const& src) + { + x = src.x; + y = src.y; + z = src.z; + } + + explicit FORCEINLINE FourVectors(float a) + { + fltx4 aReplicated = ReplicateX4(a); + x = y = z = aReplicated; + } + + FORCEINLINE void Init(void) + { + x = Four_Zeros; + y = Four_Zeros; + z = Four_Zeros; + } + + FORCEINLINE void Init(float flX, float flY, float flZ) + { + x = ReplicateX4(flX); + y = ReplicateX4(flY); + z = ReplicateX4(flZ); + } + + FORCEINLINE FourVectors(float flX, float flY, float flZ) + { + Init(flX, flY, flZ); + } + + FORCEINLINE void Init(fltx4 const& fl4X, fltx4 const& fl4Y, fltx4 const& fl4Z) + { + x = fl4X; + y = fl4Y; + z = fl4Z; + } + + FORCEINLINE FourVectors(fltx4 const& fl4X, fltx4 const& fl4Y, fltx4 const& fl4Z) + { + Init(fl4X, fl4Y, fl4Z); + } + + + + /// construct a FourVectors from 4 separate Vectors + FORCEINLINE FourVectors(Vector3D const& a, Vector3D const& b, Vector3D const& c, Vector3D const& d) + { + LoadAndSwizzle(a, b, c, d); + } + + /// construct a FourVectors from 4 separate Vectors + FORCEINLINE FourVectors(VectorAligned const& a, VectorAligned const& b, VectorAligned const& c, VectorAligned const& d) + { + LoadAndSwizzleAligned(a, b, c, d); + } + + // construct from twelve floats; really only useful for static const constructors. + // input arrays must be aligned, and in the fourvectors' native format + // (eg in xxxx,yyyy,zzzz form) + // each pointer should be to an aligned array of four floats + FORCEINLINE FourVectors(const float* xs, const float* ys, const float* zs) : + x(LoadAlignedSIMD(xs)), y(LoadAlignedSIMD(ys)), z(LoadAlignedSIMD(zs)) + {}; + FORCEINLINE void DuplicateVector(Vector3D const& v) //< set all 4 vectors to the same vector value { x = ReplicateX4(v.x); @@ -2505,6 +4705,25 @@ public: return dot; } + FORCEINLINE FourVectors operator*(float b) const //< scale + { + fltx4 scalepacked = ReplicateX4(b); + FourVectors res; + res.x = MulSIMD(x, scalepacked); + res.y = MulSIMD(y, scalepacked); + res.z = MulSIMD(z, scalepacked); + return res; + } + + FORCEINLINE FourVectors operator*(FLTX4 fl4Scale) const //< scale + { + FourVectors res; + res.x = MulSIMD(x, fl4Scale); + res.y = MulSIMD(y, fl4Scale); + res.z = MulSIMD(z, fl4Scale); + return res; + } + FORCEINLINE void VProduct(FourVectors const& b) //< component by component mul { x = MulSIMD(x, b.x); @@ -2529,12 +4748,18 @@ public: // If you have a long list of FourVectors structures that you all want // to rotate by the same matrix, use FourVectors::RotateManyBy() instead. inline void RotateBy(const matrix3x4_t& matrix); + /***** removed because one of the SWIG permutations doesn't include ssequaternion.h, causing a missing symbol on this function: + // rotate these vectors ( in place ) by the corresponding quaternions: + inline void RotateBy( const FourQuaternions &quats ); + ******/ /// You can use this to rotate a long array of FourVectors all by the same /// matrix. The first parameter is the head of the array. The second is the /// number of vectors to rotate. The third is the matrix. static void RotateManyBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix); + static void RotateManyBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors* RESTRICT pOut); + /// Assume the vectors are points, and transform them in place by the matrix. inline void TransformBy(const matrix3x4_t& matrix); @@ -2552,6 +4777,9 @@ public: /// This is an in-place transformation. static void TransformManyBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix); + static void CalcClosestPointOnLineSIMD(const FourVectors& P, const FourVectors& vLineA, const FourVectors& vLineB, FourVectors& vClosest, fltx4* outT = 0); + static fltx4 CalcClosestPointToLineTSIMD(const FourVectors& P, const FourVectors& vLineA, const FourVectors& vLineB, FourVectors& vDir); + // X(),Y(),Z() - get at the desired component of the i'th (0..3) vector. FORCEINLINE const float& X(int idx) const { @@ -2589,17 +4817,6 @@ public: return Vector3D(X(idx), Y(idx), Z(idx)); } - FourVectors(void) - { - } - - FourVectors(FourVectors const& src) - { - x = src.x; - y = src.y; - z = src.z; - } - FORCEINLINE void operator=(FourVectors const& src) { x = src.x; @@ -2612,19 +4829,19 @@ public: { // TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360 // use an unfolded implementation here -#if _X360 +#if defined( _X360 ) || defined(_PS3) fltx4 tx = LoadUnalignedSIMD(&a.x); fltx4 ty = LoadUnalignedSIMD(&b.x); fltx4 tz = LoadUnalignedSIMD(&c.x); fltx4 tw = LoadUnalignedSIMD(&d.x); - fltx4 r0 = __vmrghw(tx, tz); - fltx4 r1 = __vmrghw(ty, tw); - fltx4 r2 = __vmrglw(tx, tz); - fltx4 r3 = __vmrglw(ty, tw); + fltx4 r0 = VectorMergeHighSIMD(tx, tz); + fltx4 r1 = VectorMergeHighSIMD(ty, tw); + fltx4 r2 = VectorMergeLowSIMD(tx, tz); + fltx4 r3 = VectorMergeLowSIMD(ty, tw); - x = __vmrghw(r0, r1); - y = __vmrglw(r0, r1); - z = __vmrghw(r2, r3); + x = VectorMergeHighSIMD(r0, r1); + y = VectorMergeLowSIMD(r0, r1); + z = VectorMergeHighSIMD(r2, r3); #else x = LoadUnalignedSIMD(&(a.x)); y = LoadUnalignedSIMD(&(b.x)); @@ -2639,23 +4856,87 @@ public: #endif } + FORCEINLINE void LoadAndSwizzle(Vector3D const& a) + { + LoadAndSwizzle(a, a, a, a); + } + + // Broadcasts a, b, c, and d into the four vectors + // This is only performant if the floats are ALREADY IN MEMORY + // and not on registers -- eg, + // .Load( &fltArrray[0], &fltArrray[1], &fltArrray[2], &fltArrray[3] ) is okay, + // .Load( fltArrray[0] * 0.5f, fltArrray[1] * 0.5f, fltArrray[2] * 0.5f, fltArrray[3] * 0.5f ) is not. + FORCEINLINE void Load(const float& a, const float& b, const float& c, const float& d) + { +#if defined( _X360 ) || defined( _PS3 ) + fltx4 temp[4]; + temp[0] = LoadUnalignedFloatSIMD(&a); + temp[1] = LoadUnalignedFloatSIMD(&b); + temp[2] = LoadUnalignedFloatSIMD(&c); + temp[3] = LoadUnalignedFloatSIMD(&d); + y = VectorMergeHighSIMD(temp[0], temp[2]); // ac__ + z = VectorMergeHighSIMD(temp[1], temp[3]); // bd__ + + x = VectorMergeHighSIMD(y, z); // abcd + y = x; + z = x; +#else + ALIGN16 float temp[4]; + temp[0] = a; temp[1] = b; temp[2] = c; temp[3] = d; + fltx4 v = LoadAlignedSIMD(temp); + x = v; + y = v; + z = v; +#endif + } + + // transform four horizontal vectors into the internal vertical ones + FORCEINLINE void LoadAndSwizzle(FLTX4 a, FLTX4 b, FLTX4 c, FLTX4 d) + { +#if defined( _X360 ) || defined( _PS3 ) + fltx4 tx = a; + fltx4 ty = b; + fltx4 tz = c; + fltx4 tw = d; + fltx4 r0 = VectorMergeHighSIMD(tx, tz); + fltx4 r1 = VectorMergeHighSIMD(ty, tw); + fltx4 r2 = VectorMergeLowSIMD(tx, tz); + fltx4 r3 = VectorMergeLowSIMD(ty, tw); + + x = VectorMergeHighSIMD(r0, r1); + y = VectorMergeLowSIMD(r0, r1); + z = VectorMergeHighSIMD(r2, r3); +#else + x = a; + y = b; + z = c; + fltx4 w = d; + // now, matrix is: + // x y z ? + // x y z ? + // x y z ? + // x y z ? + TransposeSIMD(x, y, z, w); +#endif + } + /// LoadAndSwizzleAligned - load 4 Vectors into a FourVectors, performing transpose op. /// all 4 vectors must be 128 bit boundary FORCEINLINE void LoadAndSwizzleAligned(const float* RESTRICT a, const float* RESTRICT b, const float* RESTRICT c, const float* RESTRICT d) { -#if _X360 +#if defined( _X360 ) || defined( _PS3 ) fltx4 tx = LoadAlignedSIMD(a); fltx4 ty = LoadAlignedSIMD(b); fltx4 tz = LoadAlignedSIMD(c); fltx4 tw = LoadAlignedSIMD(d); - fltx4 r0 = __vmrghw(tx, tz); - fltx4 r1 = __vmrghw(ty, tw); - fltx4 r2 = __vmrglw(tx, tz); - fltx4 r3 = __vmrglw(ty, tw); + fltx4 r0 = VectorMergeHighSIMD(tx, tz); + fltx4 r1 = VectorMergeHighSIMD(ty, tw); + fltx4 r2 = VectorMergeLowSIMD(tx, tz); + fltx4 r3 = VectorMergeLowSIMD(ty, tw); - x = __vmrghw(r0, r1); - y = __vmrglw(r0, r1); - z = __vmrghw(r2, r3); + x = VectorMergeHighSIMD(r0, r1); + y = VectorMergeLowSIMD(r0, r1); + z = VectorMergeHighSIMD(r2, r3); #else x = LoadAlignedSIMD(a); y = LoadAlignedSIMD(b); @@ -2675,6 +4956,81 @@ public: LoadAndSwizzleAligned(&a.x, &b.x, &c.x, &d.x); } + /// Unpack a FourVectors back into four horizontal fltx4s. + /// Since the FourVectors doesn't store a w row, you can optionally + /// specify your own; otherwise it will be 0. + /// This function ABSOLUTELY MUST be inlined or the reference parameters will + /// induce a severe load-hit-store. + FORCEINLINE void TransposeOnto(fltx4& out0, fltx4& out1, fltx4& out2, fltx4& out3, FLTX4 w = Four_Zeros) const + { + // TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360 + // use an unfolded implementation here +#if defined( _X360 ) || defined(_PS3) + fltx4 r0 = VectorMergeHighSIMD(x, z); + fltx4 r1 = VectorMergeHighSIMD(y, w); + fltx4 r2 = VectorMergeLowSIMD(x, z); + fltx4 r3 = VectorMergeLowSIMD(y, w); + + out0 = VectorMergeHighSIMD(r0, r1); + out1 = VectorMergeLowSIMD(r0, r1); + out2 = VectorMergeHighSIMD(r2, r3); + out3 = VectorMergeLowSIMD(r2, r3); +#else + out0 = x; + out1 = y; + out2 = z; + out3 = w; + + TransposeSIMD(out0, out1, out2, out3); +#endif + } + +#if !defined(__SPU__) + /// Store a FourVectors into four NON-CONTIGUOUS Vector*'s. + FORCEINLINE void StoreUnalignedVector3SIMD(Vector3D* RESTRICT out0, Vector3D* RESTRICT out1, Vector3D* RESTRICT out2, Vector3D* RESTRICT out3) const; +#endif + + /// Store a FourVectors into four NON-CONTIGUOUS VectorAligned s. + FORCEINLINE void StoreAlignedVectorSIMD(VectorAligned* RESTRICT out0, VectorAligned* RESTRICT out1, VectorAligned* RESTRICT out2, VectorAligned* RESTRICT out3) const; + +#if !defined(__SPU__) + /// Store a FourVectors into four CONSECUTIVE Vectors in memory, + /// where the first vector IS NOT aligned on a 16-byte boundary. + FORCEINLINE void StoreUnalignedContigVector3SIMD(Vector3D* RESTRICT pDestination) + { + fltx4 a, b, c, d; + TransposeOnto(a, b, c, d); + StoreFourUnalignedVector3SIMD(a, b, c, d, pDestination); + } +#endif + + /// Store a FourVectors into four CONSECUTIVE Vectors in memory, + /// where the first vector IS aligned on a 16-byte boundary. + /// (since four Vector3s = 48 bytes, groups of four can be said + /// to be 16-byte aligned though obviously the 2nd, 3d, and 4th + /// vectors in the group individually are not) +#if !defined(__SPU__) + FORCEINLINE void StoreAlignedContigVector3SIMD(Vector3D* RESTRICT pDestination) + { + fltx4 a, b, c, d; + TransposeOnto(a, b, c, d); + StoreFourAlignedVector3SIMD(a, b, c, d, pDestination); + } + + /// Store a FourVectors into four CONSECUTIVE VectorAligneds in memory + FORCEINLINE void StoreAlignedContigVectorASIMD(VectorAligned* RESTRICT pDestination) + { + StoreAlignedVectorSIMD(pDestination, pDestination + 1, pDestination + 2, pDestination + 3); + } +#endif + + /// return the squared length of all 4 vectors, the same name as used on Vector + FORCEINLINE fltx4 LengthSqr(void) const + { + const FourVectors& a = *this; + return a * a; + } + /// return the squared length of all 4 vectors FORCEINLINE fltx4 length2(void) const { @@ -2687,6 +5043,13 @@ public: return SqrtEstSIMD(length2()); } + /// full precision square root. upper/lower case name is an artifact - the lower case one should be changed to refelct the lower accuracy. I added the mixed case one for compat with Vector + FORCEINLINE fltx4 Length(void) const + { + return SqrtSIMD(length2()); + } + + /// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction) FORCEINLINE void VectorNormalizeFast(void) { @@ -2701,18 +5064,6 @@ public: (*this) *= ReciprocalSqrtSIMD(mag_sq); // *(1.0/sqrt(length^2)) } - /// construct a FourVectors from 4 separate Vectors - FORCEINLINE FourVectors(Vector3D const& a, Vector3D const& b, Vector3D const& c, Vector3D const& d) - { - LoadAndSwizzle(a, b, c, d); - } - - /// construct a FourVectors from 4 separate Vectors - FORCEINLINE FourVectors(VectorAligned const& a, VectorAligned const& b, VectorAligned const& c, VectorAligned const& d) - { - LoadAndSwizzleAligned(a, b, c, d); - } - FORCEINLINE fltx4 DistToSqr(FourVectors const& pnt) { fltx4 fl4dX = SubSIMD(pnt.x, x); @@ -2748,9 +5099,97 @@ public: lineDelta *= fl4T; return v4OurPnt.DistToSqr(lineDelta); } + FORCEINLINE FourVectors Normalized()const + { + fltx4 fl4LengthInv = ReciprocalSqrtSIMD(LengthSqr()); + FourVectors out; + out.x = x * fl4LengthInv; + out.y = y * fl4LengthInv; + out.z = z * fl4LengthInv; + return out; + } + FORCEINLINE FourVectors NormalizedSafeX() const + { + fltx4 f4LenSqr = LengthSqr(); + fltx4 isBigEnough = CmpGeSIMD(f4LenSqr, Four_Epsilons); + fltx4 fl4LengthInv = ReciprocalSqrtSIMD(f4LenSqr); + FourVectors out; + out.x = MaskedAssign(isBigEnough, x * fl4LengthInv, Four_Ones); + out.y = AndSIMD(y * fl4LengthInv, isBigEnough); + out.z = AndSIMD(z * fl4LengthInv, isBigEnough); + return out; + } + FORCEINLINE FourVectors NormalizedSafeY() const + { + fltx4 f4LenSqr = LengthSqr(); + fltx4 isBigEnough = CmpGeSIMD(f4LenSqr, Four_Epsilons); + fltx4 fl4LengthInv = ReciprocalSqrtSIMD(f4LenSqr); + FourVectors out; + out.x = AndSIMD(x * fl4LengthInv, isBigEnough); + out.y = MaskedAssign(isBigEnough, y * fl4LengthInv, Four_Ones); + out.z = AndSIMD(z * fl4LengthInv, isBigEnough); + return out; + } + + FORCEINLINE FourVectors NormalizedSafeZ() const + { + fltx4 f4LenSqr = LengthSqr(); + fltx4 isBigEnough = CmpGeSIMD(f4LenSqr, Four_Epsilons); + fltx4 fl4LengthInv = ReciprocalSqrtSIMD(f4LenSqr); + FourVectors out; + out.x = AndSIMD(x * fl4LengthInv, isBigEnough); + out.y = AndSIMD(y * fl4LengthInv, isBigEnough); + out.z = MaskedAssign(isBigEnough, z * fl4LengthInv, Four_Ones); + return out; + } }; + +inline FourVectors CrossProduct(const FourVectors& a, const FourVectors& b) +{ + return FourVectors(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); +} + +inline fltx4 DotProduct(const FourVectors& a, const FourVectors& b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +inline FourVectors operator * (fltx4 left, const FourVectors& right) +{ + return right * left; +} + + +// +inline FourVectors Mul(const FourVectors& a, const fltx4& b) +{ + FourVectors ret; + ret.x = MulSIMD(a.x, b); + ret.y = MulSIMD(a.y, b); + ret.z = MulSIMD(a.z, b); + return ret; +} + +inline FourVectors Mul(const FourVectors& a, const FourVectors& b) +{ + FourVectors ret; + ret.x = MulSIMD(a.x, b.x); + ret.y = MulSIMD(a.y, b.y); + ret.z = MulSIMD(a.z, b.z); + return ret; +} + +inline FourVectors Madd(const FourVectors& a, const fltx4& b, const FourVectors& c) // a*b + c +{ + FourVectors ret; + ret.x = MaddSIMD(a.x, b, c.x); + ret.y = MaddSIMD(a.y, b, c.y); + ret.z = MaddSIMD(a.z, b, c.z); + return ret; +} + /// form 4 cross products inline FourVectors operator ^(const FourVectors& a, const FourVectors& b) { @@ -2761,6 +5200,24 @@ inline FourVectors operator ^(const FourVectors& a, const FourVectors& b) return ret; } +inline FourVectors operator-(const FourVectors& a, const FourVectors& b) +{ + FourVectors ret; + ret.x = SubSIMD(a.x, b.x); + ret.y = SubSIMD(a.y, b.y); + ret.z = SubSIMD(a.z, b.z); + return ret; +} + +inline FourVectors operator+(const FourVectors& a, const FourVectors& b) +{ + FourVectors ret; + ret.x = AddSIMD(a.x, b.x); + ret.y = AddSIMD(a.y, b.y); + ret.z = AddSIMD(a.z, b.z); + return ret; +} + /// component-by-componentwise MAX operator inline FourVectors maximum(const FourVectors& a, const FourVectors& b) { @@ -2781,6 +5238,32 @@ inline FourVectors minimum(const FourVectors& a, const FourVectors& b) return ret; } +FORCEINLINE FourVectors RotateLeft(const FourVectors& src) +{ + FourVectors ret; + ret.x = RotateLeft(src.x); + ret.y = RotateLeft(src.y); + ret.z = RotateLeft(src.z); + return ret; +} + +FORCEINLINE FourVectors RotateRight(const FourVectors& src) +{ + FourVectors ret; + ret.x = RotateRight(src.x); + ret.y = RotateRight(src.y); + ret.z = RotateRight(src.z); + return ret; +} +FORCEINLINE FourVectors MaskedAssign(const bi32x4& ReplacementMask, const FourVectors& NewValue, const FourVectors& OldValue) +{ + FourVectors ret; + ret.x = MaskedAssign(ReplacementMask, NewValue.x, OldValue.x); + ret.y = MaskedAssign(ReplacementMask, NewValue.y, OldValue.y); + ret.z = MaskedAssign(ReplacementMask, NewValue.z, OldValue.z); + return ret; +} + /// calculate reflection vector. incident and normal dir assumed normalized FORCEINLINE FourVectors VectorReflect(const FourVectors& incident, const FourVectors& normal) { @@ -2804,7 +5287,77 @@ FORCEINLINE FourVectors VectorSlide(const FourVectors& incident, const FourVecto return ret; } +/// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction) +FORCEINLINE FourVectors VectorNormalizeFast(const FourVectors& src) +{ + fltx4 mag_sq = ReciprocalSqrtEstSIMD(src * src); // *(1.0/sqrt(length^2)) + FourVectors result; + result.x = MulSIMD(src.x, mag_sq); + result.y = MulSIMD(src.y, mag_sq); + result.z = MulSIMD(src.z, mag_sq); + return result; +} +#if !defined(__SPU__) +/// Store a FourVectors into four NON-CONTIGUOUS Vector*'s. +FORCEINLINE void FourVectors::StoreUnalignedVector3SIMD(Vector3D* RESTRICT out0, Vector3D* RESTRICT out1, Vector3D* RESTRICT out2, Vector3D* RESTRICT out3) const +{ +#ifdef _X360 + fltx4 x0, x1, x2, x3, y0, y1, y2, y3, z0, z1, z2, z3; + x0 = SplatXSIMD(x); // all x0x0x0x0 + x1 = SplatYSIMD(x); + x2 = SplatZSIMD(x); + x3 = SplatWSIMD(x); + + y0 = SplatXSIMD(y); + y1 = SplatYSIMD(y); + y2 = SplatZSIMD(y); + y3 = SplatWSIMD(y); + + z0 = SplatXSIMD(z); + z1 = SplatYSIMD(z); + z2 = SplatZSIMD(z); + z3 = SplatWSIMD(z); + + __stvewx(x0, out0->Base(), 0); // store X word + __stvewx(y0, out0->Base(), 4); // store Y word + __stvewx(z0, out0->Base(), 8); // store Z word + + __stvewx(x1, out1->Base(), 0); // store X word + __stvewx(y1, out1->Base(), 4); // store Y word + __stvewx(z1, out1->Base(), 8); // store Z word + + __stvewx(x2, out2->Base(), 0); // store X word + __stvewx(y2, out2->Base(), 4); // store Y word + __stvewx(z2, out2->Base(), 8); // store Z word + + __stvewx(x3, out3->Base(), 0); // store X word + __stvewx(y3, out3->Base(), 4); // store Y word + __stvewx(z3, out3->Base(), 8); // store Z word +#else + fltx4 a, b, c, d; + TransposeOnto(a, b, c, d); + StoreUnaligned3SIMD(out0->Base(), a); + StoreUnaligned3SIMD(out1->Base(), b); + StoreUnaligned3SIMD(out2->Base(), c); + StoreUnaligned3SIMD(out3->Base(), d); +#endif +} + +/// Store a FourVectors into four NON-CONTIGUOUS VectorAligned s. +FORCEINLINE void FourVectors::StoreAlignedVectorSIMD(VectorAligned* RESTRICT out0, VectorAligned* RESTRICT out1, VectorAligned* RESTRICT out2, VectorAligned* RESTRICT out3) const +{ + fltx4 a, b, c, d; + TransposeOnto(a, b, c, d); + StoreAligned3SIMD(out0, a); + StoreAligned3SIMD(out1, b); + StoreAligned3SIMD(out2, c); + StoreAligned3SIMD(out3, d); + +} +#endif + +#if !defined(__SPU__) // Assume the given matrix is a rotation, and rotate these vectors by it. // If you have a long list of FourVectors structures that you all want // to rotate by the same matrix, use FourVectors::RotateManyBy() instead. @@ -2818,26 +5371,24 @@ void FourVectors::RotateBy(const matrix3x4_t& matrix) matSplat10, matSplat11, matSplat12, matSplat20, matSplat21, matSplat22; - { - // Load the matrix into local vectors. Sadly, matrix3x4_ts are - // often unaligned. The w components will be the tranpose row of - // the matrix, but we don't really care about that. - fltx4 matCol0 = LoadUnalignedSIMD(matrix[0]); - fltx4 matCol1 = LoadUnalignedSIMD(matrix[1]); - fltx4 matCol2 = LoadUnalignedSIMD(matrix[2]); + // Load the matrix into local vectors. Sadly, matrix3x4_ts are + // often unaligned. The w components will be the tranpose row of + // the matrix, but we don't really care about that. + fltx4 matCol0 = LoadUnalignedSIMD(matrix[0]); + fltx4 matCol1 = LoadUnalignedSIMD(matrix[1]); + fltx4 matCol2 = LoadUnalignedSIMD(matrix[2]); - matSplat00 = SplatXSIMD(matCol0); - matSplat01 = SplatYSIMD(matCol0); - matSplat02 = SplatZSIMD(matCol0); + matSplat00 = SplatXSIMD(matCol0); + matSplat01 = SplatYSIMD(matCol0); + matSplat02 = SplatZSIMD(matCol0); - matSplat10 = SplatXSIMD(matCol1); - matSplat11 = SplatYSIMD(matCol1); - matSplat12 = SplatZSIMD(matCol1); + matSplat10 = SplatXSIMD(matCol1); + matSplat11 = SplatYSIMD(matCol1); + matSplat12 = SplatZSIMD(matCol1); - matSplat20 = SplatXSIMD(matCol2); - matSplat21 = SplatYSIMD(matCol2); - matSplat22 = SplatZSIMD(matCol2); - } + matSplat20 = SplatXSIMD(matCol2); + matSplat21 = SplatYSIMD(matCol2); + matSplat22 = SplatZSIMD(matCol2); // Trust in the compiler to schedule these operations correctly: fltx4 outX, outY, outZ; @@ -2850,6 +5401,7 @@ void FourVectors::RotateBy(const matrix3x4_t& matrix) z = outZ; } + // Assume the given matrix is a rotation, and rotate these vectors by it. // If you have a long list of FourVectors structures that you all want // to rotate by the same matrix, use FourVectors::RotateManyBy() instead. @@ -2863,26 +5415,24 @@ void FourVectors::TransformBy(const matrix3x4_t& matrix) matSplat10, matSplat11, matSplat12, matSplat20, matSplat21, matSplat22; - { - // Load the matrix into local vectors. Sadly, matrix3x4_ts are - // often unaligned. The w components will be the tranpose row of - // the matrix, but we don't really care about that. - fltx4 matCol0 = LoadUnalignedSIMD(matrix[0]); - fltx4 matCol1 = LoadUnalignedSIMD(matrix[1]); - fltx4 matCol2 = LoadUnalignedSIMD(matrix[2]); + // Load the matrix into local vectors. Sadly, matrix3x4_ts are + // often unaligned. The w components will be the tranpose row of + // the matrix, but we don't really care about that. + fltx4 matCol0 = LoadUnalignedSIMD(matrix[0]); + fltx4 matCol1 = LoadUnalignedSIMD(matrix[1]); + fltx4 matCol2 = LoadUnalignedSIMD(matrix[2]); - matSplat00 = SplatXSIMD(matCol0); - matSplat01 = SplatYSIMD(matCol0); - matSplat02 = SplatZSIMD(matCol0); + matSplat00 = SplatXSIMD(matCol0); + matSplat01 = SplatYSIMD(matCol0); + matSplat02 = SplatZSIMD(matCol0); - matSplat10 = SplatXSIMD(matCol1); - matSplat11 = SplatYSIMD(matCol1); - matSplat12 = SplatZSIMD(matCol1); + matSplat10 = SplatXSIMD(matCol1); + matSplat11 = SplatYSIMD(matCol1); + matSplat12 = SplatZSIMD(matCol1); - matSplat20 = SplatXSIMD(matCol2); - matSplat21 = SplatYSIMD(matCol2); - matSplat22 = SplatZSIMD(matCol2); - } + matSplat20 = SplatXSIMD(matCol2); + matSplat21 = SplatYSIMD(matCol2); + matSplat22 = SplatZSIMD(matCol2); // Trust in the compiler to schedule these operations correctly: fltx4 outX, outY, outZ; @@ -2895,12 +5445,8 @@ void FourVectors::TransformBy(const matrix3x4_t& matrix) y = AddSIMD(outY, ReplicateX4(matrix[1][3])); z = AddSIMD(outZ, ReplicateX4(matrix[2][3])); } +#endif - - -/// quick, low quality perlin-style noise() function suitable for real time use. -/// return value is -1..1. Only reliable around +/- 1 million or so. -fltx4 NoiseSIMD(const fltx4& x, const fltx4& y, const fltx4& z); fltx4 NoiseSIMD(FourVectors const& v); // vector valued noise direction @@ -2909,6 +5455,13 @@ FourVectors DNoiseSIMD(FourVectors const& v); // vector value "curl" noise function. see http://hyperphysics.phy-astr.gsu.edu/hbase/curl.html FourVectors CurlNoiseSIMD(FourVectors const& v); +//#endif // !defined SPU + + +/// quick, low quality perlin-style noise() function suitable for real time use. +/// return value is -1..1. Only reliable around +/- 1 million or so. +fltx4 NoiseSIMD(const fltx4& x, const fltx4& y, const fltx4& z); + /// calculate the absolute value of a packed single inline fltx4 fabs(const fltx4& x) @@ -2916,13 +5469,18 @@ inline fltx4 fabs(const fltx4& x) return AndSIMD(x, LoadAlignedSIMD(g_SIMD_clear_signmask)); } +// Convenience version +inline fltx4 AbsSIMD(const fltx4& x) +{ + return fabs(x); +} + /// negate all four components of a SIMD packed single inline fltx4 fnegate(const fltx4& x) { return XorSIMD(x, LoadAlignedSIMD(g_SIMD_signmask)); } - fltx4 Pow_FixedPoint_Exponent_SIMD(const fltx4& x, int exponent); // PowSIMD - raise a SIMD register to a power. This is analogous to the C pow() function, with some @@ -2936,8 +5494,40 @@ inline fltx4 PowSIMD(const fltx4& x, float exponent) return Pow_FixedPoint_Exponent_SIMD(x, (int)(4.0 * exponent)); } +/// (x<1)?x^(1/2.2):1. Use a 4th order polynomial to approximate x^(1/2.2) over 0..1 +inline fltx4 LinearToGammaSIMD(fltx4 x) +{ + // y = -3.7295x4 + 8.9635x3 - 7.7397x2 + 3.443x + 0.048 + x = MaxSIMD(MinSIMD(Four_Ones, x), Four_Zeros); + return AddSIMD(Four_LinearToGammaCoefficients_E, + MulSIMD(x, AddSIMD(Four_LinearToGammaCoefficients_D, + MulSIMD(x, AddSIMD(Four_LinearToGammaCoefficients_C, + MulSIMD(x, AddSIMD(Four_LinearToGammaCoefficients_B, + MulSIMD(x, Four_LinearToGammaCoefficients_A)))))))); +} +inline fltx4 GammaToLinearSIMD(fltx4 x) +{ + x = MaxSIMD(x, Four_Zeros); + x = AddSIMD(Four_GammaToLinearCoefficients_D, + MulSIMD(x, AddSIMD(Four_GammaToLinearCoefficients_C, + MulSIMD(x, AddSIMD(Four_GammaToLinearCoefficients_B, + MulSIMD(x, Four_GammaToLinearCoefficients_A)))))); + return MinSIMD(x, Four_Ones); +} + +/// ( x > 1 ) ? x : x^2.2 +inline fltx4 GammaToLinearExtendedSIMD(fltx4 x) +{ + x = MaxSIMD(x, Four_Zeros); + fltx4 fl4Ret = AddSIMD(Four_GammaToLinearCoefficients_D, + MulSIMD(x, AddSIMD(Four_GammaToLinearCoefficients_C, + MulSIMD(x, AddSIMD(Four_GammaToLinearCoefficients_B, + MulSIMD(x, Four_GammaToLinearCoefficients_A)))))); + return MaskedAssign(CmpGeSIMD(x, Four_Ones), x, fl4Ret); +} + // random number generation - generate 4 random numbers quickly. void SeedRandSIMD(uint32 seed); // seed the random # generator @@ -2953,6 +5543,18 @@ FORCEINLINE fltx4 RandSignedSIMD(void) // -1..1 } +FORCEINLINE fltx4 LerpSIMD(const fltx4& percent, const fltx4& a, const fltx4& b) +{ + return AddSIMD(a, MulSIMD(SubSIMD(b, a), percent)); +} + +FORCEINLINE fltx4 RemapValClampedSIMD(const fltx4& val, const fltx4& a, const fltx4& b, const fltx4& c, const fltx4& d) // Remap val from clamped range between a and b to new range between c and d +{ + fltx4 range = MaskedAssign(CmpEqSIMD(a, b), Four_Ones, SubSIMD(b, a)); //make sure range > 0 + fltx4 cVal = MaxSIMD(Four_Zeros, MinSIMD(Four_Ones, DivSIMD(SubSIMD(val, a), range))); //saturate + return LerpSIMD(cVal, c, d); +} + // SIMD versions of mathlib simplespline functions // hermite basis function for smooth interpolation // Similar to Gain() above, but very cheap to call @@ -3002,6 +5604,11 @@ FORCEINLINE fltx4 FracSIMD(const fltx4& val) return XorSIMD(SubSIMD(fl4Abs, ival), XorSIMD(val, fl4Abs)); // restore sign bits } +#ifndef SPU +// Disable on SPU for the moment as it generates a warning +// warning: dereferencing type-punned pointer will break strict-aliasing rules +// This is related to LoadAlignedSIMD( (float *) g_SIMD_lsbmask ) +// LoadAlignedSIMD() under the hood is dereferencing the variable. FORCEINLINE fltx4 Mod2SIMD(const fltx4& val) { fltx4 fl4Abs = fabs(val); @@ -3009,6 +5616,7 @@ FORCEINLINE fltx4 Mod2SIMD(const fltx4& val) ival = MaskedAssign(CmpGtSIMD(ival, fl4Abs), SubSIMD(ival, Four_Twos), ival); return XorSIMD(SubSIMD(fl4Abs, ival), XorSIMD(val, fl4Abs)); // restore sign bits } +#endif FORCEINLINE fltx4 Mod2SIMDPositiveInput(const fltx4& val) { @@ -3040,7 +5648,7 @@ FORCEINLINE fltx4 SinEst01SIMD(const fltx4& val) { fltx4 fl4Abs = fabs(val); fltx4 fl4Reduced2 = Mod2SIMDPositiveInput(fl4Abs); - fltx4 fl4OddMask = CmpGeSIMD(fl4Reduced2, Four_Ones); + bi32x4 fl4OddMask = CmpGeSIMD(fl4Reduced2, Four_Ones); fltx4 fl4val = SubSIMD(fl4Reduced2, AndSIMD(Four_Ones, fl4OddMask)); fltx4 fl4Sin = _SinEst01SIMD(fl4val); fl4Sin = XorSIMD(fl4Sin, AndSIMD(LoadAlignedSIMD(g_SIMD_signmask), XorSIMD(val, fl4OddMask))); @@ -3052,7 +5660,7 @@ FORCEINLINE fltx4 Sin01SIMD(const fltx4& val) { fltx4 fl4Abs = fabs(val); fltx4 fl4Reduced2 = Mod2SIMDPositiveInput(fl4Abs); - fltx4 fl4OddMask = CmpGeSIMD(fl4Reduced2, Four_Ones); + bi32x4 fl4OddMask = CmpGeSIMD(fl4Reduced2, Four_Ones); fltx4 fl4val = SubSIMD(fl4Reduced2, AndSIMD(Four_Ones, fl4OddMask)); fltx4 fl4Sin = _Sin01SIMD(fl4val); fl4Sin = XorSIMD(fl4Sin, AndSIMD(LoadAlignedSIMD(g_SIMD_signmask), XorSIMD(val, fl4OddMask))); @@ -3060,6 +5668,17 @@ FORCEINLINE fltx4 Sin01SIMD(const fltx4& val) } +FORCEINLINE fltx4 NatExpSIMD(const fltx4& val) // why is ExpSimd( x ) defined to be 2^x? +{ + // need to write this. just stub with normal float implementation for now + fltx4 fl4Result; + SubFloat(fl4Result, 0) = exp(SubFloat(val, 0)); + SubFloat(fl4Result, 1) = exp(SubFloat(val, 1)); + SubFloat(fl4Result, 2) = exp(SubFloat(val, 2)); + SubFloat(fl4Result, 3) = exp(SubFloat(val, 3)); + return fl4Result; +} + // Schlick style Bias approximation see graphics gems 4 : bias(t,a)= t/( (1/a-2)*(1-t)+1) FORCEINLINE fltx4 PreCalcBiasParameter(const fltx4& bias_parameter) @@ -3081,6 +5700,10 @@ FORCEINLINE fltx4 BiasSIMD(const fltx4& val, const fltx4& precalc_param) // Box/plane test // NOTE: The w component of emins + emaxs must be 1 for this to work //----------------------------------------------------------------------------- + +#ifndef SPU +// We don't need this on SPU right now + FORCEINLINE int BoxOnPlaneSideSIMD(const fltx4& emins, const fltx4& emaxs, const cplane_t* p, float tolerance = 0.f) { fltx4 corners[2]; @@ -3089,13 +5712,13 @@ FORCEINLINE int BoxOnPlaneSideSIMD(const fltx4& emins, const fltx4& emaxs, const normal = SetWSIMD(normal, dist); fltx4 t4 = ReplicateX4(tolerance); fltx4 negt4 = ReplicateX4(-tolerance); - fltx4 cmp = CmpGeSIMD(normal, Four_Zeros); + bi32x4 cmp = CmpGeSIMD(normal, Four_Zeros); corners[0] = MaskedAssign(cmp, emaxs, emins); corners[1] = MaskedAssign(cmp, emins, emaxs); fltx4 dot1 = Dot4SIMD(normal, corners[0]); fltx4 dot2 = Dot4SIMD(normal, corners[1]); cmp = CmpGeSIMD(dot1, t4); - fltx4 cmp2 = CmpGtSIMD(negt4, dot2); + bi32x4 cmp2 = CmpGtSIMD(negt4, dot2); fltx4 result = MaskedAssign(cmp, Four_Ones, Four_Zeros); fltx4 result2 = MaskedAssign(cmp2, Four_Twos, Four_Zeros); result = AddSIMD(result, result2); @@ -3104,4 +5727,246 @@ FORCEINLINE int BoxOnPlaneSideSIMD(const fltx4& emins, const fltx4& emaxs, const return sides[0]; } + +// k-dop bounding volume. 26-dop bounds with 13 plane-pairs plus 3 other "arbitrary bounds". The arbitrary values could be used to hold type info, etc, +// which can compare against "for free" +class KDop32_t +{ +public: + fltx4 m_Mins[4]; + fltx4 m_Maxes[4]; + + FORCEINLINE bool Intersects(KDop32_t const& other) const; + + FORCEINLINE void operator|=(KDop32_t const& other); + + FORCEINLINE bool IsEmpty(void) const; + + FORCEINLINE void Init(void) + { + for (int i = 0; i < ARRAYSIZE(m_Mins); i++) + { + m_Mins[i] = Four_FLT_MAX; + m_Maxes[i] = Four_Negative_FLT_MAX; + } + } + + // given a set of points, expand the kdop to contain them + void AddPointSet(Vector3D const* pPoints, int nPnts); + + void CreateFromPointSet(Vector3D const* pPoints, int nPnts); +}; + +FORCEINLINE void KDop32_t::operator|=(KDop32_t const& other) +{ + m_Mins[0] = MinSIMD(m_Mins[0], other.m_Mins[0]); + m_Mins[1] = MinSIMD(m_Mins[1], other.m_Mins[1]); + m_Mins[2] = MinSIMD(m_Mins[2], other.m_Mins[2]); + m_Mins[3] = MinSIMD(m_Mins[3], other.m_Mins[3]); + + m_Maxes[0] = MaxSIMD(m_Maxes[0], other.m_Maxes[0]); + m_Maxes[1] = MaxSIMD(m_Maxes[1], other.m_Maxes[1]); + m_Maxes[2] = MaxSIMD(m_Maxes[2], other.m_Maxes[2]); + m_Maxes[3] = MaxSIMD(m_Maxes[3], other.m_Maxes[3]); + + +} + +FORCEINLINE bool KDop32_t::Intersects(KDop32_t const& other) const +{ + bi32x4 c00 = CmpLeSIMD(m_Mins[0], other.m_Maxes[0]); + bi32x4 c01 = CmpLeSIMD(m_Mins[1], other.m_Maxes[1]); + bi32x4 c02 = CmpLeSIMD(m_Mins[2], other.m_Maxes[2]); + bi32x4 c03 = CmpLeSIMD(m_Mins[3], other.m_Maxes[3]); + + bi32x4 c10 = CmpGeSIMD(m_Maxes[0], other.m_Mins[0]); + bi32x4 c11 = CmpGeSIMD(m_Maxes[1], other.m_Mins[1]); + bi32x4 c12 = CmpGeSIMD(m_Maxes[2], other.m_Mins[2]); + bi32x4 c13 = CmpGeSIMD(m_Maxes[3], other.m_Mins[3]); + + bi32x4 a0 = AndSIMD(AndSIMD(c00, c01), AndSIMD(c02, c03)); + bi32x4 a1 = AndSIMD(AndSIMD(c10, c11), AndSIMD(c12, c13)); + + return !(IsAnyZeros(AndSIMD(a1, a0))); +} + + +FORCEINLINE bool KDop32_t::IsEmpty(void) const +{ + bi32x4 c00 = CmpLtSIMD(m_Maxes[0], m_Mins[0]); + bi32x4 c01 = CmpLtSIMD(m_Maxes[1], m_Mins[1]); + bi32x4 c02 = CmpLtSIMD(m_Maxes[2], m_Mins[2]); + bi32x4 c03 = CmpLtSIMD(m_Maxes[3], m_Mins[3]); + + return IsAnyTrue(OrSIMD(OrSIMD(c00, c01), OrSIMD(c02, c03))); +} + + +extern const fltx4 g_KDop32XDirs[4]; +extern const fltx4 g_KDop32YDirs[4]; +extern const fltx4 g_KDop32ZDirs[4]; +#endif + +#if 0 + +// FIXME!!! If we need a version of this that runs on 360, this is a work-in-progress version that hasn't been debugged. + +#define _VEC_SWIZZLE_QUAT48_UNPACK (__vector unsigned char) { 16, 17, 0, 1, 16, 17, 2, 3, 16, 17, 4, 5, 16, 17, 6, 7 } +#define _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT (__vector unsigned int ) { 0, 0, 1, 0 } + +// unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4 +FORCEINLINE fltx4 UnpackQuaternion48SIMD(const Quaternion48* RESTRICT pVec) +{ + // A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 . + // z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 . + // w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is + // w's sign bit. +// fltx4 q16s = XMLoadVector3((const void *)pVec); + fltx4 q16s = LoadUnaligned3SIMD((const float*)pVec); + + // fltx4 shift = *( fltx4 * )&g_SIMD_Quat48_Unpack_Shift; // load the aligned shift mask that we use to shuffle z. + // fltx4 permute = *( fltx4 * )&g_SIMD_Quat48_Unpack_Permute0; // load the permute word that shuffles x,y,z into their own words + bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS. + + // q16s = __vperm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f + q16s = vec_perm(q16s, Four_Threes, _VEC_SWIZZLE_QUAT48_UNPACK); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f + + // q16s = __vslh(q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) +// q16s = vec_sl( *( u32x4 * )( void * )( &q16s ), _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT ); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) + u32x4 tmp = IntShiftLeftWordSIMD(*(u32x4*)&q16s, _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT); + q16s = *(fltx4*)&tmp; + + // each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1 + const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s }; + const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); + + /* + fltx4 ret = __vcfux( q16s, 0 ); // convert from uint16 to floats. + + // scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0); + ret = __vmaddfp( ret, g_SIMD_Quat48_DivByU15, Four_NegativeOnes ); + */ + // fltx4 ret = __vmaddfp( q16s, vUpkMul, vUpkAdd ); + fltx4 ret = vec_madd(q16s, vUpkMul, vUpkAdd); + + // now, work out what w must be. + fltx4 dotxyz = Dot3SIMD(ret, ret); // all components are dot product of ret w/ self. + dotxyz = ClampVectorSIMD(dotxyz, Four_Zeros, Four_Ones); + + fltx4 ww = SubSIMD(Four_Ones, dotxyz); // all components are 1 - dotxyz + ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz) + if (wneg) + { + ret = SetWSIMD(ret, NegSIMD(ww)); + // ret = __vrlimi( ret, NegSIMD(ww), 1, 0 ); // insert one element from the ww vector into the w component of ret + } + else + { + ret = SetWSIMD(ret, ww); + // ret = __vrlimi( ret, ww, 1, 0 ); // insert one element from the ww vector into the w component of ret + } + return ret; +} + +#endif + +// These are not optimized right now for some platforms. We should be able to shuffle the values in some platforms. +// As the methods are hard-coded we can actually avoid loading memory to do the transfer. +// We should be able to create all versions. +FORCEINLINE fltx4 SetWFromXSIMD(const fltx4& a, const fltx4& x) +{ + fltx4 value = SplatXSIMD(x); + return SetWSIMD(a, value); +} + +FORCEINLINE fltx4 SetWFromYSIMD(const fltx4& a, const fltx4& y) +{ + fltx4 value = SplatYSIMD(y); + return SetWSIMD(a, value); +} + +FORCEINLINE fltx4 SetWFromZSIMD(const fltx4& a, const fltx4& z) +{ + fltx4 value = SplatZSIMD(z); + return SetWSIMD(a, value); +} + +FORCEINLINE fltx4 CrossProductSIMD(const fltx4& A, const fltx4& B) +{ +#if defined( _X360 ) + return XMVector3Cross(A, B); +#elif defined( _WIN32 ) + fltx4 A1 = _mm_shuffle_ps(A, A, MM_SHUFFLE_REV(1, 2, 0, 3)); + fltx4 B1 = _mm_shuffle_ps(B, B, MM_SHUFFLE_REV(2, 0, 1, 3)); + fltx4 Result1 = MulSIMD(A1, B1); + fltx4 A2 = _mm_shuffle_ps(A, A, MM_SHUFFLE_REV(2, 0, 1, 3)); + fltx4 B2 = _mm_shuffle_ps(B, B, MM_SHUFFLE_REV(1, 2, 0, 3)); + fltx4 Result2 = MulSIMD(A2, B2); + return SubSIMD(Result1, Result2); + +#elif defined(_PS3) + /* + fltx4 perm1 = (vector unsigned char){0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x00,0x01,0x02,0x03,0x0c,0x0d,0x0e,0x0f}; + fltx4 perm2 = (vector unsigned char){0x08,0x09,0x0a,0x0b,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e,0x0f}; + + fltx4 A1 = __vpermwi( A, A, perm1 ); + fltx4 A2 = __vpermwi( B, B, perm2 ); + fltx4 Result1 = MulSIMD( A1, B1 ); + fltx4 A2 = __vpermwi( A, A, perm2 ); + fltx4 B2 = __vpermwi( B, B, perm1 ); + return MsubSIMD( A2, B2, Result1 ); + */ + return _vmathVfCross(A, B); +#else + fltx4 CrossVal; + SubFloat(CrossVal, 0) = SubFloat(A, 1) * SubFloat(B, 2) - SubFloat(A, 2) * SubFloat(B, 1); + SubFloat(CrossVal, 1) = SubFloat(A, 2) * SubFloat(B, 0) - SubFloat(A, 0) * SubFloat(B, 2); + SubFloat(CrossVal, 2) = SubFloat(A, 0) * SubFloat(B, 1) - SubFloat(A, 1) * SubFloat(B, 0); + SubFloat(CrossVal, 3) = 0; + return CrossVal; +#endif +} + +inline const fltx4 Length3SIMD(const fltx4 vec) +{ + fltx4 scLengthSqr = Dot3SIMD(vec, vec); + bi32x4 isSignificant = CmpGtSIMD(scLengthSqr, Four_Epsilons); + fltx4 scLengthInv = ReciprocalSqrtSIMD(scLengthSqr); + return AndSIMD(isSignificant, MulSIMD(scLengthInv, scLengthSqr)); +} + +inline const fltx4 Normalized3SIMD(const fltx4 vec) +{ + fltx4 scLengthSqr = Dot3SIMD(vec, vec); + bi32x4 isSignificant = CmpGtSIMD(scLengthSqr, Four_Epsilons); + fltx4 scLengthInv = ReciprocalSqrtSIMD(scLengthSqr); + return AndSIMD(isSignificant, MulSIMD(vec, scLengthInv)); +} + + +// Some convenience operator overloads, which are just aliasing the functions above. +// Unneccessary on 360, as you already have them from xboxmath.h +// Componentwise add +#ifndef COMPILER_GCC + +FORCEINLINE fltx4 operator+=(fltx4& a, FLTX4 b) +{ + a = AddSIMD(a, b); + return a; +} + +FORCEINLINE fltx4 operator-=(fltx4& a, FLTX4 b) +{ + a = SubSIMD(a, b); + return a; +} + + +FORCEINLINE fltx4 operator*=(fltx4& a, FLTX4 b) +{ + a = MulSIMD(a, b); + return a; +} + +#endif #endif // _ssemath_h diff --git a/r5dev/mathlib/ssenoise.cpp b/r5dev/mathlib/ssenoise.cpp new file mode 100644 index 00000000..a581391f --- /dev/null +++ b/r5dev/mathlib/ssenoise.cpp @@ -0,0 +1,232 @@ +//========= Copyright � 1996-2006, Valve Corporation, All rights reserved. ============// +// +// Purpose: Fast low quality noise suitable for real time use +// +//=====================================================================================// + +#include "core/stdafx.h" +#include "tier0/dbg.h" +#include "mathlib/mathlib.h" +#include "mathlib/vector.h" +#include "mathlib/ssemath.h" +#include "mathlib/noisedata.h" + +// memdbgon must be the last include file in a .cpp file!!! +//#include "tier0/memdbgon.h" + + +#define MAGIC_NUMBER (1<<15) // gives 8 bits of fraction + +static fltx4 Four_MagicNumbers = { MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER }; + + +static ALIGN16 int32 idx_mask[4] = { 0xffff, 0xffff, 0xffff, 0xffff }; + +#define MASK255 (*((fltx4 *)(& idx_mask ))) + +// returns 0..1 +static inline float GetLatticePointValue(int idx_x, int idx_y, int idx_z) +{ + int ret_idx = perm_a[idx_x & 0xff]; + ret_idx = perm_b[(idx_y + ret_idx) & 0xff]; + ret_idx = perm_c[(idx_z + ret_idx) & 0xff]; + return impulse_xcoords[ret_idx]; + +} + +fltx4 NoiseSIMD(const fltx4& x, const fltx4& y, const fltx4& z) +{ + // use magic to convert to integer index + fltx4 x_idx = AndSIMD(MASK255, AddSIMD(x, Four_MagicNumbers)); + fltx4 y_idx = AndSIMD(MASK255, AddSIMD(y, Four_MagicNumbers)); + fltx4 z_idx = AndSIMD(MASK255, AddSIMD(z, Four_MagicNumbers)); + + fltx4 lattice000 = Four_Zeros, lattice001 = Four_Zeros, lattice010 = Four_Zeros, lattice011 = Four_Zeros; + fltx4 lattice100 = Four_Zeros, lattice101 = Four_Zeros, lattice110 = Four_Zeros, lattice111 = Four_Zeros; + + // FIXME: Converting the input vectors to int indices will cause load-hit-stores (48 bytes) + // Converting the indexed noise values back to vectors will cause more (128 bytes) + // The noise table could store vectors if we chunked it into 2x2x2 blocks. + fltx4 xfrac = Four_Zeros, yfrac = Four_Zeros, zfrac = Four_Zeros; +#define DOPASS(i) \ + { unsigned int xi = SubInt( x_idx, i ); \ + unsigned int yi = SubInt( y_idx, i ); \ + unsigned int zi = SubInt( z_idx, i ); \ + SubFloat( xfrac, i ) = (xi & 0xff)*(1.0/256.0); \ + SubFloat( yfrac, i ) = (yi & 0xff)*(1.0/256.0); \ + SubFloat( zfrac, i ) = (zi & 0xff)*(1.0/256.0); \ + xi>>=8; \ + yi>>=8; \ + zi>>=8; \ + \ + SubFloat( lattice000, i ) = GetLatticePointValue( xi,yi,zi ); \ + SubFloat( lattice001, i ) = GetLatticePointValue( xi,yi,zi+1 ); \ + SubFloat( lattice010, i ) = GetLatticePointValue( xi,yi+1,zi ); \ + SubFloat( lattice011, i ) = GetLatticePointValue( xi,yi+1,zi+1 ); \ + SubFloat( lattice100, i ) = GetLatticePointValue( xi+1,yi,zi ); \ + SubFloat( lattice101, i ) = GetLatticePointValue( xi+1,yi,zi+1 ); \ + SubFloat( lattice110, i ) = GetLatticePointValue( xi+1,yi+1,zi ); \ + SubFloat( lattice111, i ) = GetLatticePointValue( xi+1,yi+1,zi+1 ); \ + } + + DOPASS(0); + DOPASS(1); + DOPASS(2); + DOPASS(3); + + // now, we have 8 lattice values for each of four points as m128s, and interpolant values for + // each axis in m128 form in [xyz]frac. Perfom the trilinear interpolation as SIMD ops + + // first, do x interpolation + fltx4 l2d00 = AddSIMD(lattice000, MulSIMD(xfrac, SubSIMD(lattice100, lattice000))); + fltx4 l2d01 = AddSIMD(lattice001, MulSIMD(xfrac, SubSIMD(lattice101, lattice001))); + fltx4 l2d10 = AddSIMD(lattice010, MulSIMD(xfrac, SubSIMD(lattice110, lattice010))); + fltx4 l2d11 = AddSIMD(lattice011, MulSIMD(xfrac, SubSIMD(lattice111, lattice011))); + + // now, do y interpolation + fltx4 l1d0 = AddSIMD(l2d00, MulSIMD(yfrac, SubSIMD(l2d10, l2d00))); + fltx4 l1d1 = AddSIMD(l2d01, MulSIMD(yfrac, SubSIMD(l2d11, l2d01))); + + // final z interpolation + fltx4 rslt = AddSIMD(l1d0, MulSIMD(zfrac, SubSIMD(l1d1, l1d0))); + + // map to 0..1 + return MulSIMD(Four_Twos, SubSIMD(rslt, Four_PointFives)); + + +} + +static inline void GetVectorLatticePointValue(int idx, fltx4& x, fltx4& y, fltx4& z, + int idx_x, int idx_y, int idx_z) +{ + int ret_idx = perm_a[idx_x & 0xff]; + ret_idx = perm_b[(idx_y + ret_idx) & 0xff]; + ret_idx = perm_c[(idx_z + ret_idx) & 0xff]; + float const* pData = s_randomGradients + ret_idx * 3; + SubFloat(x, idx) = pData[0]; + SubFloat(y, idx) = pData[1]; + SubFloat(z, idx) = pData[2]; + +} + +FourVectors DNoiseSIMD(const fltx4& x, const fltx4& y, const fltx4& z) +{ + // use magic to convert to integer index + fltx4 x_idx = AndSIMD(MASK255, AddSIMD(x, Four_MagicNumbers)); + fltx4 y_idx = AndSIMD(MASK255, AddSIMD(y, Four_MagicNumbers)); + fltx4 z_idx = AndSIMD(MASK255, AddSIMD(z, Four_MagicNumbers)); + + fltx4 xlattice000 = Four_Zeros, xlattice001 = Four_Zeros, xlattice010 = Four_Zeros, xlattice011 = Four_Zeros; + fltx4 xlattice100 = Four_Zeros, xlattice101 = Four_Zeros, xlattice110 = Four_Zeros, xlattice111 = Four_Zeros; + fltx4 ylattice000 = Four_Zeros, ylattice001 = Four_Zeros, ylattice010 = Four_Zeros, ylattice011 = Four_Zeros; + fltx4 ylattice100 = Four_Zeros, ylattice101 = Four_Zeros, ylattice110 = Four_Zeros, ylattice111 = Four_Zeros; + fltx4 zlattice000 = Four_Zeros, zlattice001 = Four_Zeros, zlattice010 = Four_Zeros, zlattice011 = Four_Zeros; + fltx4 zlattice100 = Four_Zeros, zlattice101 = Four_Zeros, zlattice110 = Four_Zeros, zlattice111 = Four_Zeros; + + // FIXME: Converting the input vectors to int indices will cause load-hit-stores (48 bytes) + // Converting the indexed noise values back to vectors will cause more (128 bytes) + // The noise table could store vectors if we chunked it into 2x2x2 blocks. + fltx4 xfrac = Four_Zeros, yfrac = Four_Zeros, zfrac = Four_Zeros; +#define DODPASS(i) \ + { unsigned int xi = SubInt( x_idx, i ); \ + unsigned int yi = SubInt( y_idx, i ); \ + unsigned int zi = SubInt( z_idx, i ); \ + SubFloat( xfrac, i ) = (xi & 0xff)*(1.0/256.0); \ + SubFloat( yfrac, i ) = (yi & 0xff)*(1.0/256.0); \ + SubFloat( zfrac, i ) = (zi & 0xff)*(1.0/256.0); \ + xi>>=8; \ + yi>>=8; \ + zi>>=8; \ + \ + GetVectorLatticePointValue( i, xlattice000, ylattice000, zlattice000, xi,yi,zi ); \ + GetVectorLatticePointValue( i, xlattice001, ylattice001, zlattice001, xi,yi,zi+1 ); \ + GetVectorLatticePointValue( i, xlattice010, ylattice010, zlattice010, xi,yi+1,zi ); \ + GetVectorLatticePointValue( i, xlattice011, ylattice011, zlattice011, xi,yi+1,zi+1 ); \ + GetVectorLatticePointValue( i, xlattice100, ylattice100, zlattice100, xi+1,yi,zi ); \ + GetVectorLatticePointValue( i, xlattice101, ylattice101, zlattice101, xi+1,yi,zi+1 ); \ + GetVectorLatticePointValue( i, xlattice110, ylattice110, zlattice110, xi+1,yi+1,zi ); \ + GetVectorLatticePointValue( i, xlattice111, ylattice111, zlattice111, xi+1,yi+1,zi+1 ); \ + } + + DODPASS(0); + DODPASS(1); + DODPASS(2); + DODPASS(3); + + // now, we have 8 lattice values for each of four points as m128s, and interpolant values for + // each axis in m128 form in [xyz]frac. Perfom the trilinear interpolation as SIMD ops + + // first, do x interpolation + fltx4 xl2d00 = AddSIMD(xlattice000, MulSIMD(xfrac, SubSIMD(xlattice100, xlattice000))); + fltx4 xl2d01 = AddSIMD(xlattice001, MulSIMD(xfrac, SubSIMD(xlattice101, xlattice001))); + fltx4 xl2d10 = AddSIMD(xlattice010, MulSIMD(xfrac, SubSIMD(xlattice110, xlattice010))); + fltx4 xl2d11 = AddSIMD(xlattice011, MulSIMD(xfrac, SubSIMD(xlattice111, xlattice011))); + + // now, do y interpolation + fltx4 xl1d0 = AddSIMD(xl2d00, MulSIMD(yfrac, SubSIMD(xl2d10, xl2d00))); + fltx4 xl1d1 = AddSIMD(xl2d01, MulSIMD(yfrac, SubSIMD(xl2d11, xl2d01))); + + // final z interpolation + FourVectors rslt; + rslt.x = AddSIMD(xl1d0, MulSIMD(zfrac, SubSIMD(xl1d1, xl1d0))); + + fltx4 yl2d00 = AddSIMD(ylattice000, MulSIMD(xfrac, SubSIMD(ylattice100, ylattice000))); + fltx4 yl2d01 = AddSIMD(ylattice001, MulSIMD(xfrac, SubSIMD(ylattice101, ylattice001))); + fltx4 yl2d10 = AddSIMD(ylattice010, MulSIMD(xfrac, SubSIMD(ylattice110, ylattice010))); + fltx4 yl2d11 = AddSIMD(ylattice011, MulSIMD(xfrac, SubSIMD(ylattice111, ylattice011))); + + // now, do y interpolation + fltx4 yl1d0 = AddSIMD(yl2d00, MulSIMD(yfrac, SubSIMD(yl2d10, yl2d00))); + fltx4 yl1d1 = AddSIMD(yl2d01, MulSIMD(yfrac, SubSIMD(yl2d11, yl2d01))); + + // final z interpolation + rslt.y = AddSIMD(yl1d0, MulSIMD(zfrac, SubSIMD(yl1d1, yl1d0))); + + fltx4 zl2d00 = AddSIMD(zlattice000, MulSIMD(xfrac, SubSIMD(zlattice100, zlattice000))); + fltx4 zl2d01 = AddSIMD(zlattice001, MulSIMD(xfrac, SubSIMD(zlattice101, zlattice001))); + fltx4 zl2d10 = AddSIMD(zlattice010, MulSIMD(xfrac, SubSIMD(zlattice110, zlattice010))); + fltx4 zl2d11 = AddSIMD(zlattice011, MulSIMD(xfrac, SubSIMD(zlattice111, zlattice011))); + + // now, do y interpolation + fltx4 zl1d0 = AddSIMD(zl2d00, MulSIMD(yfrac, SubSIMD(zl2d10, zl2d00))); + fltx4 zl1d1 = AddSIMD(zl2d01, MulSIMD(yfrac, SubSIMD(zl2d11, zl2d01))); + + // final z interpolation + rslt.z = AddSIMD(zl1d0, MulSIMD(zfrac, SubSIMD(zl1d1, zl1d0))); + + return rslt; + + +} + +fltx4 NoiseSIMD(FourVectors const& pos) +{ + return NoiseSIMD(pos.x, pos.y, pos.z); +} + +FourVectors DNoiseSIMD(FourVectors const& pos) +{ + return DNoiseSIMD(pos.x, pos.y, pos.z); +} + +FourVectors CurlNoiseSIMD(FourVectors const& pos) +{ + FourVectors fl4Comp1 = DNoiseSIMD(pos); + FourVectors fl4Pos = pos; + fl4Pos.x = AddSIMD(fl4Pos.x, ReplicateX4(43.256)); + fl4Pos.y = AddSIMD(fl4Pos.y, ReplicateX4(-67.89)); + fl4Pos.z = AddSIMD(fl4Pos.z, ReplicateX4(1338.2)); + FourVectors fl4Comp2 = DNoiseSIMD(fl4Pos); + fl4Pos.x = AddSIMD(fl4Pos.x, ReplicateX4(-129.856)); + fl4Pos.y = AddSIMD(fl4Pos.y, ReplicateX4(-967.23)); + fl4Pos.z = AddSIMD(fl4Pos.z, ReplicateX4(2338.98)); + FourVectors fl4Comp3 = DNoiseSIMD(fl4Pos); + + // now we have the 3 derivatives of a vector valued field. return the curl of the field. + FourVectors fl4Ret; + fl4Ret.x = SubSIMD(fl4Comp3.y, fl4Comp2.z); + fl4Ret.y = SubSIMD(fl4Comp1.z, fl4Comp3.x); + fl4Ret.z = SubSIMD(fl4Comp2.x, fl4Comp1.y); + return fl4Ret; + +} diff --git a/r5dev/mathlib/ssenoise.h b/r5dev/mathlib/ssenoise.h deleted file mode 100644 index e40ce799..00000000 --- a/r5dev/mathlib/ssenoise.h +++ /dev/null @@ -1,107 +0,0 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// -// -// Purpose: Fast low quality noise suitable for real time use -// -//=====================================================================================// - -#include "core/stdafx.h" -#include "tier0/dbg.h" -#include "tier0/basetypes.h" -#include "mathlib/mathlib.h" -#include "mathlib/vector.h" -#include "mathlib/ssemath.h" - -// memdbgon must be the last include file in a .cpp file!!! -//#include "tier0/memdbgon.h" -#include "noisedata.h" - - -#define MAGIC_NUMBER (1<<15) // gives 8 bits of fraction - -static fltx4 Four_MagicNumbers = { MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER }; - - -static ALIGN16 int32 idx_mask[4] = { 0xffff, 0xffff, 0xffff, 0xffff }; - -#define MASK255 (*((fltx4 *)(& idx_mask ))) - -// returns 0..1 -static inline float GetLatticePointValue(int idx_x, int idx_y, int idx_z) -{ - NOTE_UNUSED(perm_d); - NOTE_UNUSED(impulse_ycoords); - NOTE_UNUSED(impulse_zcoords); - - int ret_idx = perm_a[idx_x & 0xff]; - ret_idx = perm_b[(idx_y + ret_idx) & 0xff]; - ret_idx = perm_c[(idx_z + ret_idx) & 0xff]; - return impulse_xcoords[ret_idx]; - -} - -fltx4 NoiseSIMD(const fltx4& x, const fltx4& y, const fltx4& z) -{ - // use magic to convert to integer index - fltx4 x_idx = AndSIMD(MASK255, AddSIMD(x, Four_MagicNumbers)); - fltx4 y_idx = AndSIMD(MASK255, AddSIMD(y, Four_MagicNumbers)); - fltx4 z_idx = AndSIMD(MASK255, AddSIMD(z, Four_MagicNumbers)); - - fltx4 lattice000 = Four_Zeros, lattice001 = Four_Zeros, lattice010 = Four_Zeros, lattice011 = Four_Zeros; - fltx4 lattice100 = Four_Zeros, lattice101 = Four_Zeros, lattice110 = Four_Zeros, lattice111 = Four_Zeros; - - // FIXME: Converting the input vectors to int indices will cause load-hit-stores (48 bytes) - // Converting the indexed noise values back to vectors will cause more (128 bytes) - // The noise table could store vectors if we chunked it into 2x2x2 blocks. - fltx4 xfrac = Four_Zeros, yfrac = Four_Zeros, zfrac = Four_Zeros; -#define DOPASS(i) \ - { unsigned int xi = SubInt( x_idx, i ); \ - unsigned int yi = SubInt( y_idx, i ); \ - unsigned int zi = SubInt( z_idx, i ); \ - SubFloat( xfrac, i ) = (xi & 0xff)*(1.0/256.0); \ - SubFloat( yfrac, i ) = (yi & 0xff)*(1.0/256.0); \ - SubFloat( zfrac, i ) = (zi & 0xff)*(1.0/256.0); \ - xi>>=8; \ - yi>>=8; \ - zi>>=8; \ - \ - SubFloat( lattice000, i ) = GetLatticePointValue( xi,yi,zi ); \ - SubFloat( lattice001, i ) = GetLatticePointValue( xi,yi,zi+1 ); \ - SubFloat( lattice010, i ) = GetLatticePointValue( xi,yi+1,zi ); \ - SubFloat( lattice011, i ) = GetLatticePointValue( xi,yi+1,zi+1 ); \ - SubFloat( lattice100, i ) = GetLatticePointValue( xi+1,yi,zi ); \ - SubFloat( lattice101, i ) = GetLatticePointValue( xi+1,yi,zi+1 ); \ - SubFloat( lattice110, i ) = GetLatticePointValue( xi+1,yi+1,zi ); \ - SubFloat( lattice111, i ) = GetLatticePointValue( xi+1,yi+1,zi+1 ); \ - } - - DOPASS(0); - DOPASS(1); - DOPASS(2); - DOPASS(3); - - // now, we have 8 lattice values for each of four points as m128s, and interpolant values for - // each axis in m128 form in [xyz]frac. Perfom the trilinear interpolation as SIMD ops - - // first, do x interpolation - fltx4 l2d00 = AddSIMD(lattice000, MulSIMD(xfrac, SubSIMD(lattice100, lattice000))); - fltx4 l2d01 = AddSIMD(lattice001, MulSIMD(xfrac, SubSIMD(lattice101, lattice001))); - fltx4 l2d10 = AddSIMD(lattice010, MulSIMD(xfrac, SubSIMD(lattice110, lattice010))); - fltx4 l2d11 = AddSIMD(lattice011, MulSIMD(xfrac, SubSIMD(lattice111, lattice011))); - - // now, do y interpolation - fltx4 l1d0 = AddSIMD(l2d00, MulSIMD(yfrac, SubSIMD(l2d10, l2d00))); - fltx4 l1d1 = AddSIMD(l2d01, MulSIMD(yfrac, SubSIMD(l2d11, l2d01))); - - // final z interpolation - fltx4 rslt = AddSIMD(l1d0, MulSIMD(zfrac, SubSIMD(l1d1, l1d0))); - - // map to 0..1 - return MulSIMD(Four_Twos, SubSIMD(rslt, Four_PointFives)); - - -} - -fltx4 NoiseSIMD(FourVectors const& pos) -{ - return NoiseSIMD(pos.x, pos.y, pos.z); -} diff --git a/r5dev/mathlib/ssequaternion.h b/r5dev/mathlib/ssequaternion.h index 90167b83..a1b9b37e 100644 --- a/r5dev/mathlib/ssequaternion.h +++ b/r5dev/mathlib/ssequaternion.h @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======// // // Purpose: - defines SIMD "structure of arrays" classes and functions. // @@ -37,8 +37,10 @@ // the traditional x87 FPU operations altogether and make everything use // the SSE2 registers, which lessens this problem a little. -// permitted only on 360, as we've done careful tuning on its Altivec math: -#ifdef _X360 +// permitted only on 360, as we've done careful tuning on its Altivec math. +// FourQuaternions, however, are always allowed, because vertical ops are +// fine on SSE. +#ifdef PLATFORM_PPC #define ALLOW_SIMD_QUATERNION_MATH 1 // not on PC! #endif @@ -48,7 +50,6 @@ // Load/store quaternions //--------------------------------------------------------------------- #ifndef _X360 -#if ALLOW_SIMD_QUATERNION_MATH // Using STDC or SSE FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned& pSIMD) { @@ -58,7 +59,7 @@ FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned& pSIMD) FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned* RESTRICT pSIMD) { - fltx4 retval = LoadAlignedSIMD(pSIMD); + fltx4 retval = LoadAlignedSIMD(pSIMD->Base()); return retval; } @@ -66,7 +67,6 @@ FORCEINLINE void StoreAlignedSIMD(QuaternionAligned* RESTRICT pSIMD, const fltx4 { StoreAlignedSIMD(pSIMD->Base(), a); } -#endif #else // for the transitional class -- load a QuaternionAligned @@ -87,6 +87,9 @@ FORCEINLINE void StoreAlignedSIMD(QuaternionAligned* RESTRICT pSIMD, const fltx4 XMStoreVector4A(pSIMD->Base(), a); } +// From a RadianEuler packed onto a fltx4, to a quaternion +fltx4 AngleQuaternionSIMD(FLTX4 vAngles); + #endif @@ -101,7 +104,7 @@ FORCEINLINE fltx4 QuaternionAlignSIMD(const fltx4& p, const fltx4& q) fltx4 b = AddSIMD(p, q); a = Dot4SIMD(a, a); b = Dot4SIMD(b, b); - fltx4 cmp = CmpGtSIMD(a, b); + fltx4 cmp = (fltx4)CmpGtSIMD(a, b); fltx4 result = MaskedAssign(cmp, NegSIMD(q), q); return result; } @@ -133,7 +136,7 @@ FORCEINLINE fltx4 QuaternionNormalizeSIMD(const fltx4& q) { fltx4 radius, result, mask; radius = Dot4SIMD(q, q); - mask = CmpEqSIMD(radius, Four_Zeros); // all ones iff radius = 0 + mask = (fltx4)CmpEqSIMD(radius, Four_Zeros); // all ones iff radius = 0 result = ReciprocalSqrtSIMD(radius); result = MulSIMD(result, q); return MaskedAssign(mask, q, result); // if radius was 0, just return q @@ -222,40 +225,7 @@ FORCEINLINE fltx4 QuaternionMultSIMD(const fltx4& p, const fltx4& q) //--------------------------------------------------------------------- // Quaternion scale //--------------------------------------------------------------------- -#ifndef _X360 - -// SSE and STDC -FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, float t) -{ - float r; - fltx4 q; - - // FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to - // use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale. - float sinom = sqrt(SubFloat(p, 0) * SubFloat(p, 0) + SubFloat(p, 1) * SubFloat(p, 1) + SubFloat(p, 2) * SubFloat(p, 2)); - sinom = min(sinom, 1.f); - - float sinsom = sin(asin(sinom) * t); - - t = sinsom / (sinom + FLT_EPSILON); - SubFloat(q, 0) = t * SubFloat(p, 0); - SubFloat(q, 1) = t * SubFloat(p, 1); - SubFloat(q, 2) = t * SubFloat(p, 2); - - // rescale rotation - r = 1.0f - sinsom * sinsom; - - // Assert( r >= 0 ); - if (r < 0.0f) - r = 0.0f; - r = sqrt(r); - - // keep sign of rotation - SubFloat(q, 3) = fsel(SubFloat(p, 3), r, -r); - return q; -} - -#else +#ifdef _X360 // X360 FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, float t) @@ -286,6 +256,126 @@ FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, float t) return result; } +// X360 +// assumes t4 contains a float replicated to each slot +FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, const fltx4& t4) +{ + fltx4 sinom = Dot3SIMD(p, p); + sinom = SqrtSIMD(sinom); + sinom = MinSIMD(sinom, Four_Ones); + fltx4 sinsom = ArcSinSIMD(sinom); + sinsom = MulSIMD(sinsom, t4); + sinsom = SinSIMD(sinsom); + sinom = AddSIMD(sinom, Four_Epsilons); + sinom = ReciprocalSIMD(sinom); + fltx4 result = MulSIMD(p, MulSIMD(sinsom, sinom)); + + // rescale rotation + sinsom = MulSIMD(sinsom, sinsom); + fltx4 r = SubSIMD(Four_Ones, sinsom); + r = MaxSIMD(r, Four_Zeros); + r = SqrtSIMD(r); + + // keep sign of rotation + fltx4 cmp = CmpGeSIMD(p, Four_Zeros); + r = MaskedAssign(cmp, r, NegSIMD(r)); + + result = __vrlimi(result, r, 1, 0); + return result; +} + +#elif defined(_PS3) + +// X360 +FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, float t) +{ + fltx4 sinom = Dot3SIMD(p, p); + sinom = SqrtSIMD(sinom); + sinom = MinSIMD(sinom, Four_Ones); + fltx4 sinsom = ArcSinSIMD(sinom); + fltx4 t4 = ReplicateX4(t); + sinsom = MulSIMD(sinsom, t4); + sinsom = SinSIMD(sinsom); + sinom = AddSIMD(sinom, Four_Epsilons); + sinom = ReciprocalSIMD(sinom); + t4 = MulSIMD(sinsom, sinom); + fltx4 result = MulSIMD(p, t4); + + // rescale rotation + sinsom = MulSIMD(sinsom, sinsom); + fltx4 r = SubSIMD(Four_Ones, sinsom); + r = MaxSIMD(r, Four_Zeros); + r = SqrtSIMD(r); + + // keep sign of rotation + r = MaskedAssign(CmpGeSIMD(p, Four_Zeros), r, NegSIMD(r)); + // set just the w component of result + result = MaskedAssign(LoadAlignedSIMD(g_SIMD_ComponentMask[3]), r, result); + + return result; +} + +// X360 +// assumes t4 contains a float replicated to each slot +FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, const fltx4& t4) +{ + fltx4 sinom = Dot3SIMD(p, p); + sinom = SqrtSIMD(sinom); + sinom = MinSIMD(sinom, Four_Ones); + fltx4 sinsom = ArcSinSIMD(sinom); + sinsom = MulSIMD(sinsom, t4); + sinsom = SinSIMD(sinsom); + sinom = AddSIMD(sinom, Four_Epsilons); + sinom = ReciprocalSIMD(sinom); + fltx4 result = MulSIMD(p, MulSIMD(sinsom, sinom)); + + // rescale rotation + sinsom = MulSIMD(sinsom, sinsom); + fltx4 r = SubSIMD(Four_Ones, sinsom); + r = MaxSIMD(r, Four_Zeros); + r = SqrtSIMD(r); + + // keep sign of rotation + r = MaskedAssign(CmpGeSIMD(p, Four_Zeros), r, NegSIMD(r)); + // set just the w component of result + result = MaskedAssign(LoadAlignedSIMD(g_SIMD_ComponentMask[3]), r, result); + + return result; +} + +#else + +// SSE and STDC +FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, float t) +{ + float r; + fltx4 q; + + // FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to + // use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale. + float sinom = sqrt(SubFloat(p, 0) * SubFloat(p, 0) + SubFloat(p, 1) * SubFloat(p, 1) + SubFloat(p, 2) * SubFloat(p, 2)); + sinom = fmin(sinom, 1.f); + + float sinsom = sin(asin(sinom) * t); + + t = sinsom / (sinom + FLT_EPSILON); + SubFloat(q, 0) = t * SubFloat(p, 0); + SubFloat(q, 1) = t * SubFloat(p, 1); + SubFloat(q, 2) = t * SubFloat(p, 2); + + // rescale rotation + r = 1.0f - sinsom * sinsom; + + // Assert( r >= 0 ); + if (r < 0.0f) + r = 0.0f; + r = sqrt(r); + + // keep sign of rotation + SubFloat(q, 3) = fsel(SubFloat(p, 3), r, -r); + return q; +} + #endif @@ -363,5 +453,812 @@ FORCEINLINE fltx4 QuaternionSlerpSIMD(const fltx4& p, const fltx4& q, float t) #endif // ALLOW_SIMD_QUATERNION_MATH + +/// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are +/// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated. +class ALIGN16 FourQuaternions +{ +public: + fltx4 x, y, z, w; + + FourQuaternions(void) + { + } + + FourQuaternions(const fltx4& _x, + const fltx4& _y, + const fltx4& _z, + const fltx4& _w) + : x(_x), y(_y), z(_z), w(_w) + {} + +#if !defined(__SPU__) + // four rotations around the same axis. angles should be in radians. + FourQuaternions(const fltx4& axis, + const float& angle0, const float& angle1, const float& angle2, const float& angle3) + { + FromAxisAndAngles(axis, angle0, angle1, angle2, angle3); + } +#endif + + FourQuaternions(FourQuaternions const& src) + { + x = src.x; + y = src.y; + z = src.z; + w = src.w; + } + + FORCEINLINE void operator=(FourQuaternions const& src) + { + x = src.x; + y = src.y; + z = src.z; + w = src.w; + } + + /// this = this * q; + FORCEINLINE FourQuaternions Mul(FourQuaternions const& q) const; + + /// negate the vector part + FORCEINLINE FourQuaternions Conjugate() const; + + /// for a quaternion representing a rotation of angle theta, return + /// one of angle s*theta + /// scale is four floats -- one for each quat + FORCEINLINE FourQuaternions ScaleAngle(const fltx4& scale) const; + + /// ret = this * ( s * q ) + /// In other words, for a quaternion representing a rotation of angle theta, return + /// one of angle s*theta + /// s is four floats in a fltx4 -- one for each quaternion + FORCEINLINE FourQuaternions MulAc(const fltx4& s, const FourQuaternions& q) const; + + /// ret = ( s * this ) * q + FORCEINLINE FourQuaternions ScaleMul(const fltx4& s, const FourQuaternions& q) const; + + /// Slerp four quaternions at once, FROM me TO the specified out. + FORCEINLINE FourQuaternions Slerp(const FourQuaternions& to, const fltx4& t); + + FORCEINLINE FourQuaternions SlerpNoAlign(const FourQuaternions& originalto, const fltx4& t); + +#if !defined(__SPU__) + /// given an axis and four angles, populate this quaternion with the equivalent rotations + /// (ie, make these four quaternions represent four different rotations around the same axis) + /// angles should be in RADIANS + FORCEINLINE FourQuaternions& FromAxisAndAngles(const fltx4& axis, + const float& angle0, const float& angle1, const float& angle2, const float& angle3); + FORCEINLINE FourQuaternions& FromAxisAndAngles(const fltx4& axis, const fltx4& angles); + // one convenience imp if you're doing this in degrees + FORCEINLINE FourQuaternions& FromAxisAndAnglesInDegrees(const fltx4& axis, const fltx4& angles) + { + return FromAxisAndAngles(axis, MulSIMD(angles, Four_DegToRad)); + } +#endif + + // rotate (in place) a FourVectors by this quaternion. there's a corresponding RotateBy in FourVectors. + FORCEINLINE void RotateFourVectors(FourVectors* RESTRICT vecs) const RESTRICT; + + + /// LoadAndSwizzleAligned - load 4 QuaternionAligneds into a FourQuaternions, performing transpose op. + /// all 4 vectors must be 128 bit boundary + FORCEINLINE void LoadAndSwizzleAligned(const float* RESTRICT a, const float* RESTRICT b, const float* RESTRICT c, const float* RESTRICT d) + { +#if defined( _X360 ) + fltx4 tx = LoadAlignedSIMD(a); + fltx4 ty = LoadAlignedSIMD(b); + fltx4 tz = LoadAlignedSIMD(c); + fltx4 tw = LoadAlignedSIMD(d); + fltx4 r0 = __vmrghw(tx, tz); + fltx4 r1 = __vmrghw(ty, tw); + fltx4 r2 = __vmrglw(tx, tz); + fltx4 r3 = __vmrglw(ty, tw); + + x = __vmrghw(r0, r1); + y = __vmrglw(r0, r1); + z = __vmrghw(r2, r3); + w = __vmrglw(r2, r3); +#else + x = LoadAlignedSIMD(a); + y = LoadAlignedSIMD(b); + z = LoadAlignedSIMD(c); + w = LoadAlignedSIMD(d); + // now, matrix is: + // x y z w + // x y z w + // x y z w + // x y z w + TransposeSIMD(x, y, z, w); +#endif + } + + FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned* RESTRICT a, + const QuaternionAligned* RESTRICT b, + const QuaternionAligned* RESTRICT c, + const QuaternionAligned* RESTRICT d) + { + LoadAndSwizzleAligned(a->Base(), b->Base(), c->Base(), d->Base()); + } + + + /// LoadAndSwizzleAligned - load 4 consecutive QuaternionAligneds into a FourQuaternions, + /// performing transpose op. + /// all 4 vectors must be 128 bit boundary + FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned* qs) + { +#if defined( _X360 ) + fltx4 tx = LoadAlignedSIMD(qs++); + fltx4 ty = LoadAlignedSIMD(qs++); + fltx4 tz = LoadAlignedSIMD(qs++); + fltx4 tw = LoadAlignedSIMD(qs); + fltx4 r0 = __vmrghw(tx, tz); + fltx4 r1 = __vmrghw(ty, tw); + fltx4 r2 = __vmrglw(tx, tz); + fltx4 r3 = __vmrglw(ty, tw); + + x = __vmrghw(r0, r1); + y = __vmrglw(r0, r1); + z = __vmrghw(r2, r3); + w = __vmrglw(r2, r3); +#else + x = LoadAlignedSIMD(qs++); + y = LoadAlignedSIMD(qs++); + z = LoadAlignedSIMD(qs++); + w = LoadAlignedSIMD(qs++); + // now, matrix is: + // x y z w + // x y z w + // x y z w + // x y z w + TransposeSIMD(x, y, z, w); +#endif + } + + // Store the FourQuaternions out to four nonconsecutive ordinary quaternions in memory. + FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned* a, QuaternionAligned* b, QuaternionAligned* c, QuaternionAligned* d) + { +#if defined( _X360 ) + fltx4 r0 = __vmrghw(x, z); + fltx4 r1 = __vmrghw(y, w); + fltx4 r2 = __vmrglw(x, z); + fltx4 r3 = __vmrglw(y, w); + + fltx4 rx = __vmrghw(r0, r1); + fltx4 ry = __vmrglw(r0, r1); + fltx4 rz = __vmrghw(r2, r3); + fltx4 rw = __vmrglw(r2, r3); + + StoreAlignedSIMD(a, rx); + StoreAlignedSIMD(b, ry); + StoreAlignedSIMD(c, rz); + StoreAlignedSIMD(d, rw); +#else + fltx4 dupes[4] = { x, y, z, w }; + TransposeSIMD(dupes[0], dupes[1], dupes[2], dupes[3]); + StoreAlignedSIMD(a, dupes[0]); + StoreAlignedSIMD(b, dupes[1]); + StoreAlignedSIMD(c, dupes[2]); + StoreAlignedSIMD(d, dupes[3]); +#endif + } + + // Store the FourQuaternions out to four consecutive ordinary quaternions in memory. + FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned* qs) + { +#if defined( _X360 ) + fltx4 r0 = __vmrghw(x, z); + fltx4 r1 = __vmrghw(y, w); + fltx4 r2 = __vmrglw(x, z); + fltx4 r3 = __vmrglw(y, w); + + fltx4 rx = __vmrghw(r0, r1); + fltx4 ry = __vmrglw(r0, r1); + fltx4 rz = __vmrghw(r2, r3); + fltx4 rw = __vmrglw(r2, r3); + + StoreAlignedSIMD(qs, rx); + StoreAlignedSIMD(++qs, ry); + StoreAlignedSIMD(++qs, rz); + StoreAlignedSIMD(++qs, rw); +#else + SwizzleAndStoreAligned(qs, qs + 1, qs + 2, qs + 3); +#endif + } + + // Store the FourQuaternions out to four consecutive ordinary quaternions in memory. + // The mask specifies which of the quaternions are actually written out -- each + // word in the fltx4 should be all binary ones or zeros. Ones means the corresponding + // quat will be written. + FORCEINLINE void SwizzleAndStoreAlignedMasked(QuaternionAligned* RESTRICT qs, const bi32x4& controlMask) + { + fltx4 originals[4]; + originals[0] = LoadAlignedSIMD(qs); + originals[1] = LoadAlignedSIMD(qs + 1); + originals[2] = LoadAlignedSIMD(qs + 2); + originals[3] = LoadAlignedSIMD(qs + 3); + + bi32x4 masks[4] = { SplatXSIMD(controlMask), + SplatYSIMD(controlMask), + SplatZSIMD(controlMask), + SplatWSIMD(controlMask) }; + +#if defined( _X360 ) + fltx4 r0 = __vmrghw(x, z); + fltx4 r1 = __vmrghw(y, w); + fltx4 r2 = __vmrglw(x, z); + fltx4 r3 = __vmrglw(y, w); + + fltx4 rx = __vmrghw(r0, r1); + fltx4 ry = __vmrglw(r0, r1); + fltx4 rz = __vmrghw(r2, r3); + fltx4 rw = __vmrglw(r2, r3); +#else + fltx4 rx = x; + fltx4 ry = y; + fltx4 rz = z; + fltx4 rw = w; + TransposeSIMD(rx, ry, rz, rw); +#endif + + StoreAlignedSIMD(qs + 0, MaskedAssign(masks[0], rx, originals[0])); + StoreAlignedSIMD(qs + 1, MaskedAssign(masks[1], ry, originals[1])); + StoreAlignedSIMD(qs + 2, MaskedAssign(masks[2], rz, originals[2])); + StoreAlignedSIMD(qs + 3, MaskedAssign(masks[3], rw, originals[3])); + } +}; + + + +FORCEINLINE FourQuaternions FourQuaternions::Conjugate() const +{ + return FourQuaternions(NegSIMD(x), NegSIMD(y), NegSIMD(z), w); +} + + + + +FORCEINLINE const fltx4 Dot(const FourQuaternions& a, const FourQuaternions& b) +{ + return + MaddSIMD(a.x, b.x, + MaddSIMD(a.y, b.y, + MaddSIMD(a.z, b.z, MulSIMD(a.w, b.w)) + ) + ); +} + + +FORCEINLINE const FourQuaternions Madd(const FourQuaternions& a, const fltx4& scale, const FourQuaternions& c) +{ + FourQuaternions ret; + ret.x = MaddSIMD(a.x, scale, c.x); + ret.y = MaddSIMD(a.y, scale, c.y); + ret.z = MaddSIMD(a.z, scale, c.z); + ret.w = MaddSIMD(a.w, scale, c.w); + return ret; +} + +FORCEINLINE const FourQuaternions Mul(const FourQuaternions& a, const fltx4& scale) +{ + FourQuaternions ret; + ret.x = MulSIMD(a.x, scale); + ret.y = MulSIMD(a.y, scale); + ret.z = MulSIMD(a.z, scale); + ret.w = MulSIMD(a.w, scale); + return ret; +} + +FORCEINLINE const FourQuaternions Add(const FourQuaternions& a, const FourQuaternions& b) +{ + FourQuaternions ret; + ret.x = AddSIMD(a.x, b.x); + ret.y = AddSIMD(a.y, b.y); + ret.z = AddSIMD(a.z, b.z); + ret.w = AddSIMD(a.w, b.w); + return ret; +} + +FORCEINLINE const FourQuaternions Sub(const FourQuaternions& a, const FourQuaternions& b) +{ + FourQuaternions ret; + ret.x = SubSIMD(a.x, b.x); + ret.y = SubSIMD(a.y, b.y); + ret.z = SubSIMD(a.z, b.z); + ret.w = SubSIMD(a.w, b.w); + return ret; +} + +FORCEINLINE const FourQuaternions Neg(const FourQuaternions& q) +{ + FourQuaternions ret; + ret.x = NegSIMD(q.x); + ret.y = NegSIMD(q.y); + ret.z = NegSIMD(q.z); + ret.w = NegSIMD(q.w); + return ret; +} + +FORCEINLINE const FourQuaternions MaskedAssign(const bi32x4& mask, const FourQuaternions& a, const FourQuaternions& b) +{ + FourQuaternions ret; + ret.x = MaskedAssign(mask, a.x, b.x); + ret.y = MaskedAssign(mask, a.y, b.y); + ret.z = MaskedAssign(mask, a.z, b.z); + ret.w = MaskedAssign(mask, a.w, b.w); + return ret; +} + +#ifdef DIFFERENT_NATIVE_VECTOR_TYPES +FORCEINLINE const FourQuaternions MaskedAssign(const fltx4& mask, const FourQuaternions& a, const FourQuaternions& b) +{ + return MaskedAssign((bi32x4)mask, a, b); +} +#endif + + +FORCEINLINE FourQuaternions QuaternionAlign(const FourQuaternions& p, const FourQuaternions& q) +{ + // decide if one of the quaternions is backwards + bi32x4 cmp = CmpLtSIMD(Dot(p, q), Four_Zeros); + return MaskedAssign(cmp, Neg(q), q); +} + + +FORCEINLINE const FourQuaternions QuaternionNormalize(const FourQuaternions& q) +{ + fltx4 radius = Dot(q, q); + bi32x4 mask = CmpEqSIMD(radius, Four_Zeros); // all ones iff radius = 0 + fltx4 invRadius = ReciprocalSqrtSIMD(radius); + + FourQuaternions ret = MaskedAssign(mask, q, Mul(q, invRadius)); + return ret; +} + + +#if !defined(__SPU__) +FORCEINLINE FourQuaternions& FourQuaternions::FromAxisAndAngles(const fltx4& axis, + const float& angle0, const float& angle1, const float& angle2, const float& angle3) +{ + return FromAxisAndAngles(axis, LoadGatherSIMD(angle0, angle1, angle2, angle3)); +} + +FORCEINLINE FourQuaternions& FourQuaternions::FromAxisAndAngles(const fltx4& axis, + const fltx4& angles) +{ + // compute the half theta + fltx4 theta = MulSIMD(angles, Four_PointFives); + // compute the sine and cosine of each angle simultaneously + fltx4 vsines; fltx4 vcoses; + SinCosSIMD(vsines, vcoses, theta); + // now the sines and coses vectors contain the results for four angles. + // for each of the angles, splat them out and then swizzle together so + // as to get a < cos, sin, sin, sin > coefficient vector + + x = MulSIMD(vsines, SplatXSIMD(axis)); // sin(t0) * x, sin(t1) * x, etc + y = MulSIMD(vsines, SplatYSIMD(axis)); + z = MulSIMD(vsines, SplatZSIMD(axis)); + w = vcoses; + + + return *this; +} +#endif + + +/// this = this * q; +FORCEINLINE FourQuaternions FourQuaternions::Mul(FourQuaternions const& q) const +{ + // W = w1w2 - x1x2 - y1y2 - z1z2 + FourQuaternions ret; + fltx4 signMask = LoadAlignedSIMD((float*)g_SIMD_signmask); + // as we do the multiplication, also do a dot product, so we know whether + // one of the quats is backwards and if we therefore have to negate at the end + fltx4 dotProduct = MulSIMD(w, q.w); + + ret.w = MulSIMD(w, q.w); // W = w1w2 + ret.x = MulSIMD(w, q.x); // X = w1x2 + ret.y = MulSIMD(w, q.y); // Y = w1y2 + ret.z = MulSIMD(w, q.z); // Z = w1z2 + + dotProduct = MaddSIMD(x, q.x, dotProduct); + ret.w = MsubSIMD(x, q.x, ret.w); // W = w1w2 - x1x2 + ret.x = MaddSIMD(x, q.w, ret.x); // X = w1x2 + x1w2 + ret.y = MsubSIMD(x, q.z, ret.y); // Y = w1y2 - x1z2 + ret.z = MaddSIMD(x, q.y, ret.z); // Z = w1z2 + x1y2 + + dotProduct = MaddSIMD(y, q.y, dotProduct); + ret.w = MsubSIMD(y, q.y, ret.w); // W = w1w2 - x1x2 - y1y2 + ret.x = MaddSIMD(y, q.z, ret.x); // X = w1x2 + x1w2 + y1z2 + ret.y = MaddSIMD(y, q.w, ret.y); // Y = w1y2 - x1z2 + y1w2 + ret.z = MsubSIMD(y, q.x, ret.z); // Z = w1z2 + x1y2 - y1x2 + + dotProduct = MaddSIMD(z, q.z, dotProduct); + ret.w = MsubSIMD(z, q.z, ret.w); // W = w1w2 - x1x2 - y1y2 - z1z2 + ret.x = MsubSIMD(z, q.y, ret.x); // X = w1x2 + x1w2 + y1z2 - z1y2 + ret.y = MaddSIMD(z, q.x, ret.y); // Y = w1y2 - x1z2 + y1w2 + z1x2 + ret.z = MaddSIMD(z, q.w, ret.z); // Z = w1z2 + x1y2 - y1x2 + z1w2 + + fltx4 Zero = Four_Zeros; + bi32x4 control = CmpLtSIMD(dotProduct, Four_Zeros); + signMask = MaskedAssign(control, signMask, Zero); // negate quats where q1.q2 < 0 + ret.w = XorSIMD(signMask, ret.w); + ret.x = XorSIMD(signMask, ret.x); + ret.y = XorSIMD(signMask, ret.y); + ret.z = XorSIMD(signMask, ret.z); + + return ret; +} + + +FORCEINLINE void FourQuaternions::RotateFourVectors(FourVectors* RESTRICT vecs) const RESTRICT +{ + fltx4 tmpX, tmpY, tmpZ, tmpW; + fltx4 outX, outY, outZ; + + tmpX = SubSIMD(MaddSIMD(w, vecs->x, MulSIMD(y, vecs->z)), + MulSIMD(z, vecs->y)); + + tmpY = SubSIMD(MaddSIMD(w, vecs->y, MulSIMD(z, vecs->x)), + MulSIMD(x, vecs->z)); + + tmpZ = SubSIMD(MaddSIMD(w, vecs->z, MulSIMD(x, vecs->y)), + MulSIMD(y, vecs->x)); + + tmpW = AddSIMD(MaddSIMD(x, vecs->x, MulSIMD(y, vecs->y)), + MulSIMD(z, vecs->z)); + + + outX = AddSIMD(SubSIMD(MaddSIMD(tmpW, x, MulSIMD(tmpX, w)), + MulSIMD(tmpY, z)), + MulSIMD(tmpZ, y)); + + outY = AddSIMD(SubSIMD(MaddSIMD(tmpW, y, MulSIMD(tmpY, w)), + MulSIMD(tmpZ, x)), + MulSIMD(tmpX, z)); + + outZ = AddSIMD(SubSIMD(MaddSIMD(tmpW, z, MulSIMD(tmpZ, w)), + MulSIMD(tmpX, y)), + MulSIMD(tmpY, x)); + + // although apparently redundant, assigning the results to intermediate local variables + // seems to improve code scheduling slightly in SN. + vecs->x = outX; + vecs->y = outY; + vecs->z = outZ; +} + + +/* + +void QuaternionScale( const Quaternion &p, float t, Quaternion &q ) +{ + Assert( s_bMathlibInitialized ); + + + float r; + + // FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to + // use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale. + float sinom = sqrt( DotProduct( &p.x, &p.x ) ); + sinom = min( sinom, 1.f ); + + float sinsom = sin( asin( sinom ) * t ); + + t = sinsom / (sinom + FLT_EPSILON); + VectorScale( &p.x, t, &q.x ); + + // rescale rotation + r = 1.0f - sinsom * sinsom; + + // Assert( r >= 0 ); + if (r < 0.0f) + r = 0.0f; + r = sqrt( r ); + + // keep sign of rotation + if (p.w < 0) + q.w = -r; + else + q.w = r; + + Assert( q.IsValid() ); + + return; +} + +*/ + +FORCEINLINE FourQuaternions FourQuaternions::ScaleAngle(const fltx4& scale) const +{ + FourQuaternions ret; + static const fltx4 OneMinusEpsilon = { 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f }; + const fltx4 Zero = Four_Zeros; + fltx4 signMask = LoadAlignedSIMD((float*)g_SIMD_signmask); + // work out if there are any tiny scales or angles, which are unstable + bi32x4 tinyAngles = CmpGtSIMD(w, OneMinusEpsilon); + bi32x4 negativeRotations = CmpLtSIMD(w, Zero); // if any w's are <0, we will need to negate later down + + // figure out the theta + fltx4 angles = ArcCosSIMD(w); + + // test also if w > -1 + fltx4 negativeWs = XorSIMD(signMask, w); + tinyAngles = OrSIMD(CmpGtSIMD(negativeWs, OneMinusEpsilon), tinyAngles); + + // meanwhile start working on computing the dot product of the + // vector component, and trust in the scheduler to interleave them + fltx4 vLenSq = MulSIMD(x, x); + vLenSq = MaddSIMD(y, y, vLenSq); + vLenSq = MaddSIMD(z, z, vLenSq); + + // scale the angles + angles = MulSIMD(angles, scale); + + // clear out the sign mask where w>=0 + signMask = MaskedAssign(negativeRotations, signMask, Zero); + + // work out the new w component and vector length + fltx4 vLenRecip = ReciprocalSqrtSIMD(vLenSq); // interleave with Cos to hide latencies + fltx4 sine; + SinCosSIMD(sine, ret.w, angles); + ret.x = MulSIMD(x, vLenRecip); // renormalize so the vector length + w = 1 + ret.y = MulSIMD(y, vLenRecip); // renormalize so the vector length + w = 1 + ret.z = MulSIMD(z, vLenRecip); // renormalize so the vector length + w = 1 + ret.x = MulSIMD(ret.x, sine); + ret.y = MulSIMD(ret.y, sine); + ret.z = MulSIMD(ret.z, sine); + + // negate where necessary + ret.x = XorSIMD(ret.x, signMask); + ret.y = XorSIMD(ret.y, signMask); + ret.z = XorSIMD(ret.z, signMask); + ret.w = XorSIMD(ret.w, signMask); + + // finally, toss results from where cos(theta) is close to 1 -- these are non rotations. + ret.x = MaskedAssign(tinyAngles, x, ret.x); + ret.y = MaskedAssign(tinyAngles, y, ret.y); + ret.z = MaskedAssign(tinyAngles, z, ret.z); + ret.w = MaskedAssign(tinyAngles, w, ret.w); + + return ret; +} + +//----------------------------------------------------------------------------- +// Purpose: return = this * ( s * q ) +// In other words, for a quaternion representing a rotation of angle theta, return +// one of angle s*theta +// s is four floats in a fltx4 -- one for each quaternion +//----------------------------------------------------------------------------- + +FORCEINLINE FourQuaternions FourQuaternions::MulAc(const fltx4& s, const FourQuaternions& q) const +{ + /* + void QuaternionMA( const Quaternion &p, float s, const Quaternion &q, Quaternion &qt ) + { + Quaternion p1, q1; + + QuaternionScale( q, s, q1 ); + QuaternionMult( p, q1, p1 ); + QuaternionNormalize( p1 ); + qt[0] = p1[0]; + qt[1] = p1[1]; + qt[2] = p1[2]; + qt[3] = p1[3]; + } + */ + + return Mul(q.ScaleAngle(s)); +} + + +FORCEINLINE FourQuaternions FourQuaternions::ScaleMul(const fltx4& s, const FourQuaternions& q) const +{ + return ScaleAngle(s).Mul(q); +} + + +FORCEINLINE FourQuaternions FourQuaternions::Slerp(const FourQuaternions& originalto, const fltx4& t) +{ + FourQuaternions ret; + static const fltx4 OneMinusEpsilon = { 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f }; + + // align if necessary. + + // actually, before we even do that, start by computing the dot product of + // the quaternions. it has lots of dependent ops and we can sneak it into + // the pipeline bubbles as we figure out alignment. Of course we don't know + // yet if we need to realign, so compute them both -- there's plenty of + // space in the bubbles. They're roomy, those bubbles. + fltx4 cosineOmega; +#if 0 // Maybe I don't need to do alignment seperately, using the xb360 technique... + FourQuaternions to; + { + fltx4 diffs[4], sums[4], originalToNeg[4]; + fltx4 dotIfAligned, dotIfNotAligned; + + // compute negations of the TO quaternion. + originalToNeg[0] = NegSIMD(originalto.x); + originalToNeg[1] = NegSIMD(originalto.y); + originalToNeg[2] = NegSIMD(originalto.z); + originalToNeg[3] = NegSIMD(originalto.w); + + dotIfAligned = MulSIMD(x, originalto.x); + dotIfNotAligned = MulSIMD(x, originalToNeg[0]); + + diffs[0] = SubSIMD(x, originalto.x); + diffs[1] = SubSIMD(y, originalto.y); + diffs[2] = SubSIMD(z, originalto.z); + diffs[3] = SubSIMD(w, originalto.w); + + sums[0] = AddSIMD(x, originalto.x); + sums[1] = AddSIMD(y, originalto.y); + sums[2] = AddSIMD(z, originalto.z); + sums[3] = AddSIMD(w, originalto.w); + + dotIfAligned = MaddSIMD(y, originalto.y, dotIfAligned); + dotIfNotAligned = MaddSIMD(y, originalToNeg[1], dotIfNotAligned); + + fltx4 diffsDot, sumsDot; + + diffsDot = MulSIMD(diffs[0], diffs[0]); // x^2 + sumsDot = MulSIMD(sums[0], sums[0]); // x^2 + // do some work on the dot products while letting the multiplies cook + dotIfAligned = MaddSIMD(z, originalto.z, dotIfAligned); + dotIfNotAligned = MaddSIMD(z, originalToNeg[2], dotIfNotAligned); + + diffsDot = MaddSIMD(diffs[1], diffs[1], diffsDot); // x^2 + y^2 + sumsDot = MaddSIMD(sums[1], sums[1], sumsDot); + diffsDot = MaddSIMD(diffs[2], diffs[2], diffsDot); // x^2 + y^2 + z^2 + sumsDot = MaddSIMD(sums[2], sums[2], sumsDot); + diffsDot = MaddSIMD(diffs[3], diffs[3], diffsDot); // x^2 + y^2 + z^2 + w^2 + sumsDot = MaddSIMD(sums[3], sums[3], sumsDot); + // do some work on the dot products while letting the multiplies cook + dotIfAligned = MaddSIMD(w, originalto.w, dotIfAligned); + dotIfNotAligned = MaddSIMD(w, originalToNeg[3], dotIfNotAligned); + + // are the differences greater than the sums? + // if so, we need to negate that quaternion + fltx4 mask = CmpGtSIMD(diffsDot, sumsDot); // 1 for diffs>0 and 0 elsewhere + to.x = MaskedAssign(mask, originalToNeg[0], originalto.x); + to.y = MaskedAssign(mask, originalToNeg[1], originalto.y); + to.z = MaskedAssign(mask, originalToNeg[2], originalto.z); + to.w = MaskedAssign(mask, originalToNeg[3], originalto.w); + + cosineOmega = MaskedAssign(mask, dotIfNotAligned, dotIfAligned); + } + + // right, now to is aligned to be the short way round, and we computed + // the dot product while we were figuring all that out. +#else + const FourQuaternions& to = originalto; + cosineOmega = MulSIMD(x, to.x); + cosineOmega = MaddSIMD(y, to.y, cosineOmega); + cosineOmega = MaddSIMD(z, to.z, cosineOmega); + cosineOmega = MaddSIMD(w, to.w, cosineOmega); +#endif + + fltx4 Zero = Four_Zeros; + bi32x4 cosOmegaLessThanZero = CmpLtSIMD(cosineOmega, Zero); + // fltx4 shouldNegate = MaskedAssign(cosOmegaLessThanZero, Four_NegativeOnes , Four_Ones ); + fltx4 signMask = LoadAlignedSIMD((float*)g_SIMD_signmask); // contains a one in the sign bit -- xor against a number to negate it + fltx4 sinOmega = Four_Ones; + + // negate cosineOmega where necessary + cosineOmega = MaskedAssign(cosOmegaLessThanZero, XorSIMD(cosineOmega, signMask), cosineOmega); + fltx4 oneMinusT = SubSIMD(Four_Ones, t); + bi32x4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps + + // figure out the sin component of the diff quaternion. + // since sin^2(t) + cos^2(t) = 1... + sinOmega = MsubSIMD(cosineOmega, cosineOmega, sinOmega); // = 1 - cos^2(t) = sin^2(t) + fltx4 invSinOmega = ReciprocalSqrtSIMD(sinOmega); // 1/sin(t) + sinOmega = MulSIMD(sinOmega, invSinOmega); // = sin^2(t) / sin(t) = sin(t) + + // use the arctangent technique to work out omega from tan^-1(sin/cos) + fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega); + + // alpha = sin(omega * (1-T))/sin(omega) + // beta = sin(omega * T)/sin(omega) + fltx4 alpha = MulSIMD(omega, oneMinusT); // w(1-T) + fltx4 beta = MulSIMD(omega, t); // w(T) + signMask = MaskedAssign(cosOmegaLessThanZero, signMask, Zero); + + alpha = SinSIMD(alpha); // sin(w(1-T)) + beta = SinSIMD(beta); // sin(wT) + + alpha = MulSIMD(alpha, invSinOmega); + beta = MulSIMD(beta, invSinOmega); + + // depending on whether the dot product was less than zero, negate beta, or not + beta = XorSIMD(beta, signMask); + + // mask out singularities (where omega = 1) + alpha = MaskedAssign(bCosOmegaLessThanOne, alpha, oneMinusT); + beta = MaskedAssign(bCosOmegaLessThanOne, beta, t); + + ret.x = MulSIMD(x, alpha); + ret.y = MulSIMD(y, alpha); + ret.z = MulSIMD(z, alpha); + ret.w = MulSIMD(w, alpha); + + ret.x = MaddSIMD(to.x, beta, ret.x); + ret.y = MaddSIMD(to.y, beta, ret.y); + ret.z = MaddSIMD(to.z, beta, ret.z); + ret.w = MaddSIMD(to.w, beta, ret.w); + + return ret; +} + + + +FORCEINLINE FourQuaternions FourQuaternions::SlerpNoAlign(const FourQuaternions& originalto, const fltx4& t) +{ + FourQuaternions ret; + static const fltx4 OneMinusEpsilon = { 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f }; + + // align if necessary. + + // actually, before we even do that, start by computing the dot product of + // the quaternions. it has lots of dependent ops and we can sneak it into + // the pipeline bubbles as we figure out alignment. Of course we don't know + // yet if we need to realign, so compute them both -- there's plenty of + // space in the bubbles. They're roomy, those bubbles. + fltx4 cosineOmega; + + const FourQuaternions& to = originalto; + cosineOmega = MulSIMD(x, to.x); + cosineOmega = MaddSIMD(y, to.y, cosineOmega); + cosineOmega = MaddSIMD(z, to.z, cosineOmega); + cosineOmega = MaddSIMD(w, to.w, cosineOmega); + + fltx4 sinOmega = Four_Ones; + + fltx4 oneMinusT = SubSIMD(Four_Ones, t); + bi32x4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps + + // figure out the sin component of the diff quaternion. + // since sin^2(t) + cos^2(t) = 1... + sinOmega = MsubSIMD(cosineOmega, cosineOmega, sinOmega); // = 1 - cos^2(t) = sin^2(t) + fltx4 invSinOmega = ReciprocalSqrtSIMD(sinOmega); // 1/sin(t) + sinOmega = MulSIMD(sinOmega, invSinOmega); // = sin^2(t) / sin(t) = sin(t) + + // use the arctangent technique to work out omega from tan^-1(sin/cos) + fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega); + + // alpha = sin(omega * (1-T))/sin(omega) + // beta = sin(omega * T)/sin(omega) + fltx4 alpha = MulSIMD(omega, oneMinusT); // w(1-T) + fltx4 beta = MulSIMD(omega, t); // w(T) + alpha = SinSIMD(alpha); // sin(w(1-T)) + beta = SinSIMD(beta); // sin(wT) + alpha = MulSIMD(alpha, invSinOmega); + beta = MulSIMD(beta, invSinOmega); + + // mask out singularities (where omega = 1) + alpha = MaskedAssign(bCosOmegaLessThanOne, alpha, oneMinusT); + beta = MaskedAssign(bCosOmegaLessThanOne, beta, t); + + ret.x = MulSIMD(x, alpha); + ret.y = MulSIMD(y, alpha); + ret.z = MulSIMD(z, alpha); + ret.w = MulSIMD(w, alpha); + + ret.x = MaddSIMD(to.x, beta, ret.x); + ret.y = MaddSIMD(to.y, beta, ret.y); + ret.z = MaddSIMD(to.z, beta, ret.z); + ret.w = MaddSIMD(to.w, beta, ret.w); + + return ret; +} + +/***** removed because one of the SWIG permutations doesn't include ssequaternion.h, causing a missing symbol on this function: +inline void FourVectors::RotateBy( const FourQuaternions &quats ) +{ + quats.RotateFourVectors( this ); +} +*/ + + #endif // SSEQUATMATH_H + diff --git a/r5dev/mathlib/transform.cpp b/r5dev/mathlib/transform.cpp new file mode 100644 index 00000000..8be2e83c --- /dev/null +++ b/r5dev/mathlib/transform.cpp @@ -0,0 +1,179 @@ +//==== Copyright (c) 1996-2011, Valve Corporation, All rights reserved. =====// +// +// Purpose: +// +// $NoKeywords: $ +// +//===========================================================================// + +#include "core/stdafx.h" +#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB) + +#include "mathlib/transform.h" +#include "mathlib/mathlib.h" + +// memdbgon must be the last include file in a .cpp file!!! +//#include "tier0/memdbgon.h" + +const CTransform g_TransformIdentity(Vector3D(0.0f, 0.0f, 0.0f), Quaternion(0.0f, 0.0f, 0.0f, 1.0f)); + +void SetIdentityTransform(CTransform& out) +{ + out.m_vPosition = vec3_origin; + out.m_orientation = quat_identity; +} + +void ConcatTransforms(const CTransform& in1, const CTransform& in2, CTransform& out) +{ + // Store in temp to avoid problems if out == in1 or out == in2 + CTransform result; + QuaternionMult(in1.m_orientation, in2.m_orientation, result.m_orientation); + QuaternionMultiply(in1.m_orientation, in2.m_vPosition, result.m_vPosition); + result.m_vPosition += in1.m_vPosition; + out = result; +} + +void VectorIRotate(const Vector3D& v, const CTransform& t, Vector3D& out) +{ + // FIXME: Make work directly with the transform + matrix3x4_t m; + TransformMatrix(t, m); + VectorIRotate(v, m, out); +} + +void VectorITransform(const Vector3D& v, const CTransform& t, Vector3D& out) +{ + // FIXME: Make work directly with the transform + matrix3x4_t m; + TransformMatrix(t, m); + VectorITransform(v, m, out); +} + +void TransformSlerp(const CTransform& p, const CTransform& q, float t, CTransform& qt) +{ + QuaternionSlerp(p.m_orientation, q.m_orientation, t, qt.m_orientation); + VectorLerp(p.m_vPosition, q.m_vPosition, t, qt.m_vPosition); +} + +void TransformLerp(const CTransform& p, const CTransform& q, float t, CTransform& qt) +{ + QuaternionBlend(p.m_orientation, q.m_orientation, t, qt.m_orientation); + VectorLerp(p.m_vPosition, q.m_vPosition, t, qt.m_vPosition); +} + +void TransformMatrix(const CTransform& in, matrix3x4_t& out) +{ + QuaternionMatrix(in.m_orientation, in.m_vPosition, out); +} + +void TransformMatrix(const CTransformUnaligned& in, matrix3x4_t& out) +{ + QuaternionMatrix(in.m_orientation, in.m_vPosition, out); +} + +void TransformMatrix(const CTransform& in, const Vector3D& vScaleIn, matrix3x4_t& out) +{ + QuaternionMatrix(in.m_orientation, in.m_vPosition, vScaleIn, out); +} + +void MatrixTransform(const matrix3x4_t& in, CTransformUnaligned& out) +{ + MatrixQuaternion(in, out.m_orientation); + MatrixGetColumn(in, ORIGIN, out.m_vPosition); +} + +void MatrixTransform(const matrix3x4_t& in, CTransform& out) +{ + MatrixQuaternion(in, out.m_orientation); + MatrixGetColumn(in, ORIGIN, out.m_vPosition); +} + +void MatrixTransform(const matrix3x4_t& in, CTransform& out, Vector3D& vScaleOut) +{ + matrix3x4_t norm; + vScaleOut = MatrixNormalize(in, norm); + MatrixTransform(norm, out); +} + +void AngleTransform(const QAngle& angles, const Vector3D& origin, CTransform& out) +{ + AngleQuaternion(angles, out.m_orientation); + out.m_vPosition = origin; +} + +void TransformInvert(const CTransform& in, CTransform& out) +{ + QuaternionInvert(in.m_orientation, out.m_orientation); + QuaternionMultiply(out.m_orientation, in.m_vPosition, out.m_vPosition); + out.m_vPosition *= -1.0f; +} + +void AxisAngleTransform(const Vector3D& vecAxis, float flAngleDegrees, CTransform& out) +{ + AxisAngleQuaternion(vecAxis, flAngleDegrees, out.m_orientation); + out.m_vPosition = vec3_origin; +} + +void TransformVectorsFLU(const CTransform& in, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp) +{ + QuaternionVectorsFLU(in.m_orientation, pForward, pLeft, pUp); +} + +void TransformVectorsForward(const CTransform& in, Vector3D* pForward) +{ + QuaternionVectorsForward(in.m_orientation, pForward); +} + +bool TransformsAreEqual(const CTransform& src1, const CTransform& src2, float flPosTolerance, float flRotTolerance) +{ + if (!VectorsAreEqual(src1.m_vPosition, src2.m_vPosition, flPosTolerance)) + return false; + return QuaternionsAreEqual(src1.m_orientation, src2.m_orientation, flRotTolerance); +} + +// FIXME: optimize this with simd goodness +void TransformToWorldSpace(int nRootTransformCount, int nTransformCount, const int* pParentIndices, CTransform* pTransforms) +{ +#ifdef _DEBUG + for (int i = 0; i < nRootTransformCount; ++i) + { + Assert(pParentIndices[i] < 0); + } +#endif + + for (int i = nRootTransformCount; i < nTransformCount; ++i) + { + int nParentBone = pParentIndices[i]; + Assert(nParentBone >= 0 && nParentBone < i); + ConcatTransforms(pTransforms[nParentBone], pTransforms[i], pTransforms[i]); + } +} + +// FIXME: optimize this with simd goodness +void TransformToParentSpace(int nRootTransformCount, int nTransformCount, const int* pParentIndices, CTransform* pTransforms) +{ +#ifdef _DEBUG + for (int i = 0; i < nRootTransformCount; ++i) + { + Assert(pParentIndices[i] < 0); + } +#endif + + bool* pComputedParentTransform = (bool*)stackalloc(nTransformCount * sizeof(bool)); + memset(pComputedParentTransform, 0, nTransformCount * sizeof(bool)); + CTransform* pWorldToParentTransforms = (CTransform*)stackalloc(nTransformCount * sizeof(CTransform)); + + for (int b = nTransformCount; --b >= nRootTransformCount; ) + { + int nParentBone = pParentIndices[b]; + if (!pComputedParentTransform[nParentBone]) + { + TransformInvert(pTransforms[nParentBone], pWorldToParentTransforms[nParentBone]); + pComputedParentTransform[nParentBone] = true; + } + ConcatTransforms(pWorldToParentTransforms[nParentBone], pTransforms[b], pTransforms[b]); + } +} + +#endif // !_STATIC_LINKED || _SHARED_LIB + diff --git a/r5dev/mathlib/transform.h b/r5dev/mathlib/transform.h new file mode 100644 index 00000000..d2cb9b03 --- /dev/null +++ b/r5dev/mathlib/transform.h @@ -0,0 +1,401 @@ +//====== Copyright 1996-2005, Valve Corporation, All rights reserved. =======// +// +// Purpose: +// +// $NoKeywords: $ +// +//===========================================================================// + +#ifndef TRANSFORM_H +#define TRANSFORM_H + +#ifdef COMPILER_MSVC +#pragma once +#endif + +//#include "tier0/memalloc.h" +#include "mathlib/vector.h" +#include "mathlib/mathlib.h" + +//----------------------------------------------------------------------------- +// Matrix 3x4_t +//----------------------------------------------------------------------------- +class CTransformUnaligned; + + +//----------------------------------------------------------------------------- +// Represents a position + orientation using quaternions +//----------------------------------------------------------------------------- +class ALIGN16 CTransform +{ +public: + CTransform() {} + CTransform(const Vector3D& v, const Quaternion& q) : m_vPosition(v), m_orientation(q) {} + CTransform(const Vector3D& v, const QAngle& a) : m_vPosition(v) + { + AngleQuaternion(a, m_orientation); + } + + VectorAligned m_vPosition; + QuaternionAligned m_orientation; + + bool IsValid() const + { + return m_vPosition.IsValid() && m_orientation.IsValid(); + } + + bool operator==(const CTransform& v) const; ///< exact equality check + bool operator!=(const CTransform& v) const; + + // for API compatibility with matrix3x4_t + inline void InitFromQAngles(const QAngle& angles, const Vector3D& vPosition = vec3_origin); + inline void InitFromMatrix(const matrix3x4_t& transform); + inline void InitFromQuaternion(const Quaternion& orientation, const Vector3D& vPosition = vec3_origin); + + inline Quaternion ToQuaternion() const; + inline QAngle ToQAngle() const; + inline matrix3x4_t ToMatrix() const; + + inline void SetToIdentity(); + + inline void SetOrigin(Vector3D const& vPos) { m_vPosition = vPos; } + inline void SetAngles(QAngle const& vAngles); + inline Vector3D GetOrigin(void) const { return m_vPosition; } + + inline void GetBasisVectorsFLU(Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp) const; + inline Vector3D GetForward() const; + inline Vector3D TransformVector(const Vector3D& v0) const; + inline Vector3D RotateVector(const Vector3D& v0) const; + inline Vector3D TransformVectorByInverse(const Vector3D& v0) const; + inline Vector3D RotateVectorByInverse(const Vector3D& v0) const; + inline Vector3D RotateExtents(const Vector3D& vBoxExtents) const; // these are extents and must remain positive/symmetric after rotation + inline void TransformAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const; + inline void TransformAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const; + inline void RotateAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const; + inline void RotateAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const; + //inline void TransformPlane( const cplane_t &inPlane, cplane_t &outPlane ) const; + //inline void InverseTransformPlane( const cplane_t &inPlane, cplane_t &outPlane ) const; + + /// Computes an inverse. Uses the 'TR' naming to be consistent with the same method in matrix3x4_t (which only works with orthonormal matrices) + inline void InverseTR(CTransform& out) const; + +public: + CTransform& operator=(const CTransformUnaligned& i); +} ALIGN16_POST; + + +extern const CTransform g_TransformIdentity; + + +//----------------------------------------------------------------------------- +// Represents an unaligned position + orientation using quaternions, +// used only for copying data around +//----------------------------------------------------------------------------- +class CTransformUnaligned +{ +public: + CTransformUnaligned() {} + CTransformUnaligned(const Vector3D& v, const Quaternion& q) : m_vPosition(v), m_orientation(q) {} + CTransformUnaligned(const CTransform& transform) : m_vPosition(transform.m_vPosition), m_orientation(transform.m_orientation) {} + CTransform AsTransform() const { return CTransform(m_vPosition, m_orientation); } + + Vector3D m_vPosition; + Quaternion m_orientation; + + bool IsValid() const + { + return m_vPosition.IsValid() && m_orientation.IsValid(); + } + +public: + CTransformUnaligned& operator=(const CTransform& i); +}; + + +//----------------------------------------------------------------------------- +// Inline methods +//----------------------------------------------------------------------------- +inline CTransform& CTransform::operator=(const CTransformUnaligned& i) +{ + m_vPosition = i.m_vPosition; + m_orientation = i.m_orientation; + return *this; +} + +inline CTransformUnaligned& CTransformUnaligned::operator=(const CTransform& i) +{ + m_vPosition = i.m_vPosition; + m_orientation = i.m_orientation; + return *this; +} + + +//----------------------------------------------------------------------------- +// Other methods +//----------------------------------------------------------------------------- +void ConcatTransforms(const CTransform& in1, const CTransform& in2, CTransform& out); +void TransformSlerp(const CTransform& p, const CTransform& q, float t, CTransform& qt); +void TransformLerp(const CTransform& p, const CTransform& q, float t, CTransform& qt); +void TransformMatrix(const CTransform& in, matrix3x4_t& out); +void TransformMatrix(const CTransform& in, const Vector3D& vScaleIn, matrix3x4_t& out); + +inline void TransformMatrix(const CTransform& in, float flScale, matrix3x4_t& out) +{ + QuaternionMatrix(in.m_orientation, in.m_vPosition, Vector3D(flScale, flScale, flScale), out); +} + +inline float TransformNormalize(CTransform& in) +{ + return QuaternionNormalize(in.m_orientation); +} + +void TransformMatrix(const CTransformUnaligned& in, matrix3x4_t& out); +void MatrixTransform(const matrix3x4_t& in, CTransform& out); +void MatrixTransform(const matrix3x4_t& in, CTransformUnaligned& out); +void MatrixTransform(const matrix3x4_t& in, CTransform& out, Vector3D& vScaleOut); + +inline void MatrixTransform(const matrix3x4_t& in, CTransform& out, float& flScale) +{ + Vector3D vScale; + MatrixTransform(in, out, vScale); + flScale = vScale.LargestComponentValue(); +} + +void AngleTransform(const QAngle& angles, const Vector3D& origin, CTransform& out); +void SetIdentityTransform(CTransform& out); +void TransformVectorsFLU(const CTransform& in, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp); +void TransformVectorsForward(const CTransform& in, Vector3D* pForward); + +inline const CTransform GetIdentityTransform() +{ + CTransform out; + SetIdentityTransform(out); + return out; +} + +inline const CTransform MatrixTransform(const matrix3x4_t& in) +{ + CTransform out; + MatrixTransform(in, out); + return out; +} + +inline const matrix3x4_t TransformMatrix(const CTransform& in) +{ + matrix3x4_t out; + TransformMatrix(in, out); + return out; +} +inline const matrix3x4_t TransformMatrix(const CTransformUnaligned& in) +{ + matrix3x4_t out; + TransformMatrix(in, out); + return out; +} + +inline const CTransform ConcatTransforms(const CTransform& in1, const CTransform& in2) +{ + CTransform result; + ConcatTransforms(in1, in2, result); + return result; +} + + +void TransformInvert(const CTransform& in, CTransform& out); +void AxisAngleTransform(const Vector3D& vecAxis, float flAngleDegrees, CTransform& out); +void VectorIRotate(const Vector3D& v, const CTransform& t, Vector3D& out); +void VectorITransform(const Vector3D& v, const CTransform& t, Vector3D& out); + +inline Vector3D TransformPoint(const CTransformUnaligned& tm, const Vector3D& p) +{ + return Vector3D( + tm.m_vPosition.x + (1.0f - 2.0f * tm.m_orientation.y * tm.m_orientation.y - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.x + (2.0f * tm.m_orientation.x * tm.m_orientation.y - 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.x * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.z, + tm.m_vPosition.y + (2.0f * tm.m_orientation.x * tm.m_orientation.y + 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.x + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.y * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.z, + tm.m_vPosition.z + (2.0f * tm.m_orientation.x * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.x + (2.0f * tm.m_orientation.y * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.y + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.y * tm.m_orientation.y) * p.z + ); +} + +// TODO: implement in SIMD? +inline Vector3D TransformPoint(const CTransform& tm, const Vector3D& p) +{ + return Vector3D( + tm.m_vPosition.x + (1.0f - 2.0f * tm.m_orientation.y * tm.m_orientation.y - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.x + (2.0f * tm.m_orientation.x * tm.m_orientation.y - 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.x * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.z, + tm.m_vPosition.y + (2.0f * tm.m_orientation.x * tm.m_orientation.y + 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.x + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.y * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.z, + tm.m_vPosition.z + (2.0f * tm.m_orientation.x * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.x + (2.0f * tm.m_orientation.y * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.y + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.y * tm.m_orientation.y) * p.z + ); +} + + +template < class T > +inline void TransformPoint(const T& tm, const Vector3D& p, Vector3D& out) +{ + out.x = tm.m_vPosition.x + (1.0f - 2.0f * tm.m_orientation.y * tm.m_orientation.y - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.x + (2.0f * tm.m_orientation.x * tm.m_orientation.y - 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.x * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.z; + out.y = tm.m_vPosition.y + (2.0f * tm.m_orientation.x * tm.m_orientation.y + 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.x + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.y * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.z; + out.z = tm.m_vPosition.z + (2.0f * tm.m_orientation.x * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.x + (2.0f * tm.m_orientation.y * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.y + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.y * tm.m_orientation.y) * p.z; +} + +template < class T > +inline void RotatePoint(const T& tm, const Vector3D& p, Vector3D& out) +{ + out.x = (1.0f - 2.0f * tm.m_orientation.y * tm.m_orientation.y - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.x + (2.0f * tm.m_orientation.x * tm.m_orientation.y - 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.x * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.z; + out.y = (2.0f * tm.m_orientation.x * tm.m_orientation.y + 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.x + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.y * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.z; + out.z = (2.0f * tm.m_orientation.x * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.x + (2.0f * tm.m_orientation.y * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.y + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.y * tm.m_orientation.y) * p.z; +} + + +inline const CTransform TransformInvert(const CTransform& in) +{ + CTransform out; + TransformInvert(in, out); + return out; +} + +// Transform equality test +bool TransformsAreEqual(const CTransform& src1, const CTransform& src2, float flPosTolerance = 1e-2, float flRotTolerance = 1e-1f); + +// Computes world-space transforms given local-space transforms + parent info +// The start of the pTransforms array (nRootTransformCount # of transforms) must be filled with +// the root transforms which have no parent. The end of the pTransforms array (nTransformCount # of transforms) +// must be filled with local-space transforms which are relative to other transforms, including possibly the +// root transforms. Therefore, (nRootTransformCount + nTransformCount) # of transforms must be passed into pTransforms. +// Only nTransformCount parent indices should be passed in. +// Parent indices are relative to the entire array, so a parent index of 0 indicates the first element +// of the array, which is always a root transform. -1 parent index is *illegal* +// Parent indices must always be sorted so that the index transforms earlier in the array. +// The transforms are modified in-place. +void TransformToWorldSpace(int nRootTransformCount, int nTransformCount, const int* pParentIndices, CTransform* pTransforms); +void TransformToParentSpace(int nRootTransformCount, int nTransformCount, const int* pParentIndices, CTransform* pTransforms); + + +inline void CTransform::InitFromQAngles(const QAngle& angles, const Vector3D& vPosition) +{ + AngleQuaternion(angles, m_orientation); + m_vPosition = vPosition; +} + +inline void CTransform::InitFromMatrix(const matrix3x4_t& transform) +{ + m_orientation = MatrixQuaternion(transform); + m_vPosition = transform.GetOrigin(); +} + +inline void CTransform::InitFromQuaternion(const Quaternion& orientation, const Vector3D& vPosition) +{ + m_orientation = orientation; + m_vPosition = vPosition; +} + +inline void CTransform::SetAngles(QAngle const& vAngles) +{ + AngleQuaternion(vAngles, m_orientation); +} + +inline Quaternion CTransform::ToQuaternion() const +{ + return m_orientation; +} +inline QAngle CTransform::ToQAngle() const +{ + QAngle angles; + QuaternionAngles(m_orientation, angles); + return angles; +} + +inline matrix3x4_t CTransform::ToMatrix() const +{ + return TransformMatrix(*this); +} + +inline void CTransform::SetToIdentity() +{ + m_vPosition = vec3_origin; + m_orientation = quat_identity; +} + +inline void CTransform::GetBasisVectorsFLU(Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp) const +{ + TransformVectorsFLU(*this, pForward, pLeft, pUp); +} + +inline Vector3D CTransform::GetForward() const +{ + Vector3D vForward; + TransformVectorsForward(*this, &vForward); + return vForward; +} + +inline Vector3D CTransform::TransformVector(const Vector3D& v0) const +{ + return TransformPoint(*this, v0); +} + +inline Vector3D CTransform::RotateVector(const Vector3D& v0) const +{ + Vector3D vOut; + RotatePoint(*this, v0, vOut); + return vOut; +} + +inline Vector3D CTransform::TransformVectorByInverse(const Vector3D& v0) const +{ + Vector3D vOut; + VectorITransform(v0, *this, vOut); + return vOut; +} + +inline Vector3D CTransform::RotateVectorByInverse(const Vector3D& v0) const +{ + Vector3D vOut; + VectorIRotate(v0, *this, vOut); + return vOut; +} + +inline bool CTransform::operator==(const CTransform& t) const +{ + return t.m_vPosition == m_vPosition && t.m_orientation == m_orientation; +} + +inline bool CTransform::operator!=(const CTransform& t) const +{ + return t.m_vPosition != m_vPosition || t.m_orientation != m_orientation; +} + +// PERFORMANCE: No native versions of these but implement them on matrix for convenient access +inline void CTransform::TransformAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const +{ + ToMatrix().TransformAABB(vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut); +} + +inline void CTransform::TransformAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const +{ + ToMatrix().TransformAABBByInverse(vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut); +} + +inline void CTransform::RotateAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const +{ + ToMatrix().RotateAABB(vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut); +} +inline void CTransform::RotateAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const +{ + ToMatrix().RotateAABBByInverse(vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut); +} + +inline void CTransform::InverseTR(CTransform& out) const +{ + matrix3x4_t xForm = ToMatrix(); + out = xForm.InverseTR().ToCTransform(); +} + + +// transform conversion operators on matrix3x4_t +inline void matrix3x4_t::InitFromCTransform(const CTransform& transform) +{ + TransformMatrix(transform, *this); +} +inline CTransform matrix3x4_t::ToCTransform() const +{ + return MatrixTransform(*this); +} + + +#endif // TRANSFORM \ No newline at end of file diff --git a/r5dev/mathlib/vector.h b/r5dev/mathlib/vector.h index 833c1c8d..8fca4b5b 100644 --- a/r5dev/mathlib/vector.h +++ b/r5dev/mathlib/vector.h @@ -8,7 +8,6 @@ #ifndef VECTOR_H #define VECTOR_H -#define NO_MALLOC_OVERRIDE #ifdef _WIN32 #pragma once @@ -23,7 +22,7 @@ #if defined( _PS3 ) //#include #include -#include "platform.h" +#include "tier0/platform.h" #include "mathlib/math_pfns.h" #endif @@ -36,16 +35,19 @@ #define ALIGN16_POST #endif +#define NO_MALLOC_OVERRIDE #if !defined(NO_MALLOC_OVERRIDE) #include "tier0/memalloc.h" #endif // !NO_MALLOC_OVERRIDE #include "tier0/dbg.h" #include "tier0/platform.h" +#if !defined( __SPU__ ) #include "tier0/threadtools.h" +#endif #include "mathlib/vector2d.h" #include "mathlib/math_pfns.h" -#include "mathlib/bits.h" #include "vstdlib/random.h" + // Uncomment this to add extra Asserts to check for NANs, uninitialized vecs, etc. //#define VECTOR_PARANOIA 1 @@ -92,6 +94,7 @@ public: // Got any nasty NAN's? bool IsValid() const; + bool IsReasonable(float range = 1000000) const; ///< Check for reasonably-sized values (if used as a game world position) void Invalidate(); // array access... @@ -157,13 +160,15 @@ public: inline bool IsZeroFast() const RESTRICT { static_assert(sizeof(vec_t) == sizeof(int)); - return (*(const int*)(&x) == 0 && - *(const int*)(&y) == 0 && - *(const int*)(&z) == 0); + return (*reinterpret_cast(&x) == 0 && + *reinterpret_cast(&y) == 0 && + *reinterpret_cast(&z) == 0); } - vec_t NormalizeInPlace(); - Vector3D Normalized() const; + vec_t NormalizeInPlace(); ///< Normalize all components + vec_t NormalizeInPlaceSafe(const Vector3D& vFallback);///< Normalize all components + Vector3D Normalized() const; ///< Return normalized vector + Vector3D NormalizedSafe(const Vector3D& vFallback)const; ///< Return normalized vector, falling back to vFallback if the length of this is 0 bool IsLengthGreaterThan(float val) const; bool IsLengthLessThan(float val) const; @@ -203,6 +208,9 @@ public: // returns 0, 1, 2 corresponding to the component with the largest absolute value inline int LargestComponent() const; + inline vec_t LargestComponentValue() const; + inline int SmallestComponent() const; + inline vec_t SmallestComponentValue() const; // 2d vec_t Length2D(void) const; @@ -243,7 +251,8 @@ private: #endif }; - +// Zero the object -- necessary for CNetworkVar and possibly other cases. +inline void EnsureValidValue(Vector3D& x) { x.Zero(); } #define USE_M64S defined( PLATFORM_WINDOWS_PC ) @@ -608,8 +617,14 @@ Vector3D RandomVector(vec_t minVal, vec_t maxVal); #endif float RandomVectorInUnitSphere(Vector3D* pVector); +Vector3D RandomVectorInUnitSphere(); +Vector3D RandomVectorInUnitSphere(IUniformRandomStream* pRnd); + float RandomVectorInUnitCircle(Vector2D* pVector); +Vector3D RandomVectorOnUnitSphere(); +Vector3D RandomVectorOnUnitSphere(IUniformRandomStream* pRnd); + //----------------------------------------------------------------------------- // @@ -666,6 +681,7 @@ inline void Vector3D::Init(vec_t ix, vec_t iy, vec_t iz) CHECK_VALID(*this); } +#if !defined(__SPU__) inline void Vector3D::Random(vec_t minVal, vec_t maxVal) { x = RandomFloat(minVal, maxVal); @@ -673,6 +689,7 @@ inline void Vector3D::Random(vec_t minVal, vec_t maxVal) z = RandomFloat(minVal, maxVal); CHECK_VALID(*this); } +#endif // This should really be a single opcode on the PowerPC (move r0 onto the vec reg) inline void Vector3D::Zero() @@ -749,6 +766,14 @@ inline bool Vector3D::IsValid() const return IsFinite(x) && IsFinite(y) && IsFinite(z); } +//----------------------------------------------------------------------------- +// IsReasonable? +//----------------------------------------------------------------------------- +inline bool Vector3D::IsReasonable(float range) const +{ + return (Length() < range); +} + //----------------------------------------------------------------------------- // Invalidate //----------------------------------------------------------------------------- @@ -1290,9 +1315,10 @@ inline Vector3D VectorLerp(const Vector3D& src1, const Vector3D& src2, vec_t t) //----------------------------------------------------------------------------- // Temporary storage for vector results so const Vector& results can be returned //----------------------------------------------------------------------------- -/*inline Vector& AllocTempVector() +#if !defined(__SPU__) +inline Vector3D& AllocTempVector() { - static Vector s_vecTemp[128]; + static Vector3D s_vecTemp[128]; static CInterlockedInt s_nIndex; int nIndex; @@ -1307,9 +1333,9 @@ inline Vector3D VectorLerp(const Vector3D& src1, const Vector3D& src2, vec_t t) } ThreadPause(); } - return s_vecTemp[nIndex & 0xffff]; -}*/ - + return s_vecTemp[nIndex]; +} +#endif //----------------------------------------------------------------------------- @@ -1345,6 +1371,40 @@ inline int Vector3D::LargestComponent() const return Z_INDEX; } +inline int Vector3D::SmallestComponent() const +{ + float flAbsx = fabs(x); + float flAbsy = fabs(y); + float flAbsz = fabs(z); + if (flAbsx < flAbsy) + { + if (flAbsx < flAbsz) + return X_INDEX; + return Z_INDEX; + } + if (flAbsy < flAbsz) + return Y_INDEX; + return Z_INDEX; +} + + +inline float Vector3D::LargestComponentValue() const +{ + float flAbsX = fabs(x); + float flAbsY = fabs(y); + float flAbsZ = fabs(z); + return MAX(MAX(flAbsX, flAbsY), flAbsZ); +} + +inline float Vector3D::SmallestComponentValue() const +{ + float flAbsX = fabs(x); + float flAbsY = fabs(y); + float flAbsZ = fabs(z); + return MIN(MIN(flAbsX, flAbsY), flAbsZ); +} + + inline void CrossProduct(const Vector3D& a, const Vector3D& b, Vector3D& result) { CHECK_VALID(a); @@ -1390,9 +1450,9 @@ inline vec_t Vector3D::Length(void) const // Normalization //----------------------------------------------------------------------------- - +/* // FIXME: Can't use until we're un-macroed in mathlib.h -inline vec_t VectorNormalize( Vector3D& v ) +inline vec_t VectorNormalize( Vector& v ) { Assert( v.IsValid() ); vec_t l = v.Length(); @@ -1408,7 +1468,7 @@ inline vec_t VectorNormalize( Vector3D& v ) } return l; } - +*/ // check a point against a box @@ -1432,6 +1492,35 @@ inline vec_t Vector3D::DistTo(const Vector3D& vOther) const } +//----------------------------------------------------------------------------- +// Float equality with tolerance +//----------------------------------------------------------------------------- +inline bool FloatsAreEqual(float f1, float f2, float flTolerance) +{ + // Sergiy: the implementation in Source2 is very inefficient, trying to start with a clean slate here, hopefully will reintegrate back to Source2 + const float flAbsToleranceThreshold = 0.000003814697265625; // 2 ^ -FLOAT_EQUALITY_NOISE_CUTOFF, + return fabsf(f1 - f2) <= flTolerance * (fabsf(f1) + fabsf(f2)) + flAbsToleranceThreshold; +} + + +//----------------------------------------------------------------------------- +// Vector equality with percentage tolerance +// are all components within flPercentageTolerance (expressed as a percentage of the larger component, per component)? +// and all components have the same sign +//----------------------------------------------------------------------------- +inline bool VectorsAreWithinPercentageTolerance(const Vector3D& src1, const Vector3D& src2, float flPercentageTolerance) +{ + if (!FloatsAreEqual(src1.x, src2.x, flPercentageTolerance)) + return false; + + if (!FloatsAreEqual(src1.y, src2.y, flPercentageTolerance)) + return false; + + return (FloatsAreEqual(src1.z, src2.z, flPercentageTolerance)); +} + + + //----------------------------------------------------------------------------- // Vector equality with tolerance //----------------------------------------------------------------------------- @@ -1475,6 +1564,11 @@ inline void VectorAbs(const Vector3D& src, Vector3D& dst) dst.z = FloatMakePositive(src.z); } +inline Vector3D VectorAbs(const Vector3D& src) +{ + return Vector3D(fabsf(src.x), fabsf(src.y), fabsf(src.z)); +} + //----------------------------------------------------------------------------- // @@ -1620,6 +1714,7 @@ inline float ComputeVolume(const Vector3D& vecMins, const Vector3D& vecMaxs) return DotProduct(vecDelta, vecDelta); } +#if !defined(__SPU__) // Get a random vector. inline Vector3D RandomVector(float minVal, float maxVal) { @@ -1627,6 +1722,7 @@ inline Vector3D RandomVector(float minVal, float maxVal) random.Random(minVal, maxVal); return random; } +#endif #endif //slow @@ -1668,6 +1764,13 @@ inline bool operator!=(const Vector3D& v, float const* f) // you won't get an "u void VectorPerpendicularToVector(Vector3D const& in, Vector3D* pvecOut); +inline const Vector3D VectorPerpendicularToVector(const Vector3D& in) +{ + Vector3D out; + VectorPerpendicularToVector(in, &out); + return out; +} + //----------------------------------------------------------------------------- // AngularImpulse //----------------------------------------------------------------------------- @@ -1676,12 +1779,14 @@ typedef Vector3D AngularImpulse; #ifndef VECTOR_NO_SLOW_OPERATIONS +#if !defined(__SPU__) inline AngularImpulse RandomAngularImpulse(float minVal, float maxVal) { AngularImpulse angImp; angImp.Random(minVal, maxVal); return angImp; } +#endif #endif @@ -1691,6 +1796,8 @@ inline AngularImpulse RandomAngularImpulse(float minVal, float maxVal) //----------------------------------------------------------------------------- class RadianEuler; +class DegreeEuler; +class QAngle; class Quaternion // same data-layout as engine's vec4_t, { // which is a vec_t[4] @@ -1705,9 +1812,11 @@ public: #endif } inline Quaternion(vec_t ix, vec_t iy, vec_t iz, vec_t iw) : x(ix), y(iy), z(iz), w(iw) { } - inline Quaternion(RadianEuler const& angle); // evil auto type promotion!!! + inline explicit Quaternion(RadianEuler const& angle); + inline explicit Quaternion(DegreeEuler const& angle); inline void Init(vec_t ix = 0.0f, vec_t iy = 0.0f, vec_t iz = 0.0f, vec_t iw = 0.0f) { x = ix; y = iy; z = iz; w = iw; } + inline void Init(const Vector3D& vImaginaryPart, float flRealPart) { x = vImaginaryPart.x; y = vImaginaryPart.y; z = vImaginaryPart.z; w = flRealPart; } bool IsValid() const; void Invalidate(); @@ -1717,19 +1826,47 @@ public: inline Quaternion Conjugate() const { return Quaternion(-x, -y, -z, w); } + // + const Vector3D GetForward()const; + const Vector3D GetLeft()const; + const Vector3D GetUp()const; + vec_t* Base() { return (vec_t*)this; } const vec_t* Base() const { return (vec_t*)this; } // convenience for debugging inline void Print() const; + // Imaginary part + Vector3D& ImaginaryPart() { return *(Vector3D*)this; } + const Vector3D& ImaginaryPart() const { return *(Vector3D*)this; } + float& RealPart() { return w; } + float RealPart() const { return w; } + inline QAngle ToQAngle() const; + inline struct matrix3x4_t ToMatrix() const; + // array access... vec_t operator[](int i) const; vec_t& operator[](int i); + inline Quaternion operator+(void) const { return *this; } + inline Quaternion operator-(void) const { return Quaternion(-x, -y, -z, -w); } + vec_t x, y, z, w; }; +// Random Quaternion that is UNIFORMLY distributed over the S^3 +// should be good for random generation of orientation for unit tests and for game +// NOTE: Nothing trivial like Quaternion(RandomAngle(0,180)) will do the trick , +// one needs to take special care to generate a uniformly distributed quaternion. +const Quaternion RandomQuaternion(); +const Quaternion RandomQuaternion(); +inline const Quaternion Conjugate(const Quaternion& q) +{ + return Quaternion(-q.x, -q.y, -q.z, q.w); +} + + //----------------------------------------------------------------------------- // Array access @@ -1767,10 +1904,45 @@ inline bool Quaternion::operator!=(const Quaternion& src) const void Quaternion::Print() const { #ifndef _CERT +#if !defined(__SPU__) DevMsg(eDLL_T::ENGINE, "q{ %.3fi + %.3fj + %.3fk + %.3f }", x, y, z, w); #endif +#endif } + + + +//----------------------------------------------------------------------------- +// Binaray operators +//----------------------------------------------------------------------------- +inline Quaternion operator+(const Quaternion& q1, const Quaternion& q2) +{ + return Quaternion(q1.x + q2.x, q1.y + q2.y, q1.z + q2.z, q1.w + q2.w); +} + +inline Quaternion operator-(const Quaternion& q1, const Quaternion& q2) +{ + return Quaternion(q1.x - q2.x, q1.y - q2.y, q1.z - q2.z, q1.w - q2.w); +} + +inline Quaternion operator*(float s, const Quaternion& q) +{ + return Quaternion(s * q.x, s * q.y, s * q.z, s * q.w); +} + +inline Quaternion operator*(const Quaternion& q, float s) +{ + return Quaternion(q.x * s, q.y * s, q.z * s, q.w * s); +} + +inline Quaternion operator/(const Quaternion& q, float s) +{ + Assert(s != 0.0f); + return Quaternion(q.x / s, q.y / s, q.z / s, q.w / s); +} + + //----------------------------------------------------------------------------- // Quaternion equality with tolerance //----------------------------------------------------------------------------- @@ -1898,17 +2070,35 @@ public: #endif } ALIGN16_POST; + +//----------------------------------------------------------------------------- +// Src data hasn't changed, but work data is of a form more friendly for SPU +//----------------------------------------------------------------------------- +#if defined( _PS3 ) +//typedef Vector BoneVector; +typedef VectorAligned BoneVector; +typedef QuaternionAligned BoneQuaternion; +typedef QuaternionAligned BoneQuaternionAligned; +#else +typedef Vector3D BoneVector; +typedef Quaternion BoneQuaternion; +typedef QuaternionAligned BoneQuaternionAligned; +#endif + //----------------------------------------------------------------------------- // Radian Euler angle aligned to axis (NOT ROLL/PITCH/YAW) //----------------------------------------------------------------------------- class QAngle; +#define VEC_DEG2RAD( a ) (a) * (3.14159265358979323846f / 180.0f) +#define VEC_RAD2DEG( a ) (a) * (180.0f / 3.14159265358979323846f) class RadianEuler { public: inline RadianEuler(void) { } inline RadianEuler(vec_t X, vec_t Y, vec_t Z) { x = X; y = Y; z = Z; } - inline RadianEuler(Quaternion const& q); // evil auto type promotion!!! - inline RadianEuler(QAngle const& angles); // evil auto type promotion!!! + inline explicit RadianEuler(Quaternion const& q); + inline explicit RadianEuler(QAngle const& angles); + inline explicit RadianEuler(DegreeEuler const& angles); // Initialization inline void Init(vec_t ix = 0.0f, vec_t iy = 0.0f, vec_t iz = 0.0f) { x = ix; y = iy; z = iz; } @@ -1941,6 +2131,18 @@ inline bool Quaternion::IsValid() const return IsFinite(x) && IsFinite(y) && IsFinite(z) && IsFinite(w); } + +FORCEINLINE float QuaternionLength(const Quaternion& q) +{ + return sqrtf(q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w); +} + +FORCEINLINE bool QuaternionIsNormalized(const Quaternion& q, float flTolerance = 1e-6f) +{ + float flLen = QuaternionLength(q); + return (fabs(flLen - 1.0) < flTolerance); +} + inline void Quaternion::Invalidate() { //#ifdef _DEBUG @@ -2003,6 +2205,116 @@ inline vec_t RadianEuler::operator[](int i) const } +//----------------------------------------------------------------------------- +// Degree Euler angle aligned to axis (NOT ROLL/PITCH/YAW) +//----------------------------------------------------------------------------- +class DegreeEuler +{ +public: + ///\name Initialization + //@{ + inline DegreeEuler(void) ///< Create with un-initialized components. If VECTOR_PARANOIA is set, will init with NANS. + { + // Initialize to NAN to catch errors +#ifdef VECTOR_PARANOIA + x = y = z = VEC_T_NAN; +#endif + } + inline DegreeEuler(vec_t X, vec_t Y, vec_t Z) { x = X; y = Y; z = Z; } + inline explicit DegreeEuler(Quaternion const& q); + inline explicit DegreeEuler(QAngle const& angles); + inline explicit DegreeEuler(RadianEuler const& angles); + + // Initialization + inline void Init(vec_t ix = 0.0f, vec_t iy = 0.0f, vec_t iz = 0.0f) { x = ix; y = iy; z = iz; } + + inline QAngle ToQAngle() const; + + // conversion to qangle + bool IsValid() const; + void Invalidate(); + + inline vec_t* Base() { return &x; } + inline const vec_t* Base() const { return &x; } + + // array access... + vec_t operator[](int i) const; + vec_t& operator[](int i); + + vec_t x, y, z; +}; + + +//----------------------------------------------------------------------------- +// DegreeEuler equality with tolerance +//----------------------------------------------------------------------------- +inline bool DegreeEulersAreEqual(const DegreeEuler& src1, const DegreeEuler& src2, float tolerance = 0.0f) +{ + if (FloatMakePositive(src1.x - src2.x) > tolerance) + return false; + if (FloatMakePositive(src1.y - src2.y) > tolerance) + return false; + return (FloatMakePositive(src1.z - src2.z) <= tolerance); +} + +/* +extern void AngleQuaternion( DegreeEuler const &angles, Quaternion &qt ); +extern void QuaternionAngles( Quaternion const &q, DegreeEuler &angles ); +extern void QuaternionVectorsFLU( Quaternion const &q, Vector *pForward, Vector *pLeft, Vector *pUp ); +*/ + +inline Quaternion::Quaternion(DegreeEuler const& angles) +{ + RadianEuler radians(angles); + AngleQuaternion(radians, *this); +} + +inline DegreeEuler::DegreeEuler(RadianEuler const& angles) +{ + Init(VEC_RAD2DEG(angles.x), VEC_RAD2DEG(angles.y), VEC_RAD2DEG(angles.z)); +} + +inline RadianEuler::RadianEuler(DegreeEuler const& angles) +{ + Init(VEC_DEG2RAD(angles.x), VEC_DEG2RAD(angles.y), VEC_DEG2RAD(angles.z)); +} + +inline DegreeEuler::DegreeEuler(Quaternion const& q) +{ + RadianEuler radians(q); + Init(VEC_RAD2DEG(radians.x), VEC_RAD2DEG(radians.y), VEC_RAD2DEG(radians.z)); +} + +inline bool DegreeEuler::IsValid() const +{ + return IsFinite(x) && IsFinite(y) && IsFinite(z); +} + +inline void DegreeEuler::Invalidate() +{ + //#ifdef VECTOR_PARANOIA + x = y = z = VEC_T_NAN; + //#endif +} + + +//----------------------------------------------------------------------------- +// Array access +//----------------------------------------------------------------------------- +inline vec_t& DegreeEuler::operator[](int i) +{ + Assert((i >= 0) && (i < 3)); + return ((vec_t*)this)[i]; +} + +inline vec_t DegreeEuler::operator[](int i) const +{ + Assert((i >= 0) && (i < 3)); + return ((vec_t*)this)[i]; +} + + + //----------------------------------------------------------------------------- // Degree Euler QAngle pitch, yaw, roll //----------------------------------------------------------------------------- @@ -2061,6 +2373,12 @@ public: // No assignment operators either... QAngle& operator=(const QAngle& src); + void Normalize(); + void NormalizePositive(); + + inline struct matrix3x4_t ToMatrix() const; + inline Quaternion ToQuaternion() const; + #ifndef VECTOR_NO_SLOW_OPERATIONS // copy constructors @@ -2080,6 +2398,9 @@ private: #endif }; +// Zero the object -- necessary for CNetworkVar and possibly other cases. +inline void EnsureValidValue(QAngle& x) { x.Init(); } + //----------------------------------------------------------------------------- // Allows us to specifically pass the vector by value when we need to //----------------------------------------------------------------------------- @@ -2141,6 +2462,26 @@ inline void QAngle::Init(vec_t ix, vec_t iy, vec_t iz) CHECK_VALID(*this); } + +extern float AngleNormalize(float angle); +extern float AngleNormalizePositive(float angle); + +inline void QAngle::Normalize() +{ + x = AngleNormalize(x); + y = AngleNormalize(y); + z = AngleNormalize(z); +} + +inline void QAngle::NormalizePositive() +{ + x = AngleNormalizePositive(x); + y = AngleNormalizePositive(y); + z = AngleNormalizePositive(z); +} + + +#if !defined(__SPU__) inline void QAngle::Random(vec_t minVal, vec_t maxVal) { x = RandomFloat(minVal, maxVal); @@ -2148,9 +2489,11 @@ inline void QAngle::Random(vec_t minVal, vec_t maxVal) z = RandomFloat(minVal, maxVal); CHECK_VALID(*this); } +#endif #ifndef VECTOR_NO_SLOW_OPERATIONS +#if !defined(__SPU__) inline QAngle RandomAngle(float minVal, float maxVal) { Vector3D random; @@ -2158,6 +2501,7 @@ inline QAngle RandomAngle(float minVal, float maxVal) QAngle ret(random.x, random.y, random.z); return ret; } +#endif #endif @@ -2169,17 +2513,22 @@ inline RadianEuler::RadianEuler(QAngle const& angles) angles.y * 3.14159265358979323846f / 180.f); } - - +inline DegreeEuler::DegreeEuler(QAngle const& angles) +{ + Init(angles.z, angles.x, angles.y); +} inline QAngle RadianEuler::ToQAngle(void) const { - return QAngle( - y * 180.f / 3.14159265358979323846f, - z * 180.f / 3.14159265358979323846f, - x * 180.f / 3.14159265358979323846f); + return QAngle(VEC_RAD2DEG(y), VEC_RAD2DEG(z), VEC_RAD2DEG(x)); } +inline QAngle DegreeEuler::ToQAngle() const +{ + return QAngle(y, z, x); +} + + //----------------------------------------------------------------------------- // assignment //----------------------------------------------------------------------------- @@ -2415,6 +2764,15 @@ inline void AngularImpulseToQAngle(const AngularImpulse& impulse, QAngle& angles angles.z = impulse.x; } +inline QAngle Quaternion::ToQAngle() const +{ + extern void QuaternionAngles(const Quaternion & q, QAngle & angles); + + QAngle anglesOut; + QuaternionAngles(*this, anglesOut); + return anglesOut; +} + #if !defined( _X360 ) && !defined( _PS3 ) FORCEINLINE vec_t InvRSquared(const float* v) @@ -2430,7 +2788,11 @@ FORCEINLINE vec_t InvRSquared(const Vector3D& v) #else // call directly +#if defined(__SPU__) +FORCEINLINE float _VMX_InvRSquared(Vector& v) +#else FORCEINLINE float _VMX_InvRSquared(const Vector& v) +#endif { #if !defined (_PS3) XMVECTOR xmV = XMVector3ReciprocalLength(XMLoadVector3(v.Base())); @@ -2616,6 +2978,16 @@ inline vec_t Vector3D::NormalizeInPlace() return VectorNormalize(*this); } +inline vec_t Vector3D::NormalizeInPlaceSafe(const Vector3D& vFallback) +{ + float flLength = VectorNormalize(*this); + if (flLength == 0.0f) + { + *this = vFallback; + } + return flLength; +} + inline Vector3D Vector3D::Normalized() const { Vector3D norm = *this; @@ -2623,6 +2995,15 @@ inline Vector3D Vector3D::Normalized() const return norm; } + +inline Vector3D Vector3D::NormalizedSafe(const Vector3D& vFallback)const +{ + Vector3D vNorm = *this; + float flLength = VectorNormalize(vNorm); + return (flLength != 0.0f) ? vNorm : vFallback; +} + + inline bool Vector3D::IsLengthGreaterThan(float val) const { return LengthSqr() > val * val; @@ -2633,5 +3014,68 @@ inline bool Vector3D::IsLengthLessThan(float val) const return LengthSqr() < val * val; } + +inline const Vector3D ScaleVector(const Vector3D& a, const Vector3D& b) +{ + return Vector3D(a.x * b.x, a.y * b.y, a.z * b.z); +} + + + +inline const Quaternion Exp(const Vector3D& v) +{ + float theta = v.Length(); + if (theta < 0.001f) + { + // limit case, cos(theta) ~= 1 - theta^2/2 + theta^4/24 + // sin(theta)/theta ~= 1 - theta^2/6 + theta^4/120 + float theta2_2 = theta * theta * 0.5f, theta4_24 = theta2_2 * theta2_2 * (1.0f / 6.0f); + float k = 1.0f - theta2_2 * (1.0f / 3.0f) + theta4_24 * 0.05f; + return Quaternion(k * v.x, k * v.y, k * v.z, 1 - theta2_2 + theta4_24); + } + else + { + float k = sinf(theta) / theta; + return Quaternion(k * v.x, k * v.y, k * v.z, cosf(theta)); + } +} + + +inline const Vector3D QuaternionLog(const Quaternion& q) +{ + Vector3D axis = q.ImaginaryPart(); + float sinTheta = axis.Length(), factor; + if (sinTheta > 0.001f) + { + // there's some substantial rotation; if w < 0, it's an over-180-degree rotation (in real space) + float theta = asinf(MIN(sinTheta, 1.0f)); + factor = (q.w < 0.0f ? M_PI_F - theta : theta) / sinTheta; + } + else + { + // ArcSin[x]/x = 1 + x^2/6 + x^4 * 3/40 + o( x^5 ) + float sinTheta2 = sinTheta * sinTheta; + float sinTheta4 = sinTheta2 * sinTheta2; + factor = (1 + sinTheta2 * (1.0f / 6.0f) + sinTheta4 * (3.0f / 40.0f)); + if (q.w < 0) + { + factor = -factor; // because the axis of rotation is not defined, we'll just consider this rotation to be close enough to identity + } + } + return axis * factor; +} + + + +inline float Snap(float a, float flSnap) +{ + return floorf(a / flSnap + 0.5f) * flSnap; +} + +inline const Vector3D Snap(const Vector3D& a, float flSnap) +{ + return Vector3D(Snap(a.x, flSnap), Snap(a.y, flSnap), Snap(a.z, flSnap)); +} + #endif diff --git a/r5dev/mathlib/vector2d.h b/r5dev/mathlib/vector2d.h index a66f2077..35d2791b 100644 --- a/r5dev/mathlib/vector2d.h +++ b/r5dev/mathlib/vector2d.h @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============// // // Purpose: // @@ -19,13 +19,27 @@ // For vec_t, put this somewhere else? #include "tier0/basetypes.h" -// For rand(). We really need a library! -#include +// For RandomFloat() +#include "vstdlib/random.h" #include "tier0/dbg.h" #include "mathlib/bits.h" #include "mathlib/math_pfns.h" +#ifndef M_PI +#define M_PI 3.14159265358979323846 // matches value in gcc v2 math.h +#endif + +#ifndef M_PI_F +#define M_PI_F ((float)(M_PI)) +#endif + +#ifndef DEG2RAD +#define DEG2RAD( x ) ( (float)(x) * (float)(M_PI_F / 180.f) ) +#endif + +extern void inline SinCos(float radians, float* RESTRICT sine, float* RESTRICT cosine); + //========================================================= // 2D Vector2D //========================================================= @@ -37,9 +51,9 @@ public: vec_t x, y; // Construction/destruction - Vector2D(void); + Vector2D(); Vector2D(vec_t X, vec_t Y); - Vector2D(const float* pFloat); + explicit Vector2D(const float* pFloat); // Initialization void Init(vec_t ix = 0.0f, vec_t iy = 0.0f); @@ -196,7 +210,7 @@ void Vector2DLerp(const Vector2D& src1, const Vector2D& src2, vec_t t, Vector2D& // constructors //----------------------------------------------------------------------------- -inline Vector2D::Vector2D(void) +inline Vector2D::Vector2D() { #ifdef _DEBUG // Initialize to NAN to catch errors @@ -238,11 +252,13 @@ inline void Vector2D::Init(vec_t ix, vec_t iy) Assert(IsValid()); } +#if !defined(__SPU__) inline void Vector2D::Random(float minVal, float maxVal) { - x = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal); - y = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal); + x = RandomFloat(minVal, maxVal); + y = RandomFloat(minVal, maxVal); } +#endif inline void Vector2DClear(Vector2D& a) { @@ -439,6 +455,15 @@ inline void Vector2DDivide(const Vector2D& a, const Vector2D& b, Vector2D& c) c.y = a.y / b.y; } +inline void Vector2DRotate(const Vector2D& vIn, float flDegrees, Vector2D& vOut) +{ + float c, s; + SinCos(DEG2RAD(flDegrees), &s, &c); + + vOut.x = vIn.x * c - vIn.y * s; + vOut.y = vIn.x * s + vIn.y * c; +} + inline void Vector2DMA(const Vector2D& start, float s, const Vector2D& dir, Vector2D& result) { Assert(start.IsValid() && IsFinite(s) && dir.IsValid()); diff --git a/r5dev/mathlib/vector4d.h b/r5dev/mathlib/vector4d.h index 21585121..cf4e1387 100644 --- a/r5dev/mathlib/vector4d.h +++ b/r5dev/mathlib/vector4d.h @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//========= Copyright 1996-2005, Valve Corporation, All rights reserved. ============// // // Purpose: // @@ -14,19 +14,19 @@ #endif #include -#include // For rand(). We really need a library! #include -#if !defined( _X360 ) -#include // For SSE +#if !defined( PLATFORM_PPC ) && !defined( _PS3 ) +#include // for sse #endif #include "tier0/basetypes.h" // For vec_t, put this somewhere else? #include "tier0/dbg.h" #include "mathlib/bits.h" #include "mathlib/math_pfns.h" - +#include "mathlib/vector.h" +#include "vstdlib/random.h" // forward declarations -class Vector3D; class Vector2D; +class Vector3D; //========================================================= // 4D Vector4D @@ -39,12 +39,13 @@ public: vec_t x, y, z, w; // Construction/destruction - Vector4D(void); + Vector4D(); Vector4D(vec_t X, vec_t Y, vec_t Z, vec_t W); - Vector4D(const float* pFloat); + explicit Vector4D(const float* pFloat); // Initialization void Init(vec_t ix = 0.0f, vec_t iy = 0.0f, vec_t iz = 0.0f, vec_t iw = 0.0f); + void Init(const Vector3D& src, vec_t iw = 0.0f); // Got any nasty NAN's? bool IsValid() const; @@ -79,6 +80,13 @@ public: Vector4D& operator/=(const Vector4D& v); Vector4D& operator/=(float s); + Vector4D operator-(void) const; + Vector4D operator*(float fl) const; + Vector4D operator/(float fl) const; + Vector4D operator*(const Vector4D& v) const; + Vector4D operator+(const Vector4D& v) const; + Vector4D operator-(const Vector4D& v) const; + // negate the Vector4D components void Negate(); @@ -202,7 +210,7 @@ void Vector4DLerp(Vector4D const& src1, Vector4D const& src2, vec_t t, Vector4D& // constructors //----------------------------------------------------------------------------- -inline Vector4D::Vector4D(void) +inline Vector4D::Vector4D() { #ifdef _DEBUG // Initialize to NAN to catch errors @@ -237,20 +245,27 @@ inline Vector4D::Vector4D(const Vector4D& vOther) //----------------------------------------------------------------------------- // initialization //----------------------------------------------------------------------------- - inline void Vector4D::Init(vec_t ix, vec_t iy, vec_t iz, vec_t iw) { x = ix; y = iy; z = iz; w = iw; Assert(IsValid()); } +inline void Vector4D::Init(const Vector3D& src, vec_t iw) +{ + x = src.x; y = src.y; z = src.z; w = iw; + Assert(IsValid()); +} + +#if !defined(__SPU__) inline void Vector4D::Random(vec_t minVal, vec_t maxVal) { - x = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal); - y = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal); - z = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal); - w = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal); + x = RandomFloat(minVal, maxVal); + y = RandomFloat(minVal, maxVal); + z = RandomFloat(minVal, maxVal); + w = RandomFloat(minVal, maxVal); } +#endif inline void Vector4DClear(Vector4D& a) { @@ -412,6 +427,52 @@ inline Vector4D& Vector4D::operator*=(Vector4D const& v) return *this; } +inline Vector4D Vector4D::operator-(void) const +{ + return Vector4D(-x, -y, -z, -w); +} + +inline Vector4D Vector4D::operator+(const Vector4D& v) const +{ + Vector4D res; + Vector4DAdd(*this, v, res); + return res; +} + +inline Vector4D Vector4D::operator-(const Vector4D& v) const +{ + Vector4D res; + Vector4DSubtract(*this, v, res); + return res; +} + + +inline Vector4D Vector4D::operator*(float fl) const +{ + Vector4D res; + Vector4DMultiply(*this, fl, res); + return res; +} + +inline Vector4D Vector4D::operator*(const Vector4D& v) const +{ + Vector4D res; + Vector4DMultiply(*this, v, res); + return res; +} + +inline Vector4D Vector4D::operator/(float fl) const +{ + Vector4D res; + Vector4DDivide(*this, fl, res); + return res; +} + +inline Vector4D operator*(float fl, const Vector4D& v) +{ + return v * fl; +} + inline Vector4D& Vector4D::operator/=(float fl) { Assert(fl != 0.0f); @@ -615,8 +676,10 @@ inline void Vector4DAligned::Set(vec_t X, vec_t Y, vec_t Z, vec_t W) inline void Vector4DAligned::InitZero(void) { -#if !defined( _X360 ) +#if !defined( PLATFORM_PPC ) this->AsM128() = _mm_set1_ps(0.0f); +#elif defined(_PS3) + this->AsM128() = VMX_ZERO; #else this->AsM128() = __vspltisw(0); #endif @@ -626,11 +689,13 @@ inline void Vector4DAligned::InitZero(void) inline void Vector4DMultiplyAligned(Vector4DAligned const& a, Vector4DAligned const& b, Vector4DAligned& c) { Assert(a.IsValid() && b.IsValid()); -#if !defined( _X360 ) +#if !defined( PLATFORM_PPC ) c.x = a.x * b.x; c.y = a.y * b.y; c.z = a.z * b.z; c.w = a.w * b.w; +#elif defined(_PS3) + c.AsM128() = __vec_mul(a.AsM128(), b.AsM128()); #else c.AsM128() = __vmulfp(a.AsM128(), b.AsM128()); #endif @@ -640,7 +705,7 @@ inline void Vector4DWeightMAD(vec_t w, Vector4DAligned const& vInA, Vector4DAlig { Assert(vInA.IsValid() && vInB.IsValid() && IsFinite(w)); -#if !defined( _X360 ) +#if !defined( PLATFORM_PPC ) vOutA.x += vInA.x * w; vOutA.y += vInA.y * w; vOutA.z += vInA.z * w; @@ -650,6 +715,16 @@ inline void Vector4DWeightMAD(vec_t w, Vector4DAligned const& vInA, Vector4DAlig vOutB.y += vInB.y * w; vOutB.z += vInB.z * w; vOutB.w += vInB.w * w; +#elif defined(_PS3) +#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 ) + // GCC 4.1.1 + __m128 temp = vec_splats(w); +#else //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1 + __m128 temp = __m128(w); +#endif //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1 + + vOutA.AsM128() = vec_madd(vInA.AsM128(), temp, vOutA.AsM128()); + vOutB.AsM128() = vec_madd(vInB.AsM128(), temp, vOutB.AsM128()); #else __vector4 temp; @@ -665,13 +740,23 @@ inline void Vector4DWeightMADSSE(vec_t w, Vector4DAligned const& vInA, Vector4DA { Assert(vInA.IsValid() && vInB.IsValid() && IsFinite(w)); -#if !defined( _X360 ) +#if !defined( PLATFORM_PPC ) // Replicate scalar float out to 4 components __m128 packed = _mm_set1_ps(w); // 4D SSE Vector MAD vOutA.AsM128() = _mm_add_ps(vOutA.AsM128(), _mm_mul_ps(vInA.AsM128(), packed)); vOutB.AsM128() = _mm_add_ps(vOutB.AsM128(), _mm_mul_ps(vInB.AsM128(), packed)); +#elif defined(_PS3) +#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 ) + // GCC 4.1.1 + __m128 temp = vec_splats(w); +#else //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1 + __m128 temp = __m128(w); +#endif //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1 + + vOutA.AsM128() = vec_madd(vInA.AsM128(), temp, vOutA.AsM128()); + vOutB.AsM128() = vec_madd(vInB.AsM128(), temp, vOutB.AsM128()); #else __vector4 temp; diff --git a/r5dev/mathlib/vmatrix.cpp b/r5dev/mathlib/vmatrix.cpp index 7e7183f6..fdf2bccb 100644 --- a/r5dev/mathlib/vmatrix.cpp +++ b/r5dev/mathlib/vmatrix.cpp @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//========= Copyright (c) 1996-2005, Valve Corporation, All rights reserved. ============// // // Purpose: // @@ -6,18 +6,19 @@ // //=============================================================================// #include "core/stdafx.h" +#include "tier0/dbg.h" #if !defined(_STATIC_LINKED) || defined(_SHARED_LIB) -#include "tier0/dbg.h" -#include "tier0/basetypes.h" + #include "mathlib/vmatrix.h" #include "mathlib/mathlib.h" #include "mathlib/vector4d.h" +#include "mathlib/ssemath.h" // memdbgon must be the last include file in a .cpp file!!! //#include "tier0/memdbgon.h" -//#pragma warning (disable : 4700) // local variable 'x' used without having been initialized +#pragma warning (disable : 4700) // local variable 'x' used without having been initialized // ------------------------------------------------------------------------------------------- // // Helper functions. @@ -120,7 +121,7 @@ VMatrix SetupMatrixProjection(const Vector3D& vOrigin, const VPlane& thePlane) VMatrix SetupMatrixAxisRot(const Vector3D& vAxis, vec_t fDegrees) { - vec_t s, c, t; + vec_t s, c, t; // sin, cos, 1-cos vec_t tx, ty, tz; vec_t sx, sy, sz; vec_t fRadians; @@ -142,6 +143,43 @@ VMatrix SetupMatrixAxisRot(const Vector3D& vAxis, vec_t fDegrees) 0.0f, 0.0f, 0.0f, 1.0f); } + +// Basically takes a cross product and then does the same thing as SetupMatrixAxisRot +// above, but takes advantage of the fact that the sin angle is precomputed. +VMatrix SetupMatrixAxisToAxisRot(const Vector3D& vFromAxis, const Vector3D& vToAxis) +{ + Assert(vFromAxis.LengthSqr() == 1); // these axes + Assert(vToAxis.LengthSqr() == 1); // must be normal. + + vec_t s, c, t; // sin(theta), cos(theta), 1-cos + vec_t tx, ty, tz; + vec_t sx, sy, sz; + + Vector3D vAxis = vFromAxis.Cross(vToAxis); + + s = vAxis.Length(); + c = vFromAxis.Dot(vToAxis); + t = 1.0f - c; + + if (s > 0) + { + vAxis *= 1.0 / s; + + tx = t * vAxis.x; ty = t * vAxis.y; tz = t * vAxis.z; + sx = s * vAxis.x; sy = s * vAxis.y; sz = s * vAxis.z; + + return VMatrix( + tx * vAxis.x + c, tx * vAxis.y - sz, tx * vAxis.z + sy, 0.0f, + tx * vAxis.y + sz, ty * vAxis.y + c, ty * vAxis.z - sx, 0.0f, + tx * vAxis.z - sy, ty * vAxis.z + sx, tz * vAxis.z + c, 0.0f, + 0.0f, 0.0f, 0.0f, 1.0f); + } + else + { + return SetupMatrixIdentity(); + } +} + VMatrix SetupMatrixAngles(const QAngle& vAngles) { VMatrix mRet; @@ -158,8 +196,19 @@ VMatrix SetupMatrixOrgAngles(const Vector3D& origin, const QAngle& vAngles) #endif // VECTOR_NO_SLOW_OPERATIONS - +#if 1 bool PlaneIntersection(const VPlane& vp1, const VPlane& vp2, const VPlane& vp3, Vector3D& vOut) +{ + Vector3D v2Cross3 = CrossProduct(vp2.m_Normal, vp3.m_Normal); + float flDenom = DotProduct(vp1.m_Normal, v2Cross3); + if (fabs(flDenom) < FLT_EPSILON) + return false; + Vector3D vRet = vp1.m_Dist * v2Cross3 + vp2.m_Dist * CrossProduct(vp3.m_Normal, vp1.m_Normal) + vp3.m_Dist * CrossProduct(vp1.m_Normal, vp2.m_Normal); + vOut = vRet * (1.0 / flDenom); + return true; +} +#else // old slow innaccurate code +bool PlaneIntersection(const VPlane& vp1, const VPlane& vp2, const VPlane& vp3, Vector& vOut) { VMatrix mMat, mInverse; @@ -169,7 +218,6 @@ bool PlaneIntersection(const VPlane& vp1, const VPlane& vp2, const VPlane& vp3, vp3.m_Normal.x, vp3.m_Normal.y, vp3.m_Normal.z, -vp3.m_Dist, 0.0f, 0.0f, 0.0f, 1.0f ); - if (mMat.InverseGeneral(mInverse)) { //vOut = mInverse * Vector(0.0f, 0.0f, 0.0f); @@ -181,7 +229,7 @@ bool PlaneIntersection(const VPlane& vp1, const VPlane& vp2, const VPlane& vp3, return false; } } - +#endif // ------------------------------------------------------------------------------------------- // @@ -303,7 +351,7 @@ bool MatrixInverseGeneral(const VMatrix& src, VMatrix& dst) for (iRow = 0; iRow < 4; iRow++) { // Find the row with the largest element in this column. - fLargest = 0.00001f; + fLargest = 1e-6f; iLargest = -1; for (iTest = iRow; iTest < 4; iTest++) { @@ -506,7 +554,7 @@ bool VMatrix::IsRotationMatrix() const FloatMakePositive(v2.Dot(v3)) < 0.01f; } -static void SetupMatrixAnglesInternal(vec_t m[4][4], const QAngle& vAngles) +void VMatrix::SetupMatrixOrgAngles(const Vector3D& origin, const QAngle& vAngles) { float sr, sp, sy, cr, cp, cy; @@ -527,11 +575,6 @@ static void SetupMatrixAnglesInternal(vec_t m[4][4], const QAngle& vAngles) m[0][3] = 0.f; m[1][3] = 0.f; m[2][3] = 0.f; -} - -void VMatrix::SetupMatrixOrgAngles(const Vector3D& origin, const QAngle& vAngles) -{ - SetupMatrixAnglesInternal(m, vAngles); // Add translation m[0][3] = origin.x; @@ -544,21 +587,6 @@ void VMatrix::SetupMatrixOrgAngles(const Vector3D& origin, const QAngle& vAngles } -void VMatrix::SetupMatrixAngles(const QAngle& vAngles) -{ - SetupMatrixAnglesInternal(m, vAngles); - - // Zero everything else - m[0][3] = 0.0f; - m[1][3] = 0.0f; - m[2][3] = 0.0f; - m[3][0] = 0.0f; - m[3][1] = 0.0f; - m[3][2] = 0.0f; - m[3][3] = 1.0f; -} - - //----------------------------------------------------------------------------- // Sets matrix to identity //----------------------------------------------------------------------------- @@ -745,7 +773,7 @@ void Vector4DMultiplyPosition(const VMatrix& src1, Vector3D const& src2, Vector4 { // Make sure it works if src2 == dst Vector3D tmp; - Vector3D const& v = (&src2 == &dst.AsVector3D()) ? static_cast(tmp) : src2; + Vector3D const& v = (&src2 == &dst.AsVector3D()) ? static_cast(tmp) : src2; if (&src2 == &dst.AsVector3D()) { @@ -768,7 +796,7 @@ void Vector3DMultiply(const VMatrix& src1, const Vector3D& src2, Vector3D& dst) { // Make sure it works if src2 == dst Vector3D tmp; - const Vector3D& v = (&src2 == &dst) ? static_cast(tmp) : src2; + const Vector3D& v = (&src2 == &dst) ? static_cast(tmp) : src2; if (&src2 == &dst) { @@ -789,7 +817,7 @@ void Vector3DMultiplyPositionProjective(const VMatrix& src1, const Vector3D& src { // Make sure it works if src2 == dst Vector3D tmp; - const Vector3D& v = (&src2 == &dst) ? static_cast(tmp) : src2; + const Vector3D& v = (&src2 == &dst) ? static_cast(tmp) : src2; if (&src2 == &dst) { VectorCopy(src2, tmp); @@ -816,7 +844,7 @@ void Vector3DMultiplyProjective(const VMatrix& src1, const Vector3D& src2, Vecto { // Make sure it works if src2 == dst Vector3D tmp; - const Vector3D& v = (&src2 == &dst) ? static_cast(tmp) : src2; + const Vector3D& v = (&src2 == &dst) ? static_cast(tmp) : src2; if (&src2 == &dst) { VectorCopy(src2, tmp); @@ -869,7 +897,7 @@ void Vector3DMultiplyTranspose(const VMatrix& src1, const Vector3D& src2, Vector bool srcEqualsDst = (&src2 == &dst); Vector3D tmp; - const Vector3D& v = srcEqualsDst ? static_cast(tmp) : src2; + const Vector3D& v = srcEqualsDst ? static_cast(tmp) : src2; if (srcEqualsDst) { @@ -954,7 +982,7 @@ void MatrixBuildTranslation(VMatrix& dst, const Vector3D& translation) //----------------------------------------------------------------------------- void MatrixBuildRotationAboutAxis(VMatrix& dst, const Vector3D& vAxisOfRot, float angleDegrees) { - MatrixBuildRotationAboutAxis(vAxisOfRot, angleDegrees, const_cast (dst.As3x4())); + MatrixBuildRotationAboutAxis(vAxisOfRot, angleDegrees, dst.As3x4()); dst[3][0] = 0; dst[3][1] = 0; dst[3][2] = 0; @@ -1006,6 +1034,13 @@ void MatrixBuildRotation(VMatrix& dst, const Vector3D& initialDirection, const V } MatrixBuildRotationAboutAxis(dst, axis, angle); + +#ifdef _DEBUG + Vector3D test; + Vector3DMultiply(dst, initialDirection, test); + test -= finalDirection; + Assert(test.LengthSqr() < 1e-3); +#endif } //----------------------------------------------------------------------------- @@ -1163,8 +1198,7 @@ void CalculateSphereFromProjectionMatrix(const VMatrix& worldToVolume, Vector3D* } -static inline void FrustumPlanesFromMatrixHelper(const VMatrix& shadowToWorld, const Vector3D& p1, const Vector3D& p2, const Vector3D& p3, - Vector3D& normal, float& dist) +static inline void FrustumPlanesFromMatrixHelper(const VMatrix& shadowToWorld, const Vector3D& p1, const Vector3D& p2, const Vector3D& p3, VPlane& plane) { Vector3D world1, world2, world3; Vector3DMultiplyPositionProjective(shadowToWorld, p1, world1); @@ -1175,41 +1209,37 @@ static inline void FrustumPlanesFromMatrixHelper(const VMatrix& shadowToWorld, c VectorSubtract(world2, world1, v1); VectorSubtract(world3, world1, v2); - CrossProduct(v1, v2, normal); - VectorNormalize(normal); - dist = DotProduct(normal, world1); + CrossProduct(v1, v2, plane.m_Normal); + VectorNormalize(plane.m_Normal); + plane.m_Dist = DotProduct(plane.m_Normal, world1); } void FrustumPlanesFromMatrix(const VMatrix& clipToWorld, Frustum_t& frustum) { - Vector3D normal; - float dist; + VPlane planes[6]; FrustumPlanesFromMatrixHelper(clipToWorld, - Vector3D(0.0f, 0.0f, 0.0f), Vector3D(1.0f, 0.0f, 0.0f), Vector3D(0.0f, 1.0f, 0.0f), normal, dist); - frustum.SetPlane(FRUSTUM_NEARZ, PLANE_ANYZ, normal, dist); + Vector3D(0.0f, 0.0f, 0.0f), Vector3D(1.0f, 0.0f, 0.0f), Vector3D(0.0f, 1.0f, 0.0f), planes[FRUSTUM_NEARZ]); FrustumPlanesFromMatrixHelper(clipToWorld, - Vector3D(0.0f, 0.0f, 1.0f), Vector3D(0.0f, 1.0f, 1.0f), Vector3D(1.0f, 0.0f, 1.0f), normal, dist); - frustum.SetPlane(FRUSTUM_FARZ, PLANE_ANYZ, normal, dist); + Vector3D(0.0f, 0.0f, 1.0f), Vector3D(0.0f, 1.0f, 1.0f), Vector3D(1.0f, 0.0f, 1.0f), planes[FRUSTUM_FARZ]); FrustumPlanesFromMatrixHelper(clipToWorld, - Vector3D(1.0f, 0.0f, 0.0f), Vector3D(1.0f, 1.0f, 1.0f), Vector3D(1.0f, 1.0f, 0.0f), normal, dist); - frustum.SetPlane(FRUSTUM_RIGHT, PLANE_ANYZ, normal, dist); + Vector3D(1.0f, 0.0f, 0.0f), Vector3D(1.0f, 1.0f, 1.0f), Vector3D(1.0f, 1.0f, 0.0f), planes[FRUSTUM_RIGHT]); FrustumPlanesFromMatrixHelper(clipToWorld, - Vector3D(0.0f, 0.0f, 0.0f), Vector3D(0.0f, 1.0f, 1.0f), Vector3D(0.0f, 0.0f, 1.0f), normal, dist); - frustum.SetPlane(FRUSTUM_LEFT, PLANE_ANYZ, normal, dist); + Vector3D(0.0f, 0.0f, 0.0f), Vector3D(0.0f, 1.0f, 1.0f), Vector3D(0.0f, 0.0f, 1.0f), planes[FRUSTUM_LEFT]); FrustumPlanesFromMatrixHelper(clipToWorld, - Vector3D(1.0f, 1.0f, 0.0f), Vector3D(1.0f, 1.0f, 1.0f), Vector3D(0.0f, 1.0f, 1.0f), normal, dist); - frustum.SetPlane(FRUSTUM_TOP, PLANE_ANYZ, normal, dist); + Vector3D(1.0f, 1.0f, 0.0f), Vector3D(1.0f, 1.0f, 1.0f), Vector3D(0.0f, 1.0f, 1.0f), planes[FRUSTUM_TOP]); FrustumPlanesFromMatrixHelper(clipToWorld, - Vector3D(1.0f, 0.0f, 0.0f), Vector3D(0.0f, 0.0f, 1.0f), Vector3D(1.0f, 0.0f, 1.0f), normal, dist); - frustum.SetPlane(FRUSTUM_BOTTOM, PLANE_ANYZ, normal, dist); + Vector3D(1.0f, 0.0f, 0.0f), Vector3D(0.0f, 0.0f, 1.0f), Vector3D(1.0f, 0.0f, 1.0f), planes[FRUSTUM_BOTTOM]); + + frustum.SetPlanes(planes); } +// BEWARE: top/bottom are FLIPPED relative to D3DXMatrixOrthoOffCenterRH(). void MatrixBuildOrtho(VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar) { // FIXME: This is being used incorrectly! Should read: @@ -1243,29 +1273,19 @@ void MatrixBuildOrtho(VMatrix& dst, double left, double top, double right, doubl 0.0f, 0.0f, 0.0f, 1.0f); } -void MatrixBuildPerspectiveZRange(VMatrix& dst, double flZNear, double flZFar) -{ - dst.m[2][0] = 0.0f; - dst.m[2][1] = 0.0f; - dst.m[2][2] = flZFar / (flZNear - flZFar); - dst.m[2][3] = flZNear * flZFar / (flZNear - flZFar); -} - void MatrixBuildPerspectiveX(VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar) { - float flWidthScale = 1.0f / tanf(flFovX * M_PI / 360.0f); - float flHeightScale = flAspect * flWidthScale; - dst.Init(flWidthScale, 0.0f, 0.0f, 0.0f, - 0.0f, flHeightScale, 0.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 0.0f, + float flWidth = 2.0f * flZNear * tanf(flFovX * M_PI / 360.0f); + float flHeight = flWidth / flAspect; + dst.Init(2.0f * flZNear / flWidth, 0.0f, 0.0f, 0.0f, + 0.0f, 2.0f * flZNear / flHeight, 0.0f, 0.0f, + 0.0f, 0.0f, flZFar / (flZNear - flZFar), flZNear * flZFar / (flZNear - flZFar), 0.0f, 0.0f, -1.0f, 0.0f); - - MatrixBuildPerspectiveZRange(dst, flZNear, flZFar); } void MatrixBuildPerspectiveOffCenterX(VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar, double bottom, double top, double left, double right) { - float flWidth = tanf(flFovX * M_PI / 360.0f); + float flWidth = 2.0f * flZNear * tanf(flFovX * M_PI / 360.0f); float flHeight = flWidth / flAspect; // bottom, top, left, right are 0..1 so convert to -/2../2 @@ -1274,12 +1294,58 @@ void MatrixBuildPerspectiveOffCenterX(VMatrix& dst, double flFovX, double flAspe float flBottom = -(flHeight / 2.0f) * (1.0f - bottom) + bottom * (flHeight / 2.0f); float flTop = -(flHeight / 2.0f) * (1.0f - top) + top * (flHeight / 2.0f); - dst.Init(1.0f / (flRight - flLeft), 0.0f, (flLeft + flRight) / (flRight - flLeft), 0.0f, - 0.0f, 1.0f / (flTop - flBottom), (flTop + flBottom) / (flTop - flBottom), 0.0f, - 0.0f, 0.0f, 0.0f, 0.0f, + dst.Init((2.0f * flZNear) / (flRight - flLeft), 0.0f, (flLeft + flRight) / (flRight - flLeft), 0.0f, + 0.0f, 2.0f * flZNear / (flTop - flBottom), (flTop + flBottom) / (flTop - flBottom), 0.0f, + 0.0f, 0.0f, flZFar / (flZNear - flZFar), flZNear * flZFar / (flZNear - flZFar), 0.0f, 0.0f, -1.0f, 0.0f); - - MatrixBuildPerspectiveZRange(dst, flZNear, flZFar); } -#endif // !_STATIC_LINKED || _SHARED_LIB +void ExtractClipPlanesFromNonTransposedMatrix(const VMatrix& viewProjMatrix, VPlane* pPlanesOut, bool bD3DClippingRange) +{ + // Left + Vector4D vPlane = MatrixGetRowAsVector4D(viewProjMatrix, 0) + MatrixGetRowAsVector4D(viewProjMatrix, 3); + pPlanesOut[FRUSTUM_LEFT].Init(vPlane.AsVector3D(), -vPlane.w); + + // Right + vPlane = -MatrixGetRowAsVector4D(viewProjMatrix, 0) + MatrixGetRowAsVector4D(viewProjMatrix, 3); + pPlanesOut[FRUSTUM_RIGHT].Init(vPlane.AsVector3D(), -vPlane.w); + + // Bottom + vPlane = MatrixGetRowAsVector4D(viewProjMatrix, 1) + MatrixGetRowAsVector4D(viewProjMatrix, 3); + pPlanesOut[FRUSTUM_BOTTOM].Init(vPlane.AsVector3D(), -vPlane.w); + + // Top + vPlane = -MatrixGetRowAsVector4D(viewProjMatrix, 1) + MatrixGetRowAsVector4D(viewProjMatrix, 3); + pPlanesOut[FRUSTUM_TOP].Init(vPlane.AsVector3D(), -vPlane.w); + + // Near + if (bD3DClippingRange) + { + // [0,1] Z clipping range (D3D-style) + vPlane = MatrixGetRowAsVector4D(viewProjMatrix, 2); + } + else + { + // [-1,1] Z clipping range (OpenGL-style) + vPlane = MatrixGetRowAsVector4D(viewProjMatrix, 2) + MatrixGetRowAsVector4D(viewProjMatrix, 3); + } + + pPlanesOut[FRUSTUM_NEARZ].Init(vPlane.AsVector3D(), -vPlane.w); + + // Far + vPlane = -MatrixGetRowAsVector4D(viewProjMatrix, 2) + MatrixGetRowAsVector4D(viewProjMatrix, 3); + pPlanesOut[FRUSTUM_FARZ].Init(vPlane.AsVector3D(), -vPlane.w); + + for (uint i = 0; i < FRUSTUM_NUMPLANES; ++i) + { + float flLen2 = pPlanesOut[i].m_Normal.x * pPlanesOut[i].m_Normal.x + pPlanesOut[i].m_Normal.y * pPlanesOut[i].m_Normal.y + pPlanesOut[i].m_Normal.z * pPlanesOut[i].m_Normal.z; + if (flLen2 != 0.0f) + { + float flScale = 1.0f / sqrt(flLen2); + pPlanesOut[i].m_Normal *= flScale; + pPlanesOut[i].m_Dist *= flScale; + } + } +} + +#endif // !_STATIC_LINKED || _SHARED_LIB diff --git a/r5dev/mathlib/vmatrix.h b/r5dev/mathlib/vmatrix.h index a1520499..02771bef 100644 --- a/r5dev/mathlib/vmatrix.h +++ b/r5dev/mathlib/vmatrix.h @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============// // // Purpose: // @@ -54,10 +54,9 @@ public: // Creates a matrix where the X axis = forward // the Y axis = left, and the Z axis = up VMatrix(const Vector3D& forward, const Vector3D& left, const Vector3D& up); - VMatrix(const Vector3D& forward, const Vector3D& left, const Vector3D& up, const Vector3D& translation); // Construct from a 3x4 matrix - VMatrix(const matrix3x4_t& matrix3x4); + explicit VMatrix(const matrix3x4_t& matrix3x4); // Set the values in the matrix. void Init( @@ -107,6 +106,7 @@ public: void PreTranslate(const Vector3D& vTrans); void PostTranslate(const Vector3D& vTrans); + matrix3x4_t& As3x4(); const matrix3x4_t& As3x4() const; void CopyFrom3x4(const matrix3x4_t& m3x4); void Set3x4(matrix3x4_t& matrix3x4) const; @@ -199,9 +199,6 @@ public: // Setup a matrix for origin and angles. void SetupMatrixOrgAngles(const Vector3D& origin, const QAngle& vAngles); - // Setup a matrix for angles and no translation. - void SetupMatrixAngles(const QAngle& vAngles); - // General inverse. This may fail so check the return! bool InverseGeneral(VMatrix& vInverse) const; @@ -217,7 +214,7 @@ public: VMatrix InverseTR() const; // Get the scale of the matrix's basis vectors. - Vector3D GetScale() const; + Vector3D GetScale() const; // (Fast) multiply by a scaling matrix setup from vScale. VMatrix Scale(const Vector3D& vScale); @@ -263,6 +260,9 @@ VMatrix SetupMatrixProjection(const Vector3D& vOrigin, const VPlane& thePlane); // Setup a matrix to rotate the specified amount around the specified axis. VMatrix SetupMatrixAxisRot(const Vector3D& vAxis, vec_t fDegrees); +// Setup a matrix to rotate one axis onto another. Input vectors must be normalized. +VMatrix SetupMatrixAxisToAxisRot(const Vector3D& vFromAxis, const Vector3D& vToAxis); + // Setup a matrix from euler angles. Just sets identity and calls MatrixAngles. VMatrix SetupMatrixAngles(const QAngle& vAngles); @@ -460,16 +460,6 @@ inline VMatrix::VMatrix(const Vector3D& xAxis, const Vector3D& yAxis, const Vect ); } -inline VMatrix::VMatrix(const Vector3D& xAxis, const Vector3D& yAxis, const Vector3D& zAxis, const Vector3D& translation) -{ - Init( - xAxis.x, yAxis.x, zAxis.x, translation.x, - xAxis.y, yAxis.y, zAxis.y, translation.y, - xAxis.z, yAxis.z, zAxis.z, translation.z, - 0.0f, 0.0f, 0.0f, 1.0f - ); -} - inline void VMatrix::Init( vec_t m00, vec_t m01, vec_t m02, vec_t m03, @@ -629,6 +619,11 @@ inline const matrix3x4_t& VMatrix::As3x4() const return *((const matrix3x4_t*)this); } +inline matrix3x4_t& VMatrix::As3x4() +{ + return *((matrix3x4_t*)this); +} + inline void VMatrix::CopyFrom3x4(const matrix3x4_t& m3x4) { memcpy(m, m3x4.Base(), sizeof(matrix3x4_t)); @@ -691,7 +686,7 @@ inline VMatrix VMatrix::operator-() const VMatrix ret; for (int i = 0; i < 16; i++) { - ((float*)ret.m)[i] = ((float*)m)[i]; + ((float*)ret.m)[i] = -((float*)m)[i]; } return ret; } @@ -908,9 +903,9 @@ inline bool MatricesAreEqual(const VMatrix& src1, const VMatrix& src2, float flT // //----------------------------------------------------------------------------- void MatrixBuildOrtho(VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar); +void MatrixBuildOrthoLH(VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar); void MatrixBuildPerspectiveX(VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar); void MatrixBuildPerspectiveOffCenterX(VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar, double bottom, double top, double left, double right); -void MatrixBuildPerspectiveZRange(VMatrix& dst, double flZNear, double flZFar); inline void MatrixOrtho(VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar) { @@ -922,6 +917,16 @@ inline void MatrixOrtho(VMatrix& dst, double left, double top, double right, dou dst = temp; } +inline void MatrixBuildOrthoLH(VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar) +{ + // Same as XMMatrixOrthographicOffCenterLH(). + dst.Init( + 2.0f / (right - left), 0.0f, 0.0f, (left + right) / (left - right), + 0.0f, 2.0f / (bottom - top), 0.0f, (bottom + top) / (top - bottom), + 0.0f, 0.0f, 1.0f / (zFar - zNear), zNear / (zNear - zFar), + 0.0f, 0.0f, 0.0f, 1.0f); +} + inline void MatrixPerspectiveX(VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar) { VMatrix mat; @@ -942,6 +947,61 @@ inline void MatrixPerspectiveOffCenterX(VMatrix& dst, double flFovX, double flAs dst = temp; } +inline Vector4D GetMatrixColumnAsVector4D(const VMatrix& mMatrix, int nCol) +{ + Vector4D vColumnOut; + vColumnOut.x = mMatrix.m[0][nCol]; + vColumnOut.y = mMatrix.m[1][nCol]; + vColumnOut.z = mMatrix.m[2][nCol]; + vColumnOut.w = mMatrix.m[3][nCol]; + return vColumnOut; +} + +inline Vector4D MatrixGetRowAsVector4D(const VMatrix& src, int nRow) +{ + Assert((nRow >= 0) && (nRow <= 3)); + return Vector4D(src[nRow]); +} + +//----------------------------------------------------------------------------- +// Extracts clip planes from an arbitrary view projection matrix. +// This function assumes the matrix has been transposed. +//----------------------------------------------------------------------------- +inline void ExtractClipPlanesFromTransposedMatrix(const VMatrix& transposedViewProjMatrix, VPlane* pPlanesOut) +{ + // Left + Vector4D vPlane = GetMatrixColumnAsVector4D(transposedViewProjMatrix, 0) + GetMatrixColumnAsVector4D(transposedViewProjMatrix, 3); + pPlanesOut[FRUSTUM_LEFT].Init(vPlane.AsVector3D(), -vPlane.w); + + // Right + vPlane = -GetMatrixColumnAsVector4D(transposedViewProjMatrix, 0) + GetMatrixColumnAsVector4D(transposedViewProjMatrix, 3); + pPlanesOut[FRUSTUM_RIGHT].Init(vPlane.AsVector3D(), -vPlane.w); + + // Bottom + vPlane = GetMatrixColumnAsVector4D(transposedViewProjMatrix, 1) + GetMatrixColumnAsVector4D(transposedViewProjMatrix, 3); + pPlanesOut[FRUSTUM_BOTTOM].Init(vPlane.AsVector3D(), -vPlane.w); + + // Top + vPlane = -GetMatrixColumnAsVector4D(transposedViewProjMatrix, 1) + GetMatrixColumnAsVector4D(transposedViewProjMatrix, 3); + pPlanesOut[FRUSTUM_TOP].Init(vPlane.AsVector3D(), -vPlane.w); + + // Near + vPlane = GetMatrixColumnAsVector4D(transposedViewProjMatrix, 2) + GetMatrixColumnAsVector4D(transposedViewProjMatrix, 3); + pPlanesOut[FRUSTUM_NEARZ].Init(vPlane.AsVector3D(), -vPlane.w); + + // Far + vPlane = -GetMatrixColumnAsVector4D(transposedViewProjMatrix, 2) + GetMatrixColumnAsVector4D(transposedViewProjMatrix, 3); + pPlanesOut[FRUSTUM_FARZ].Init(vPlane.AsVector3D(), -vPlane.w); +} + +//----------------------------------------------------------------------------- +// Extracts clip planes from an arbitrary view projection matrix. +// Differences from ExtractClipPlanesFromTransposedMatrix(): +// This function assumes the matrix has NOT been transposed. +// If bD3DClippingRange is true, the projection space clipping range is assumed +// to be [0,1], vs. the OpenGL range [-1,1]. +// This function always returns normalized planes. +//----------------------------------------------------------------------------- +void ExtractClipPlanesFromNonTransposedMatrix(const VMatrix& viewProjMatrix, VPlane* pPlanesOut, bool bD3DClippingRange = true); + #endif - - diff --git a/r5dev/mathlib/vplane.h b/r5dev/mathlib/vplane.h index a9dfe040..48f52a0f 100644 --- a/r5dev/mathlib/vplane.h +++ b/r5dev/mathlib/vplane.h @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============// // // Purpose: // @@ -25,7 +25,6 @@ typedef int SideType; #define VP_EPSILON 0.01f - class VPlane { public: @@ -63,7 +62,7 @@ public: #endif public: - Vector3D m_Normal; + Vector3D m_Normal; vec_t m_Dist; #ifdef VECTOR_NO_SLOW_OPERATIONS @@ -176,7 +175,4 @@ inline SideType VPlane::BoxOnPlaneSide(const Vector3D& vMin, const Vector3D& vMa return firstSide; } - - - #endif // VPLANE_H diff --git a/r5dev/tier0/basetypes.h b/r5dev/tier0/basetypes.h index 0a0b1d38..491b5fe6 100644 --- a/r5dev/tier0/basetypes.h +++ b/r5dev/tier0/basetypes.h @@ -154,6 +154,78 @@ #define MAX( a, b ) ( ( ( a ) > ( b ) ) ? ( a ) : ( b ) ) #endif +#ifdef __cplusplus + +template< class T, class Y, class X > +inline T clamp(T const& val, Y const& minVal, X const& maxVal) +{ + if (val < minVal) + return minVal; + else if (val > maxVal) + return maxVal; + else + return val; +} + +// This is the preferred clamp operator. Using the clamp macro can lead to +// unexpected side-effects or more expensive code. Even the clamp (all +// lower-case) function can generate more expensive code because of the +// mixed types involved. +template< class T > +T Clamp(T const& val, T const& minVal, T const& maxVal) +{ + if (val < minVal) + return minVal; + else if (val > maxVal) + return maxVal; + else + return val; +} + +// This is the preferred Min operator. Using the MIN macro can lead to unexpected +// side-effects or more expensive code. +template< class T > +T Min(T const& val1, T const& val2) +{ + return val1 < val2 ? val1 : val2; +} + +// This is the preferred Max operator. Using the MAX macro can lead to unexpected +// side-effects or more expensive code. +template< class T > +T Max(T const& val1, T const& val2) +{ + return val1 > val2 ? val1 : val2; +} + +template +void Swap(T& a, T& b) +{ + T temp = a; + a = b; + b = temp; +} + +#else + +#define clamp(val, min, max) (((val) > (max)) ? (max) : (((val) < (min)) ? (min) : (val))) + +#endif + +#define fsel(c,x,y) ( (c) >= 0 ? (x) : (y) ) + +// integer conditional move +// if a >= 0, return x, else y +#define isel(a,x,y) ( ((a) >= 0) ? (x) : (y) ) + +// if x = y, return a, else b +#define ieqsel(x,y,a,b) (( (x) == (y) ) ? (a) : (b)) + +// if the nth bit of a is set (counting with 0 = LSB), +// return x, else y +// this is fast if nbit is a compile-time immediate +#define ibitsel(a, nbit, x, y) ( ( ((a) & (1 << (nbit))) != 0 ) ? (x) : (y) ) + // MSVC CRT uses 0x7fff while gcc uses MAX_INT, leading to mismatches between platforms // As a result, we pick the least common denominator here. This should be used anywhere // you might typically want to use RAND_MAX diff --git a/r5dev/tier0/dbg.cpp b/r5dev/tier0/dbg.cpp index 115dc451..a986d734 100644 --- a/r5dev/tier0/dbg.cpp +++ b/r5dev/tier0/dbg.cpp @@ -8,9 +8,10 @@ #include "core/stdafx.h" #include "core/logdef.h" +#include "tier0/dbg.h" #include "tier0/platform.h" #include "tier0/threadtools.h" -#include "tier0/dbg.h" +#include #ifndef DEDICATED #include "vgui/vgui_debugpanel.h" #include "gameui/IConsole.h" @@ -28,7 +29,12 @@ std::mutex s_LogMutex; //----------------------------------------------------------------------------- bool HushAsserts() { +#ifdef DBGFLAG_ASSERT + static bool s_bHushAsserts = !!CommandLine()->FindParm("-hushasserts"); + return s_bHushAsserts; +#else return true; +#endif } //----------------------------------------------------------------------------- diff --git a/r5dev/tier0/dbg.h b/r5dev/tier0/dbg.h index 11ede439..d1d202fb 100644 --- a/r5dev/tier0/dbg.h +++ b/r5dev/tier0/dbg.h @@ -8,6 +8,7 @@ #ifndef DBG_H #define DBG_H #define Assert assert +#define AssertDbg assert #include "tier0/dbgflag.h" bool HushAsserts(); diff --git a/r5dev/tier0/platform.h b/r5dev/tier0/platform.h index e8449be3..0b6d9730 100644 --- a/r5dev/tier0/platform.h +++ b/r5dev/tier0/platform.h @@ -141,6 +141,12 @@ #define IS_WINDOWS_PC 1 #endif +#if _MSC_VER >= 1800 +#define VECTORCALL __vectorcall +#else +#define VECTORCALL +#endif + #endif // CROSS_PLATFORM_VERSION < 2 #if defined( GNUC ) && !defined( COMPILER_PS3 ) // use pre-align on PS3 @@ -282,6 +288,8 @@ inline int64 CastPtrToInt64(const void* p) #endif +#define NO_MALLOC_OVERRIDE + //----------------------------------------------------------------------------- // Various compiler-specific keywords //----------------------------------------------------------------------------- diff --git a/r5dev/tier0/threadtools.cpp b/r5dev/tier0/threadtools.cpp new file mode 100644 index 00000000..3104304f --- /dev/null +++ b/r5dev/tier0/threadtools.cpp @@ -0,0 +1,31 @@ +//===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======// +// +// Purpose: Random number generator +// +// $Workfile: $ +// $NoKeywords: $ +//===========================================================================// + +#include "core/stdafx.h" +#include "threadtools.h" + +LONG ThreadInterlockedCompareExchange64(LONG volatile* pDest, int64 value, int64 comperand) +{ + return _InterlockedCompareExchange(pDest, comperand, value); +} + +bool ThreadInterlockedAssignIf(LONG volatile* p, int32 value, int32 comperand) +{ + Assert((size_t)p % 4 == 0); + return _InterlockedCompareExchange(p, comperand, value); +} + +int64 ThreadInterlockedCompareExchange64(int64 volatile* pDest, int64 value, int64 comperand) +{ + return _InterlockedCompareExchange64(pDest, comperand, value); +} + +bool ThreadInterlockedAssignIf64(int64 volatile* pDest, int64 value, int64 comperand) +{ + return _InterlockedCompareExchange64(pDest, comperand, value); +} \ No newline at end of file diff --git a/r5dev/tier0/threadtools.h b/r5dev/tier0/threadtools.h index 956f503d..8189a19c 100644 --- a/r5dev/tier0/threadtools.h +++ b/r5dev/tier0/threadtools.h @@ -1,11 +1,6 @@ #ifndef THREADTOOLS_H #define THREADTOOLS_H -inline bool ThreadInterlockedAssignIf(LONG volatile* p, int32 value, int32 comperand) -{ - Assert((size_t)p % 4 == 0); - return _InterlockedCompareExchange(p, comperand, value); -} inline void ThreadSleep(unsigned nMilliseconds) { #ifdef _WIN32 @@ -38,6 +33,169 @@ inline void ThreadSleep(unsigned nMilliseconds) usleep(nMilliseconds * 1000); #endif } +inline void ThreadPause() +{ +#if defined( COMPILER_PS3 ) + __db16cyc(); +#elif defined( COMPILER_GCC ) + __asm __volatile("pause"); +#elif defined ( COMPILER_MSVC64 ) + _mm_pause(); +#elif defined( COMPILER_MSVC32 ) + __asm pause; +#elif defined( COMPILER_MSVCX360 ) + YieldProcessor(); + __asm { or r0, r0, r0 } + YieldProcessor(); + __asm { or r1, r1, r1 } +#else +#error "implement me" +#endif +} +LONG ThreadInterlockedCompareExchange64(LONG volatile* pDest, int64 value, int64 comperand); +bool ThreadInterlockedAssignIf(LONG volatile* p, int32 value, int32 comperand); +int64 ThreadInterlockedCompareExchange64(int64 volatile* pDest, int64 value, int64 comperand); +bool ThreadInterlockedAssignIf64(int64 volatile* pDest, int64 value, int64 comperand); + +//----------------------------------------------------------------------------- +// +// Interlock methods. These perform very fast atomic thread +// safe operations. These are especially relevant in a multi-core setting. +// +//----------------------------------------------------------------------------- + +#ifdef _WIN32 +#define NOINLINE +#elif defined( _PS3 ) +#define NOINLINE __attribute__ ((noinline)) +#elif defined(POSIX) +#define NOINLINE __attribute__ ((noinline)) +#endif + +#if defined( _X360 ) || defined( _PS3 ) +#define ThreadMemoryBarrier() __lwsync() +#elif defined(COMPILER_MSVC) +// Prevent compiler reordering across this barrier. This is +// sufficient for most purposes on x86/x64. +#define ThreadMemoryBarrier() _ReadWriteBarrier() +#elif defined(COMPILER_GCC) +// Prevent compiler reordering across this barrier. This is +// sufficient for most purposes on x86/x64. +// http://preshing.com/20120625/memory-ordering-at-compile-time +#define ThreadMemoryBarrier() asm volatile("" ::: "memory") +#else +#error Every platform needs to define ThreadMemoryBarrier to at least prevent compiler reordering +#endif + +//----------------------------------------------------------------------------- +// +// A super-fast thread-safe integer A simple class encapsulating the notion of an +// atomic integer used across threads that uses the built in and faster +// "interlocked" functionality rather than a full-blown mutex. Useful for simple +// things like reference counts, etc. +// +//----------------------------------------------------------------------------- + +template +class CInterlockedIntT +{ +public: + CInterlockedIntT() : m_value(0) { static_assert((sizeof(T) == sizeof(int32)) || (sizeof(T) == sizeof(int64))); } + + CInterlockedIntT(T value) : m_value(value) {} + + T operator()(void) const { return m_value; } + operator T() const { return m_value; } + + bool operator!() const { return (m_value == 0); } + bool operator==(T rhs) const { return (m_value == rhs); } + bool operator!=(T rhs) const { return (m_value != rhs); } + + T operator++() { + if (sizeof(T) == sizeof(int32)) + return (T)ThreadInterlockedIncrement((int32*)&m_value); + else + return (T)ThreadInterlockedIncrement64((int64*)&m_value); + } + T operator++(int) { return operator++() - 1; } + + T operator--() { + if (sizeof(T) == sizeof(int32)) + return (T)ThreadInterlockedDecrement((int32*)&m_value); + else + return (T)ThreadInterlockedDecrement64((int64*)&m_value); + } + + T operator--(int) { return operator--() + 1; } + + bool AssignIf(T conditionValue, T newValue) + { + if (sizeof(T) == sizeof(int32)) + return ThreadInterlockedAssignIf((LONG*)&m_value, (int32)newValue, (int32)conditionValue); + else + return ThreadInterlockedAssignIf64((int64*)&m_value, (int64)newValue, (int64)conditionValue); + } + + + T operator=(T newValue) { + if (sizeof(T) == sizeof(int32)) + ThreadInterlockedExchange((int32*)&m_value, newValue); + else + ThreadInterlockedExchange64((int64*)&m_value, newValue); + return m_value; + } + + // Atomic add is like += except it returns the previous value as its return value + T AtomicAdd(T add) { + if (sizeof(T) == sizeof(int32)) + return (T)ThreadInterlockedExchangeAdd((int32*)&m_value, (int32)add); + else + return (T)ThreadInterlockedExchangeAdd64((int64*)&m_value, (int64)add); + } + + + void operator+=(T add) { + if (sizeof(T) == sizeof(int32)) + ThreadInterlockedExchangeAdd((int32*)&m_value, (int32)add); + else + ThreadInterlockedExchangeAdd64((int64*)&m_value, (int64)add); + } + + void operator-=(T subtract) { operator+=(-subtract); } + void operator*=(T multiplier) { + T original, result; + do + { + original = m_value; + result = original * multiplier; + } while (!AssignIf(original, result)); + } + void operator/=(T divisor) { + T original, result; + do + { + original = m_value; + result = original / divisor; + } while (!AssignIf(original, result)); + } + + T operator+(T rhs) const { return m_value + rhs; } + T operator-(T rhs) const { return m_value - rhs; } + + T InterlockedExchange(T newValue) { + if (sizeof(T) == sizeof(int32)) + return (T)ThreadInterlockedExchange((int32*)&m_value, newValue); + else + return (T)ThreadInterlockedExchange64((int64*)&m_value, newValue); + } + +private: + volatile T m_value; +}; + +typedef CInterlockedIntT CInterlockedInt; +typedef CInterlockedIntT CInterlockedUInt; + //============================================================================= class CThreadFastMutex; diff --git a/r5dev/vproj/clientsdk.vcxproj b/r5dev/vproj/clientsdk.vcxproj index 55990659..cd3f7e24 100644 --- a/r5dev/vproj/clientsdk.vcxproj +++ b/r5dev/vproj/clientsdk.vcxproj @@ -11,6 +11,7 @@ + @@ -70,6 +71,8 @@ + + @@ -102,6 +105,7 @@ + @@ -201,6 +205,7 @@ + @@ -210,9 +215,9 @@ - + diff --git a/r5dev/vproj/clientsdk.vcxproj.filters b/r5dev/vproj/clientsdk.vcxproj.filters index 07a3e855..9151d79a 100644 --- a/r5dev/vproj/clientsdk.vcxproj.filters +++ b/r5dev/vproj/clientsdk.vcxproj.filters @@ -211,6 +211,9 @@ {01d3645a-16c3-4910-ac95-049e112cd2b8} + + {57e1f0c7-ce4f-4576-960e-0cd15b2b5092} + @@ -546,6 +549,18 @@ sdk\tier2 + + sdk\bonesetup + + + sdk\mathlib + + + sdk\mathlib + + + sdk\tier0 + @@ -1565,9 +1580,6 @@ sdk\mathlib - - sdk\mathlib - sdk\mathlib @@ -1598,6 +1610,12 @@ sdk\tier2 + + sdk\mathlib + + + sdk\mathlib + diff --git a/r5dev/vproj/dedicated.vcxproj b/r5dev/vproj/dedicated.vcxproj index 88fa8e03..3c9e74e5 100644 --- a/r5dev/vproj/dedicated.vcxproj +++ b/r5dev/vproj/dedicated.vcxproj @@ -188,6 +188,7 @@ + @@ -197,9 +198,9 @@ - + @@ -448,6 +449,7 @@ + @@ -501,6 +503,8 @@ + + @@ -532,6 +536,7 @@ + diff --git a/r5dev/vproj/dedicated.vcxproj.filters b/r5dev/vproj/dedicated.vcxproj.filters index 8fa542eb..ed637cb0 100644 --- a/r5dev/vproj/dedicated.vcxproj.filters +++ b/r5dev/vproj/dedicated.vcxproj.filters @@ -187,6 +187,9 @@ {98975892-5379-4f6c-8c7e-35d92d2bc5e5} + + {d49ec580-58c2-49e7-8e83-957da576febd} + @@ -1131,9 +1134,6 @@ sdk\mathlib - - sdk\mathlib - sdk\mathlib @@ -1161,6 +1161,12 @@ sdk\vstdlib + + sdk\mathlib + + + sdk\mathlib + @@ -1448,6 +1454,18 @@ sdk\vstdlib + + sdk\bonesetup + + + sdk\mathlib + + + sdk\mathlib + + + sdk\tier0 + diff --git a/r5dev/vproj/gamesdk.vcxproj b/r5dev/vproj/gamesdk.vcxproj index 3bd4a075..bfc9a4a9 100644 --- a/r5dev/vproj/gamesdk.vcxproj +++ b/r5dev/vproj/gamesdk.vcxproj @@ -11,6 +11,7 @@ + @@ -76,6 +77,8 @@ + + @@ -109,6 +112,7 @@ + @@ -219,6 +223,8 @@ + + @@ -228,9 +234,9 @@ - + diff --git a/r5dev/vproj/gamesdk.vcxproj.filters b/r5dev/vproj/gamesdk.vcxproj.filters index 3fd46497..e2d08203 100644 --- a/r5dev/vproj/gamesdk.vcxproj.filters +++ b/r5dev/vproj/gamesdk.vcxproj.filters @@ -220,6 +220,9 @@ {b7e33427-fd37-44b1-8530-651ae5f4fde1} + + {acbd4b45-6a8d-4d9f-9747-1bc460481bb4} + @@ -576,6 +579,18 @@ sdk\tier2 + + sdk\bonesetup + + + sdk\mathlib + + + sdk\mathlib + + + sdk\tier0 + @@ -1637,9 +1652,6 @@ sdk\mathlib - - sdk\mathlib - sdk\mathlib @@ -1667,6 +1679,15 @@ sdk\tier2 + + sdk\mathlib + + + sdk\mathlib + + + sdk\mathlib + diff --git a/r5dev/vstdlib/random.h b/r5dev/vstdlib/random.h index d63d22ee..daded0d5 100644 --- a/r5dev/vstdlib/random.h +++ b/r5dev/vstdlib/random.h @@ -10,7 +10,6 @@ #define VSTDLIB_RANDOM_H #include "tier0/basetypes.h" -#include "tier0/threadtools.h" #define NTAB 32