diff --git a/r5dev/bonesetup/bone_utils.cpp b/r5dev/bonesetup/bone_utils.cpp
new file mode 100644
index 00000000..6154b971
--- /dev/null
+++ b/r5dev/bonesetup/bone_utils.cpp
@@ -0,0 +1,101 @@
+﻿//===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//===========================================================================//
+
+#include "core/stdafx.h"
+#include "mathlib/mathlib.h"
+
+//-----------------------------------------------------------------------------
+// Purpose: qt = ( s * p ) * q
+//-----------------------------------------------------------------------------
+void QuaternionSM(float s, const Quaternion& p, const Quaternion& q, Quaternion& qt)
+{
+	Quaternion		p1, q1;
+
+	QuaternionScale(p, s, p1);
+	QuaternionMult(p1, q, q1);
+	QuaternionNormalize(q1);
+	qt[0] = q1[0];
+	qt[1] = q1[1];
+	qt[2] = q1[2];
+	qt[3] = q1[3];
+}
+
+#if ALLOW_SIMD_QUATERNION_MATH
+FORCEINLINE fltx4 QuaternionSMSIMD(const fltx4& s, const fltx4& p, const fltx4& q)
+{
+	fltx4 p1, q1, result;
+	p1 = QuaternionScaleSIMD(p, s);
+	q1 = QuaternionMultSIMD(p1, q);
+	result = QuaternionNormalizeSIMD(q1);
+	return result;
+}
+
+FORCEINLINE fltx4 QuaternionSMSIMD(float s, const fltx4& p, const fltx4& q)
+{
+	return QuaternionSMSIMD(ReplicateX4(s), p, q);
+}
+#endif
+
+//-----------------------------------------------------------------------------
+// Purpose: qt = p * ( s * q )
+//-----------------------------------------------------------------------------
+void QuaternionMA(const Quaternion& p, float s, const Quaternion& q, Quaternion& qt)
+{
+	Quaternion p1, q1;
+
+	QuaternionScale(q, s, q1);
+	QuaternionMult(p, q1, p1);
+	QuaternionNormalize(p1);
+	qt[0] = p1[0];
+	qt[1] = p1[1];
+	qt[2] = p1[2];
+	qt[3] = p1[3];
+}
+
+#if ALLOW_SIMD_QUATERNION_MATH
+
+FORCEINLINE fltx4 QuaternionMASIMD(const fltx4& p, const fltx4& s, const fltx4& q)
+{
+	fltx4 p1, q1, result;
+	q1 = QuaternionScaleSIMD(q, s);
+	p1 = QuaternionMultSIMD(p, q1);
+	result = QuaternionNormalizeSIMD(p1);
+	return result;
+}
+
+FORCEINLINE fltx4 QuaternionMASIMD(const fltx4& p, float s, const fltx4& q)
+{
+	return QuaternionMASIMD(p, ReplicateX4(s), q);
+}
+#endif
+
+
+//-----------------------------------------------------------------------------
+// Purpose: qt = p + s * q
+//-----------------------------------------------------------------------------
+void QuaternionAccumulate(const Quaternion& p, float s, const Quaternion& q, Quaternion& qt)
+{
+	Quaternion q2;
+	QuaternionAlign(p, q, q2);
+
+	qt[0] = p[0] + s * q2[0];
+	qt[1] = p[1] + s * q2[1];
+	qt[2] = p[2] + s * q2[2];
+	qt[3] = p[3] + s * q2[3];
+}
+
+#if ALLOW_SIMD_QUATERNION_MATH
+FORCEINLINE fltx4 QuaternionAccumulateSIMD(const fltx4& p, float s, const fltx4& q)
+{
+	fltx4 q2, s4, result;
+	q2 = QuaternionAlignSIMD(p, q);
+	s4 = ReplicateX4(s);
+	result = MaddSIMD(s4, q2, p);
+	return result;
+}
+#endif
diff --git a/r5dev/core/init.cpp b/r5dev/core/init.cpp
index b672b445..75568b4c 100644
--- a/r5dev/core/init.cpp
+++ b/r5dev/core/init.cpp
@@ -35,6 +35,7 @@
 #ifndef DEDICATED
 #include "milessdk/win64_rrthreads.h"
 #endif // !DEDICATED
+#include "mathlib/mathlib.h"
 #include "vphysics/QHull.h"
 #include "bsplib/bsplib.h"
 #include "materialsystem/cmaterialsystem.h"
@@ -118,9 +119,10 @@ void Systems_Init()
 {
 	spdlog::info("+-------------------------------------------------------------+\n");
 	QuerySystemInfo();
-	CFastTimer initTimer;
 
+	CFastTimer initTimer;
 	initTimer.Start();
+
 	for (IDetour* pDetour : vDetour)
 	{
 		pDetour->GetCon();
@@ -128,13 +130,14 @@ void Systems_Init()
 		pDetour->GetVar();
 	}
 	initTimer.End();
+
 	spdlog::info("+-------------------------------------------------------------+\n");
 	spdlog::info("Detour->Init()   '{:10.6f}' seconds ('{:12d}' clocks)\n", initTimer.GetDuration().GetSeconds(), initTimer.GetDuration().GetCycles());
 
 	initTimer.Start();
 
-	// Initialize WinSock system.
-	WS_Init();
+	WS_Init();      // Initialize WinSock.
+	MathLib_Init(); // Initialize MathLib.
 
 	// Begin the detour transaction to hook the the process
 	DetourTransactionBegin();
@@ -404,11 +407,14 @@ void QuerySystemInfo()
 			std::system_category().message(static_cast<int>(::GetLastError())));
 	}
 
-	if (!(pi.m_bSSE && pi.m_bSSE2))
+	if (!s_bMathlibInitialized)
 	{
-		if (MessageBoxA(NULL, "SSE and SSE2 are required.", "Unsupported CPU", MB_ICONERROR | MB_OK))
+		if (!(pi.m_bSSE && pi.m_bSSE2))
 		{
-			TerminateProcess(GetCurrentProcess(), 0xBAD0C0DE);
+			if (MessageBoxA(NULL, "SSE and SSE2 are required.", "Unsupported CPU", MB_ICONERROR | MB_OK))
+			{
+				TerminateProcess(GetCurrentProcess(), 0xBAD0C0DE);
+			}
 		}
 	}
 }
diff --git a/r5dev/mathlib/almostequal.cpp b/r5dev/mathlib/almostequal.cpp
index 76f99a74..01865efa 100644
--- a/r5dev/mathlib/almostequal.cpp
+++ b/r5dev/mathlib/almostequal.cpp
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+﻿//========= Copyright � 1996-2008, Valve Corporation, All rights reserved. ============//
 //
 // Purpose: Fast ways to compare equality of two floats.  Assumes 
 // sizeof(float) == sizeof(int) and we are using IEEE format.
diff --git a/r5dev/mathlib/color_conversion.cpp b/r5dev/mathlib/color_conversion.cpp
index 37f03fe4..ab7d87b0 100644
--- a/r5dev/mathlib/color_conversion.cpp
+++ b/r5dev/mathlib/color_conversion.cpp
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+﻿//========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============//
 //
 // Purpose: Color conversion routines.
 //
@@ -34,71 +34,71 @@ static float	g_Mathlib_LinearToGamma[256];	// linear (0..1) to gamma (0..1)
 // TODO: move this into the one DLL that actually uses it, instead of statically
 // linking it everywhere via mathlib.
 ALIGN128 float	power2_n[256] = 			// 2**(index - 128) / 255
-{ 
-	1.152445441982634800E-041, 2.304890883965269600E-041, 4.609781767930539200E-041, 9.219563535861078400E-041, 
+{
+	1.152445441982634800E-041, 2.304890883965269600E-041, 4.609781767930539200E-041, 9.219563535861078400E-041,
 	1.843912707172215700E-040, 3.687825414344431300E-040, 7.375650828688862700E-040, 1.475130165737772500E-039,
-	2.950260331475545100E-039, 5.900520662951090200E-039, 1.180104132590218000E-038, 2.360208265180436100E-038, 
-	4.720416530360872100E-038, 9.440833060721744200E-038, 1.888166612144348800E-037, 3.776333224288697700E-037, 
-	7.552666448577395400E-037, 1.510533289715479100E-036, 3.021066579430958200E-036, 6.042133158861916300E-036, 
-	1.208426631772383300E-035, 2.416853263544766500E-035, 4.833706527089533100E-035, 9.667413054179066100E-035, 
-	1.933482610835813200E-034, 3.866965221671626400E-034, 7.733930443343252900E-034, 1.546786088668650600E-033, 
-	3.093572177337301200E-033, 6.187144354674602300E-033, 1.237428870934920500E-032, 2.474857741869840900E-032, 
-	4.949715483739681800E-032, 9.899430967479363700E-032, 1.979886193495872700E-031, 3.959772386991745500E-031, 
-	7.919544773983491000E-031, 1.583908954796698200E-030, 3.167817909593396400E-030, 6.335635819186792800E-030, 
-	1.267127163837358600E-029, 2.534254327674717100E-029, 5.068508655349434200E-029, 1.013701731069886800E-028, 
-	2.027403462139773700E-028, 4.054806924279547400E-028, 8.109613848559094700E-028, 1.621922769711818900E-027, 
-	3.243845539423637900E-027, 6.487691078847275800E-027, 1.297538215769455200E-026, 2.595076431538910300E-026, 
-	5.190152863077820600E-026, 1.038030572615564100E-025, 2.076061145231128300E-025, 4.152122290462256500E-025, 
-	8.304244580924513000E-025, 1.660848916184902600E-024, 3.321697832369805200E-024, 6.643395664739610400E-024, 
-	1.328679132947922100E-023, 2.657358265895844200E-023, 5.314716531791688300E-023, 1.062943306358337700E-022, 
-	2.125886612716675300E-022, 4.251773225433350700E-022, 8.503546450866701300E-022, 1.700709290173340300E-021, 
-	3.401418580346680500E-021, 6.802837160693361100E-021, 1.360567432138672200E-020, 2.721134864277344400E-020, 
-	5.442269728554688800E-020, 1.088453945710937800E-019, 2.176907891421875500E-019, 4.353815782843751100E-019, 
-	8.707631565687502200E-019, 1.741526313137500400E-018, 3.483052626275000900E-018, 6.966105252550001700E-018, 
-	1.393221050510000300E-017, 2.786442101020000700E-017, 5.572884202040001400E-017, 1.114576840408000300E-016, 
-	2.229153680816000600E-016, 4.458307361632001100E-016, 8.916614723264002200E-016, 1.783322944652800400E-015, 
-	3.566645889305600900E-015, 7.133291778611201800E-015, 1.426658355722240400E-014, 2.853316711444480700E-014, 
-	5.706633422888961400E-014, 1.141326684577792300E-013, 2.282653369155584600E-013, 4.565306738311169100E-013, 
-	9.130613476622338300E-013, 1.826122695324467700E-012, 3.652245390648935300E-012, 7.304490781297870600E-012, 
-	1.460898156259574100E-011, 2.921796312519148200E-011, 5.843592625038296500E-011, 1.168718525007659300E-010, 
-	2.337437050015318600E-010, 4.674874100030637200E-010, 9.349748200061274400E-010, 1.869949640012254900E-009, 
-	3.739899280024509800E-009, 7.479798560049019500E-009, 1.495959712009803900E-008, 2.991919424019607800E-008, 
-	5.983838848039215600E-008, 1.196767769607843100E-007, 2.393535539215686200E-007, 4.787071078431372500E-007, 
-	9.574142156862745000E-007, 1.914828431372549000E-006, 3.829656862745098000E-006, 7.659313725490196000E-006, 
-	1.531862745098039200E-005, 3.063725490196078400E-005, 6.127450980392156800E-005, 1.225490196078431400E-004, 
-	2.450980392156862700E-004, 4.901960784313725400E-004, 9.803921568627450800E-004, 1.960784313725490200E-003, 
-	3.921568627450980300E-003, 7.843137254901960700E-003, 1.568627450980392100E-002, 3.137254901960784300E-002, 
-	6.274509803921568500E-002, 1.254901960784313700E-001, 2.509803921568627400E-001, 5.019607843137254800E-001, 
-	1.003921568627451000E+000, 2.007843137254901900E+000, 4.015686274509803900E+000, 8.031372549019607700E+000, 
-	1.606274509803921500E+001, 3.212549019607843100E+001, 6.425098039215686200E+001, 1.285019607843137200E+002, 
-	2.570039215686274500E+002, 5.140078431372548900E+002, 1.028015686274509800E+003, 2.056031372549019600E+003, 
-	4.112062745098039200E+003, 8.224125490196078300E+003, 1.644825098039215700E+004, 3.289650196078431300E+004, 
-	6.579300392156862700E+004, 1.315860078431372500E+005, 2.631720156862745100E+005, 5.263440313725490100E+005, 
-	1.052688062745098000E+006, 2.105376125490196000E+006, 4.210752250980392100E+006, 8.421504501960784200E+006, 
-	1.684300900392156800E+007, 3.368601800784313700E+007, 6.737203601568627400E+007, 1.347440720313725500E+008, 
-	2.694881440627450900E+008, 5.389762881254901900E+008, 1.077952576250980400E+009, 2.155905152501960800E+009, 
-	4.311810305003921500E+009, 8.623620610007843000E+009, 1.724724122001568600E+010, 3.449448244003137200E+010, 
-	6.898896488006274400E+010, 1.379779297601254900E+011, 2.759558595202509800E+011, 5.519117190405019500E+011, 
-	1.103823438081003900E+012, 2.207646876162007800E+012, 4.415293752324015600E+012, 8.830587504648031200E+012, 
-	1.766117500929606200E+013, 3.532235001859212500E+013, 7.064470003718425000E+013, 1.412894000743685000E+014, 
-	2.825788001487370000E+014, 5.651576002974740000E+014, 1.130315200594948000E+015, 2.260630401189896000E+015, 
-	4.521260802379792000E+015, 9.042521604759584000E+015, 1.808504320951916800E+016, 3.617008641903833600E+016, 
-	7.234017283807667200E+016, 1.446803456761533400E+017, 2.893606913523066900E+017, 5.787213827046133800E+017, 
-	1.157442765409226800E+018, 2.314885530818453500E+018, 4.629771061636907000E+018, 9.259542123273814000E+018, 
-	1.851908424654762800E+019, 3.703816849309525600E+019, 7.407633698619051200E+019, 1.481526739723810200E+020, 
-	2.963053479447620500E+020, 5.926106958895241000E+020, 1.185221391779048200E+021, 2.370442783558096400E+021, 
-	4.740885567116192800E+021, 9.481771134232385600E+021, 1.896354226846477100E+022, 3.792708453692954200E+022, 
-	7.585416907385908400E+022, 1.517083381477181700E+023, 3.034166762954363400E+023, 6.068333525908726800E+023, 
-	1.213666705181745400E+024, 2.427333410363490700E+024, 4.854666820726981400E+024, 9.709333641453962800E+024, 
-	1.941866728290792600E+025, 3.883733456581585100E+025, 7.767466913163170200E+025, 1.553493382632634000E+026, 
-	3.106986765265268100E+026, 6.213973530530536200E+026, 1.242794706106107200E+027, 2.485589412212214500E+027, 
-	4.971178824424429000E+027, 9.942357648848857900E+027, 1.988471529769771600E+028, 3.976943059539543200E+028, 
-	7.953886119079086300E+028, 1.590777223815817300E+029, 3.181554447631634500E+029, 6.363108895263269100E+029, 
-	1.272621779052653800E+030, 2.545243558105307600E+030, 5.090487116210615300E+030, 1.018097423242123100E+031, 
-	2.036194846484246100E+031, 4.072389692968492200E+031, 8.144779385936984400E+031, 1.628955877187396900E+032, 
-	3.257911754374793800E+032, 6.515823508749587500E+032, 1.303164701749917500E+033, 2.606329403499835000E+033, 
-	5.212658806999670000E+033, 1.042531761399934000E+034, 2.085063522799868000E+034, 4.170127045599736000E+034, 
-	8.340254091199472000E+034, 1.668050818239894400E+035, 3.336101636479788800E+035, 6.672203272959577600E+035 
+	2.950260331475545100E-039, 5.900520662951090200E-039, 1.180104132590218000E-038, 2.360208265180436100E-038,
+	4.720416530360872100E-038, 9.440833060721744200E-038, 1.888166612144348800E-037, 3.776333224288697700E-037,
+	7.552666448577395400E-037, 1.510533289715479100E-036, 3.021066579430958200E-036, 6.042133158861916300E-036,
+	1.208426631772383300E-035, 2.416853263544766500E-035, 4.833706527089533100E-035, 9.667413054179066100E-035,
+	1.933482610835813200E-034, 3.866965221671626400E-034, 7.733930443343252900E-034, 1.546786088668650600E-033,
+	3.093572177337301200E-033, 6.187144354674602300E-033, 1.237428870934920500E-032, 2.474857741869840900E-032,
+	4.949715483739681800E-032, 9.899430967479363700E-032, 1.979886193495872700E-031, 3.959772386991745500E-031,
+	7.919544773983491000E-031, 1.583908954796698200E-030, 3.167817909593396400E-030, 6.335635819186792800E-030,
+	1.267127163837358600E-029, 2.534254327674717100E-029, 5.068508655349434200E-029, 1.013701731069886800E-028,
+	2.027403462139773700E-028, 4.054806924279547400E-028, 8.109613848559094700E-028, 1.621922769711818900E-027,
+	3.243845539423637900E-027, 6.487691078847275800E-027, 1.297538215769455200E-026, 2.595076431538910300E-026,
+	5.190152863077820600E-026, 1.038030572615564100E-025, 2.076061145231128300E-025, 4.152122290462256500E-025,
+	8.304244580924513000E-025, 1.660848916184902600E-024, 3.321697832369805200E-024, 6.643395664739610400E-024,
+	1.328679132947922100E-023, 2.657358265895844200E-023, 5.314716531791688300E-023, 1.062943306358337700E-022,
+	2.125886612716675300E-022, 4.251773225433350700E-022, 8.503546450866701300E-022, 1.700709290173340300E-021,
+	3.401418580346680500E-021, 6.802837160693361100E-021, 1.360567432138672200E-020, 2.721134864277344400E-020,
+	5.442269728554688800E-020, 1.088453945710937800E-019, 2.176907891421875500E-019, 4.353815782843751100E-019,
+	8.707631565687502200E-019, 1.741526313137500400E-018, 3.483052626275000900E-018, 6.966105252550001700E-018,
+	1.393221050510000300E-017, 2.786442101020000700E-017, 5.572884202040001400E-017, 1.114576840408000300E-016,
+	2.229153680816000600E-016, 4.458307361632001100E-016, 8.916614723264002200E-016, 1.783322944652800400E-015,
+	3.566645889305600900E-015, 7.133291778611201800E-015, 1.426658355722240400E-014, 2.853316711444480700E-014,
+	5.706633422888961400E-014, 1.141326684577792300E-013, 2.282653369155584600E-013, 4.565306738311169100E-013,
+	9.130613476622338300E-013, 1.826122695324467700E-012, 3.652245390648935300E-012, 7.304490781297870600E-012,
+	1.460898156259574100E-011, 2.921796312519148200E-011, 5.843592625038296500E-011, 1.168718525007659300E-010,
+	2.337437050015318600E-010, 4.674874100030637200E-010, 9.349748200061274400E-010, 1.869949640012254900E-009,
+	3.739899280024509800E-009, 7.479798560049019500E-009, 1.495959712009803900E-008, 2.991919424019607800E-008,
+	5.983838848039215600E-008, 1.196767769607843100E-007, 2.393535539215686200E-007, 4.787071078431372500E-007,
+	9.574142156862745000E-007, 1.914828431372549000E-006, 3.829656862745098000E-006, 7.659313725490196000E-006,
+	1.531862745098039200E-005, 3.063725490196078400E-005, 6.127450980392156800E-005, 1.225490196078431400E-004,
+	2.450980392156862700E-004, 4.901960784313725400E-004, 9.803921568627450800E-004, 1.960784313725490200E-003,
+	3.921568627450980300E-003, 7.843137254901960700E-003, 1.568627450980392100E-002, 3.137254901960784300E-002,
+	6.274509803921568500E-002, 1.254901960784313700E-001, 2.509803921568627400E-001, 5.019607843137254800E-001,
+	1.003921568627451000E+000, 2.007843137254901900E+000, 4.015686274509803900E+000, 8.031372549019607700E+000,
+	1.606274509803921500E+001, 3.212549019607843100E+001, 6.425098039215686200E+001, 1.285019607843137200E+002,
+	2.570039215686274500E+002, 5.140078431372548900E+002, 1.028015686274509800E+003, 2.056031372549019600E+003,
+	4.112062745098039200E+003, 8.224125490196078300E+003, 1.644825098039215700E+004, 3.289650196078431300E+004,
+	6.579300392156862700E+004, 1.315860078431372500E+005, 2.631720156862745100E+005, 5.263440313725490100E+005,
+	1.052688062745098000E+006, 2.105376125490196000E+006, 4.210752250980392100E+006, 8.421504501960784200E+006,
+	1.684300900392156800E+007, 3.368601800784313700E+007, 6.737203601568627400E+007, 1.347440720313725500E+008,
+	2.694881440627450900E+008, 5.389762881254901900E+008, 1.077952576250980400E+009, 2.155905152501960800E+009,
+	4.311810305003921500E+009, 8.623620610007843000E+009, 1.724724122001568600E+010, 3.449448244003137200E+010,
+	6.898896488006274400E+010, 1.379779297601254900E+011, 2.759558595202509800E+011, 5.519117190405019500E+011,
+	1.103823438081003900E+012, 2.207646876162007800E+012, 4.415293752324015600E+012, 8.830587504648031200E+012,
+	1.766117500929606200E+013, 3.532235001859212500E+013, 7.064470003718425000E+013, 1.412894000743685000E+014,
+	2.825788001487370000E+014, 5.651576002974740000E+014, 1.130315200594948000E+015, 2.260630401189896000E+015,
+	4.521260802379792000E+015, 9.042521604759584000E+015, 1.808504320951916800E+016, 3.617008641903833600E+016,
+	7.234017283807667200E+016, 1.446803456761533400E+017, 2.893606913523066900E+017, 5.787213827046133800E+017,
+	1.157442765409226800E+018, 2.314885530818453500E+018, 4.629771061636907000E+018, 9.259542123273814000E+018,
+	1.851908424654762800E+019, 3.703816849309525600E+019, 7.407633698619051200E+019, 1.481526739723810200E+020,
+	2.963053479447620500E+020, 5.926106958895241000E+020, 1.185221391779048200E+021, 2.370442783558096400E+021,
+	4.740885567116192800E+021, 9.481771134232385600E+021, 1.896354226846477100E+022, 3.792708453692954200E+022,
+	7.585416907385908400E+022, 1.517083381477181700E+023, 3.034166762954363400E+023, 6.068333525908726800E+023,
+	1.213666705181745400E+024, 2.427333410363490700E+024, 4.854666820726981400E+024, 9.709333641453962800E+024,
+	1.941866728290792600E+025, 3.883733456581585100E+025, 7.767466913163170200E+025, 1.553493382632634000E+026,
+	3.106986765265268100E+026, 6.213973530530536200E+026, 1.242794706106107200E+027, 2.485589412212214500E+027,
+	4.971178824424429000E+027, 9.942357648848857900E+027, 1.988471529769771600E+028, 3.976943059539543200E+028,
+	7.953886119079086300E+028, 1.590777223815817300E+029, 3.181554447631634500E+029, 6.363108895263269100E+029,
+	1.272621779052653800E+030, 2.545243558105307600E+030, 5.090487116210615300E+030, 1.018097423242123100E+031,
+	2.036194846484246100E+031, 4.072389692968492200E+031, 8.144779385936984400E+031, 1.628955877187396900E+032,
+	3.257911754374793800E+032, 6.515823508749587500E+032, 1.303164701749917500E+033, 2.606329403499835000E+033,
+	5.212658806999670000E+033, 1.042531761399934000E+034, 2.085063522799868000E+034, 4.170127045599736000E+034,
+	8.340254091199472000E+034, 1.668050818239894400E+035, 3.336101636479788800E+035, 6.672203272959577600E+035
 };
 
 // You can use this to double check the exponent table and assert that 
@@ -108,20 +108,20 @@ ALIGN128 float	power2_n[256] = 			// 2**(index - 128) / 255
 #pragma warning( disable : 4189 ) // disable unused local variable warning
 static void CheckExponentTable()
 {
-	for( int i = 0; i < 256; i++ )
+	for (int i = 0; i < 256; i++)
 	{
-		float testAgainst = pow( 2.0f, i - 128 ) / 255.0f;
-		float diff = testAgainst - power2_n[i] ;
+		float testAgainst = pow(2.0f, i - 128) / 255.0f;
+		float diff = testAgainst - power2_n[i];
 		float relativeDiff = diff / testAgainst;
-		Assert( testAgainst == 0 ? 
-				power2_n[i] < 1.16E-041 :
-				power2_n[i] == testAgainst );
+		Assert(testAgainst == 0 ?
+			power2_n[i] < 1.16E-041 :
+			power2_n[i] == testAgainst);
 	}
 }
 #pragma warning(pop)
 #endif
 
-void BuildGammaTable( float gamma, float texGamma, float brightness, int overbright )
+void BuildGammaTable(float gamma, float texGamma, float brightness, int overbright)
 {
 	int		i, inf;
 	float	g1, g3;
@@ -129,30 +129,30 @@ void BuildGammaTable( float gamma, float texGamma, float brightness, int overbri
 	// Con_Printf("BuildGammaTable %.1f %.1f %.1f\n", g, v_lightgamma.GetFloat(), v_texgamma.GetFloat() );
 
 	float g = gamma;
-	if (g > 3.0) 
+	if (g > 3.0)
 	{
 		g = 3.0;
 	}
 
 	g = 1.0 / g;
-	g1 = texGamma * g; 
+	g1 = texGamma * g;
 
-	if (brightness <= 0.0) 
+	if (brightness <= 0.0)
 	{
 		g3 = 0.125;
 	}
-	else if (brightness > 1.0) 
+	else if (brightness > 1.0)
 	{
 		g3 = 0.05;
 	}
-	else 
+	else
 	{
 		g3 = 0.125 - (brightness * brightness) * 0.075;
 	}
 
-	for (i=0 ; i<256 ; i++)
+	for (i = 0; i < 256; i++)
 	{
-		inf = 255 * pow ( i/255.f, g1 ); 
+		inf = (int)(255 * pow(i / 255.f, g1));
 		if (inf < 0)
 			inf = 0;
 		if (inf > 255)
@@ -160,7 +160,7 @@ void BuildGammaTable( float gamma, float texGamma, float brightness, int overbri
 		texgammatable[i] = inf;
 	}
 
-	for (i=0 ; i<1024 ; i++)
+	for (i = 0; i < 1024; i++)
 	{
 		float f;
 
@@ -173,11 +173,11 @@ void BuildGammaTable( float gamma, float texGamma, float brightness, int overbri
 		// shift up
 		if (f <= g3)
 			f = (f / g3) * 0.125;
-		else 
+		else
 			f = 0.125 + ((f - g3) / (1.0 - g3)) * 0.875;
 
 		// convert linear space to desired gamma space
-		inf = 255 * pow ( f, g ); 
+		inf = (int)(255 * pow(f, g));
 
 		if (inf < 0)
 			inf = 0;
@@ -196,32 +196,32 @@ void BuildGammaTable( float gamma, float texGamma, float brightness, int overbri
 	}
 	*/
 
-	for (i=0 ; i<256 ; i++)
+	for (i = 0; i < 256; i++)
 	{
 		// convert from nonlinear texture space (0..255) to linear space (0..1)
-		texturetolinear[i] =  pow( i / 255.f, texGamma );
+		texturetolinear[i] = pow(i / 255.f, texGamma);
 
 		// convert from linear space (0..1) to nonlinear (sRGB) space (0..1)
-		g_Mathlib_LinearToGamma[i] =  LinearToGammaFullRange( i / 255.f );
+		g_Mathlib_LinearToGamma[i] = LinearToGammaFullRange(i / 255.f);
 
 		// convert from sRGB gamma space (0..1) to linear space (0..1)
-		g_Mathlib_GammaToLinear[i] =  GammaToLinearFullRange( i / 255.f );
+		g_Mathlib_GammaToLinear[i] = GammaToLinearFullRange(i / 255.f);
 	}
 
-	for (i=0 ; i<1024 ; i++)
+	for (i = 0; i < 1024; i++)
 	{
 		// convert from linear space (0..1) to nonlinear texture space (0..255)
-		lineartotexture[i] =  pow( i / 1023.0, 1.0 / texGamma ) * 255;
+		lineartotexture[i] = (int)pow(i / 1023.0, 1.0 / texGamma) * 255;
 	}
 
 #if 0
-	for (i=0 ; i<256 ; i++)
+	for (i = 0; i < 256; i++)
 	{
 		float f;
 
 		// convert from nonlinear lightmap space (0..255) to linear space (0..4)
 		// f =  (i / 255.0) * sqrt( 4 );
-		f =  i * (2.0 / 255.0);
+		f = i * (2.0 / 255.0);
 		f = f * f;
 
 		texlighttolinear[i] = f;
@@ -234,50 +234,50 @@ void BuildGammaTable( float gamma, float texGamma, float brightness, int overbri
 
 		// Can't do overbright without texcombine
 		// UNDONE: Add GAMMA ramp to rectify this
-		if ( overbright == 2 )
+		if (overbright == 2)
 		{
 			overbrightFactor = 0.5;
 		}
-		else if ( overbright == 4 )
+		else if (overbright == 4)
 		{
 			overbrightFactor = 0.25;
 		}
 
-		for (i=0 ; i<4096 ; i++)
+		for (i = 0; i < 4096; i++)
 		{
 			// convert from linear 0..4 (x1024) to screen corrected vertex space (0..1?)
-			f = pow ( i/1024.0, 1.0 / gamma );
+			f = pow(i / 1024.0, 1.0 / gamma);
 
 			lineartovertex[i] = f * overbrightFactor;
 			if (lineartovertex[i] > 1)
 				lineartovertex[i] = 1;
 
-			int nLightmap = RoundFloatToInt( f * 255 * overbrightFactor );
-			nLightmap = clamp( nLightmap, 0, 255 );
+			int nLightmap = RoundFloatToInt(f * 255 * overbrightFactor);
+			nLightmap = clamp(nLightmap, 0, 255);
 			lineartolightmap[i] = (unsigned char)nLightmap;
 		}
 	}
 }
 
-float GammaToLinearFullRange( float gamma )
+float GammaToLinearFullRange(float gamma)
 {
-	return pow( gamma, 2.2f );
+	return pow(gamma, 2.2f);
 }
 
-float LinearToGammaFullRange( float linear )
+float LinearToGammaFullRange(float linear)
 {
-	return pow( linear, 1.0f / 2.2f );
+	return pow(linear, 1.0f / 2.2f);
 }
 
-float GammaToLinear( float gamma )
+float GammaToLinear(float gamma)
 {
-	Assert( s_bMathlibInitialized );
-	if ( gamma < 0.0f )
+	Assert(s_bMathlibInitialized);
+	if (gamma < 0.0f)
 	{
 		return 0.0f;
 	}
 
-	if ( gamma >= 0.95f )
+	if (gamma >= 0.95f)
 	{
 		// Use GammaToLinearFullRange maybe if you trip this.
 // X360TEMP
@@ -285,129 +285,129 @@ float GammaToLinear( float gamma )
 		return 1.0f;
 	}
 
-	int index = RoundFloatToInt( gamma * 255.0f );
-	Assert( index >= 0 && index < 256 );
+	int index = RoundFloatToInt(gamma * 255.0f);
+	Assert(index >= 0 && index < 256);
 	return g_Mathlib_GammaToLinear[index];
 }
 
-float LinearToGamma( float linear )
+float LinearToGamma(float linear)
 {
-	Assert( s_bMathlibInitialized );
-	if ( linear < 0.0f )
+	Assert(s_bMathlibInitialized);
+	if (linear < 0.0f)
 	{
 		return 0.0f;
 	}
-	if ( linear > 1.0f )
+	if (linear > 1.0f)
 	{
 		// Use LinearToGammaFullRange maybe if you trip this.
-		Assert( 0 );
+		Assert(0);
 		return 1.0f;
 	}
 
-	int index = RoundFloatToInt( linear * 255.0f );
-	Assert( index >= 0 && index < 256 );
+	int index = RoundFloatToInt(linear * 255.0f);
+	Assert(index >= 0 && index < 256);
 	return g_Mathlib_LinearToGamma[index];
 }
 
 //-----------------------------------------------------------------------------
 // Helper functions to convert between sRGB and 360 gamma space
 //-----------------------------------------------------------------------------
-float SrgbGammaToLinear( float flSrgbGammaValue )
+float SrgbGammaToLinear(float flSrgbGammaValue)
 {
-	float x = clamp( flSrgbGammaValue, 0.0f, 1.0f );
-	return ( x <= 0.04045f ) ? ( x / 12.92f ) : ( pow( ( x + 0.055f ) / 1.055f, 2.4f ) );
+	float x = clamp(flSrgbGammaValue, 0.0f, 1.0f);
+	return (x <= 0.04045f) ? (x / 12.92f) : (pow((x + 0.055f) / 1.055f, 2.4f));
 }
 
-float SrgbLinearToGamma( float flLinearValue )
+float SrgbLinearToGamma(float flLinearValue)
 {
-	float x = clamp( flLinearValue, 0.0f, 1.0f );
-	return ( x <= 0.0031308f ) ? ( x * 12.92f ) : ( 1.055f * pow( x, ( 1.0f / 2.4f ) ) ) - 0.055f;
+	float x = clamp(flLinearValue, 0.0f, 1.0f);
+	return (x <= 0.0031308f) ? (x * 12.92f) : (1.055f * pow(x, (1.0f / 2.4f))) - 0.055f;
 }
 
-float X360GammaToLinear( float fl360GammaValue )
+float X360GammaToLinear(float fl360GammaValue)
 {
 	float flLinearValue;
 
-	fl360GammaValue = clamp( fl360GammaValue, 0.0f, 1.0f );
-	if ( fl360GammaValue < ( 96.0f / 255.0f ) )
+	fl360GammaValue = clamp(fl360GammaValue, 0.0f, 1.0f);
+	if (fl360GammaValue < (96.0f / 255.0f))
 	{
-		if ( fl360GammaValue < ( 64.0f / 255.0f ) )
+		if (fl360GammaValue < (64.0f / 255.0f))
 		{
 			flLinearValue = fl360GammaValue * 255.0f;
 		}
 		else
 		{
-			flLinearValue = fl360GammaValue * ( 255.0f * 2.0f ) - 64.0f;
-			flLinearValue += floor( flLinearValue * ( 1.0f / 512.0f ) );
+			flLinearValue = fl360GammaValue * (255.0f * 2.0f) - 64.0f;
+			flLinearValue += floor(flLinearValue * (1.0f / 512.0f));
 		}
 	}
 	else
 	{
-		if( fl360GammaValue < ( 192.0f / 255.0f ) )
+		if (fl360GammaValue < (192.0f / 255.0f))
 		{
-			flLinearValue = fl360GammaValue * ( 255.0f * 4.0f ) - 256.0f;
-			flLinearValue += floor( flLinearValue * ( 1.0f / 256.0f ) );
+			flLinearValue = fl360GammaValue * (255.0f * 4.0f) - 256.0f;
+			flLinearValue += floor(flLinearValue * (1.0f / 256.0f));
 		}
 		else
 		{
-			flLinearValue = fl360GammaValue * ( 255.0f * 8.0f ) - 1024.0f;
-			flLinearValue += floor( flLinearValue * ( 1.0f / 128.0f ) );
+			flLinearValue = fl360GammaValue * (255.0f * 8.0f) - 1024.0f;
+			flLinearValue += floor(flLinearValue * (1.0f / 128.0f));
 		}
 	}
 
 	flLinearValue *= 1.0f / 1023.0f;
 
-	flLinearValue = clamp( flLinearValue, 0.0f, 1.0f );
+	flLinearValue = clamp(flLinearValue, 0.0f, 1.0f);
 	return flLinearValue;
 }
 
-float X360LinearToGamma( float flLinearValue )
+float X360LinearToGamma(float flLinearValue)
 {
 	float fl360GammaValue;
 
-	flLinearValue = clamp( flLinearValue, 0.0f, 1.0f );
-	if ( flLinearValue < ( 128.0f / 1023.0f ) )
+	flLinearValue = clamp(flLinearValue, 0.0f, 1.0f);
+	if (flLinearValue < (128.0f / 1023.0f))
 	{
-		if ( flLinearValue < ( 64.0f / 1023.0f ) )
+		if (flLinearValue < (64.0f / 1023.0f))
 		{
-			fl360GammaValue = flLinearValue * ( 1023.0f * ( 1.0f / 255.0f ) );
+			fl360GammaValue = flLinearValue * (1023.0f * (1.0f / 255.0f));
 		}
 		else
 		{
-			fl360GammaValue = flLinearValue * ( ( 1023.0f / 2.0f ) * ( 1.0f / 255.0f ) ) + ( 32.0f / 255.0f );
+			fl360GammaValue = flLinearValue * ((1023.0f / 2.0f) * (1.0f / 255.0f)) + (32.0f / 255.0f);
 		}
 	}
 	else
 	{
-		if ( flLinearValue < ( 512.0f / 1023.0f ) )
+		if (flLinearValue < (512.0f / 1023.0f))
 		{
-			fl360GammaValue = flLinearValue * ( ( 1023.0f / 4.0f ) * ( 1.0f / 255.0f ) ) + ( 64.0f / 255.0f );
+			fl360GammaValue = flLinearValue * ((1023.0f / 4.0f) * (1.0f / 255.0f)) + (64.0f / 255.0f);
 		}
 		else
 		{
-			fl360GammaValue = flLinearValue * ( ( 1023.0f /8.0f ) * ( 1.0f / 255.0f ) ) + ( 128.0f /255.0f ); // 1.0 -> 1.0034313725490196078431372549016
-			if ( fl360GammaValue > 1.0f )
+			fl360GammaValue = flLinearValue * ((1023.0f / 8.0f) * (1.0f / 255.0f)) + (128.0f / 255.0f); // 1.0 -> 1.0034313725490196078431372549016
+			if (fl360GammaValue > 1.0f)
 			{
 				fl360GammaValue = 1.0f;
 			}
 		}
 	}
 
-	fl360GammaValue = clamp( fl360GammaValue, 0.0f, 1.0f );
+	fl360GammaValue = clamp(fl360GammaValue, 0.0f, 1.0f);
 	return fl360GammaValue;
 }
 
-float SrgbGammaTo360Gamma( float flSrgbGammaValue )
+float SrgbGammaTo360Gamma(float flSrgbGammaValue)
 {
-	float flLinearValue = SrgbGammaToLinear( flSrgbGammaValue );
-	float fl360GammaValue = X360LinearToGamma( flLinearValue );
+	float flLinearValue = SrgbGammaToLinear(flSrgbGammaValue);
+	float fl360GammaValue = X360LinearToGamma(flLinearValue);
 	return fl360GammaValue;
 }
 
 // convert texture to linear 0..1 value
-float TextureToLinear( int c )
+float TextureToLinear(int c)
 {
-	Assert( s_bMathlibInitialized );
+	Assert(s_bMathlibInitialized);
 	if (c < 0)
 		return 0;
 	if (c > 255)
@@ -417,11 +417,11 @@ float TextureToLinear( int c )
 }
 
 // convert texture to linear 0..1 value
-int LinearToTexture( float f )
+int LinearToTexture(float f)
 {
-	Assert( s_bMathlibInitialized );
+	Assert(s_bMathlibInitialized);
 	int i;
-	i = f * 1023;	// assume 0..1 range
+	i = (int)(f * 1023);	// assume 0..1 range
 	if (i < 0)
 		i = 0;
 	if (i > 1023)
@@ -432,11 +432,11 @@ int LinearToTexture( float f )
 
 
 // converts 0..1 linear value to screen gamma (0..255)
-int LinearToScreenGamma( float f )
+int LinearToScreenGamma(float f)
 {
-	Assert( s_bMathlibInitialized );
+	Assert(s_bMathlibInitialized);
 	int i;
-	i = f * 1023;	// assume 0..1 range
+	i = (int)(f * 1023);	// assume 0..1 range
 	if (i < 0)
 		i = 0;
 	if (i > 1023)
@@ -445,30 +445,30 @@ int LinearToScreenGamma( float f )
 	return lineartoscreen[i];
 }
 
-void ColorRGBExp32ToVector( const ColorRGBExp32& in, Vector3D& out )
+void ColorRGBExp32ToVector(const ColorRGBExp32& in, Vector3D& out)
 {
-	Assert( s_bMathlibInitialized );
+	Assert(s_bMathlibInitialized);
 	// FIXME: Why is there a factor of 255 built into this?
-	out.x = 255.0f * TexLightToLinear( in.r, in.exponent );
-	out.y = 255.0f * TexLightToLinear( in.g, in.exponent );
-	out.z = 255.0f * TexLightToLinear( in.b, in.exponent );
+	out.x = 255.0f * TexLightToLinear(in.r, in.exponent);
+	out.y = 255.0f * TexLightToLinear(in.g, in.exponent);
+	out.z = 255.0f * TexLightToLinear(in.b, in.exponent);
 }
 
 #if 0
 // assumes that the desired mantissa range is 128..255
-static int VectorToColorRGBExp32_CalcExponent( float in )
+static int VectorToColorRGBExp32_CalcExponent(float in)
 {
 	int power = 0;
-	
-	if( in != 0.0f )
+
+	if (in != 0.0f)
 	{
-		while( in > 255.0f )
+		while (in > 255.0f)
 		{
 			power += 1;
 			in *= 0.5f;
 		}
-		
-		while( in < 128.0f )
+
+		while (in < 128.0f)
 		{
 			power -= 1;
 			in *= 2.0f;
@@ -478,51 +478,51 @@ static int VectorToColorRGBExp32_CalcExponent( float in )
 	return power;
 }
 
-void VectorToColorRGBExp32( const Vector& vin, ColorRGBExp32 &c )
+void VectorToColorRGBExp32(const Vector3D& vin, ColorRGBExp32& c)
 {
-	Vector v = vin;
-	Assert( s_bMathlibInitialized );
-	Assert( v.x >= 0.0f && v.y >= 0.0f && v.z >= 0.0f );
-	int i;		
-	float max = v[0];				
-	for( i = 1; i < 3; i++ )
+	Vector3D v = vin;
+	Assert(s_bMathlibInitialized);
+	Assert(v.x >= 0.0f && v.y >= 0.0f && v.z >= 0.0f);
+	int i;
+	float max = v[0];
+	for (i = 1; i < 3; i++)
 	{
 		// Get the maximum value.
-		if( v[i] > max )
+		if (v[i] > max)
 		{
 			max = v[i];
 		}
 	}
-				
+
 	// figure out the exponent for this luxel.
-	int exponent = VectorToColorRGBExp32_CalcExponent( max );
-				
+	int exponent = VectorToColorRGBExp32_CalcExponent(max);
+
 	// make the exponent fits into a signed byte.
-	if( exponent < -128 )
+	if (exponent < -128)
 	{
 		exponent = -128;
 	}
-	else if( exponent > 127 )
+	else if (exponent > 127)
 	{
 		exponent = 127;
 	}
-				
+
 	// undone: optimize with a table
-	float scalar = pow( 2.0f, -exponent );
+	float scalar = pow(2.0f, -exponent);
 	// convert to mantissa x 2^exponent format
-	for( i = 0; i < 3; i++ )
+	for (i = 0; i < 3; i++)
 	{
 		v[i] *= scalar;
 		// clamp
-		if( v[i] > 255.0f )
+		if (v[i] > 255.0f)
 		{
 			v[i] = 255.0f;
 		}
 	}
-	c.r = ( unsigned char )v[0];
-	c.g = ( unsigned char )v[1];
-	c.b = ( unsigned char )v[2];
-	c.exponent = ( signed char )exponent;
+	c.r = (unsigned char)v[0];
+	c.g = (unsigned char)v[1];
+	c.b = (unsigned char)v[2];
+	c.exponent = (signed char)exponent;
 }
 
 #else
@@ -531,7 +531,7 @@ void VectorToColorRGBExp32( const Vector& vin, ColorRGBExp32 &c )
 // for f' = f * 2^e,  f is on [128..255].
 // Uses IEEE 754 representation to directly extract this information
 // from the float.
-inline static int VectorToColorRGBExp32_CalcExponent( const float *pin )
+inline static int VectorToColorRGBExp32_CalcExponent(const float* pin)
 {
 	// The thing we will take advantage of here is that the exponent component
 	// is stored in the float itself, and because we want to map to 128..255, we
@@ -542,12 +542,12 @@ inline static int VectorToColorRGBExp32_CalcExponent( const float *pin )
 	if (*pin == 0.0f)
 		return 0;
 
-	unsigned int fbits = *reinterpret_cast<const unsigned int *>(pin);
-	
+	unsigned int fbits = *reinterpret_cast<const unsigned int*>(pin);
+
 	// the exponent component is bits 23..30, and biased by +127
 	const unsigned int biasedSeven = 7 + 127;
 
-	signed int expComponent = ( fbits & 0x7F800000 ) >> 23;
+	signed int expComponent = (fbits & 0x7F800000) >> 23;
 	expComponent -= biasedSeven; // now the difference from seven (positive if was less than, etc)
 	return expComponent;
 }
@@ -561,15 +561,15 @@ inline static int VectorToColorRGBExp32_CalcExponent( const float *pin )
 /// moving it onto the cell.
 /// \warning: Assumes an IEEE 754 single-precision float representation! Those of you
 /// porting to an 8080 are out of luck.
-void VectorToColorRGBExp32( const Vector3D& vin, ColorRGBExp32 &c )
+void VectorToColorRGBExp32(const Vector3D& vin, ColorRGBExp32& c)
 {
-	Assert( s_bMathlibInitialized );
-	Assert( vin.x >= 0.0f && vin.y >= 0.0f && vin.z >= 0.0f );
+	Assert(s_bMathlibInitialized);
+	Assert(vin.x >= 0.0f && vin.y >= 0.0f && vin.z >= 0.0f);
 
 	// work out which of the channels is the largest ( we will use that to map the exponent )
 	// this is a sluggish branch-based decision tree -- most architectures will offer a [max]
 	// assembly opcode to do this faster.
-	const float *pMax;
+	const float* pMax;
 	if (vin.x > vin.y)
 	{
 		if (vin.x > vin.z)
@@ -594,7 +594,7 @@ void VectorToColorRGBExp32( const Vector3D& vin, ColorRGBExp32 &c )
 	}
 
 	// now work out the exponent for this luxel. 
-	signed int exponent = VectorToColorRGBExp32_CalcExponent( pMax );
+	signed int exponent = VectorToColorRGBExp32_CalcExponent(pMax);
 
 	// make sure the exponent fits into a signed byte.
 	// (in single precision format this is assured because it was a signed byte to begin with)
@@ -604,20 +604,20 @@ void VectorToColorRGBExp32( const Vector3D& vin, ColorRGBExp32 &c )
 	float scalar;
 	{
 		unsigned int fbits = (127 - exponent) << 23;
-		scalar = *reinterpret_cast<float *>(&fbits);
+		scalar = *reinterpret_cast<float*>(&fbits);
 	}
 
-	// We can totally wind up above 255 and that's okay--but above 256 would be right out.
-	Assert(vin.x * scalar < 256.0f && 
-		   vin.y * scalar < 256.0f && 
-		   vin.z * scalar < 256.0f);
+	// we should never need to clamp:
+	Assert(vin.x * scalar <= 255.0f &&
+		vin.y * scalar <= 255.0f &&
+		vin.z * scalar <= 255.0f);
 
 	// This awful construction is necessary to prevent VC2005 from using the 
 	// fldcw/fnstcw control words around every float-to-unsigned-char operation.
 	{
-		int red = (vin.x * scalar);
-		int green = (vin.y * scalar);
-		int blue = (vin.z * scalar);
+		int red = (int)(vin.x * scalar);
+		int green = (int)(vin.y * scalar);
+		int blue = (int)(vin.z * scalar);
 
 		c.r = red;
 		c.g = green;
@@ -629,7 +629,7 @@ void VectorToColorRGBExp32( const Vector3D& vin, ColorRGBExp32 &c )
 	c.b = ( unsigned char )(vin.z * scalar);
 	*/
 
-	c.exponent = ( signed char )exponent;
+	c.exponent = (signed char)exponent;
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/r5dev/mathlib/fltx4.h b/r5dev/mathlib/fltx4.h
new file mode 100644
index 00000000..b091ac56
--- /dev/null
+++ b/r5dev/mathlib/fltx4.h
@@ -0,0 +1,107 @@
+//===== Copyright 1996-2010, Valve Corporation, All rights reserved. ======//
+//
+// Purpose: - defines the type fltx4 - Avoid cyclic includion.
+//
+//===========================================================================//
+
+#ifndef FLTX4_H
+#define FLTX4_H
+
+#if defined(GNUC)
+#define USE_STDC_FOR_SIMD 0
+#else
+#define USE_STDC_FOR_SIMD 0
+#endif
+
+#if (!defined(PLATFORM_PPC) && (USE_STDC_FOR_SIMD == 0))
+#define _SSE1 1
+#endif
+
+// I thought about defining a class/union for the SIMD packed floats instead of using fltx4,
+// but decided against it because (a) the nature of SIMD code which includes comparisons is to blur
+// the relationship between packed floats and packed integer types and (b) not sure that the
+// compiler would handle generating good code for the intrinsics.
+
+#if USE_STDC_FOR_SIMD
+#error "hello"
+typedef union
+{
+	float  m128_f32[4];
+	uint32 m128_u32[4];
+} fltx4;
+
+typedef fltx4 i32x4;
+typedef fltx4 u32x4;
+
+#ifdef _PS3
+typedef fltx4 u32x4;
+typedef fltx4 i32x4;
+#endif
+typedef fltx4 bi32x4;
+
+#elif ( defined( _PS3 ) )
+
+typedef union
+{
+	// This union allows float/int access (which generally shouldn't be done in inner loops)
+
+	vec_float4	vmxf;
+	vec_int4	vmxi;
+	vec_uint4	vmxui;
+#if defined(__SPU__)
+	vec_uint4	vmxbi;
+#else
+	__vector bool vmxbi;
+#endif
+
+	struct
+	{
+		float x;
+		float y;
+		float z;
+		float w;
+	};
+
+	float		m128_f32[4];
+	uint32		m128_u32[4];
+	int32		m128_i32[4];
+
+} fltx4_union;
+
+typedef vec_float4 fltx4;
+typedef vec_uint4  u32x4;
+typedef vec_int4   i32x4;
+
+#if defined(__SPU__)
+typedef vec_uint4 bi32x4;
+#else
+typedef __vector bool bi32x4;
+#endif
+
+#define DIFFERENT_NATIVE_VECTOR_TYPES // true if the compiler has different types for float4, uint4, int4, etc
+
+#elif ( defined( _X360 ) )
+
+typedef union
+{
+	// This union allows float/int access (which generally shouldn't be done in inner loops)
+	__vector4	vmx;
+	float		m128_f32[4];
+	uint32		m128_u32[4];
+} fltx4_union;
+
+typedef __vector4 fltx4;
+typedef __vector4 i32x4; // a VMX register; just a way of making it explicit that we're doing integer ops.
+typedef __vector4 u32x4; // a VMX register; just a way of making it explicit that we're doing unsigned integer ops.
+typedef fltx4 bi32x4;
+#else
+
+typedef __m128 fltx4;
+typedef __m128 i32x4;
+typedef __m128 u32x4;
+typedef __m128i shortx8;
+typedef fltx4 bi32x4;
+
+#endif
+
+#endif
diff --git a/r5dev/mathlib/math_pfns.h b/r5dev/mathlib/math_pfns.h
index cdf7ccd2..268fe37c 100644
--- a/r5dev/mathlib/math_pfns.h
+++ b/r5dev/mathlib/math_pfns.h
@@ -9,11 +9,36 @@
 
 #include <limits>
 
+// YUP_ACTIVE is from Source2. It's (obviously) not supported on this branch, just including it here to help merge camera.cpp/.h and the CSM shadow code.
+//#define YUP_ACTIVE 1
+
+enum MatrixAxisType_t
+{
+#ifdef YUP_ACTIVE
+	FORWARD_AXIS = 2,
+	LEFT_AXIS = 0,
+	UP_AXIS = 1,
+#else
+	FORWARD_AXIS = 0,
+	LEFT_AXIS = 1,
+	UP_AXIS = 2,
+#endif
+
+	X_AXIS = 0,
+	Y_AXIS = 1,
+	Z_AXIS = 2,
+	ORIGIN = 3,
+	PROJECTIVE = 3,
+};
+
 #if defined( _X360 )
 #include <xboxmath.h>
 #elif defined(_PS3)
 
-#ifndef SPU
+#ifdef SPU
+#include <vectormath/c/vectormath_aos.h>
+#include <spu_intrinsics.h>
+#else
 #include <ppu_asm_intrinsics.h>
 #endif
 
@@ -53,17 +78,19 @@
 
 #include <xmmintrin.h>
 
+
+
 // These globals are initialized by mathlib and redirected based on available fpu features
 
 // The following are not declared as macros because they are often used in limiting situations,
 // and sometimes the compiler simply refuses to inline them for some reason
-FORCEINLINE float FastSqrt(float x)
+FORCEINLINE float VECTORCALL FastSqrt(float x)
 {
 	__m128 root = _mm_sqrt_ss(_mm_load_ss(&x));
 	return *(reinterpret_cast<float*>(&root));
 }
 
-FORCEINLINE float FastRSqrtFast(float x)
+FORCEINLINE float VECTORCALL FastRSqrtFast(float x)
 {
 	// use intrinsics
 	__m128 rroot = _mm_rsqrt_ss(_mm_load_ss(&x));
@@ -72,7 +99,7 @@ FORCEINLINE float FastRSqrtFast(float x)
 // Single iteration NewtonRaphson reciprocal square root:
 // 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) 	
 // Very low error, and fine to use in place of 1.f / sqrtf(x).	
-FORCEINLINE float FastRSqrt(float x)
+FORCEINLINE float VECTORCALL FastRSqrt(float x)
 {
 	float rroot = FastRSqrtFast(x);
 	return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
@@ -136,6 +163,7 @@ inline double FastSqrtEst(double x) { return __frsqrte(x) * x; }
 
 #endif // !defined( PLATFORM_PPC ) && !defined(_SPU)
 
+
 // if x is infinite, return FLT_MAX
 inline float FastClampInfinity(float x)
 {
@@ -146,7 +174,19 @@ inline float FastClampInfinity(float x)
 #endif
 }
 
-#if defined (_PS3) && !defined(SPU)
+#if defined (_PS3) 
+
+#if defined(__SPU__)
+
+inline int _rotl(int a, int count)
+{
+	vector signed int vi;
+	vi = spu_promote(a, 0);
+	vi = spu_rl(vi, count);
+	return spu_extract(vi, 0);
+}
+
+#else
 
 // extern float cosvf(float);      /* single precision cosine      */
 // extern float sinvf(float);      /* single precision sine        */
@@ -164,63 +204,6 @@ inline int64 _rotl64(int64 x, int c)
 	return __rldicl(x, c, 0);
 }
 
-//-----------------------------------------------------------------
-// Vector Unions
-//-----------------------------------------------------------------
-
-//-----------------------------------------------------------------
-// Floats
-//-----------------------------------------------------------------
-typedef union
-{
-	vector float vf;
-	float f[4];
-} vector_float_union;
-
-//-----------------------------------------------------------------
-// Ints
-//-----------------------------------------------------------------
-typedef union
-{
-	vector int vi;
-	int i[4];
-} vector_int4_union;
-
-typedef union
-{
-	vector unsigned int vui;
-	unsigned int ui[4];
-} vector_uint4_union;
-
-//-----------------------------------------------------------------
-// Shorts
-//-----------------------------------------------------------------
-typedef union
-{
-	vector signed short vs;
-	signed short s[8];
-} vector_short8_union;
-
-typedef union
-{
-	vector unsigned short vus;
-	unsigned short us[8];
-} vector_ushort8_union;
-
-//-----------------------------------------------------------------
-// Chars
-//-----------------------------------------------------------------
-typedef union
-{
-	vector signed char vc;
-	signed char c[16];
-} vector_char16_union;
-
-typedef union
-{
-	vector unsigned char vuc;
-	unsigned char uc[16];
-} vector_uchar16_union;
 
 /*
 FORCEINLINE float _VMX_Sqrt( float x )
@@ -277,6 +260,95 @@ FORCEINLINE float _VMX_Cos(float a)
 #define FastSinCos(x,s,c)	_VMX_SinCos(x,s,c)
 #define FastCos(x)			_VMX_Cos(x)
 */
+
+#endif
+
+
+#if defined(__SPU__)
+
+// do we need these optimized yet?
+
+FORCEINLINE float FastSqrt(float x)
+{
+	return sqrtf(x);
+}
+
+FORCEINLINE float FastRSqrt(float x)
+{
+	float rroot = 1.f / (sqrtf(x) + FLT_EPSILON);
+	return rroot;
+}
+
+
+#define FastRSqrtFast(x)	FastRSqrt(x)
+
+
+#endif
+
+
+
+//-----------------------------------------------------------------
+// Vector Unions
+//-----------------------------------------------------------------
+
+//-----------------------------------------------------------------
+// Floats
+//-----------------------------------------------------------------
+typedef union
+{
+	vector float vf;
+	float f[4];
+} vector_float_union;
+
+#if !defined(__SPU__)
+//-----------------------------------------------------------------
+// Ints
+//-----------------------------------------------------------------
+typedef union
+{
+	vector int vi;
+	int i[4];
+} vector_int4_union;
+
+typedef union
+{
+	vector unsigned int vui;
+	unsigned int ui[4];
+} vector_uint4_union;
+
+//-----------------------------------------------------------------
+// Shorts
+//-----------------------------------------------------------------
+typedef union
+{
+	vector signed short vs;
+	signed short s[8];
+} vector_short8_union;
+
+typedef union
+{
+	vector unsigned short vus;
+	unsigned short us[8];
+} vector_ushort8_union;
+
+//-----------------------------------------------------------------
+// Chars
+//-----------------------------------------------------------------
+typedef union
+{
+	vector signed char vc;
+	signed char c[16];
+} vector_char16_union;
+
+typedef union
+{
+	vector unsigned char vuc;
+	unsigned char uc[16];
+} vector_uchar16_union;
+#endif
+
+
+
 #endif	// _PS3
 #endif	// #ifndef SPU
 
diff --git a/r5dev/mathlib/mathlib.h b/r5dev/mathlib/mathlib.h
index 0a9ff11f..ad11b2f6 100644
--- a/r5dev/mathlib/mathlib.h
+++ b/r5dev/mathlib/mathlib.h
@@ -11,9 +11,8 @@
 #include "mathlib/vector.h"
 #include "mathlib/vector2d.h"
 #include "tier0/dbg.h"
-
 #include "mathlib/math_pfns.h"
-#include "mathlib/bits.h"
+#include "mathlib/fltx4.h"
 
 #ifndef ALIGN8_POST
 #define ALIGN8_POST
@@ -21,68 +20,19 @@
 
 #if defined(_PS3)
 
+#if defined(__SPU__)
+#include <spu_intrinsics.h>
+#include <vmx2spu.h>
+#include <vectormath/c/vectormath_soa.h>
+#else
 #include <ppu_intrinsics.h>
 #include <altivec.h>
-#include <Vectormath/c/Vectormath_soa.h>
+#include <vectormath/c/vectormath_soa.h>
+#endif
+#include <mathlib/ssemath.h>
 
 #endif
 
-//
-// Returns a clamped value in the range [min, max].
-//
-template< class T >
-inline T clamp(T const& val, T const& minVal, T const& maxVal)
-{
-	if (maxVal < minVal)
-		return maxVal;
-	else if (val < minVal)
-		return minVal;
-	else if (val > maxVal)
-		return maxVal;
-	else
-		return val;
-}
-#define fsel(c,x,y) ( (c) >= 0 ? (x) : (y) )
-
-// integer conditional move
-// if a >= 0, return x, else y
-#define isel(a,x,y) ( ((a) >= 0) ? (x) : (y) )
-
-// if x = y, return a, else b
-#define ieqsel(x,y,a,b) (( (x) == (y) ) ? (a) : (b))
-
-// if the nth bit of a is set (counting with 0 = LSB),
-// return x, else y
-// this is fast if nbit is a compile-time immediate 
-#define ibitsel(a, nbit, x, y) ( ( ((a) & (1 << (nbit))) != 0 ) ? (x) : (y) )
-
-
-FORCEINLINE double fpmin(double a, double b)
-{
-	return a > b ? b : a;
-}
-
-FORCEINLINE double fpmax(double a, double b)
-{
-	return a >= b ? a : b;
-}
-
-// clamp x to lie inside [a,b]. Assumes b>a
-FORCEINLINE float fclamp(float x, float a, float b)
-{
-	return fpmin(fpmax(x, a), b);
-}
-// clamp x to lie inside [a,b]. Assumes b>a
-FORCEINLINE double fclamp(double x, double a, double b)
-{
-	return fpmin(fpmax(x, a), b);
-}
-
-// At some point, we will need a unified API.
-#define imin( x, y ) ( (x) < (y) ? (x) : (y) )
-#define imax( x, y ) ( (x) > (y) ? (x) : (y) )
-#define iclamp clamp
-
 // plane_t structure
 // !!! if this is changed, it must be changed in asm code too !!!
 // FIXME: does the asm code even exist anymore?
@@ -95,7 +45,7 @@ struct cplane_t
 	byte	signbits;		// signx + (signy<<1) + (signz<<1)
 	byte	pad[2];
 
-#ifdef Vector_NO_SLOW_OPERATIONS
+#ifdef VECTOR_NO_SLOW_OPERATIONS
 	cplane_t() {}
 
 private:
@@ -142,26 +92,7 @@ enum
 };
 
 extern int SignbitsForPlane(cplane_t* out);
-
-class Frustum_t
-{
-public:
-	void SetPlane(int i, int nType, const Vector3D& vecNormal, float dist)
-	{
-		m_Plane[i].normal = vecNormal;
-		m_Plane[i].dist = dist;
-		m_Plane[i].type = nType;
-		m_Plane[i].signbits = SignbitsForPlane(&m_Plane[i]);
-		m_AbsNormal[i].Init(fabs(vecNormal.x), fabs(vecNormal.y), fabs(vecNormal.z));
-	}
-
-	inline const cplane_t* GetPlane(int i) const { return &m_Plane[i]; }
-	inline const Vector3D& GetAbsNormal(int i) const { return m_AbsNormal[i]; }
-
-private:
-	cplane_t	m_Plane[FRUSTUM_NUMPLANES];
-	Vector3D	m_AbsNormal[FRUSTUM_NUMPLANES];
-};
+class Frustum_t;
 
 // Computes Y fov from an X fov and a screen aspect ratio + X from Y
 float CalcFovY(float flFovX, float flScreenAspect);
@@ -171,12 +102,13 @@ float CalcFovX(float flFovY, float flScreenAspect);
 // NOTE: FOV is specified in degrees, as the *full* view angle (not half-angle)
 class VPlane;
 void GeneratePerspectiveFrustum(const Vector3D& origin, const QAngle& angles, float flZNear, float flZFar, float flFovX, float flAspectRatio, Frustum_t& frustum);
-void GeneratePerspectiveFrustum(const Vector3D& origin, const Vector3D& forward, const Vector3D& right, const Vector3D& up, float flZNear, float flZFar, float flFovX, float flFovY, Frustum_t& frustum);
+void GeneratePerspectiveFrustum(const Vector3D& origin, const Vector3D& forward, const Vector3D& right, const Vector3D& up, float flZNear, float flZFar, float flFovX, float flFovY, VPlane* pPlanesOut);
 // Cull the world-space bounding box to the specified frustum.
-// bool R_CullBox( const Vector3D& mins, const Vector3D& maxs, const Frustum_t &frustum );
-// bool R_CullBoxSkipNear( const Vector3D& mins, const Vector3D& maxs, const Frustum_t &frustum );
+// bool R_CullBox( const Vector& mins, const Vector& maxs, const Frustum_t &frustum );
+// bool R_CullBoxSkipNear( const Vector& mins, const Vector& maxs, const Frustum_t &frustum );
 void GenerateOrthoFrustum(const Vector3D& origin, const Vector3D& forward, const Vector3D& right, const Vector3D& up, float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar, VPlane* pPlanesOut);
 
+class CTransform;
 class matrix3x4a_t;
 
 struct matrix3x4_t
@@ -192,6 +124,14 @@ struct matrix3x4_t
 		m_flMatVal[2][0] = m20;	m_flMatVal[2][1] = m21; m_flMatVal[2][2] = m22; m_flMatVal[2][3] = m23;
 	}
 
+	/// Creates a matrix where the X axis = forward the Y axis = left, and the Z axis = up
+	void InitXYZ(const Vector3D& xAxis, const Vector3D& yAxis, const Vector3D& zAxis, const Vector3D& vecOrigin)
+	{
+		m_flMatVal[0][0] = xAxis.x; m_flMatVal[0][1] = yAxis.x; m_flMatVal[0][2] = zAxis.x; m_flMatVal[0][3] = vecOrigin.x;
+		m_flMatVal[1][0] = xAxis.y; m_flMatVal[1][1] = yAxis.y; m_flMatVal[1][2] = zAxis.y; m_flMatVal[1][3] = vecOrigin.y;
+		m_flMatVal[2][0] = xAxis.z; m_flMatVal[2][1] = yAxis.z; m_flMatVal[2][2] = zAxis.z; m_flMatVal[2][3] = vecOrigin.z;
+	}
+
 	//-----------------------------------------------------------------------------
 	// Creates a matrix where the X axis = forward
 	// the Y axis = left, and the Z axis = up
@@ -212,6 +152,27 @@ struct matrix3x4_t
 		Init(xAxis, yAxis, zAxis, vecOrigin);
 	}
 
+	inline void InitFromQAngles(const QAngle& angles, const Vector3D& vPosition);
+	inline void InitFromQAngles(const QAngle& angles);
+	inline void InitFromRadianEuler(const RadianEuler& angles, const Vector3D& vPosition);
+	inline void InitFromRadianEuler(const RadianEuler& angles);
+	inline void InitFromCTransform(const CTransform& transform);
+	inline void InitFromQuaternion(const Quaternion& orientation, const Vector3D& vPosition);
+	inline void InitFromQuaternion(const Quaternion& orientation);
+	inline void InitFromDiagonal(const Vector3D& vDiagonal);
+
+	inline Quaternion ToQuaternion() const;
+	inline QAngle ToQAngle() const;
+	inline CTransform ToCTransform() const;
+
+	inline void SetToIdentity();
+
+	/// multiply the scale/rot part of the matrix by a constant. This doesn't init the matrix ,
+	/// just scale in place. So if you want to construct a scaling matrix, init to identity and
+	/// then call this.
+	FORCEINLINE void ScaleUpper3x3Matrix(float flScale);
+
+	/// modify the origin
 	inline void SetOrigin(Vector3D const& p)
 	{
 		m_flMatVal[0][3] = p.x;
@@ -219,6 +180,13 @@ struct matrix3x4_t
 		m_flMatVal[2][3] = p.z;
 	}
 
+	/// return the origin
+	inline Vector3D GetOrigin(void) const
+	{
+		Vector3D vecRet(m_flMatVal[0][3], m_flMatVal[1][3], m_flMatVal[2][3]);
+		return vecRet;
+	}
+
 	inline void Invalidate(void)
 	{
 		for (int i = 0; i < 3; i++)
@@ -230,6 +198,60 @@ struct matrix3x4_t
 		}
 	}
 
+	/// check all components for invalid floating point values
+	inline bool IsValid(void) const
+	{
+		for (int i = 0; i < 3; i++)
+		{
+			for (int j = 0; j < 4; j++)
+			{
+				if (!IsFinite(m_flMatVal[i][j]))
+					return false;
+			}
+		}
+		return true;
+	}
+
+	bool operator==(const matrix3x4_t& other) const
+	{
+		return memcmp(this, &other, sizeof(matrix3x4_t)) == 0;
+	}
+
+	bool operator!=(const matrix3x4_t& other) const
+	{
+		return memcmp(this, &other, sizeof(matrix3x4_t)) != 0;
+	}
+
+	inline bool IsEqualTo(const matrix3x4_t& other, float flTolerance = 1e-5f) const;
+
+	inline void GetBasisVectorsFLU(Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp) const;
+	inline Vector3D TransformVector(const Vector3D& v0) const;
+	inline Vector3D RotateVector(const Vector3D& v0) const;
+	inline Vector3D TransformVectorByInverse(const Vector3D& v0) const;
+	inline Vector3D RotateVectorByInverse(const Vector3D& v0) const;
+	inline Vector3D RotateExtents(const Vector3D& vBoxExtents) const; // these are extents and must remain positive/symmetric after rotation
+	inline void TransformAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const;
+	inline void TransformAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const;
+	inline void RotateAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const;
+	inline void RotateAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const;
+	inline void TransformPlane(const cplane_t& inPlane, cplane_t& outPlane) const;
+	inline void TransformPlaneByInverse(const cplane_t& inPlane, cplane_t& outPlane) const;
+	inline float GetOrthogonalityError() const;
+	inline float GetDeterminant()const;
+	inline float GetSylvestersCriterion()const; // for symmetrical matrices only: should be >0 iff it's a positive definite matrix
+
+	inline Vector3D GetColumn(MatrixAxisType_t nColumn) const;
+	inline void SetColumn(const Vector3D& vColumn, MatrixAxisType_t nColumn);
+	inline Vector3D GetForward() const { return GetColumn(FORWARD_AXIS); }
+	inline Vector3D GetLeft() const { return GetColumn(LEFT_AXIS); }
+	inline Vector3D GetUp() const { return GetColumn(UP_AXIS); }
+	inline Vector3D GetRow(int nRow) const { return *(Vector3D*)(m_flMatVal[nRow]); }
+	inline void SetRow(int nRow, const Vector3D& vRow) { m_flMatVal[nRow][0] = vRow.x; m_flMatVal[nRow][1] = vRow.y; m_flMatVal[nRow][2] = vRow.z; }
+
+	inline void InverseTR(matrix3x4_t& out) const;
+	inline matrix3x4_t InverseTR() const;
+
+
 	float* operator[](int i) { Assert((i >= 0) && (i < 3)); return m_flMatVal[i]; }
 	const float* operator[](int i) const { Assert((i >= 0) && (i < 3)); return m_flMatVal[i]; }
 	float* Base() { return &m_flMatVal[0][0]; }
@@ -244,14 +266,50 @@ public:
 	/*
 	matrix3x4a_t() { if (((size_t)Base()) % 16 != 0) { Error( "matrix3x4a_t missaligned" ); } }
 	*/
+	matrix3x4a_t(const matrix3x4_t& src) { *this = src; };
 	matrix3x4a_t& operator=(const matrix3x4_t& src) { memcpy(Base(), src.Base(), sizeof(float) * 3 * 4); return *this; };
+
+	matrix3x4a_t(
+		float m00, float m01, float m02, float m03,
+		float m10, float m11, float m12, float m13,
+		float m20, float m21, float m22, float m23)
+	{
+		AssertDbg(((size_t)Base() & 0xf) == 0);
+		m_flMatVal[0][0] = m00;	m_flMatVal[0][1] = m01; m_flMatVal[0][2] = m02; m_flMatVal[0][3] = m03;
+		m_flMatVal[1][0] = m10;	m_flMatVal[1][1] = m11; m_flMatVal[1][2] = m12; m_flMatVal[1][3] = m13;
+		m_flMatVal[2][0] = m20;	m_flMatVal[2][1] = m21; m_flMatVal[2][2] = m22; m_flMatVal[2][3] = m23;
+	}
+	matrix3x4a_t() {}
+
+	static FORCEINLINE bool TypeIsAlignedForSIMD(void) { return true; }
+
+
+	// raw data simd accessor
+	FORCEINLINE fltx4& SIMDRow(uint nIdx) { AssertDbg(nIdx < 3); return *((fltx4*)(&(m_flMatVal[nIdx]))); }
+	FORCEINLINE const fltx4& SIMDRow(uint nIdx) const { AssertDbg(nIdx < 3); return *((const fltx4*)(&(m_flMatVal[nIdx]))); }
+
 } ALIGN16_POST;
 
+
+FORCEINLINE void matrix3x4_t::ScaleUpper3x3Matrix(float flScale)
+{
+	for (int i = 0; i < 3; i++)
+	{
+		for (int j = 0; j < 3; j++)
+		{
+			m_flMatVal[i][j] *= flScale;
+		}
+	}
+}
+
+
 #ifndef M_PI
 #define M_PI		3.14159265358979323846	// matches value in gcc v2 math.h
 #endif
 
-#define M_PI_F		((float)(M_PI))	// Shouldn't collide with anything.
+#ifndef M_PI_F
+#define M_PI_F		((float)(M_PI))
+#endif
 
 // NJS: Inlined to prevent floats from being autopromoted to doubles, as with the old system.
 #ifndef RAD2DEG
@@ -282,6 +340,7 @@ enum Sides
 
 extern bool s_bMathlibInitialized;
 
+extern const matrix3x4a_t g_MatrixIdentity;
 extern  const Vector3D vec3_origin;
 extern  const QAngle vec3_angle;
 extern	const Quaternion quat_identity;
@@ -359,7 +418,7 @@ inline void VectorNegate(vec_t* a)
 
 // NJS: Some functions in VBSP still need to use these for dealing with mixing vec4's and shorts with vec_t's.
 // remove when no longer needed.
-#define Vector_COPY( A, B ) do { (B)[0] = (A)[0]; (B)[1] = (A)[1]; (B)[2]=(A)[2]; } while(0)
+#define VECTOR_COPY( A, B ) do { (B)[0] = (A)[0]; (B)[1] = (A)[1]; (B)[2]=(A)[2]; } while(0)
 #define DOT_PRODUCT( A, B ) ( (A)[0]*(B)[0] + (A)[1]*(B)[1] + (A)[2]*(B)[2] )
 
 FORCEINLINE void VectorMAInline(const float* start, float scale, const float* direction, float* dest)
@@ -396,6 +455,21 @@ inline float VectorLength(const float* v)
 
 void CrossProduct(const float* v1, const float* v2, float* cross);
 
+inline float CrossProductX(const Vector3D& v1, const Vector3D& v2)
+{
+	return v1.y * v2.z - v1.z * v2.y;
+}
+
+inline float CrossProductY(const Vector3D& v1, const Vector3D& v2)
+{
+	return v1.z * v2.x - v1.x * v2.z;
+}
+
+inline float CrossProductZ(const Vector3D& v1, const Vector3D& v2)
+{
+	return v1.x * v2.y - v1.y * v2.x;
+}
+
 qboolean VectorsEqual(const float* v1, const float* v2);
 
 inline vec_t RoundInt(vec_t in)
@@ -403,7 +477,7 @@ inline vec_t RoundInt(vec_t in)
 	return floor(in + 0.5f);
 }
 
-int Q_log2(int val);
+size_t Q_log2(unsigned int val);
 
 // Math routines done in optimized assembly math package routines
 void inline SinCos(float radians, float* RESTRICT sine, float* RESTRICT cosine)
@@ -412,8 +486,8 @@ void inline SinCos(float radians, float* RESTRICT sine, float* RESTRICT cosine)
 	XMScalarSinCos(sine, cosine, radians);
 #elif defined( _PS3 )
 #if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
-	Vector_float_union s;
-	Vector_float_union c;
+	vector_float_union s;
+	vector_float_union c;
 
 	vec_float4 rad = vec_splats(radians);
 	vec_float4 sin;
@@ -427,9 +501,9 @@ void inline SinCos(float radians, float* RESTRICT sine, float* RESTRICT cosine)
 	*sine = s.f[0];
 	*cosine = c.f[0];
 #else //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1
-	Vector_float_union r;
-	Vector_float_union s;
-	Vector_float_union c;
+	vector_float_union r;
+	vector_float_union s;
+	vector_float_union c;
 
 	vec_float4 rad;
 	vec_float4 sin;
@@ -476,6 +550,10 @@ extern float SinCosTable[SIN_TABLE_SIZE];
 
 inline float TableCos(float theta)
 {
+#if defined( LINUX )
+	return cos(theta); // under the GCC compiler the float-represented-as-an-int causes an internal compiler error
+#else
+
 	union
 	{
 		int i;
@@ -485,10 +563,14 @@ inline float TableCos(float theta)
 	// ideally, the following should compile down to: theta * constant + constant, changing any of these constants from defines sometimes fubars this.
 	ftmp.f = theta * (float)(SIN_TABLE_SIZE / (2.0f * M_PI)) + (FTOIBIAS + (SIN_TABLE_SIZE / 4));
 	return SinCosTable[ftmp.i & (SIN_TABLE_SIZE - 1)];
+#endif
 }
 
 inline float TableSin(float theta)
 {
+#if defined( LINUX )
+	return sin(theta); // under the GCC compiler the float-represented-as-an-int causes an internal compiler error
+#else
 	union
 	{
 		int i;
@@ -498,6 +580,7 @@ inline float TableSin(float theta)
 	// ideally, the following should compile down to: theta * constant + constant
 	ftmp.f = theta * (float)(SIN_TABLE_SIZE / (2.0f * M_PI)) + FTOIBIAS;
 	return SinCosTable[ftmp.i & (SIN_TABLE_SIZE - 1)];
+#endif
 }
 
 template<class T>
@@ -551,16 +634,25 @@ enum
 	ROLL		// fall over
 };
 
+void MatrixVectorsFLU(const matrix3x4_t& matrix, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp);
 void MatrixAngles(const matrix3x4_t& matrix, float* angles); // !!!!
 void MatrixVectors(const matrix3x4_t& matrix, Vector3D* pForward, Vector3D* pRight, Vector3D* pUp);
-void VectorTransform(const float* in1, const matrix3x4_t& in2, float* out);
+void VectorTransform(const float* RESTRICT in1, const matrix3x4_t& in2, float* RESTRICT out);
 void VectorITransform(const float* in1, const matrix3x4_t& in2, float* out);
-void VectorRotate(const float* in1, const matrix3x4_t& in2, float* out);
+void VectorRotate(const float* RESTRICT in1, const matrix3x4_t& in2, float* RESTRICT out);
 void VectorRotate(const Vector3D& in1, const QAngle& in2, Vector3D& out);
 void VectorRotate(const Vector3D& in1, const Quaternion& in2, Vector3D& out);
-void VectorIRotate(const float* in1, const matrix3x4_t& in2, float* out);
+void VectorIRotate(const float* RESTRICT in1, const matrix3x4_t& in2, float* RESTRICT out);
 
-#ifndef Vector_NO_SLOW_OPERATIONS
+inline const Vector3D VectorRotate(const Vector3D& vIn1, const Quaternion& qIn2)
+{
+	Vector3D out;
+	VectorRotate(vIn1, qIn2, out);
+	return out;
+}
+
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
 
 QAngle TransformAnglesToLocalSpace(const QAngle& angles, const matrix3x4_t& parentMatrix);
 QAngle TransformAnglesToWorldSpace(const QAngle& angles, const matrix3x4_t& parentMatrix);
@@ -581,7 +673,7 @@ void MatrixSetColumn(const Vector3D& in, int column, matrix3x4_t& out);
 void ConcatRotations(const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out);
 void ConcatTransforms(const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out);
 // faster version assumes m0, m1, out are 16-byte aligned addresses
-void ConcatTransforms_Aligned(const matrix3x4_t& m0, const matrix3x4_t& m1, matrix3x4_t& out);
+void ConcatTransforms_Aligned(const matrix3x4a_t& m0, const matrix3x4a_t& m1, matrix3x4a_t& out);
 
 // For identical interface w/ VMatrix
 inline void MatrixMultiply(const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out)
@@ -605,20 +697,242 @@ float QuaternionDotProduct(const Quaternion& p, const Quaternion& q);
 void QuaternionConjugate(const Quaternion& p, Quaternion& q);
 void QuaternionInvert(const Quaternion& p, Quaternion& q);
 float QuaternionNormalize(Quaternion& q);
+void QuaternionMultiply(const Quaternion& q, const Vector3D& v, Vector3D& result);
 void QuaternionAdd(const Quaternion& p, const Quaternion& q, Quaternion& qt);
 void QuaternionMult(const Quaternion& p, const Quaternion& q, Quaternion& qt);
 void QuaternionMatrix(const Quaternion& q, matrix3x4_t& matrix);
 void QuaternionMatrix(const Quaternion& q, const Vector3D& pos, matrix3x4_t& matrix);
+void QuaternionMatrix(const Quaternion& q, const Vector3D& pos, const Vector3D& vScale, matrix3x4_t& mat);
 void QuaternionAngles(const Quaternion& q, QAngle& angles);
 void AngleQuaternion(const QAngle& angles, Quaternion& qt);
 void QuaternionAngles(const Quaternion& q, RadianEuler& angles);
+void QuaternionVectorsFLU(Quaternion const& q, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp);
+void QuaternionVectorsForward(const Quaternion& q, Vector3D* pForward);
 void AngleQuaternion(RadianEuler const& angles, Quaternion& qt);
 void QuaternionAxisAngle(const Quaternion& q, Vector3D& axis, float& angle);
 void AxisAngleQuaternion(const Vector3D& axis, float angle, Quaternion& q);
 void BasisToQuaternion(const Vector3D& vecForward, const Vector3D& vecRight, const Vector3D& vecUp, Quaternion& q);
 void MatrixQuaternion(const matrix3x4_t& mat, Quaternion& q);
 
-// A couple methods to find the dot product of a Vector3D with a matrix row or column...
+
+void MatrixQuaternionFast(const matrix3x4_t& mat, Quaternion& q);
+void MatrixPosition(const matrix3x4_t& matrix, Vector3D& position);
+Vector3D MatrixNormalize(const matrix3x4_t& in, matrix3x4_t& out);
+
+inline void MatrixQuaternion(const matrix3x4_t& mat, Quaternion& q, Vector3D& o)
+{
+	MatrixQuaternion(mat, q);
+	MatrixPosition(mat, o);
+}
+
+
+
+float MatrixQuaternionTest(uint);
+float MatrixQuaternionTest2(uint);
+
+/// qt = p + s * q
+void QuaternionAccumulate(const Quaternion& p, float s, const Quaternion& q, Quaternion& qt);
+
+/// qt = ( s * p ) * q
+void QuaternionSM(float s, const Quaternion& p, const Quaternion& q, Quaternion& qt);
+
+/// qt = p * ( s * q )
+void QuaternionMA(const Quaternion& p, float s, const Quaternion& q, Quaternion& qt);
+
+/*
+//-----------------------------------------------------------------------------
+// Quaternion equality with tolerance
+//-----------------------------------------------------------------------------
+inline bool QuaternionsAreEqualInternal( const Quaternion& src1, const Quaternion& src2, float flTolerance )
+{
+	if ( !FloatsAreEqual( src1.x, src2.x, flTolerance ) )
+		return false;
+
+	if ( !FloatsAreEqual( src1.y, src2.y, flTolerance ) )
+		return false;
+
+	if ( !FloatsAreEqual( src1.z, src2.z, flTolerance ) )
+		return false;
+
+	return FloatsAreEqual( src1.w, src2.w, flTolerance );
+}
+
+inline bool QuaternionsAreEqual( const Quaternion& src1, const Quaternion& src2, float flTolerance )
+{
+	if ( QuaternionsAreEqualInternal( src1, src2, flTolerance ) )
+		return true;
+
+	// negated quaternions are also 'equal'
+	Quaternion src2neg( -src2.x, -src2.y, -src2.z, -src2.w );
+	return QuaternionsAreEqualInternal( src1, src2neg, flTolerance );
+}
+*/
+inline const Quaternion GetNormalized(const Quaternion& q)
+{
+	float flInv = 1.0f / sqrtf(q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w);
+	return Quaternion(q.x * flInv, q.y * flInv, q.z * flInv, q.w * flInv);
+}
+
+inline const Quaternion AngleQuaternion(const QAngle& angles)
+{
+	Quaternion qt;
+	AngleQuaternion(angles, qt);
+	return qt;
+}
+
+
+inline const Quaternion AngleQuaternion(RadianEuler const& angles)
+{
+	Quaternion qt;
+	AngleQuaternion(angles, qt);
+	return qt;
+}
+
+
+
+inline Quaternion QuaternionFromPitchYawRoll(float flPitch, float flYaw, float flRoll)
+{
+	QAngle ang(flPitch, flYaw, flRoll);
+
+	Quaternion q;
+	AngleQuaternion(ang, q);
+	return q;
+}
+
+inline Quaternion QuaternionAddPitch(const Quaternion& q, float flPitch)
+{
+	// FIXME: I know this can be made *tons* faster, but I just want to get something working quickly
+	// that matches being able to add to the pitch of a QAngles so I can expose Quats to script/game code
+	QAngle ang;
+	QuaternionAngles(q, ang);
+	ang[PITCH] += flPitch;
+
+	Quaternion res;
+	AngleQuaternion(ang, res);
+	return res;
+}
+
+inline Quaternion QuaternionAddYaw(const Quaternion& q, float flYaw)
+{
+	// FIXME: I know this can be made *tons* faster, but I just want to get something working quickly
+	// that matches being able to add to the yaw of a QAngles so I can expose Quats to script/game code
+	QAngle ang;
+	QuaternionAngles(q, ang);
+	ang[YAW] += flYaw;
+
+	Quaternion res;
+	AngleQuaternion(ang, res);
+	return res;
+}
+
+inline Quaternion QuaternionAddRoll(const Quaternion& q, float flRoll)
+{
+	// FIXME: I know this can be made *tons* faster, but I just want to get something working quickly
+	// that matches being able to add to the roll of a QAngles so I can expose Quats to script/game code
+	QAngle ang;
+	QuaternionAngles(q, ang);
+	ang[ROLL] += flRoll;
+
+	Quaternion res;
+	AngleQuaternion(ang, res);
+	return res;
+}
+
+inline const Quaternion MatrixQuaternion(const matrix3x4_t& mat)
+{
+	Quaternion tmp;
+	MatrixQuaternion(mat, tmp);
+	return tmp;
+}
+
+inline const Quaternion MatrixQuaternionFast(const matrix3x4_t& mat)
+{
+	Quaternion tmp;
+	MatrixQuaternionFast(mat, tmp);
+	return tmp;
+}
+
+inline const matrix3x4_t QuaternionMatrix(const Quaternion& q)
+{
+	matrix3x4_t mat;
+	QuaternionMatrix(q, mat);
+	return mat;
+}
+
+inline const matrix3x4_t QuaternionMatrix(const Quaternion& q, const Vector3D& pos)
+{
+	matrix3x4_t mat;
+	QuaternionMatrix(q, pos, mat);
+	return mat;
+}
+
+//! Shortest-arc quaternion that rotates vector v1 into vector v2
+const Quaternion RotateBetween(const Vector3D& v1, const Vector3D& v2);
+
+inline const Quaternion QuaternionConjugate(const Quaternion& p)
+{
+	Quaternion q;
+	QuaternionConjugate(p, q);
+	return q;
+}
+
+inline const Quaternion QuaternionInvert(const Quaternion& p)
+{
+	Quaternion q;
+	QuaternionInvert(p, q);
+	return q;
+}
+
+
+
+
+
+/// Actual quaternion multiplication; NOTE: QuaternionMult aligns quaternions first, so that q *
+/// conjugate(q) may be -1 instead of 1!
+inline const Quaternion operator * (const Quaternion& p, const Quaternion& q)
+{
+	Quaternion qt;
+	qt.x = p.x * q.w + p.y * q.z - p.z * q.y + p.w * q.x;
+	qt.y = -p.x * q.z + p.y * q.w + p.z * q.x + p.w * q.y;
+	qt.z = p.x * q.y - p.y * q.x + p.z * q.w + p.w * q.z;
+	qt.w = -p.x * q.x - p.y * q.y - p.z * q.z + p.w * q.w;
+	return qt;
+}
+
+inline Quaternion& operator *= (Quaternion& p, const Quaternion& q)
+{
+	QuaternionMult(p, q, p);
+	return p;
+}
+
+inline const matrix3x4_t ConcatTransforms(const matrix3x4_t& in1, const matrix3x4_t& in2)
+{
+	matrix3x4_t out;
+	ConcatTransforms(in1, in2, out);
+	return out;
+}
+
+inline const matrix3x4_t operator *(const matrix3x4_t& in1, const matrix3x4_t& in2)
+{
+	matrix3x4_t out;
+	ConcatTransforms(in1, in2, out);
+	return out;
+}
+
+
+inline const matrix3x4_t MatrixInvert(const matrix3x4_t& in)
+{
+	matrix3x4_t out;
+	::MatrixInvert(in, out);
+	return out;
+}
+
+inline const Vector3D MatrixGetColumn(const matrix3x4_t& in, MatrixAxisType_t nColumn)
+{
+	return in.GetColumn(nColumn);
+}
+
+// A couple methods to find the dot product of a vector with a matrix row or column...
 inline float MatrixRowDotProduct(const matrix3x4_t& in1, int row, const Vector3D& in2)
 {
 	Assert((row >= 0) && (row < 3));
@@ -755,7 +1069,7 @@ static inline float FLerp(float f1, float f2, float i1, float i2, float x)
 }
 
 
-#ifndef Vector_NO_SLOW_OPERATIONS
+#ifndef VECTOR_NO_SLOW_OPERATIONS
 
 // YWB:  Specialization for interpolating euler angles via quaternions...
 template<> FORCEINLINE QAngle Lerp<QAngle>(float flPercent, const QAngle& q1, const QAngle& q2)
@@ -809,7 +1123,7 @@ template<> FORCEINLINE QAngleByValue Lerp<QAngleByValue>(float flPercent, const
 	return output;
 }
 
-#endif // Vector_NO_SLOW_OPERATIONS
+#endif // VECTOR_NO_SLOW_OPERATIONS
 
 
 // Swap two of anything.
@@ -829,7 +1143,7 @@ template <class T> FORCEINLINE T AVG(T a, T b)
 // number of elements in an array of static size
 #define NELEMS(x) ((sizeof(x))/sizeof(x[0]))
 
-// XYZ macro, for printf type functions - ex printf("%f %f %f",XYZ(myVector));
+// XYZ macro, for printf type functions - ex printf("%f %f %f",XYZ(myvector));
 #define XYZ(v) (v).x,(v).y,(v).z
 
 
@@ -897,12 +1211,13 @@ int InsideOut(int nTotal, int nCounter);
 		BoxOnPlaneSide( (emins), (emaxs), (p)))
 
 //-----------------------------------------------------------------------------
-// FIXME: Vector3D versions.... the float versions will go away hopefully soon!
+// FIXME: Vector versions.... the float versions will go away hopefully soon!
 //-----------------------------------------------------------------------------
 
 void AngleVectors(const QAngle& angles, Vector3D* forward);
 void AngleVectors(const QAngle& angles, Vector3D* forward, Vector3D* right, Vector3D* up);
 void AngleVectorsTranspose(const QAngle& angles, Vector3D* forward, Vector3D* right, Vector3D* up);
+void AngleVectorsFLU(const QAngle& angles, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp);
 void AngleMatrix(const QAngle& angles, matrix3x4_t& mat);
 void AngleMatrix(const QAngle& angles, const Vector3D& position, matrix3x4_t& mat);
 void AngleMatrix(const RadianEuler& angles, matrix3x4_t& mat);
@@ -996,13 +1311,38 @@ inline void VectorTransform(const Vector3D& in1, const matrix3x4_t& in2, Vector3
 	VectorTransform(&in1.x, in2, &out.x);
 }
 
+// MSVC folds the return value nicely and creates no temporaries on the stack,
+//    we need more experiments with different compilers and in different circumstances
+inline const Vector3D VectorTransform(const Vector3D& in1, const matrix3x4_t& in2)
+{
+	Vector3D out;
+	VectorTransform(in1, in2, out);
+	return out;
+}
+
+inline const Vector3D VectorRotate(const Vector3D& in1, const matrix3x4_t& in2)
+{
+	Vector3D out;
+	VectorRotate(in1, in2, out);
+	return out;
+}
+
+
+
 inline void VectorITransform(const Vector3D& in1, const matrix3x4_t& in2, Vector3D& out)
 {
 	VectorITransform(&in1.x, in2, &out.x);
 }
 
+inline const Vector3D VectorITransform(const Vector3D& in1, const matrix3x4_t& in2)
+{
+	Vector3D out;
+	VectorITransform(in1, in2, out);
+	return out;
+}
+
 /*
-inline void DecomposeRotation( const matrix3x4_t &mat, Vector3D &out )
+inline void DecomposeRotation( const matrix3x4_t &mat, Vector &out )
 {
 	DecomposeRotation( mat, &out.x );
 }
@@ -1110,7 +1450,9 @@ void BuildGammaTable(float gamma, float texGamma, float brightness, int overbrig
 // convert texture to linear 0..1 value
 inline float TexLightToLinear(int c, int exponent)
 {
-	extern float power2_n[256];
+	// On VS 2013 LTCG builds it is required that the array declaration be annotated with
+	// the same alignment requirements as the array definition.
+	extern ALIGN128 float power2_n[256];
 	Assert(exponent >= -128 && exponent <= 127);
 	return (float)c * power2_n[exponent + 128];
 }
@@ -1129,8 +1471,8 @@ struct ColorRGBExp32
 	signed char exponent;
 };
 
-void ColorRGBExp32ToVector3D(const ColorRGBExp32& in, Vector3D& out);
-void Vector3DToColorRGBExp32(const Vector3D& v, ColorRGBExp32& c);
+void ColorRGBExp32ToVector(const ColorRGBExp32& in, Vector3D& out);
+void VectorToColorRGBExp32(const Vector3D& v, ColorRGBExp32& c);
 
 // solve for "x" where "a x^2 + b x + c = 0", return true if solution exists
 bool SolveQuadratic(float a, float b, float c, float& root1, float& root2);
@@ -1151,7 +1493,7 @@ bool SolveInverseQuadraticMonotonic(float x1, float y1, float x2, float y2,
 // solves for "a, b, c" where "1/(a x^2 + b x + c ) = y", return true if solution exists
 bool SolveInverseReciprocalQuadratic(float x1, float y1, float x2, float y2, float x3, float y3, float& a, float& b, float& c);
 
-// rotate a Vector3D around the Z axis (YAW)
+// rotate a vector around the Z axis (YAW)
 void VectorYawRotate(const Vector3D& in, float flYaw, Vector3D& out);
 
 
@@ -1304,10 +1646,11 @@ inline float SimpleSplineRemapValClamped(float val, float A, float B, float C, f
 	if (A == B)
 		return val >= B ? D : C;
 	float cVal = (val - A) / (B - A);
-	cVal = std::clamp(cVal, 0.0f, 1.0f);
+	cVal = clamp(cVal, 0.0f, 1.0f);
 	return C + (D - C) * SimpleSpline(cVal);
 }
 
+
 FORCEINLINE int RoundFloatToInt(float f)
 {
 #if defined( _X360 )
@@ -1322,7 +1665,13 @@ FORCEINLINE int RoundFloatToInt(float f)
 	flResult = __fctiw(f);
 	return pResult[1];
 #elif defined ( _PS3 )
+#if defined(__SPU__)
+	int nResult;
+	nResult = static_cast<int>(f);
+	return nResult;
+#else
 	return  __fctiw(f);
+#endif
 #else // !X360
 	int nResult;
 #if defined( COMPILER_MSVC32 )
@@ -1361,7 +1710,13 @@ FORCEINLINE unsigned char RoundFloatToByte(float f)
 	return pResult[7];
 
 #elif defined ( _PS3 )
+#if defined(__SPU__)
+	int nResult;
+	nResult = static_cast<unsigned int> (f) & 0xff;
+	return nResult;
+#else
 	return __fctiw(f);
+#endif
 #else // !X360
 
 	int nResult;
@@ -1404,7 +1759,11 @@ FORCEINLINE unsigned long RoundFloatToUnsignedLong(float f)
 	Assert(pIntResult[1] >= 0);
 	return pResult[1];
 #elif defined ( _PS3 )
+#if defined(__SPU__)
+	return static_cast<unsigned long>(f);
+#else
 	return __fctiw(f);
+#endif
 #else  // !X360
 
 #if defined( COMPILER_MSVC32 )
@@ -1445,7 +1804,13 @@ FORCEINLINE int Float2Int(float a)
 	flResult = __fctiwz(a);
 	return pResult[1];
 #elif defined ( _PS3 )
+#if defined(__SPU__)
+	int RetVal;
+	RetVal = static_cast<int>(a);
+	return RetVal;
+#else
 	return __fctiwz(a);
+#endif
 #else  // !X360
 
 	int RetVal;
@@ -1473,6 +1838,8 @@ FORCEINLINE int Float2Int(float a)
 #endif
 }
 
+
+
 // Over 15x faster than: (int)floor(value)
 inline int Floor2Int(float a)
 {
@@ -1801,7 +2168,7 @@ float Hermite_Spline(
 	float t);
 
 
-void Hermite_SplineBasis(float t, float basis[]);
+void Hermite_SplineBasis(float t, float basis[4]);
 
 void Hermite_Spline(
 	const Quaternion& q0,
@@ -1906,7 +2273,7 @@ float CubicBasis3(float t);
 
 // quintic interpolating polynomial from Perlin.
 // 0->0, 1->1, smooth-in between with smooth tangents
-FORCEINLINE float QuinticInterpolatingPolynomial(float t)
+inline float QuinticInterpolatingPolynomial(float t)
 {
 	// 6t^5-15t^4+10t^3
 	return t * t * t * (t * (t * 6.0 - 15.0) + 10.0);
@@ -1971,6 +2338,7 @@ bool MathLib_MMXEnabled(void);
 bool MathLib_SSEEnabled(void);
 bool MathLib_SSE2Enabled(void);
 
+inline float Approach(float target, float value, float speed);
 float ApproachAngle(float target, float value, float speed);
 float AngleDiff(float destAngle, float srcAngle);
 float AngleDistance(float next, float cur);
@@ -1987,7 +2355,7 @@ void RotationDelta(const QAngle& srcAngles, const QAngle& destAngles, QAngle* ou
 
 //-----------------------------------------------------------------------------
 // Clips a line segment such that only the portion in the positive half-space
-// of the plane remains.  If the segment is entirely clipped, the Vector3Ds
+// of the plane remains.  If the segment is entirely clipped, the vectors
 // are set to vec3_invalid (all components are FLT_MAX).
 //
 // flBias is added to the dot product with the normal.  A positive bias 
@@ -1998,13 +2366,20 @@ void ClipLineSegmentToPlane(const Vector3D& vNormal, const Vector3D& vPlanePoint
 
 void ComputeTrianglePlane(const Vector3D& v1, const Vector3D& v2, const Vector3D& v3, Vector3D& normal, float& intercept);
 int PolyFromPlane(Vector3D* pOutVerts, const Vector3D& normal, float dist, float fHalfScale = 9000.0f);
-//void PolyFromPlane_SIMD(fltx4* pOutVerts, const fltx4& plane, float fHalfScale = 9000.0f);
+void PolyFromPlane_SIMD(fltx4* pOutVerts, const fltx4& plane, float fHalfScale = 9000.0f);
 int ClipPolyToPlane(Vector3D* inVerts, int vertCount, Vector3D* outVerts, const Vector3D& normal, float dist, float fOnPlaneEpsilon = 0.1f);
-//int ClipPolyToPlane_SIMD(fltx4* pInVerts, int vertCount, fltx4* pOutVerts, const fltx4& plane, float fOnPlaneEpsilon = 0.1f);
+int ClipPolyToPlane_SIMD(fltx4* pInVerts, int vertCount, fltx4* pOutVerts, const fltx4& plane, float fOnPlaneEpsilon = 0.1f);
 int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, const double* normal, double dist, double fOnPlaneEpsilon = 0.1);
 float TetrahedronVolume(const Vector3D& p0, const Vector3D& p1, const Vector3D& p2, const Vector3D& p3);
 float TriangleArea(const Vector3D& p0, const Vector3D& p1, const Vector3D& p2);
 
+/// return surface area of an AABB
+FORCEINLINE float BoxSurfaceArea(Vector3D const& vecBoxMin, Vector3D const& vecBoxMax)
+{
+	Vector3D boxdim = vecBoxMax - vecBoxMin;
+	return 2.0 * ((boxdim[0] * boxdim[2]) + (boxdim[0] * boxdim[1]) + (boxdim[1] * boxdim[2]));
+}
+
 //-----------------------------------------------------------------------------
 // Computes a reasonable tangent space for a triangle
 //-----------------------------------------------------------------------------
@@ -2146,7 +2521,7 @@ FORCEINLINE unsigned int* PackNormal_HEND3N(float nx, float ny, float nz, unsign
 
 FORCEINLINE float* UnpackNormal_SHORT2(const unsigned int* pPackedNormal, float* pNormal, bool bIsTangent = FALSE)
 {
-	// Unpacks from Jason's 2-short format (fills in a 4th binormal-sign (+1/-1) value, if this is a tangent Vector3D)
+	// Unpacks from Jason's 2-short format (fills in a 4th binormal-sign (+1/-1) value, if this is a tangent vector)
 
 	// FIXME: short math is slow on 360 - use ints here instead (bit-twiddle to deal w/ the sign bits)
 	short iX = (*pPackedNormal & 0x0000FFFF);
@@ -2183,9 +2558,9 @@ FORCEINLINE float* UnpackNormal_SHORT2(const unsigned int* pPackedNormal, float*
 
 FORCEINLINE unsigned int* PackNormal_SHORT2(float nx, float ny, float nz, unsigned int* pPackedNormal, float binormalSign = +1.0f)
 {
-	// Pack a Vector3D (ASSUMED TO BE NORMALIZED) into Jason's 4-byte (SHORT2) format.
+	// Pack a vector (ASSUMED TO BE NORMALIZED) into Jason's 4-byte (SHORT2) format.
 	// This simply reconstructs Z from X & Y. It uses the sign bits of the X & Y coords
-	// to reconstruct the sign of Z and, if this is a tangent Vector3D, the sign of the
+	// to reconstruct the sign of Z and, if this is a tangent vector, the sign of the
 	// binormal (this is needed because tangent/binormal vectors are supposed to follow
 	// UV gradients, but shaders reconstruct the binormal from the tangent and normal
 	// assuming that they form a right-handed basis).
@@ -2204,7 +2579,7 @@ FORCEINLINE unsigned int* PackNormal_SHORT2(float nx, float ny, float nz, unsign
 	if (nz < 0.0f)
 		nx = -nx;				// Set the sign bit for z
 
-	ny *= binormalSign;			// Set the sign bit for the binormal (use when encoding a tangent Vector3D)
+	ny *= binormalSign;			// Set the sign bit for the binormal (use when encoding a tangent vector)
 
 	// FIXME: short math is slow on 360 - use ints here instead (bit-twiddle to deal w/ the sign bits), also use Float2Int()
 	short sX = (short)nx;		// signed short [1,32767]
@@ -2278,7 +2653,7 @@ FORCEINLINE float* UnpackNormal_UBYTE4(const unsigned int* pPackedNormal, float*
 // See: http://www.oroboro.com/rafael/docserv.php/index/programming/article/unitv2
 //
 // UBYTE4 encoding, using per-octant projection onto x+y+z=1
-// Assume input Vector3D is already unit length
+// Assume input vector is already unit length
 //
 // binormalSign specifies 'sign' of binormal, stored in t sign bit of tangent
 // (lets the shader know whether norm/tan/bin form a right-handed basis)
@@ -2359,7 +2734,7 @@ FORCEINLINE void RGB2YUV(int& nR, int& nG, int& nB, float& fY, float& fU, float&
 		dX = 2 * (fU - 0.5f);
 		dY = 2 * (fV - 0.5f);
 		sat = sqrtf(dX * dX + dY * dY);
-		sat = clamp((int)(sat * (1 + SNAP_TO_GREY) - SNAP_TO_GREY), 0, 1);
+		sat = clamp((sat * (1 + SNAP_TO_GREY) - SNAP_TO_GREY), 0.f, 1.f);
 		scale = (sat == 0) ? 0 : MIN((sqrtf(sat) / sat), 4.0f);
 		fU = 0.5f + scale * (fU - 0.5f);
 		fV = 0.5f + scale * (fV - 0.5f);
@@ -2445,6 +2820,21 @@ inline bool AlmostEqual(const Vector3D& a, const Vector3D& b, int maxUlps = 10)
 		AlmostEqual(a.z, b.z, maxUlps);
 }
 
+inline Vector3D Approach(Vector3D target, Vector3D value, float speed)
+{
+	Vector3D diff = (target - value);
+	float delta = diff.Length();
+
+	if (delta > speed)
+		value += diff.Normalized() * speed;
+	else if (delta < -speed)
+		value -= diff.Normalized() * speed;
+	else
+		value = target;
+
+	return value;
+}
+
 inline float Approach(float target, float value, float speed)
 {
 	float delta = target - value;
@@ -2472,6 +2862,20 @@ inline float Approach(float target, float value, float speed)
 #endif
 }
 
+
+// return a 0..1 value based on the position of x between edge0 and edge1
+inline float smoothstep_bounds(float edge0, float edge1, float x)
+{
+	x = clamp(static_cast<int>((x - edge0) / (edge1 - edge0)), 0, 1);
+	return x * x * (3 - 2 * x);
+}
+
+// return a value between edge0 and edge1 based on the 0..1 value of x
+inline float interpstep(float edge0, float edge1, float x)
+{
+	return edge0 + (x * (edge1 - edge0));
+}
+
 // on PPC we can do this truncate without converting to int
 #if defined(_X360) || defined(_PS3)
 inline double TruncateFloatToIntAsFloat(double flVal)
@@ -2480,9 +2884,14 @@ inline double TruncateFloatToIntAsFloat(double flVal)
 	double flIntFormat = __fctiwz(flVal);
 	return __fcfid(flIntFormat);
 #elif defined(_PS3)
+#if defined(__SPU__)
+	int iVal = int(flVal);
+	return static_cast<double>(iVal);
+#else
 	double flIntFormat = __builtin_fctiwz(flVal);
 	return __builtin_fcfid(flIntFormat);
 #endif
+#endif
 }
 #endif
 
@@ -2494,5 +2903,231 @@ inline double SubtractIntegerPart(double flVal)
 	return flVal - int(flVal);
 #endif
 }
+
+
+inline void matrix3x4_t::InitFromQAngles(const QAngle& angles, const Vector3D& vPosition)
+{
+	AngleMatrix(angles, vPosition, *this);
+}
+inline void matrix3x4_t::InitFromQAngles(const QAngle& angles) { InitFromQAngles(angles, vec3_origin); }
+
+inline void matrix3x4_t::InitFromRadianEuler(const RadianEuler& angles, const Vector3D& vPosition)
+{
+	AngleMatrix(angles, vPosition, *this);
+}
+
+inline void matrix3x4_t::InitFromRadianEuler(const RadianEuler& angles) { InitFromRadianEuler(angles, vec3_origin); }
+
+inline void matrix3x4_t::InitFromQuaternion(const Quaternion& orientation, const Vector3D& vPosition)
+{
+	QuaternionMatrix(orientation, vPosition, *this);
+}
+
+inline void matrix3x4_t::InitFromDiagonal(const Vector3D& vDiagonal)
+{
+	SetToIdentity();
+	m_flMatVal[0][0] = vDiagonal.x;
+	m_flMatVal[1][1] = vDiagonal.y;
+	m_flMatVal[2][2] = vDiagonal.z;
+}
+
+
+inline void matrix3x4_t::InitFromQuaternion(const Quaternion& orientation) { InitFromQuaternion(orientation, vec3_origin); }
+
+inline Quaternion matrix3x4_t::ToQuaternion() const
+{
+	return MatrixQuaternion(*this);
+}
+
+inline QAngle matrix3x4_t::ToQAngle() const
+{
+	QAngle tmp;
+	MatrixAngles(*this, tmp);
+	return tmp;
+}
+
+inline void matrix3x4_t::SetToIdentity()
+{
+	SetIdentityMatrix(*this);
+}
+
+inline bool matrix3x4_t::IsEqualTo(const matrix3x4_t& other, float flTolerance) const
+{
+	return MatricesAreEqual(*this, other, flTolerance);
+}
+
+inline void matrix3x4_t::GetBasisVectorsFLU(Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp) const
+{
+	return MatrixVectorsFLU(*this, pForward, pLeft, pUp);
+}
+
+inline Vector3D matrix3x4_t::TransformVector(const Vector3D& v0) const
+{
+	return VectorTransform(v0, *this);
+}
+
+inline Vector3D matrix3x4_t::RotateVector(const Vector3D& v0) const
+{
+	return VectorRotate(v0, *this);
+}
+
+inline Vector3D matrix3x4_t::TransformVectorByInverse(const Vector3D& v0) const
+{
+	return VectorITransform(v0, *this);
+}
+
+inline Vector3D matrix3x4_t::RotateVectorByInverse(const Vector3D& v0) const
+{
+	Vector3D tmp;
+	VectorIRotate(v0, *this, tmp);
+	return tmp;
+}
+
+inline Vector3D matrix3x4_t::RotateExtents(const Vector3D& vBoxExtents) const
+{
+	return Vector3D(DotProductAbs(vBoxExtents, m_flMatVal[0]), DotProductAbs(vBoxExtents, m_flMatVal[1]), DotProductAbs(vBoxExtents, m_flMatVal[2]));
+}
+
+inline Vector3D matrix3x4_t::GetColumn(MatrixAxisType_t nColumn) const
+{
+	return Vector3D(m_flMatVal[0][nColumn], m_flMatVal[1][nColumn], m_flMatVal[2][nColumn]);
+}
+
+inline void matrix3x4_t::SetColumn(const Vector3D& vColumn, MatrixAxisType_t nColumn)
+{
+	m_flMatVal[0][nColumn] = vColumn.x;
+	m_flMatVal[1][nColumn] = vColumn.y;
+	m_flMatVal[2][nColumn] = vColumn.z;
+}
+
+inline void matrix3x4_t::InverseTR(matrix3x4_t& out) const
+{
+	::MatrixInvert(*this, out);
+}
+
+inline matrix3x4_t matrix3x4_t::InverseTR() const
+{
+	matrix3x4_t out;
+	::MatrixInvert(*this, out);
+	return out;
+}
+
+inline void matrix3x4_t::TransformAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const
+{
+	::TransformAABB(*this, vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut);
+}
+
+inline void matrix3x4_t::TransformAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const
+{
+	::ITransformAABB(*this, vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut);
+}
+
+inline void matrix3x4_t::RotateAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const
+{
+	::RotateAABB(*this, vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut);
+}
+inline void matrix3x4_t::RotateAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const
+{
+	::IRotateAABB(*this, vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut);
+}
+
+inline void matrix3x4_t::TransformPlane(const cplane_t& inPlane, cplane_t& outPlane) const
+{
+	::MatrixTransformPlane(*this, inPlane, outPlane);
+}
+inline void matrix3x4_t::TransformPlaneByInverse(const cplane_t& inPlane, cplane_t& outPlane) const
+{
+	::MatrixITransformPlane(*this, inPlane, outPlane);
+}
+
+inline float matrix3x4_t::GetOrthogonalityError() const
+{
+	return
+		fabsf(m_flMatVal[0][0] * m_flMatVal[0][1] + m_flMatVal[1][0] * m_flMatVal[1][1] + m_flMatVal[2][0] * m_flMatVal[2][1]) +
+		fabsf(m_flMatVal[0][1] * m_flMatVal[0][2] + m_flMatVal[1][1] * m_flMatVal[1][2] + m_flMatVal[2][1] * m_flMatVal[2][2]) +
+		fabsf(m_flMatVal[0][2] * m_flMatVal[0][0] + m_flMatVal[1][2] * m_flMatVal[1][0] + m_flMatVal[2][2] * m_flMatVal[2][0]);
+}
+
+inline matrix3x4_t Quaternion::ToMatrix() const
+{
+	matrix3x4_t mat;
+	mat.InitFromQuaternion(*this);
+	return mat;
+}
+
+inline matrix3x4_t QAngle::ToMatrix() const
+{
+	matrix3x4_t mat;
+	AngleMatrix(*this, mat);
+	return mat;
+}
+
+inline Quaternion QAngle::ToQuaternion() const
+{
+	return AngleQuaternion(*this);
+}
+
+inline float matrix3x4_t::GetDeterminant() const
+{
+	return
+		m_flMatVal[0][0] * (m_flMatVal[1][1] * m_flMatVal[2][2] - m_flMatVal[2][1] * m_flMatVal[1][2])
+		- m_flMatVal[0][1] * (m_flMatVal[1][0] * m_flMatVal[2][2] - m_flMatVal[1][2] * m_flMatVal[2][0])
+		+ m_flMatVal[0][2] * (m_flMatVal[1][0] * m_flMatVal[2][1] - m_flMatVal[1][1] * m_flMatVal[2][0]);
+}
+
+inline float GetRelativeDifferenceSqr(const Vector3D& a, const Vector3D& b)
+{
+	return (a - b).LengthSqr() / Max(1.0f, Max(a.LengthSqr(), b.LengthSqr()));
+}
+
+
+inline float GetRelativeDifference(const Vector3D& a, const Vector3D& b)
+{
+	return sqrtf(GetRelativeDifferenceSqr(a, b));
+}
+
+
+// a good measure of relative error between two TR matrices, perhaps with a reasonable scale
+inline float GetRelativeDifference(const matrix3x4_t& a, const matrix3x4_t& b)
+{
+	return sqrtf(Max(Max(GetRelativeDifferenceSqr(a.GetColumn(X_AXIS), b.GetColumn(X_AXIS)),
+		GetRelativeDifferenceSqr(a.GetColumn(Y_AXIS), b.GetColumn(Y_AXIS))),
+		Max(GetRelativeDifferenceSqr(a.GetColumn(Z_AXIS), b.GetColumn(Z_AXIS)),
+			GetRelativeDifferenceSqr(a.GetOrigin(), b.GetOrigin()))
+	)
+	);
+}
+
+
+
+inline float matrix3x4_t::GetSylvestersCriterion()const
+{
+	// http://en.wikipedia.org/wiki/Sylvester%27s_criterion
+	float flDet1 = m_flMatVal[0][0];
+	float flDet2 = m_flMatVal[0][0] * m_flMatVal[1][1] - m_flMatVal[1][0] * m_flMatVal[0][1];
+	float flDet3 = GetDeterminant();
+	return MIN(MIN(flDet1, flDet2), flDet3);
+}
+
+
+
+// Generate the corner points of a box:
+// +y       _+z
+// ^        /|
+// |       /
+// |  3---7   
+//   /|  /|
+//  / | / |
+// 2---6  |
+// |  1|--5
+// | / | /
+// |/  |/
+// 0---4   --> +x
+//
+void PointsFromBox(const Vector3D& mins, const Vector3D& maxs, Vector3D* points);
+void BuildTransformedBox(Vector3D* v2, Vector3D const& bbmin, Vector3D const& bbmax, const matrix3x4_t& m);
+
+
+
 #endif	// MATH_BASE_H
 
diff --git a/r5dev/mathlib/mathlib_base.cpp b/r5dev/mathlib/mathlib_base.cpp
index 25bf1462..0d4ce678 100644
--- a/r5dev/mathlib/mathlib_base.cpp
+++ b/r5dev/mathlib/mathlib_base.cpp
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+﻿//===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======//
 //
 // Purpose: Math primitives.
 //
@@ -7,30 +7,38 @@
 /// FIXME: As soon as all references to mathlib.c are gone, include it in here
 
 #include "core/stdafx.h"
-#include <math.h>
-#include <float.h>	// Needed for FLT_EPSILON
 
 #include "tier0/basetypes.h"
-#include <memory.h>
+//#include <memory.h>
 #include "tier0/dbg.h"
+#include "tier0/cpu.h"
 
+//#include "tier0/vprof.h"
 //#define _VPROF_MATHLIB
 
+#if !defined(__SPU__)
 #pragma warning(disable:4244)   // "conversion from 'const int' to 'float', possible loss of data"
 #pragma warning(disable:4730)	// "mixing _m64 and floating point expressions may result in incorrect code"
+#endif
 
-#include "mathlib/bits.h"
-#include "mathlib/vplane.h"
-#include "mathlib/Vector.h"
-#include "mathlib/Vector2d.h"
 #include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "mathlib/vplane.h"
+#if !defined(__SPU__)
+#include "mathlib/vmatrix.h"
+#endif
+
+#if !defined( _X360 )
+//#include "sse.h"
+#endif
 
 #include "mathlib/ssemath.h"
-#include "mathlib/math_pfns.h"
-#include <tier0/cpu.h>
+#include "mathlib/ssequaternion.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+//#include "tier0/memdbgon.h"
 
 bool s_bMathlibInitialized = false;
-
 #ifdef PARANOID
 // User must provide an implementation of Sys_Error()
 void Sys_Error(char* error, ...);
@@ -38,9 +46,17 @@ void Sys_Error(char* error, ...);
 
 const Vector3D vec3_origin(0, 0, 0);
 const QAngle vec3_angle(0, 0, 0);
+const Quaternion quat_identity(0, 0, 0, 1);
 const Vector3D vec3_invalid(FLT_MAX, FLT_MAX, FLT_MAX);
 const int nanmask = 255 << 23;
 
+const matrix3x4a_t g_MatrixIdentity(
+	1, 0, 0, 0,
+	0, 1, 0, 0,
+	0, 0, 1, 0
+);
+
+#if !defined(__SPU__)
 //-----------------------------------------------------------------------------
 // Standard C implementations of optimized routines:
 //-----------------------------------------------------------------------------
@@ -57,10 +73,11 @@ float _rsqrtf(float x)
 	return 1.f / _sqrtf(x);
 }
 
-float FASTCALL _VectorNormalize(Vector3D& vec)
+#ifndef PLATFORM_PPC
+float VectorNormalize(Vector3D& vec)
 {
 #ifdef _VPROF_MATHLIB
-	VPROF_BUDGET("_Vector3Normalize", "Mathlib");
+	VPROF_BUDGET("_VectorNormalize", "Mathlib");
 #endif
 	Assert(s_bMathlibInitialized);
 	float radius = sqrtf(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
@@ -74,6 +91,8 @@ float FASTCALL _VectorNormalize(Vector3D& vec)
 
 	return radius;
 }
+#endif
+
 
 // TODO: Add fast C VectorNormalizeFast.
 // Perhaps use approximate rsqrt trick, if the accuracy isn't too bad.
@@ -97,17 +116,11 @@ float _InvRSquared(const float* v)
 	return r2 < 1.f ? 1.f : 1 / r2;
 }
 
+#if !defined(__SPU__)
 //-----------------------------------------------------------------------------
 // Function pointers selecting the appropriate implementation
 //-----------------------------------------------------------------------------
-float (*pfSqrt)(float x) = _sqrtf;
-float (*pfRSqrt)(float x) = _rsqrtf;
-float (*pfRSqrtFast)(float x) = _rsqrtf;
-float (FASTCALL* pfVectorNormalize)(Vector3D& v) = _VectorNormalize;
 void  (FASTCALL* pfVectorNormalizeFast)(Vector3D& v) = _VectorNormalizeFast;
-float (*pfInvRSquared)(const float* v) = _InvRSquared;
-void  (*pfFastSinCos)(float x, float* s, float* c) = SinCos;
-float (*pfFastCos)(float x) = cosf;
 
 float SinCosTable[SIN_TABLE_SIZE];
 void InitSinCosTable()
@@ -117,6 +130,8 @@ void InitSinCosTable()
 		SinCosTable[i] = sin(i * 2.0 * M_PI / SIN_TABLE_SIZE);
 	}
 }
+#endif // !defined(__SPU__)
+
 
 qboolean VectorsEqual(const float* v1, const float* v2)
 {
@@ -125,11 +140,11 @@ qboolean VectorsEqual(const float* v1, const float* v2)
 		(v1[1] == v2[1]) &&
 		(v1[2] == v2[2]));
 }
-
+#endif // #if !defined(__SPU__)
 
 //-----------------------------------------------------------------------------
 // Purpose: Generates Euler angles given a left-handed orientation matrix. The
-//			columns of the matrix contain the forward, left, and up Vector3s.
+//			columns of the matrix contain the forward, left, and up vectors.
 // Input  : matrix - Left-handed orientation matrix.
 //			angles[PITCH, YAW, ROLL]. Receives right-handed counterclockwise
 //				rotations in degrees around Y, Z, and X respectively.
@@ -210,8 +225,8 @@ void MatrixAngles(const matrix3x4_t& matrix, float* angles)
 	float up[3];
 
 	//
-	// Extract the basis Vector3s from the matrix. Since we only need the Z
-	// component of the up Vector3, we don't get X and Y.
+	// Extract the basis vectors from the matrix. Since we only need the Z
+	// component of the up vector, we don't get X and Y.
 	//
 	forward[0] = matrix[0][0];
 	forward[1] = matrix[1][0];
@@ -248,15 +263,39 @@ void MatrixAngles(const matrix3x4_t& matrix, float* angles)
 	}
 }
 
+Vector3D MatrixNormalize(const matrix3x4_t& in, matrix3x4_t& out)
+{
+	Vector3D vScale;
+	vScale.x = sqrt(in[0][0] * in[0][0] + in[1][0] * in[1][0] + in[2][0] * in[2][0]);
+	vScale.y = sqrt(in[0][1] * in[0][1] + in[1][1] * in[1][1] + in[2][1] * in[2][1]);
+	vScale.z = sqrt(in[0][2] * in[0][2] + in[1][2] * in[1][2] + in[2][2] * in[2][2]);
 
+	matrix3x4_t norm;
+	float flInvScaleX = 1.0f / vScale.x;
+	float flInvScaleY = 1.0f / vScale.y;
+	float flInvScaleZ = 1.0f / vScale.z;
+	out[0][0] = in[0][0] * flInvScaleX; out[1][0] = in[1][0] * flInvScaleX; out[2][0] = in[2][0] * flInvScaleX;
+	out[0][1] = in[0][1] * flInvScaleY; out[1][1] = in[1][1] * flInvScaleY; out[2][1] = in[2][1] * flInvScaleY;
+	out[0][2] = in[0][2] * flInvScaleZ; out[1][2] = in[1][2] * flInvScaleZ; out[2][2] = in[2][2] * flInvScaleZ;
+	out[0][3] = in[0][3]; out[1][3] = in[1][3]; out[2][3] = in[2][3];
+
+	return vScale;
+}
+
+
+
+#if !defined(__SPU__)
 // transform in1 by the matrix in2
-void VectorTransform(const float* in1, const matrix3x4_t& in2, float* out)
+void VectorTransform(const float* RESTRICT in1, const matrix3x4_t& in2, float* RESTRICT out)
 {
 	Assert(s_bMathlibInitialized);
-	Assert(in1 != out);
-	out[0] = DotProduct(in1, in2[0]) + in2[0][3];
-	out[1] = DotProduct(in1, in2[1]) + in2[1][3];
-	out[2] = DotProduct(in1, in2[2]) + in2[2][3];
+	float x = DotProduct(in1, in2[0]) + in2[0][3];
+	float y = DotProduct(in1, in2[1]) + in2[1][3];
+	float z = DotProduct(in1, in2[2]) + in2[2][3];
+
+	out[0] = x;
+	out[1] = y;
+	out[2] = z;
 }
 
 
@@ -270,23 +309,31 @@ void VectorITransform(const float* in1, const matrix3x4_t& in2, float* out)
 	in1t[1] = in1[1] - in2[1][3];
 	in1t[2] = in1[2] - in2[2][3];
 
-	out[0] = in1t[0] * in2[0][0] + in1t[1] * in2[1][0] + in1t[2] * in2[2][0];
-	out[1] = in1t[0] * in2[0][1] + in1t[1] * in2[1][1] + in1t[2] * in2[2][1];
-	out[2] = in1t[0] * in2[0][2] + in1t[1] * in2[1][2] + in1t[2] * in2[2][2];
+	float x = in1t[0] * in2[0][0] + in1t[1] * in2[1][0] + in1t[2] * in2[2][0];
+	float y = in1t[0] * in2[0][1] + in1t[1] * in2[1][1] + in1t[2] * in2[2][1];
+	float z = in1t[0] * in2[0][2] + in1t[1] * in2[1][2] + in1t[2] * in2[2][2];
+
+	out[0] = x;
+	out[1] = y;
+	out[2] = z;
 }
+#endif // #if !defined(__SPU__)
 
-
-// assume in2 is a rotation and rotate the input Vector3D
-void VectorRotate(const float* in1, const matrix3x4_t& in2, float* out)
+// assume in2 is a rotation and rotate the input vector
+void VectorRotate(const float* RESTRICT in1, const matrix3x4_t& in2, float* RESTRICT out)
 {
 	Assert(s_bMathlibInitialized);
-	Assert(in1 != out);
-	out[0] = DotProduct(in1, in2[0]);
-	out[1] = DotProduct(in1, in2[1]);
-	out[2] = DotProduct(in1, in2[2]);
+	float x = DotProduct(in1, in2[0]);
+	float y = DotProduct(in1, in2[1]);
+	float z = DotProduct(in1, in2[2]);
+
+	out[0] = x;
+	out[1] = y;
+	out[2] = z;
 }
 
-// assume in2 is a rotation and rotate the input Vector3D
+#if !defined(__SPU__)
+// assume in2 is a rotation and rotate the input vector
 void VectorRotate(const Vector3D& in1, const QAngle& in2, Vector3D& out)
 {
 	matrix3x4_t matRotate;
@@ -294,17 +341,38 @@ void VectorRotate(const Vector3D& in1, const QAngle& in2, Vector3D& out)
 	VectorRotate(in1, matRotate, out);
 }
 
-// assume in2 is a rotation and rotate the input Vector3D
+// assume in2 is a rotation and rotate the input vector
 void VectorRotate(const Vector3D& in1, const Quaternion& in2, Vector3D& out)
 {
+#if WE_WANT_OUR_CODE_TO_BE_POINTLESSLY_SLOW
 	matrix3x4_t matRotate;
 	QuaternionMatrix(in2, matRotate);
 	VectorRotate(in1, matRotate, out);
+#else
+	// rotation is  q * v * q^-1
+
+	Quaternion conjugate = in2.Conjugate();
+
+
+	// do the rotation as unrolled flop code ( QuaternionMult is a function call, which murders instruction scheduling )
+	// first q*v
+	Quaternion temp;
+	temp.x = in2.y * in1.z - in2.z * in1.y + in2.w * in1.x;
+	temp.y = -in2.x * in1.z + in2.z * in1.x + in2.w * in1.y;
+	temp.z = in2.x * in1.y - in2.y * in1.x + in2.w * in1.z;
+	temp.w = -in2.x * in1.x - in2.y * in1.y - in2.z * in1.z;
+
+	// now  (qv)(q*)
+	out.x = temp.x * conjugate.w + temp.y * conjugate.z - temp.z * conjugate.y + temp.w * conjugate.x;
+	out.y = -temp.x * conjugate.z + temp.y * conjugate.w + temp.z * conjugate.x + temp.w * conjugate.y;
+	out.z = temp.x * conjugate.y - temp.y * conjugate.x + temp.z * conjugate.w + temp.w * conjugate.z;
+	Assert(fabs(-temp.x * conjugate.x - temp.y * conjugate.y - temp.z * conjugate.z + temp.w * conjugate.w) < 0.0001);
+#endif
 }
 
 
 // rotate by the inverse of the matrix
-void VectorIRotate(const float* in1, const matrix3x4_t& in2, float* out)
+void VectorIRotate(const float* RESTRICT in1, const matrix3x4_t& in2, float* RESTRICT out)
 {
 	Assert(s_bMathlibInitialized);
 	Assert(in1 != out);
@@ -313,7 +381,7 @@ void VectorIRotate(const float* in1, const matrix3x4_t& in2, float* out)
 	out[2] = in1[0] * in2[0][2] + in1[1] * in2[1][2] + in1[2] * in2[2][2];
 }
 
-#ifndef Vector_NO_SLOW_OPERATIONS
+#ifndef VECTOR_NO_SLOW_OPERATIONS
 // transform a set of angles in the output space of parentMatrix to the input space
 QAngle TransformAnglesToLocalSpace(const QAngle& angles, const matrix3x4_t& parentMatrix)
 {
@@ -338,7 +406,7 @@ QAngle TransformAnglesToWorldSpace(const QAngle& angles, const matrix3x4_t& pare
 	return out;
 }
 
-#endif // Vector3D_NO_SLOW_OPERATIONS
+#endif // VECTOR_NO_SLOW_OPERATIONS
 
 void MatrixInitialize(matrix3x4_t& mat, const Vector3D& vecOrigin, const Vector3D& vecXAxis, const Vector3D& vecYAxis, const Vector3D& vecZAxis)
 {
@@ -369,6 +437,8 @@ bool MatricesAreEqual(const matrix3x4_t& src1, const matrix3x4_t& src2, float fl
 	}
 	return true;
 }
+#endif // #if !defined(__SPU__)
+
 
 // NOTE: This is just the transpose not a general inverse
 void MatrixInvert(const matrix3x4_t& in, matrix3x4_t& out)
@@ -421,34 +491,7 @@ void MatrixSetColumn(const Vector3D& in, int column, matrix3x4_t& out)
 	out[2][column] = in.z;
 }
 
-void MatrixScaleBy(const float flScale, matrix3x4_t& out)
-{
-	out[0][0] *= flScale;
-	out[1][0] *= flScale;
-	out[2][0] *= flScale;
-	out[0][1] *= flScale;
-	out[1][1] *= flScale;
-	out[2][1] *= flScale;
-	out[0][2] *= flScale;
-	out[1][2] *= flScale;
-	out[2][2] *= flScale;
-}
-
-void MatrixScaleByZero(matrix3x4_t& out)
-{
-	out[0][0] = 0.0f;
-	out[1][0] = 0.0f;
-	out[2][0] = 0.0f;
-	out[0][1] = 0.0f;
-	out[1][1] = 0.0f;
-	out[2][1] = 0.0f;
-	out[0][2] = 0.0f;
-	out[1][2] = 0.0f;
-	out[2][2] = 0.0f;
-}
-
-
-
+#if !defined(__SPU__)
 int VectorCompare(const float* v1, const float* v2)
 {
 	Assert(s_bMathlibInitialized);
@@ -471,15 +514,28 @@ void CrossProduct(const float* v1, const float* v2, float* cross)
 	cross[2] = v1[0] * v2[1] - v1[1] * v2[0];
 }
 
-int Q_log2(int val)
+size_t Q_log2(unsigned int val)
 {
+#ifdef _X360 // use hardware
+	// both zero and one return zero (per old implementation)
+	return (val == 0) ? 0 : 31 - _CountLeadingZeros(val);
+#else // use N. Compoop's algorithm ( inherited from days of yore )
 	int answer = 0;
 	while (val >>= 1)
 		answer++;
 	return answer;
+#endif
 }
 
-// Matrix is right-handed x=forward, y=left, z=up.  We a left-handed convention for Vector3Ds in the game code (forward, right, up)
+// Matrix is right-handed x=forward, y=left, z=up.  We a left-handed convention for vectors in the game code (forward, right, up)
+void MatrixVectorsFLU(const matrix3x4_t& matrix, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp)
+{
+	MatrixGetColumn(matrix, FORWARD_AXIS, *pForward);
+	MatrixGetColumn(matrix, LEFT_AXIS, *pLeft);
+	MatrixGetColumn(matrix, UP_AXIS, *pUp);
+}
+
+// Matrix is right-handed x=forward, y=left, z=up.  We a left-handed convention for vectors in the game code (forward, right, up)
 void MatrixVectors(const matrix3x4_t& matrix, Vector3D* pForward, Vector3D* pRight, Vector3D* pUp)
 {
 	MatrixGetColumn(matrix, 0, *pForward);
@@ -494,7 +550,7 @@ void VectorVectors(const Vector3D& forward, Vector3D& right, Vector3D& up)
 	Assert(s_bMathlibInitialized);
 	Vector3D tmp;
 
-	if (forward[0] == 0 && forward[1] == 0)
+	if (fabs(forward[0]) < 1e-6 && fabs(forward[1]) < 1e-6)
 	{
 		// pitch 90 degrees up/down from identity
 		right[0] = 0;
@@ -525,6 +581,62 @@ void VectorMatrix(const Vector3D& forward, matrix3x4_t& matrix)
 	MatrixSetColumn(up, 2, matrix);
 }
 
+void VectorPerpendicularToVector(Vector3D const& in, Vector3D* pvecOut)
+{
+	float flY = in.y * in.y;
+	pvecOut->x = RemapVal(flY, 0, 1, in.z, 1);
+	pvecOut->y = 0;
+	pvecOut->z = -in.x;
+	pvecOut->NormalizeInPlace();
+	float flDot = DotProduct(*pvecOut, in);
+	*pvecOut -= flDot * in;
+	pvecOut->NormalizeInPlace();
+}
+
+//-----------------------------------------------------------------------------
+// Euler QAngle -> Basis Vectors.  Each vector is optional
+//-----------------------------------------------------------------------------
+void AngleVectorsFLU(const QAngle& angles, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp)
+{
+	Assert(s_bMathlibInitialized);
+
+	float sr, sp, sy, cr, cp, cy;
+
+#ifdef _X360
+	fltx4 radians, scale, sine, cosine;
+	radians = LoadUnaligned3SIMD(angles.Base());
+	scale = ReplicateX4(M_PI_F / 180.f);
+	radians = MulSIMD(radians, scale);
+	SinCos3SIMD(sine, cosine, radians);
+	sp = SubFloat(sine, 0);	sy = SubFloat(sine, 1);	sr = SubFloat(sine, 2);
+	cp = SubFloat(cosine, 0);	cy = SubFloat(cosine, 1);	cr = SubFloat(cosine, 2);
+#else
+	SinCos(DEG2RAD(angles[YAW]), &sy, &cy);
+	SinCos(DEG2RAD(angles[PITCH]), &sp, &cp);
+	SinCos(DEG2RAD(angles[ROLL]), &sr, &cr);
+#endif
+
+	if (pForward)
+	{
+		(*pForward)[FORWARD_AXIS] = cp * cy;
+		(*pForward)[LEFT_AXIS] = cp * sy;
+		(*pForward)[UP_AXIS] = -sp;
+	}
+
+	if (pLeft)
+	{
+		(*pLeft)[FORWARD_AXIS] = (sr * sp * cy + cr * -sy);
+		(*pLeft)[LEFT_AXIS] = (sr * sp * sy + cr * cy);
+		(*pLeft)[UP_AXIS] = sr * cp;
+	}
+
+	if (pUp)
+	{
+		(*pUp)[FORWARD_AXIS] = (cr * sp * cy + -sr * -sy);
+		(*pUp)[LEFT_AXIS] = (cr * sp * sy + -sr * cy);
+		(*pUp)[UP_AXIS] = cr * cp;
+	}
+}
 
 void VectorAngles(const float* forward, float* angles)
 {
@@ -562,7 +674,7 @@ void VectorAngles(const float* forward, float* angles)
 R_ConcatRotations
 ================
 */
-void ConcatRotations(const float in1[3][3], const float in2[3][3], float out[3][3])
+void ConcatRotations(const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out)
 {
 	Assert(s_bMathlibInitialized);
 	Assert(in1 != out);
@@ -586,12 +698,14 @@ void ConcatRotations(const float in1[3][3], const float in2[3][3], float out[3][
 	out[2][2] = in1[2][0] * in2[0][2] + in1[2][1] * in2[1][2] +
 		in1[2][2] * in2[2][2];
 }
+#endif // #if !defined(__SPU__)
 
-void ConcatTransforms_Aligned(const matrix3x4_t& m0, const matrix3x4_t& m1, matrix3x4_t& out)
+
+void ConcatTransforms_Aligned(const matrix3x4a_t& m0, const matrix3x4a_t& m1, matrix3x4a_t& out)
 {
-	Assert((((size_t)&m0) % 16) == 0);
-	Assert((((size_t)&m1) % 16) == 0);
-	Assert((((size_t)&out) % 16) == 0);
+	//AssertAligned(&m0);
+	//AssertAligned(&m1);
+	//AssertAligned(&out);
 
 	fltx4 lastMask = *(fltx4*)(&g_SIMD_ComponentMask[3]);
 	fltx4 rowA0 = LoadAlignedSIMD(m0.m_flMatVal[0]);
@@ -630,7 +744,7 @@ void ConcatTransforms_Aligned(const matrix3x4_t& m0, const matrix3x4_t& m1, matr
 	fltx4 mul22 = MulSIMD(A2, rowB2);
 	fltx4 out2 = AddSIMD(mul20, AddSIMD(mul21, mul22));
 
-	// add in translation Vector3D
+	// add in translation vector
 	A0 = AndSIMD(rowA0, lastMask);
 	A1 = AndSIMD(rowA1, lastMask);
 	A2 = AndSIMD(rowA2, lastMask);
@@ -697,7 +811,7 @@ void ConcatTransforms(const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_
 	fltx4 mul22 = MulSIMD(A2, rowB2);
 	fltx4 out2 = AddSIMD(mul20, AddSIMD(mul21, mul22));
 
-	// add in translation Vector3D
+	// add in translation vector
 	A0 = AndSIMD(rowA0, lastMask);
 	A1 = AndSIMD(rowA1, lastMask);
 	A2 = AndSIMD(rowA2, lastMask);
@@ -721,7 +835,7 @@ numer and denom, both of which should contain no fractional part. The
 quotient must fit in 32 bits.
 ====================
 */
-
+#if !defined(__SPU__)
 void FloorDivMod(double numer, double denom, int* quotient,
 	int* rem)
 {
@@ -889,7 +1003,7 @@ int __cdecl BoxOnPlaneSide(const float* emins, const float* emaxs, const cplane_
 }
 
 //-----------------------------------------------------------------------------
-// Euler QAngle -> Basis Vector3Ds
+// Euler QAngle -> Basis Vectors
 //-----------------------------------------------------------------------------
 
 void AngleVectors(const QAngle& angles, Vector3D* forward)
@@ -908,7 +1022,7 @@ void AngleVectors(const QAngle& angles, Vector3D* forward)
 }
 
 //-----------------------------------------------------------------------------
-// Euler QAngle -> Basis Vector3Ds.  Each Vector3D is optional
+// Euler QAngle -> Basis Vectors.  Each vector is optional
 //-----------------------------------------------------------------------------
 void AngleVectors(const QAngle& angles, Vector3D* forward, Vector3D* right, Vector3D* up)
 {
@@ -953,7 +1067,7 @@ void AngleVectors(const QAngle& angles, Vector3D* forward, Vector3D* right, Vect
 }
 
 //-----------------------------------------------------------------------------
-// Euler QAngle -> Basis Vector3Ds transposed
+// Euler QAngle -> Basis Vectors transposed
 //-----------------------------------------------------------------------------
 
 void AngleVectorsTranspose(const QAngle& angles, Vector3D* forward, Vector3D* right, Vector3D* up)
@@ -988,7 +1102,7 @@ void AngleVectorsTranspose(const QAngle& angles, Vector3D* forward, Vector3D* ri
 }
 
 //-----------------------------------------------------------------------------
-// Forward direction Vector3D -> Euler angles
+// Forward direction vector -> Euler angles
 //-----------------------------------------------------------------------------
 
 void VectorAngles(const Vector3D& forward, QAngle& angles)
@@ -1022,7 +1136,7 @@ void VectorAngles(const Vector3D& forward, QAngle& angles)
 }
 
 //-----------------------------------------------------------------------------
-// Forward direction Vector3D with a reference up Vector3D -> Euler angles
+// Forward direction vector with a reference up vector -> Euler angles
 //-----------------------------------------------------------------------------
 
 void VectorAngles(const Vector3D& forward, const Vector3D& pseudoup, QAngle& angles)
@@ -1067,6 +1181,8 @@ void VectorAngles(const Vector3D& forward, const Vector3D& pseudoup, QAngle& ang
 	}
 }
 
+#endif // #if !defined(__SPU__)
+
 void SetIdentityMatrix(matrix3x4_t& matrix)
 {
 	memset(matrix.Base(), 0, sizeof(float) * 3 * 4);
@@ -1076,6 +1192,7 @@ void SetIdentityMatrix(matrix3x4_t& matrix)
 }
 
 
+#if !defined(__SPU__)
 //-----------------------------------------------------------------------------
 // Builds a scale matrix
 //-----------------------------------------------------------------------------
@@ -1154,13 +1271,13 @@ void MatrixTranspose(const matrix3x4_t& src, matrix3x4_t& dst)
 	dst[1][0] = src[0][1]; dst[1][1] = src[1][1]; dst[1][2] = src[2][1]; dst[1][3] = 0.0f;
 	dst[2][0] = src[0][2]; dst[2][1] = src[1][2]; dst[2][2] = src[2][2]; dst[2][3] = 0.0f;
 }
-
+#endif // #if !defined(__SPU__)
 
 //-----------------------------------------------------------------------------
 // Purpose: converts engine euler angles into a matrix
 // Input  : vec3_t angles - PITCH, YAW, ROLL
 // Output : *matrix - left-handed column matrix
-//			the basis Vector3Ds for the rotations will be in the columns as follows:
+//			the basis vectors for the rotations will be in the columns as follows:
 //			matrix[][0] is forward
 //			matrix[][1] is left
 //			matrix[][2] is up
@@ -1214,16 +1331,12 @@ void AngleMatrix(const QAngle& angles, matrix3x4_t& matrix)
 	matrix[1][0] = cp * sy;
 	matrix[2][0] = -sp;
 
-	float crcy = cr * cy;
-	float crsy = cr * sy;
-	float srcy = sr * cy;
-	float srsy = sr * sy;
-	matrix[0][1] = sp * srcy - crsy;
-	matrix[1][1] = sp * srsy + crcy;
+	// NOTE: Do not optimize this to reduce multiplies! optimizer bug will screw this up.
+	matrix[0][1] = sr * sp * cy + cr * -sy;
+	matrix[1][1] = sr * sp * sy + cr * cy;
 	matrix[2][1] = sr * cp;
-
-	matrix[0][2] = (sp * crcy + srsy);
-	matrix[1][2] = (sp * crsy - srcy);
+	matrix[0][2] = (cr * sp * cy + -sr * -sy);
+	matrix[1][2] = (cr * sp * sy + -sr * cy);
 	matrix[2][2] = cr * cp;
 
 	matrix[0][3] = 0.0f;
@@ -1231,6 +1344,7 @@ void AngleMatrix(const QAngle& angles, matrix3x4_t& matrix)
 	matrix[2][3] = 0.0f;
 }
 
+#if !defined(__SPU__)
 void AngleIMatrix(const RadianEuler& angles, matrix3x4_t& matrix)
 {
 	QAngle quakeEuler(RAD2DEG(angles.y), RAD2DEG(angles.z), RAD2DEG(angles.x));
@@ -1271,8 +1385,9 @@ void AngleIMatrix(const QAngle& angles, const Vector3D& position, matrix3x4_t& m
 	vecTranslation *= -1.0f;
 	MatrixSetColumn(vecTranslation, 3, mat);
 }
+#endif // #if !defined(__SPU__)
 
-
+#if !defined(__SPU__)
 //-----------------------------------------------------------------------------
 // Bounding box construction methods
 //-----------------------------------------------------------------------------
@@ -1280,8 +1395,8 @@ void AngleIMatrix(const QAngle& angles, const Vector3D& position, matrix3x4_t& m
 void ClearBounds(Vector3D& mins, Vector3D& maxs)
 {
 	Assert(s_bMathlibInitialized);
-	mins[0] = mins[1] = mins[2] = 99999;
-	maxs[0] = maxs[1] = maxs[2] = -99999;
+	mins[0] = mins[1] = mins[2] = FLT_MAX;
+	maxs[0] = maxs[1] = maxs[2] = -FLT_MAX;
 }
 
 void AddPointToBounds(const Vector3D& v, Vector3D& mins, Vector3D& maxs)
@@ -1300,6 +1415,32 @@ void AddPointToBounds(const Vector3D& v, Vector3D& mins, Vector3D& maxs)
 	}
 }
 
+bool AreBoundsValid(const Vector3D& vMin, const Vector3D& vMax)
+{
+	for (int i = 0; i < 3; ++i)
+	{
+		if (vMin[i] > vMax[i])
+		{
+			return false;
+		}
+	}
+
+	return true;
+}
+
+bool IsPointInBounds(const Vector3D& vPoint, const Vector3D& vMin, const Vector3D& vMax)
+{
+	for (int i = 0; i < 3; ++i)
+	{
+		if (vPoint[i] < vMin[i] || vPoint[i] > vMax[i])
+		{
+			return false;
+		}
+	}
+
+	return true;
+}
+
 // solve a x^2 + b x + c = 0
 bool SolveQuadratic(float a, float b, float c, float& root1, float& root2)
 {
@@ -1423,7 +1564,7 @@ bool SolveInverseReciprocalQuadratic(float x1, float y1, float x2, float y2, flo
 }
 
 
-// Rotate a Vector3D around the Z axis (YAW)
+// Rotate a vector around the Z axis (YAW)
 void VectorYawRotate(const Vector3D& in, float flYaw, Vector3D& out)
 {
 	Assert(s_bMathlibInitialized);
@@ -1455,9 +1596,7 @@ float Bias(float x, float biasAmt)
 	{
 		lastExponent = log(biasAmt) * -1.4427f; // (-1.4427 = 1 / log(0.5))
 	}
-	float fRet = pow(x, lastExponent);
-	Assert(!IS_NAN(fRet));
-	return fRet;
+	return pow(x, lastExponent);
 }
 
 
@@ -1473,9 +1612,7 @@ float Gain(float x, float biasAmt)
 
 float SmoothCurve(float x)
 {
-	// Actual smooth curve. Visualization:
-	// http://www.wolframalpha.com/input/?i=plot%5B+0.5+*+%281+-+cos%5B2+*+pi+*+x%5D%29+for+x+%3D+%280%2C+1%29+%5D
-	return 0.5f * (1 - cos(2.0f * M_PI * x));
+	return (1 - cos(x * M_PI)) * 0.5f;
 }
 
 
@@ -1496,118 +1633,7 @@ float SmoothCurve_Tweak(float x, float flPeakPos, float flPeakSharpness)
 	return SmoothCurve(flSharpened);
 }
 
-void QuaternionExp(const Quaternion& p, Quaternion& q)
-{
-	float r = sqrt(p[0] * p[0] + p[1] * p[1] + p[2] * p[2]);
-	float et = exp(p[3]);
-	float s = r >= 0.00001f ? et * sin(r) / r : 0.f;
-	q.Init(s * p[0], s * p[1], s * p[2], et * cos(r));
-}
-
-void QuaternionLn(const Quaternion& p, Quaternion& q)
-{
-	float r = sqrt(p[0] * p[0] + p[1] * p[1] + p[2] * p[2]);
-	float t = r > 0.00001f ? atan2(r, p[3]) / r : 0.f;
-	float norm = p[0] * p[0] + p[1] * p[1] + p[2] * p[2] + p[3] * p[3];
-	q.Init(t * p[0], t * p[1], t * p[2], 0.5 * log(norm));
-}
-
-// Average using exponential method
-// Qave = exp( 1 / n * log( Q1 ) + ... + 1 / n * log( Qn ) ) where
-// if pflWeights passed in 1/n is replaced by normalized weighting
-void QuaternionAverageExponential(Quaternion& q, int nCount, const Quaternion* pQuaternions, const float* pflWeights /*=NULL*/)
-{
-	Assert(nCount >= 1);
-	Assert(pQuaternions);
-
-	// Nothing to do if only one input quaternions
-	if (nCount == 1)
-	{
-		q = pQuaternions[0];
-		return;
-	}
-
-	float ooWeightSum = 1.0f;
-	float flWeightSum = 0.0f;
-	for (int i = 0; i < nCount; ++i)
-	{
-		if (pflWeights)
-		{
-			flWeightSum += pflWeights[i];
-		}
-		else
-		{
-			flWeightSum += 1.0f;
-		}
-	}
-
-	if (flWeightSum > 0.0f)
-	{
-		ooWeightSum = 1.0f / flWeightSum;
-	}
-
-	Quaternion sum(0, 0, 0, 0);
-	// Now sum the ln of the quaternions
-	for (int i = 0; i < nCount; ++i)
-	{
-		float weight = ooWeightSum;
-		if (pflWeights)
-		{
-			weight *= pflWeights[i];
-		}
-
-		// Make sure all quaternions are aligned with the 
-		// first to avoid blending the wrong direction.
-		Quaternion alignedQuat;
-		QuaternionAlign(pQuaternions[0], pQuaternions[i], alignedQuat);
-
-		Quaternion qLn;
-		QuaternionLn(alignedQuat, qLn);
-		for (int j = 0; j < 4; ++j)
-		{
-			sum[j] += (qLn[j] * weight);
-		}
-	}
-
-	// then exponentiate to get final value
-	QuaternionExp(sum, q);
-}
-
-// Given a vector and a pseudo-up reference vector, create a quaternion which represents
-//  the orientation of the forward vector.  Note, will be unstable if vecForward is close
-//  to referenceUp
-void QuaternionLookAt(const Vector3D& vecForward, const Vector3D& referenceUp, Quaternion& q)
-{
-	Vector3D forward = vecForward;
-	forward.NormalizeInPlace();
-	float ratio = DotProduct(forward, referenceUp);
-	Vector3D up = referenceUp - (forward * ratio);
-	up.NormalizeInPlace();
-
-	Vector3D right = forward.Cross(up);
-	right.NormalizeInPlace();
-
-	const Vector3D& x = right;
-	const Vector3D& y = forward;
-	const Vector3D& z = up;
-
-	float tr = x.x + y.y + z.z;
-	q.Init(y.z - z.y, z.x - x.z, x.y - y.x, tr + 1.0f);
-	QuaternionNormalize(q);
-
-	/*
-	Vector z = vecForward;
-	z.NormalizeInPlace();
-	Vector x = referenceUp.Cross( z );
-	x.NormalizeInPlace();
-	Vector y = z.Cross( x );
-	y.NormalizeInPlace();
-
-	float tr = x.x + y.y + z.z;
-	q.Init( y.z - z.y , z.x - x.z, x.y - y.x, tr + 1.0f );
-	QuaternionNormalize( q );
-	*/
-}
+#endif  // !defined(__SPU__)
 
 //-----------------------------------------------------------------------------
 // make sure quaternions are within 180 degrees of one another, if not, reverse q
@@ -1764,7 +1790,7 @@ void QuaternionSlerpNoAlign(const Quaternion& p, const Quaternion& q, float t, Q
 	Assert(qt.IsValid());
 }
 
-
+#if !defined(__SPU__)
 //-----------------------------------------------------------------------------
 // Purpose: Returns the angular delta between the two normalized quaternions in degrees.
 //-----------------------------------------------------------------------------
@@ -1836,6 +1862,18 @@ void QuaternionInvert(const Quaternion& p, Quaternion& q)
 	}
 }
 
+void QuaternionMultiply(const Quaternion& q, const Vector3D& v, Vector3D& result)
+{
+	Vector3D t, t2;
+	CrossProduct(q.ImaginaryPart(), v, t);
+	t *= 2.0f;
+	VectorMA(v, q.RealPart(), t, result);
+	CrossProduct(q.ImaginaryPart(), t, t2);
+	result += t2;
+}
+
+#endif // #if !defined(__SPU__)
+
 //-----------------------------------------------------------------------------
 // Make sure the quaternion is of unit length
 //-----------------------------------------------------------------------------
@@ -1882,7 +1920,7 @@ void QuaternionScale(const Quaternion& p, float t, Quaternion& q)
 	// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to 
 	// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
 	float sinom = sqrt(DotProduct(&p.x, &p.x));
-	sinom = min(sinom, 1.f);
+	sinom = MIN(sinom, 1.f);
 
 	float sinsom = sin(asin(sinom) * t);
 
@@ -1965,12 +2003,126 @@ void QuaternionMult(const Quaternion& p, const Quaternion& q, Quaternion& qt)
 }
 
 
+#if !defined(__SPU__)
+
+void QuaternionExp(const Quaternion& p, Quaternion& q)
+{
+	float r = sqrt(p[0] * p[0] + p[1] * p[1] + p[2] * p[2]);
+	float et = exp(p[3]);
+	float s = r >= 0.00001f ? et * sin(r) / r : 0.f;
+	q.Init(s * p[0], s * p[1], s * p[2], et * cos(r));
+}
+
+void QuaternionLn(const Quaternion& p, Quaternion& q)
+{
+	float r = sqrt(p[0] * p[0] + p[1] * p[1] + p[2] * p[2]);
+	float t = r > 0.00001f ? atan2(r, p[3]) / r : 0.f;
+	float norm = p[0] * p[0] + p[1] * p[1] + p[2] * p[2] + p[3] * p[3];
+	q.Init(t * p[0], t * p[1], t * p[2], 0.5 * log(norm));
+}
+
+// Average using exponential method
+// Qave = exp( 1 / n * log( Q1 ) + ... + 1 / n * log( Qn ) ) where
+// if pflWeights passed in 1/n is replaced by normalized weighting
+void QuaternionAverageExponential(Quaternion& q, int nCount, const Quaternion* pQuaternions, const float* pflWeights /*=NULL*/)
+{
+	Assert(nCount >= 1);
+	Assert(pQuaternions);
+
+	// Nothing to do if only one input quaternions
+	if (nCount == 1)
+	{
+		q = pQuaternions[0];
+		return;
+	}
+
+	float ooWeightSum = 1.0f;
+	float flWeightSum = 0.0f;
+	for (int i = 0; i < nCount; ++i)
+	{
+		if (pflWeights)
+		{
+			flWeightSum += pflWeights[i];
+		}
+		else
+		{
+			flWeightSum += 1.0f;
+		}
+	}
+
+	if (flWeightSum > 0.0f)
+	{
+		ooWeightSum = 1.0f / flWeightSum;
+	}
+
+	Quaternion sum(0, 0, 0, 0);
+	// Now sum the ln of the quaternions
+	for (int i = 0; i < nCount; ++i)
+	{
+		float weight = ooWeightSum;
+		if (pflWeights)
+		{
+			weight *= pflWeights[i];
+		}
+
+		// Make sure all quaternions are aligned with the 
+		// first to avoid blending the wrong direction.
+		Quaternion alignedQuat;
+		QuaternionAlign(pQuaternions[0], pQuaternions[i], alignedQuat);
+
+		Quaternion qLn;
+		QuaternionLn(alignedQuat, qLn);
+		for (int j = 0; j < 4; ++j)
+		{
+			sum[j] += (qLn[j] * weight);
+		}
+	}
+
+	// then exponentiate to get final value
+	QuaternionExp(sum, q);
+}
+
+// Given a vector and a pseudo-up reference vector, create a quaternion which represents
+//  the orientation of the forward vector.  Note, will be unstable if vecForward is close
+//  to referenceUp
+void QuaternionLookAt(const Vector3D& vecForward, const Vector3D& referenceUp, Quaternion& q)
+{
+	Vector3D forward = vecForward;
+	forward.NormalizeInPlace();
+	float ratio = DotProduct(forward, referenceUp);
+	Vector3D up = referenceUp - (forward * ratio);
+	up.NormalizeInPlace();
+
+	Vector3D right = forward.Cross(up);
+	right.NormalizeInPlace();
+
+	const Vector3D& x = right;
+	const Vector3D& y = forward;
+	const Vector3D& z = up;
+
+	float tr = x.x + y.y + z.z;
+	q.Init(y.z - z.y, z.x - x.z, x.y - y.x, tr + 1.0f);
+	QuaternionNormalize(q);
+
+	/*
+	Vector z = vecForward;
+	z.NormalizeInPlace();
+	Vector x = referenceUp.Cross( z );
+	x.NormalizeInPlace();
+	Vector y = z.Cross( x );
+	y.NormalizeInPlace();
+
+	float tr = x.x + y.y + z.z;
+	q.Init( y.z - z.y , z.x - x.z, x.y - y.x, tr + 1.0f );
+	QuaternionNormalize( q );
+	*/
+}
+
+#endif // !defined(__SPU__)
+
 void QuaternionMatrix(const Quaternion& q, const Vector3D& pos, matrix3x4_t& matrix)
 {
-	if (!HushAsserts())
-	{
-		Assert(pos.IsValid());
-	}
+	Assert(pos.IsValid());
 
 	QuaternionMatrix(q, matrix);
 
@@ -1979,13 +2131,25 @@ void QuaternionMatrix(const Quaternion& q, const Vector3D& pos, matrix3x4_t& mat
 	matrix[2][3] = pos.z;
 }
 
+void QuaternionMatrix(const Quaternion& q, const Vector3D& pos, const Vector3D& vScale, matrix3x4_t& mat)
+{
+	Assert(pos.IsValid());
+	Assert(q.IsValid());
+	Assert(vScale.IsValid());
+
+	QuaternionMatrix(q, mat);
+
+	mat[0][0] *= vScale.x; mat[1][0] *= vScale.x; mat[2][0] *= vScale.x;
+	mat[0][1] *= vScale.y; mat[1][1] *= vScale.y; mat[2][1] *= vScale.y;
+	mat[0][2] *= vScale.z; mat[1][2] *= vScale.z; mat[2][2] *= vScale.z;
+	mat[0][3] = pos.x; mat[1][3] = pos.y; mat[2][3] = pos.z;
+}
+
+
 void QuaternionMatrix(const Quaternion& q, matrix3x4_t& matrix)
 {
 	Assert(s_bMathlibInitialized);
-	if (!HushAsserts())
-	{
-		Assert(q.IsValid());
-	}
+	Assert(q.IsValid());
 
 #ifdef _VPROF_MATHLIB
 	VPROF_BUDGET("QuaternionMatrix", "Mathlib");
@@ -2045,6 +2209,109 @@ void QuaternionMatrix(const Quaternion& q, matrix3x4_t& matrix)
 }
 
 
+const Vector3D Quaternion::GetForward()const
+{
+	Vector3D vAxisX;
+	vAxisX.x = 1.0 - 2.0 * y * y - 2.0 * z * z;
+	vAxisX.y = 2.0 * x * y + 2.0 * w * z;
+	vAxisX.z = 2.0 * x * z - 2.0 * w * y;
+	return vAxisX;
+}
+
+
+const Vector3D Quaternion::GetLeft()const
+{
+	Vector3D vAxisY;
+	vAxisY.x = 2.0f * x * y - 2.0f * w * z;
+	vAxisY.y = 1.0f - 2.0f * x * x - 2.0f * z * z;
+	vAxisY.z = 2.0f * y * z + 2.0f * w * x;
+	return vAxisY;
+}
+
+
+
+const Vector3D Quaternion::GetUp()const
+{
+	Vector3D vAxisZ;
+	vAxisZ.x = 2.0f * x * z + 2.0f * w * y;
+	vAxisZ.y = 2.0f * y * z - 2.0f * w * x;
+	vAxisZ.z = 1.0f - 2.0f * x * x - 2.0f * y * y;
+	return vAxisZ;
+}
+
+
+
+const Quaternion RotateBetween(const Vector3D& v1, const Vector3D& v2)
+{
+	// Find quaternion that rotates v1 into v2
+	Quaternion qOut;
+
+	Vector3D vBisector = 0.5f * (v1 + v2);
+	if (vBisector.LengthSqr() > 1e-9f)
+	{
+		qOut.Init(CrossProduct(v1, vBisector), DotProduct(v1, vBisector));
+	}
+	else
+	{
+		// Anti-parallel: Use a perpendicular vector
+		if (fabsf(v1.x) > 0.5f)
+		{
+			qOut.x = v1.y;
+			qOut.y = -v1.x;
+			qOut.z = 0.0f;
+		}
+		else
+		{
+			qOut.x = 0.0f;
+			qOut.y = v1.z;
+			qOut.z = -v1.y;
+		}
+
+		qOut.w = 0.0f;
+	}
+
+	// The algorithm is simplified and made more accurate by normalizing at the end
+	QuaternionNormalize(qOut);
+
+	Assert((VectorTransform(v1, QuaternionMatrix(qOut)) - v2).Length() < 2e-3f);
+
+	return qOut;
+}
+
+
+void UnitTestQuatExpLog()
+{
+	for (int i = 0; i < 300000; ++i)
+	{
+		Quaternion q = RandomQuaternion();
+		Vector3D l = QuaternionLog(q);
+		Quaternion q2 = Exp(l);
+		Assert(QuaternionLength(q - q2) < 0.0001f);
+	}
+}
+
+
+void UnitTestRotateBetween()
+{
+	RandomSeed(1);
+	float flMaxError = 0;
+	int nMaxError;
+	for (int i = 0; i < 3000000; ++i)
+	{
+		Vector3D u = RandomVectorOnUnitSphere(), v = RandomVectorOnUnitSphere();
+		Quaternion q = RotateBetween(u, v);
+
+		float flError = (VectorTransform(u, QuaternionMatrix(q)) - v).Length();
+		if (flMaxError < flError)
+		{
+			flMaxError = flError;
+			nMaxError = i;
+		}
+	}
+	Assert(flMaxError < 0.001f);
+}
+
+
 //-----------------------------------------------------------------------------
 // Purpose: Converts a quaternion into engine angles
 // Input  : *quaternion - q3 + q0.i + q1.j + q2.k
@@ -2082,6 +2349,97 @@ void QuaternionAngles(const Quaternion& q, QAngle& angles)
 	Assert(angles.IsValid());
 }
 
+
+float QuaternionionGetYaw(const Quaternion& q)
+{
+	// FIXME: doing it this way calculates too much data, need to do an optimized version...
+	QAngle angles;
+	matrix3x4_t matrix;
+	QuaternionMatrix(q, matrix);
+	MatrixAngles(matrix, angles);
+	return angles[YAW];
+}
+
+float QuaternionionGetPitch(const Quaternion& q)
+{
+	// FIXME: doing it this way calculates too much data, need to do an optimized version...
+	QAngle angles;
+	matrix3x4_t matrix;
+	QuaternionMatrix(q, matrix);
+	MatrixAngles(matrix, angles);
+	return angles[PITCH];
+}
+
+float QuaternionionGetRoll(const Quaternion& q)
+{
+	// FIXME: doing it this way calculates too much data, need to do an optimized version...
+	QAngle angles;
+	matrix3x4_t matrix;
+	QuaternionMatrix(q, matrix);
+	MatrixAngles(matrix, angles);
+	return angles[ROLL];
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Converts a quaternion into FLU vectors
+// Input  : *quaternion - q3 + q0.i + q1.j + q2.k
+//			basis vectors, each vector is optional
+//-----------------------------------------------------------------------------
+void QuaternionVectorsFLU(Quaternion const& q, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp)
+{
+	Assert(s_bMathlibInitialized);
+	Assert(q.IsValid());
+
+#ifdef _VPROF_MATHLIB
+	// @TODO: VPROF_BUDGET( "QuaternionVectorsFLU", "Mathlib" );
+#endif
+
+	// Note: it's pretty much identical to just computing the quaternion matrix and assigning its columns to the vectors
+	* pForward = q.GetForward();
+	*pLeft = q.GetLeft();
+	*pUp = q.GetUp();
+#ifdef DBGFLAG_ASSERT
+	matrix3x4_t matrix;
+	QuaternionMatrix(q, matrix);
+	Vector3D forward, left, up;
+	MatrixVectorsFLU(matrix, &forward, &left, &up);
+	Assert((forward - *pForward).Length() + (left - *pLeft).Length() + (up - *pUp).Length() < 1e-4f);
+#endif
+}
+
+void QuaternionVectorsForward(const Quaternion& q, Vector3D* pForward)
+{
+	Assert(s_bMathlibInitialized);
+	Assert(q.IsValid());
+
+#ifdef _VPROF_MATHLIB
+	// @TODO: VPROF_BUDGET( "QuaternionVectorsForward", "Mathlib" );
+#endif
+
+	* pForward = q.GetForward();
+#ifdef DBGFLAG_ASSERT
+	matrix3x4_t matrix;
+	QuaternionMatrix(q, matrix);
+	Assert((MatrixGetColumn(matrix, FORWARD_AXIS) - *pForward).Length() < 1e-4f);
+#endif
+}
+
+
+void UnitTestVectorFLU()
+{
+	for (int i = 0; i < 100000; ++i)
+	{
+		Quaternion q = RandomQuaternion();
+		Vector3D forward, left, up;
+		QuaternionVectorsForward(q, &forward);
+		QuaternionVectorsFLU(q, &forward, &left, &up);
+	}
+}
+
+
+
+#if !defined(__SPU__)
 //-----------------------------------------------------------------------------
 // Purpose: Converts a quaternion to an axis / angle in degrees
 //			(exponential map)
@@ -2113,7 +2471,7 @@ void AxisAngleQuaternion(const Vector3D& axis, float angle, Quaternion& q)
 	q.z = axis.z * sa;
 	q.w = ca;
 }
-
+#endif // #if !defined(__SPU__)
 
 //-----------------------------------------------------------------------------
 // Purpose: Converts radian-euler axis aligned angles to a quaternion
@@ -2158,6 +2516,72 @@ void AngleQuaternion(const RadianEuler& angles, Quaternion& outQuat)
 	outQuat.w = crXcp * cy + srXsp * sy; // W (real component)
 }
 
+#ifdef _X360
+//-----------------------------------------------------------------------------
+// Purpose: Converts radian-euler axis aligned angles to a quaternion, returning
+//			it on a vector register.
+// Input  : *vAngles - Right-handed Euler angles in radians (roll pitch yaw)
+//
+// Algorithm based on that found in the XDK (which really uses RPY order, as
+//  opposed to this which takes the parameters in RPY order but catenates them
+//  in PYR order).
+//-----------------------------------------------------------------------------
+fltx4 AngleQuaternionSIMD(FLTX4 vAngles)
+{
+	Assert(s_bMathlibInitialized);
+	//	Assert( angles.IsValid() );
+
+#ifdef _VPROF_MATHLIB
+	VPROF_BUDGET("AngleQuaternion", "Mathlib");
+#endif
+
+	// we compute the sin and cos of half all the angles.
+	// in the comments I'll call these components
+	// sr = sin(r/2), cp = cos(p/2), sy = sin(y/2), etc.
+
+	fltx4 OneHalf = __vspltisw(1);
+	OneHalf = __vcfsx(OneHalf, 1);
+
+	fltx4 HalfAngles = MulSIMD(vAngles, OneHalf);
+	fltx4 sine, cosine;
+	SinCos3SIMD(sine, cosine, HalfAngles);
+
+	fltx4 SignMask = __vspltisw(-1);
+	fltx4 Zero = __vspltisw(0);
+	SignMask = __vslw(SignMask, SignMask); // shift left so 1 is only in the sign bit
+	SignMask = __vrlimi(SignMask, Zero, 0x5, 0); // { -1, 0, -1, 0 }
+
+	fltx4 Rc, Pc, Yc, Rs, Ps, Ys, retsum, retval;
+
+	Rc = __vspltw(cosine, 0);	// cr cr cr cr
+	Pc = __vspltw(cosine, 1);	// cp cp cp cp
+	Yc = __vspltw(cosine, 2);	// cy cy cy cy
+	Rs = __vspltw(sine, 0);		// sr sr sr sr
+	Ps = __vspltw(sine, 1);		// sp sp sp sp
+	Ys = __vspltw(sine, 2);		// sy sy sy sy
+
+	Rc = __vrlimi(Rc, sine, 0x8, 0);	// sr cr cr cr
+	Rs = __vrlimi(Rs, cosine, 0x8, 0);	// cr sr sr sr
+	Pc = __vrlimi(Pc, sine, 0x4, 0);	// cp sp cp cp 
+	Ps = __vrlimi(Ps, cosine, 0x4, 0);	// sp cp sp sp 
+	Yc = __vrlimi(Yc, sine, 0x2, 0);	// cy cy sy cy 
+	Ys = __vrlimi(Ys, cosine, 0x2, 0);	// sy sy cy sy
+
+	retsum = __vxor(Rs, SignMask);	// -cr sr -sr sr
+	retval = __vmulfp(Pc, Yc);		//  cp*cy  sp*cy  cp*sy  cp*cy
+	retsum = __vmulfp(retsum, Ys);	// -cr*sy  sr*sy -sr*cy  sr*sy
+	retval = __vmulfp(retval, Rc);	//  cp*cy*sr  sp*cy*cr  cp*sy*cr  cp*cy*cr
+	retval = __vmaddfp(retsum, Ps, retval); //  cp*cy*sr + -cr*sy*sp ...
+
+	return retval;
+}
+
+inline fltx4 AngleQuaternionSIMD(const RadianEuler& angles)
+{
+	return AngleQuaternionSIMD(LoadUnaligned3SIMD(angles.Base()));
+}
+#endif
+
 
 //-----------------------------------------------------------------------------
 // Purpose: Converts engine-format euler angles to a quaternion
@@ -2202,7 +2626,7 @@ void AngleQuaternion(const QAngle& angles, Quaternion& outQuat)
 	outQuat.w = crXcp * cy + srXsp * sy; // W (real component)
 }
 
-
+#if !defined(__SPU__)
 //-----------------------------------------------------------------------------
 // Purpose: Converts a basis to a quaternion
 //-----------------------------------------------------------------------------
@@ -2288,7 +2712,116 @@ void MatrixQuaternion(const matrix3x4_t& mat, Quaternion& q)
 	MatrixAngles(mat, angles);
 	AngleQuaternion(angles, q);
 }
+#endif // #if !defined(__SPU__)
 
+void MatrixQuaternionFast(const matrix3x4_t& mat, Quaternion& q)
+{
+	float t;
+	if (mat[2][2] < 0)
+	{
+		if (mat[0][0] > mat[1][1])
+		{
+			t = 1 + mat[0][0] - mat[1][1] - mat[2][2];
+			q.Init(t, mat[0][1] + mat[1][0], mat[2][0] + mat[0][2], mat[2][1] - mat[1][2]);
+		}
+		else
+		{
+			t = 1 - mat[0][0] + mat[1][1] - mat[2][2];
+			q.Init(mat[0][1] + mat[1][0], t, mat[1][2] + mat[2][1], mat[0][2] - mat[2][0]);
+		}
+	}
+	else
+	{
+		if (mat[0][0] < -mat[1][1])
+		{
+			t = 1 - mat[0][0] - mat[1][1] + mat[2][2];
+			q.Init(mat[2][0] + mat[0][2], mat[1][2] + mat[2][1], t, mat[1][0] - mat[0][1]);
+		}
+		else
+		{
+			t = 1 + mat[0][0] + mat[1][1] + mat[2][2];
+			q.Init(mat[2][1] - mat[1][2], mat[0][2] - mat[2][0], mat[1][0] - mat[0][1], t);
+		}
+	}
+	q = q * (0.5f / sqrtf(t));
+}
+
+
+float MatrixQuaternionTest(uint nCount)
+{
+	float flMaxError = 0, flSumError = 0;
+	for (uint i = 0; i < nCount; ++i)
+	{
+		Quaternion q = RandomQuaternion(), r;
+		Assert(fabsf(q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w - 1) < 1e-5f);
+		matrix3x4_t mat;
+		QuaternionMatrix(q, mat);
+		MatrixQuaternion(mat, r);
+		if (QuaternionDotProduct(q, r) < 0)
+		{
+			r = -r;
+		}
+		float flError = Sqr(q.x - r.x) + Sqr(q.y - r.y) + Sqr(q.z - r.z) + Sqr(q.w - r.w);
+		flSumError += flError;
+		if (flError > flMaxError)
+		{
+			flMaxError = flError;
+		}
+	}
+	NOTE_UNUSED(flMaxError); NOTE_UNUSED(flSumError);
+	return flSumError / nCount;
+}
+
+float MatrixQuaternionFastTest(uint nCount)
+{
+	float flMaxError = 0, flSumError = 0;
+	for (uint i = 0; i < nCount; ++i)
+	{
+		Quaternion q = RandomQuaternion(), r;
+		Assert(fabsf(q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w - 1) < 1e-5f);
+		matrix3x4_t mat;
+		QuaternionMatrix(q, mat);
+		MatrixQuaternionFast(mat, r);
+		if (QuaternionDotProduct(q, r) < 0)
+		{
+			r = -r;
+		}
+		float flError = Sqr(q.x - r.x) + Sqr(q.y - r.y) + Sqr(q.z - r.z) + Sqr(q.w - r.w);
+		flSumError += flError;
+		if (flError > flMaxError)
+		{
+			flMaxError = flError;
+		}
+	}
+	NOTE_UNUSED(flMaxError); NOTE_UNUSED(flSumError);
+	return flSumError / nCount;
+}
+
+// the same as MatrixQuaternionTest, but uses inline helper functions that return matrix and quaternion instead of using return-by-reference versions
+// on MSVC10, this generates the same code as MatrixQuaternionTest, but it's easier to read, write and maintain code
+float MatrixQuaternionTest2(uint nCount)
+{
+	float flMaxError = 0, flSumError = 0;
+	for (uint i = 0; i < nCount; ++i)
+	{
+		Quaternion q = RandomQuaternion(), r;
+		Assert(fabsf(q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w - 1) < 1e-5f);
+		matrix3x4_t mat = QuaternionMatrix(q);
+		r = MatrixQuaternion(mat);
+		if (QuaternionDotProduct(q, r) < 0)
+		{
+			r = -r;
+		}
+		float flError = Sqr(q.x - r.x) + Sqr(q.y - r.y) + Sqr(q.z - r.z) + Sqr(q.w - r.w);
+		flSumError += flError;
+		if (flError > flMaxError)
+		{
+			flMaxError = flError;
+		}
+	}
+	NOTE_UNUSED(flMaxError); NOTE_UNUSED(flSumError);
+	return flSumError / nCount;
+}
 
 //-----------------------------------------------------------------------------
 // Purpose: Converts a quaternion into engine angles
@@ -2308,6 +2841,7 @@ void QuaternionAngles(const Quaternion& q, RadianEuler& angles)
 	Assert(angles.IsValid());
 }
 
+#if !defined(__SPU__)
 //-----------------------------------------------------------------------------
 // Purpose: A helper function to normalize p2.x->p1.x and p3.x->p4.x to 
 //  be the same length as p2.x->p3.x
@@ -2342,7 +2876,9 @@ void Spline_Normalize(
 		}
 	}
 }
+#endif // #if !defined(__SPU__)
 
+#if !defined(__SPU__)
 //-----------------------------------------------------------------------------
 // Purpose: 
 // Input  : 
@@ -2546,6 +3082,7 @@ void Catmull_Rom_Spline_NormalizeX(
 	Catmull_Rom_Spline(p1n, p2, p3, p4n, t, output);
 }
 
+#endif // !defined(__SPU__)
 
 //-----------------------------------------------------------------------------
 // Purpose: basic hermite spline.  t = 0 returns p1, t = 1 returns p2, 
@@ -2626,8 +3163,10 @@ void Hermite_SplineBasis(float t, float basis[4])
 // Input  : 
 //-----------------------------------------------------------------------------
 
-// BUG: the Vector3DSubtract()'s calls go away if the global optimizer is enabled
+// BUG: the VectorSubtract()'s calls go away if the global optimizer is enabled
+#if !defined(__SPU__)
 #pragma optimize( "g", off )
+#endif
 
 void Hermite_Spline(const Vector3D& p0, const Vector3D& p1, const Vector3D& p2, float t, Vector3D& output)
 {
@@ -2637,7 +3176,9 @@ void Hermite_Spline(const Vector3D& p0, const Vector3D& p1, const Vector3D& p2,
 	Hermite_Spline(p1, p2, e10, e21, t, output);
 }
 
+#if !defined(__SPU__)
 #pragma optimize( "", on )
+#endif
 
 float Hermite_Spline(float p0, float p1, float p2, float t)
 {
@@ -2662,6 +3203,8 @@ void Hermite_Spline(const Quaternion& q0, const Quaternion& q1, const Quaternion
 	QuaternionNormalize(output);
 }
 
+
+#if !defined(__SPU__)
 // See http://en.wikipedia.org/wiki/Kochanek-Bartels_curves
 // 
 // Tension:  -1 = Round -> 1 = Tight
@@ -2961,6 +3504,33 @@ void Parabolic_Spline_NormalizeX(
 	Parabolic_Spline(p1n, p2, p3, p4n, t, output);
 }
 
+//-----------------------------------------------------------------------------
+// Cubic Bernstein basis functions
+// http://mathworld.wolfram.com/BernsteinPolynomial.html
+//
+// Purpose: Evaluate the cubic Bernstein basis for the input parametric coordinate.
+// Output is the coefficient for that basis polynomial.
+//-----------------------------------------------------------------------------
+float CubicBasis0(float t)
+{
+	float invT = 1.0f - t;
+	return invT * invT * invT;
+}
+float CubicBasis1(float t)
+{
+	float invT = 1.0f - t;
+	return 3.0f * t * invT * invT;
+}
+float CubicBasis2(float t)
+{
+	float invT = 1.0f - t;
+	return 3.0f * t * t * invT;
+}
+float CubicBasis3(float t)
+{
+	return t * t * t;
+}
+
 //-----------------------------------------------------------------------------
 // Purpose: Compress the input values for a ranged result such that from 75% to 200% smoothly of the range maps 
 //-----------------------------------------------------------------------------
@@ -3033,6 +3603,8 @@ void TransformAABB(const matrix3x4_t& transform, const Vector3D& vecMinsIn, cons
 
 	VectorSubtract(worldCenter, worldExtents, vecMinsOut);
 	VectorAdd(worldCenter, worldExtents, vecMaxsOut);
+	// sanity chec	
+	Assert(vecMinsOut.LengthSqr() + vecMaxsOut.LengthSqr() < 1e+12);
 }
 
 
@@ -3246,7 +3818,7 @@ void CalcClosestPointOnLineSegment(const Vector3D& P, const Vector3D& vLineA, co
 {
 	Vector3D vDir;
 	float t = CalcClosestPointToLineT(P, vLineA, vLineB, vDir);
-	t = clamp(t, 0.f, 1.f);
+	t = clamp(static_cast<int>(t), 0, 1);
 	if (outT)
 	{
 		*outT = t;
@@ -3318,7 +3890,7 @@ void CalcClosestPointOnLineSegment2D(const Vector2D& P, const Vector2D& vLineA,
 {
 	Vector2D vDir;
 	float t = CalcClosestPointToLineT2D(P, vLineA, vLineB, vDir);
-	t = clamp(t, 0.f, 1.f);
+	t = clamp(static_cast<int>(t), 0, 1);
 	if (outT)
 	{
 		*outT = t;
@@ -3393,12 +3965,15 @@ bool CalcLineToLineIntersectionSegment(
 	*t1 = numer / denom;
 	*t2 = (d1343 + d4321 * (*t1)) / d4343;
 
-	s1->x = p1.x + *t1 * p21.x;
-	s1->y = p1.y + *t1 * p21.y;
-	s1->z = p1.z + *t1 * p21.z;
-	s2->x = p3.x + *t2 * p43.x;
-	s2->y = p3.y + *t2 * p43.y;
-	s2->z = p3.z + *t2 * p43.z;
+	if (s1 != NULL && s2 != NULL)
+	{
+		s1->x = p1.x + *t1 * p21.x;
+		s1->y = p1.y + *t1 * p21.y;
+		s1->z = p1.z + *t1 * p21.z;
+		s2->x = p3.x + *t2 * p43.x;
+		s2->y = p3.y + *t2 * p43.y;
+		s2->z = p3.z + *t2 * p43.z;
+	}
 
 	return true;
 }
@@ -3411,132 +3986,67 @@ bool CalcLineToLineIntersectionSegment(
 
 #pragma optimize( "", on )
 
-static bool s_b3DNowEnabled = false;
-static bool s_bMMXEnabled = false;
-static bool s_bSSEEnabled = false;
-static bool s_bSSE2Enabled = false;
+
+#ifndef NDEBUG
+volatile static char const* pDebugString;
+#endif
 
 void MathLib_Init(float gamma, float texGamma, float brightness, int overbright, bool bAllow3DNow, bool bAllowSSE, bool bAllowSSE2, bool bAllowMMX)
 {
 	if (s_bMathlibInitialized)
 		return;
+#ifdef _WIN32
+	Assert(_rotl(0xC7654321, 1) == 0x8ECA8643);
+	Assert(_rotl64(0xC7654321ABCDEF00ull, 1) == 0x8ECA8643579BDE01ull);
+#endif
+#ifndef NDEBUG
+	pDebugString = "mathlib.lib built debug!";
+#endif
 
-	// FIXME: Hook SSE into Vector3DAligned + Vector3D4DAligned
+	// FIXME: Hook SSE into VectorAligned + Vector4DAligned
 
-#if !defined( _X360 )
+#if !defined( _GAMECONSOLE )
 	// Grab the processor information:
 	const CPUInformation& pi = GetCPUInformation();
 
-	// Select the default generic routines.
-	pfSqrt = _sqrtf;
-	pfRSqrt = _rsqrtf;
-	pfRSqrtFast = _rsqrtf;
-	pfVectorNormalize = _VectorNormalize;
-	pfVectorNormalizeFast = _VectorNormalizeFast;
-	pfInvRSquared = _InvRSquared;
-	pfFastSinCos = SinCos;
-	pfFastCos = cosf;
+	if (!(pi.m_bSSE && pi.m_bSSE2))
+	{
+		Assert(0);
+		if (MessageBoxA(NULL, "SSE and SSE2 are required.", "Unsupported CPU", MB_ICONERROR | MB_OK))
+		{
+			TerminateProcess(GetCurrentProcess(), 0xBAD0C0DE);
+		}
+	}
+#endif //!360
 
-	if (bAllowMMX && pi.m_bMMX)
-	{
-		// Select the MMX specific routines if available
-		// (MMX routines were used by SW span fillers - not currently used for HW)
-		s_bMMXEnabled = true;
-	}
-	else
-	{
-		s_bMMXEnabled = false;
-	}
-
-	// SSE Generally performs better than 3DNow when present, so this is placed 
-	// first to allow SSE to override these settings.
-#if !defined( OSX ) && !defined( PLATFORM_WINDOWS_PC64 ) && !defined(LINUX)
-	if (bAllow3DNow && pi.m_b3DNow)
-	{
-		s_b3DNowEnabled = true;
-
-		// Select the 3DNow specific routines if available;
-		pfVector3DNormalize = _3DNow_Vector3DNormalize;
-		pfVector3DNormalizeFast = _3DNow_Vector3DNormalizeFast;
-		pfInvRSquared = _3DNow_InvRSquared;
-		pfSqrt = _3DNow_Sqrt;
-		pfRSqrt = _3DNow_RSqrt;
-		pfRSqrtFast = _3DNow_RSqrt;
-	}
-	else
-#endif
-	{
-		s_b3DNowEnabled = false;
-	}
-
-	if (bAllowSSE && pi.m_bSSE)
-	{
-		s_bSSEEnabled = true;
-
-#ifndef PLATFORM_WINDOWS_PC64
-		// These are not yet available.
-		// Select the SSE specific routines if available
-		pfVector3DNormalize = _Vector3DNormalize;
-		pfVector3DNormalizeFast = _SSE_Vector3DNormalizeFast;
-		pfInvRSquared = _SSE_InvRSquared;
-		pfSqrt = _SSE_Sqrt;
-		pfRSqrt = _SSE_RSqrtAccurate;
-		pfRSqrtFast = _SSE_RSqrtFast;
-#endif
-#ifdef PLATFORM_WINDOWS_PC32
-		pfFastSinCos = _SSE_SinCos;
-		pfFastCos = _SSE_cos;
-#endif
-	}
-	else
-	{
-		s_bSSEEnabled = false;
-	}
-
-	if (bAllowSSE2 && pi.m_bSSE2)
-	{
-		s_bSSE2Enabled = true;
-#ifdef PLATFORM_WINDOWS_PC32
-		pfFastSinCos = _SSE2_SinCos;
-		pfFastCos = _SSE2_cos;
-#endif
-	}
-	else
-	{
-		s_bSSE2Enabled = false;
-	}
-#endif // !_X360
 
 	s_bMathlibInitialized = true;
 
 	InitSinCosTable();
 	BuildGammaTable(gamma, texGamma, brightness, overbright);
+	SeedRandSIMD(0x31415926);
 }
 
-bool MathLib_3DNowEnabled(void)
-{
-	Assert(s_bMathlibInitialized);
-	return s_b3DNowEnabled;
-}
 
 bool MathLib_MMXEnabled(void)
 {
 	Assert(s_bMathlibInitialized);
-	return s_bMMXEnabled;
+	return true;
 }
 
 bool MathLib_SSEEnabled(void)
 {
 	Assert(s_bMathlibInitialized);
-	return s_bSSEEnabled;
+	return true;
 }
 
 bool MathLib_SSE2Enabled(void)
 {
 	Assert(s_bMathlibInitialized);
-	return s_bSSE2Enabled;
+	return true;
 }
 
+
 // BUGBUG: Why doesn't this call angle diff?!?!?
 float ApproachAngle(float target, float value, float speed)
 {
@@ -3662,6 +4172,34 @@ void RotationDelta(const QAngle& srcAngles, const QAngle& destAngles, QAngle* ou
 	}
 }
 
+void ClipLineSegmentToPlane(const Vector3D& vNormal, const Vector3D& vPlanePoint, Vector3D* p1, Vector3D* p2, float flBias)
+{
+	float flDot1, flDot2;
+	flDot1 = (*p1 - vPlanePoint).Dot(vNormal) + flBias;
+	flDot2 = (*p2 - vPlanePoint).Dot(vNormal) + flBias;
+
+	if (flDot1 >= 0 && flDot2 >= 0)
+	{
+		return;
+	}
+
+	if (flDot1 >= 0)
+	{
+		Vector3D vRay = *p2 - *p1;
+		*p2 = *p1 + vRay * flDot1 / (flDot1 - flDot2);
+	}
+	else if (flDot2 >= 0)
+	{
+		Vector3D vRay = *p1 - *p2;
+		*p1 = *p2 + vRay * flDot2 / (flDot2 - flDot1);
+	}
+	else
+	{
+		*p1 = vec3_invalid;
+		*p2 = vec3_invalid;
+	}
+}
+
 //-----------------------------------------------------------------------------
 // Purpose: Computes a triangle normal
 //-----------------------------------------------------------------------------
@@ -3675,14 +4213,49 @@ void ComputeTrianglePlane(const Vector3D& v1, const Vector3D& v2, const Vector3D
 	intercept = DotProduct(normal, v1);
 }
 
+//-----------------------------------------------------------------------------
+// Purpose: Calculate the volume of a tetrahedron with these vertices
+// Input  : p0 - points of tetrahedron
+//			p1 - 
+//			p2 - 
+//			p3 - 
+// Output : float (volume in units^3)
+//-----------------------------------------------------------------------------
+float TetrahedronVolume(const Vector3D& p0, const Vector3D& p1, const Vector3D& p2, const Vector3D& p3)
+{
+	Vector3D a, b, c, cross;
+	float volume = 1.0f / 6.0f;
+
+	a = p1 - p0;
+	b = p2 - p0;
+	c = p3 - p0;
+	cross = CrossProduct(b, c);
+
+	volume *= DotProduct(a, cross);
+	if (volume < 0)
+		return -volume;
+	return volume;
+}
+
+
+// computes the area of a triangle given three verts
+float TriangleArea(const Vector3D& v0, const Vector3D& v1, const Vector3D& v2)
+{
+	Vector3D vecEdge0, vecEdge1, vecCross;
+	VectorSubtract(v1, v0, vecEdge0);
+	VectorSubtract(v2, v0, vecEdge1);
+	CrossProduct(vecEdge0, vecEdge1, vecCross);
+	return (VectorLength(vecCross) * 0.5f);
+}
+
 //-----------------------------------------------------------------------------
 // Purpose: This is a clone of BaseWindingForPlane()
-// Input  : *outVerts - an array of preallocated verts to build the polygon in
+// Input  : *pOutVerts - an array of preallocated verts to build the polygon in
 //			normal - the plane normal
 //			dist - the plane constant
 // Output : int - vert count (always 4)
 //-----------------------------------------------------------------------------
-int PolyFromPlane(Vector3D* outVerts, const Vector3D& normal, float dist, float fHalfScale)
+int PolyFromPlane(Vector3D* pOutVerts, const Vector3D& normal, float dist, float fHalfScale)
 {
 	int		i, x;
 	vec_t	max, v;
@@ -3705,7 +4278,7 @@ int PolyFromPlane(Vector3D* outVerts, const Vector3D& normal, float dist, float
 	if (x == -1)
 		return 0;
 
-	// Build a unit Vector3D along something other than the major axis
+	// Build a unit vector along something other than the major axis
 	VectorCopy(vec3_origin, vup);
 	switch (x)
 	{
@@ -3718,7 +4291,7 @@ int PolyFromPlane(Vector3D* outVerts, const Vector3D& normal, float dist, float
 		break;
 	}
 
-	// Remove the component of this Vector3D along the normal
+	// Remove the component of this vector along the normal
 	v = DotProduct(vup, normal);
 	VectorMA(vup, -v, normal, vup);
 	// Make it a unit (perpendicular)
@@ -3726,30 +4299,80 @@ int PolyFromPlane(Vector3D* outVerts, const Vector3D& normal, float dist, float
 
 	// Center of the poly is at normal * dist
 	VectorScale(normal, dist, org);
-	// Calculate the third orthonormal basis Vector3D for our plane space (this one and vup are in the plane)
+	// Calculate the third orthonormal basis vector for our plane space (this one and vup are in the plane)
 	CrossProduct(vup, normal, vright);
 
-	// Make the plane's basis Vector3Ds big (these are the half-sides of the polygon we're making)
+	// Make the plane's basis vectors big (these are the half-sides of the polygon we're making)
 	VectorScale(vup, fHalfScale, vup);
 	VectorScale(vright, fHalfScale, vright);
 
 	// Move diagonally away from org to create the corner verts
-	VectorSubtract(org, vright, outVerts[0]);	// left
-	VectorAdd(outVerts[0], vup, outVerts[0]);	// up
+	VectorSubtract(org, vright, pOutVerts[0]);	// left
+	VectorAdd(pOutVerts[0], vup, pOutVerts[0]);	// up
 
-	VectorAdd(org, vright, outVerts[1]);		// right
-	VectorAdd(outVerts[1], vup, outVerts[1]);	// up
+	VectorAdd(org, vright, pOutVerts[1]);		// right
+	VectorAdd(pOutVerts[1], vup, pOutVerts[1]);	// up
 
-	VectorAdd(org, vright, outVerts[2]);		// right
-	VectorSubtract(outVerts[2], vup, outVerts[2]);	// down
+	VectorAdd(org, vright, pOutVerts[2]);		// right
+	VectorSubtract(pOutVerts[2], vup, pOutVerts[2]);	// down
 
-	VectorSubtract(org, vright, outVerts[3]);		// left
-	VectorSubtract(outVerts[3], vup, outVerts[3]);	// down
+	VectorSubtract(org, vright, pOutVerts[3]);		// left
+	VectorSubtract(pOutVerts[3], vup, pOutVerts[3]);	// down
 
 	// The four corners form a planar quadrilateral normal to "normal"
 	return 4;
 }
 
+// Returns void as it was impossible for the function to returns anything other than 4.
+// Any absolute of a floating value will always return a number greater than -16384. That test seemed bogus.
+void PolyFromPlane_SIMD(fltx4* pOutVerts, const fltx4& plane, float fHalfScale)
+{
+	// So we need to find the biggest component of all three,
+	// And depending of the value, we need to build a unit vector along something that is not the major axis.
+
+	fltx4 f4Abs = AbsSIMD(plane);
+	fltx4 x = SplatXSIMD(f4Abs);
+	fltx4 y = SplatYSIMD(f4Abs);
+	fltx4 z = SplatZSIMD(f4Abs);
+	fltx4 max = MaxSIMD(x, y);
+	max = MaxSIMD(max, z);
+
+	// Simplify the code, if Z is the biggest component, we will use 1 0 0.
+	// If X or Y are the biggest, we will use 0 0 1.
+	bi32x4 fIsMax = CmpEqSIMD(max, f4Abs);		// isMax will be set for the components that are the max
+	fltx4 fIsZMax = SplatZSIMD((fltx4)fIsMax);	// 0 if Z is not the max, 0xffffffff is Z is the max
+	// And depending if Z is max or not, we are going to select one unit vector or the other
+	fltx4 vup = MaskedAssign((bi32x4)fIsZMax, g_SIMD_Identity[0], g_SIMD_Identity[2]);
+
+	fltx4 normal = SetWToZeroSIMD(plane);
+	fltx4 dist = SplatWSIMD(plane);
+
+	// Remove the component of this vector along the normal
+	fltx4 v = Dot3SIMD(vup, normal);
+	vup = MaddSIMD(-v, normal, vup);
+	// Make it a unit (perpendicular)
+	vup = Normalized3SIMD(vup);
+
+	// Center of the poly is at normal * dist
+	fltx4 org = MulSIMD(dist, normal);
+	// Calculate the third orthonormal basis vector for our plane space (this one and vup are in the plane)
+	fltx4 vright = CrossProductSIMD(vup, normal);
+
+	// Make the plane's basis vectors big (these are the half-sides of the polygon we're making)
+	fltx4 f4HalfScale = ReplicateX4(fHalfScale);
+	vup = MulSIMD(f4HalfScale, vup);
+	vright = MulSIMD(f4HalfScale, vright);
+
+	// Move diagonally away from org to create the corner verts
+	fltx4 vleft = SubSIMD(org, vright);
+	vright = AddSIMD(org, vright);
+
+	pOutVerts[0] = AddSIMD(vleft, vup);		// left + up
+	pOutVerts[1] = AddSIMD(vright, vup);		// right + up
+	pOutVerts[2] = SubSIMD(vright, vup);		// right + down
+	pOutVerts[3] = SubSIMD(vleft, vup);		// left + down
+}
+
 //-----------------------------------------------------------------------------
 // Purpose: clip a poly to the plane and return the poly on the front side of the plane
 // Input  : *inVerts - input polygon
@@ -3849,6 +4472,119 @@ int ClipPolyToPlane(Vector3D* inVerts, int vertCount, Vector3D* outVerts, const
 	return outCount;
 }
 
+int ClipPolyToPlane_SIMD(fltx4* pInVerts, int nVertCount, fltx4* pOutVerts, const fltx4& plane, float fOnPlaneEpsilon)
+{
+	vec_t* dists = (vec_t*)stackalloc(sizeof(vec_t) * nVertCount * 4); //4* nVertCount should cover all cases
+	uint8* sides = (uint8*)stackalloc(sizeof(uint8) * nVertCount * 4);
+	int		i;
+
+	/*
+	 * It seems something could be done here... Especially in relation with the code below i, i + 1, etc...
+		fltx4 f4OnPlaneEpsilonP = ReplicateX4( fOnPlaneEpsilon );
+		fltx4 f4OnPlaneEpsilonM = -f4OnPlaneEpsilonP;
+		Also we could store the full fltx4 instead of a single float. It would avoid doing a SubFloat() here,
+		and a ReplicateX4() later. Trading off potential LHS against L2 cache misses?
+	*/
+	// determine sides for each point
+	int nAllSides = 0;
+	fltx4 f4Dist = SplatWSIMD(plane);
+	for (i = 0; i < nVertCount; i++)
+	{
+		// dot = DotProduct( pInVerts[i], normal) - dist;
+		fltx4 dot = Dot3SIMD(pInVerts[i], plane);
+		dot = SubSIMD(dot, f4Dist);
+		float fDot = SubFloat(dot, 0);
+		dists[i] = fDot;
+		// Look how to update sides with a branch-less version
+		int nSide = OR_SIDE_ON;
+		if (fDot > fOnPlaneEpsilon)
+		{
+			nSide = OR_SIDE_FRONT;
+		}
+		else if (fDot < -fOnPlaneEpsilon)
+		{
+			nSide = OR_SIDE_BACK;
+		}
+		sides[i] = nSide;
+		nAllSides |= nSide;
+	}
+	sides[i] = sides[0];
+	dists[i] = dists[0];
+
+	// Shortcuts (either completely clipped or not clipped at all)
+	if ((nAllSides & OR_SIDE_FRONT) == 0)
+	{
+		return 0;	// Completely clipped
+	}
+
+	if ((nAllSides & OR_SIDE_BACK) == 0)
+	{
+		// Not clipped at all, copy to output verts
+		Assert(i == nVertCount);
+		int nIndex = 0;
+		while (i >= 4)
+		{
+			pOutVerts[nIndex] = pInVerts[nIndex];
+			pOutVerts[nIndex + 1] = pInVerts[nIndex + 1];
+			pOutVerts[nIndex + 2] = pInVerts[nIndex + 2];
+			pOutVerts[nIndex + 3] = pInVerts[nIndex + 3];
+			nIndex += 4;
+			i -= 4;
+		}
+		while (i > 0)
+		{
+			pOutVerts[nIndex] = pInVerts[nIndex];
+			++nIndex;
+			--i;
+		}
+		return nVertCount;
+	}
+
+	fltx4 f4one = Four_Ones;
+	fltx4 f4MOne = -f4one;
+
+	fltx4 f4OneMask = (fltx4)CmpEqSIMD(plane, f4one);
+	fltx4 f4mOneMask = (fltx4)CmpEqSIMD(plane, f4MOne);
+	fltx4 f4AllMask = OrSIMD(f4OneMask, f4mOneMask);					// 0xffffffff where normal was 1 or -1, 0 otherwise
+	f4OneMask = AndSIMD(f4OneMask, f4Dist);							// Dist where normal.* was 1
+	f4mOneMask = AndSIMD(f4mOneMask, -f4Dist);						// -Dist where normal.* was -1
+	fltx4 f4AllValue = OrSIMD(f4OneMask, f4mOneMask);					// Dist and -Dist where normal.* was 1 and -1
+	// f4AllMask and f4AllValue will be used together (to override the default calculation).
+
+	int nOutCount = 0;
+	for (i = 0; i < nVertCount; i++)
+	{
+		const fltx4& p1 = pInVerts[i];
+
+		if (sides[i] == OR_SIDE_ON)
+		{
+			pOutVerts[nOutCount++] = p1;
+			continue;
+		}
+
+		if (sides[i] == OR_SIDE_FRONT)
+		{
+			pOutVerts[nOutCount++] = p1;
+		}
+
+		if (sides[i + 1] == OR_SIDE_ON || sides[i + 1] == sides[i])
+			continue;
+
+		// generate a split point
+		fltx4& p2 = pInVerts[(i + 1) % nVertCount];
+
+		float fDot = dists[i] / (dists[i] - dists[i + 1]);
+		fltx4 f4Dot = ReplicateX4(fDot);
+
+		// mid[j] = v1[j] + dot*(v2[j]-v1[j]);		- For j=0...2
+		fltx4 f4Result = MaddSIMD(f4Dot, SubSIMD(p2, p1), p1);
+		// If normal.* is 1, it should be dist, if -1, it should be -dist, otherwise it should be mid[j] = v1[j] + dot*(v2[j]-v1[j]);
+		fltx4 mid = MaskedAssign((bi32x4)f4AllMask, f4AllValue, f4Result);
+		pOutVerts[nOutCount++] = mid;
+	}
+
+	return nOutCount;
+}
 
 int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, const double* normal, double dist, double fOnPlaneEpsilon)
 {
@@ -3857,7 +4593,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co
 	int		counts[3];
 	double	dot;
 	int		i, j;
-	//Vector3D	mid = vec3_origin;
+	//Vector	mid = vec3_origin;
 	double mid[3];
 	mid[0] = 0.0;
 	mid[1] = 0.0;
@@ -3898,7 +4634,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co
 		//for ( i = 0; i < vertCount; i++ )
 		for (i = 0; i < vertCount * 3; i++)
 		{
-			//Vector3DCopy( inVerts[i], outVerts[i] );
+			//VectorCopy( inVerts[i], outVerts[i] );
 			outVerts[i] = inVerts[i];
 		}
 		return vertCount;
@@ -3907,7 +4643,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co
 	outCount = 0;
 	for (i = 0; i < vertCount; i++)
 	{
-		//Vector3D& p1 = inVerts[i];
+		//Vector& p1 = inVerts[i];
 		double* p1 = &inVerts[i * 3];
 		//p1[0] = inVerts[i*3 + 0];
 		//p1[1] = inVerts[i*3 + 1];
@@ -3915,7 +4651,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co
 
 		if (sides[i] == SIDE_ON)
 		{
-			//Vector3DCopy( p1, outVerts[outCount]);
+			//VectorCopy( p1, outVerts[outCount]);
 			outVerts[outCount * 3 + 0] = p1[0];
 			outVerts[outCount * 3 + 1] = p1[1];
 			outVerts[outCount * 3 + 2] = p1[2];
@@ -3925,7 +4661,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co
 
 		if (sides[i] == SIDE_FRONT)
 		{
-			//Vector3DCopy( p1, outVerts[outCount]);
+			//VectorCopy( p1, outVerts[outCount]);
 			outVerts[outCount * 3 + 0] = p1[0];
 			outVerts[outCount * 3 + 1] = p1[1];
 			outVerts[outCount * 3 + 2] = p1[2];
@@ -3936,7 +4672,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co
 			continue;
 
 		// generate a split point
-		//Vector3D& p2 = inVerts[(i+1)%vertCount];
+		//Vector& p2 = inVerts[(i+1)%vertCount];
 		int wrappedindex = (i + 1) % vertCount;
 		double* p2 = &inVerts[wrappedindex * 3];
 		//p2[0] = inVerts[wrappedindex*3 + 0];
@@ -3949,7 +4685,7 @@ int ClipPolyToPlane_Precise(double* inVerts, int vertCount, double* outVerts, co
 			mid[j] = (double)p1[j] + dot * ((double)p2[j] - (double)p1[j]);
 		}
 
-		//Vector3DCopy (mid, outVerts[outCount]);
+		//VectorCopy (mid, outVerts[outCount]);
 		outVerts[outCount * 3 + 0] = mid[0];
 		outVerts[outCount * 3 + 1] = mid[1];
 		outVerts[outCount * 3 + 2] = mid[2];
@@ -4009,6 +4745,9 @@ float CalcFovX(float flFovY, float flAspect)
 	return RAD2DEG(atan(tan(DEG2RAD(flFovY) * 0.5f) * flAspect)) * 2.0f;
 }
 
+#endif // !defined(__SPU__)
+
+#if !defined(__SPU__) 
 //-----------------------------------------------------------------------------
 // Generate a frustum based on perspective view parameters
 //-----------------------------------------------------------------------------
@@ -4071,36 +4810,533 @@ void GenerateOrthoFrustum(const Vector3D& origin, const Vector3D& forward, const
 	pPlanesOut[FRUSTUM_TOP].Init(-up, -flTop - flIntercept);
 }
 
+//-----------------------------------------------------------------------------
+// Version that accepts angles instead of vectors
+//-----------------------------------------------------------------------------
+void GeneratePerspectiveFrustum(const Vector3D& origin, const QAngle& angles, float flZNear, float flZFar, float flFovX, float flAspectRatio, Frustum_t& frustum)
+{
+	VPlane planes[FRUSTUM_NUMPLANES];
+	Vector3D vecForward, vecRight, vecUp;
+	AngleVectors(angles, &vecForward, &vecRight, &vecUp);
+	float flFovY = CalcFovY(flFovX, flAspectRatio);
+	GeneratePerspectiveFrustum(origin, vecForward, vecRight, vecUp, flZNear, flZFar, flFovX, flFovY, planes);
+	frustum.SetPlanes(planes);
+}
+
+void fourplanes_t::ComputeSignbits()
+{
+	xSign = CmpLtSIMD(nX, Four_Zeros);
+	ySign = CmpLtSIMD(nY, Four_Zeros);
+	zSign = CmpLtSIMD(nZ, Four_Zeros);
+	nXAbs = fabs(nX);
+	nYAbs = fabs(nY);
+	nZAbs = fabs(nZ);
+}
+
+void fourplanes_t::GetPlane(int index, Vector3D* pNormalOut, float* pDistOut) const
+{
+	pNormalOut->x = SubFloat(nX, index);
+	pNormalOut->y = SubFloat(nY, index);
+	pNormalOut->z = SubFloat(nZ, index);
+	*pDistOut = SubFloat(dist, index);
+}
+void fourplanes_t::SetPlane(int index, const Vector3D& vecNormal, float planeDist)
+{
+	SubFloat(nX, index) = vecNormal.x;
+	SubFloat(nY, index) = vecNormal.y;
+	SubFloat(nZ, index) = vecNormal.z;
+	SubFloat(dist, index) = planeDist;
+	ComputeSignbits();
+}
+
+void fourplanes_t::Set4Planes(const VPlane* pPlanes)
+{
+	nX = LoadUnalignedSIMD(&pPlanes[0].m_Normal.x);
+	nY = LoadUnalignedSIMD(&pPlanes[1].m_Normal.x);
+	nZ = LoadUnalignedSIMD(&pPlanes[2].m_Normal.x);
+	dist = LoadUnalignedSIMD(&pPlanes[3].m_Normal.x);
+	TransposeSIMD(nX, nY, nZ, dist);
+	ComputeSignbits();
+}
+
+void fourplanes_t::Set2Planes(const VPlane* pPlanes)
+{
+	nX = LoadUnalignedSIMD(&pPlanes[0].m_Normal.x);
+	nY = LoadUnalignedSIMD(&pPlanes[1].m_Normal.x);
+	nZ = Four_Zeros;
+	dist = Four_Zeros;
+	TransposeSIMD(nX, nY, nZ, dist);
+	ComputeSignbits();
+}
+
+void fourplanes_t::Get4Planes(VPlane* pPlanesOut) const
+{
+	fltx4 p0 = nX;
+	fltx4 p1 = nY;
+	fltx4 p2 = nZ;
+	fltx4 p3 = dist;
+	TransposeSIMD(p0, p1, p2, p3);
+	StoreUnalignedSIMD(&pPlanesOut[0].m_Normal.x, p0);
+	StoreUnalignedSIMD(&pPlanesOut[1].m_Normal.x, p1);
+	StoreUnalignedSIMD(&pPlanesOut[2].m_Normal.x, p2);
+	StoreUnalignedSIMD(&pPlanesOut[3].m_Normal.x, p3);
+}
+
+void fourplanes_t::Get2Planes(VPlane* pPlanesOut) const
+{
+	fltx4 p0 = nX;
+	fltx4 p1 = nY;
+	fltx4 p2 = nZ;
+	fltx4 p3 = dist;
+	TransposeSIMD(p0, p1, p2, p3);
+	StoreUnalignedSIMD(&pPlanesOut[0].m_Normal.x, p0);
+	StoreUnalignedSIMD(&pPlanesOut[1].m_Normal.x, p1);
+}
+
+
+Frustum_t::Frustum_t()
+{
+	memset(this, 0, sizeof(*this));
+}
+
+void Frustum_t::SetPlane(int i, const Vector3D& vecNormal, float dist)
+{
+	if (i < 4)
+	{
+		planes[0].SetPlane(i, vecNormal, dist);
+	}
+	else
+	{
+		planes[1].SetPlane(i - 4, vecNormal, dist);
+	}
+}
+
+void Frustum_t::GetPlane(int i, Vector3D* pNormalOut, float* pDistOut) const
+{
+	if (i < 4)
+	{
+		planes[0].GetPlane(i, pNormalOut, pDistOut);
+	}
+	else
+	{
+		planes[1].GetPlane(i - 4, pNormalOut, pDistOut);
+	}
+}
+
+void Frustum_t::SetPlanes(const VPlane* pPlanes)
+{
+	planes[0].Set4Planes(pPlanes);
+	planes[1].Set2Planes(pPlanes + 4);
+}
+
+void Frustum_t::GetPlanes(VPlane* pPlanesOut) const
+{
+	planes[0].Get4Planes(pPlanesOut);
+	planes[1].Get2Planes(pPlanesOut + 4);
+}
+
+
+bool Frustum_t::CullBox(const Vector3D& mins, const Vector3D& maxs) const
+{
+	fltx4 mins4 = LoadUnalignedSIMD(&mins.x);
+	fltx4 minx = SplatXSIMD(mins4);
+	fltx4 miny = SplatYSIMD(mins4);
+	fltx4 minz = SplatZSIMD(mins4);
+	fltx4 maxs4 = LoadUnalignedSIMD(&maxs.x);
+	fltx4 maxx = SplatXSIMD(maxs4);
+	fltx4 maxy = SplatYSIMD(maxs4);
+	fltx4 maxz = SplatZSIMD(maxs4);
+
+	// compute the dot product of the normal and the farthest corner
+	// dotBack0 = DotProduct( normal, normals.x < 0 ? mins.x : maxs.x );
+	for (int i = 0; i < 2; i++)
+	{
+		fltx4 xTotalBack = MulSIMD(planes[i].nX, MaskedAssign(planes[i].xSign, minx, maxx));
+		fltx4 yTotalBack = MulSIMD(planes[i].nY, MaskedAssign(planes[i].ySign, miny, maxy));
+		fltx4 zTotalBack = MulSIMD(planes[i].nZ, MaskedAssign(planes[i].zSign, minz, maxz));
+		fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack));
+		// if plane of the farthest corner is behind the plane, then the box is completely outside this plane
+		if (IsVector4LessThan(dotBack, planes[i].dist))
+			return true;
+	}
+	return false;
+}
+
+bool Frustum_t::CullBox(const fltx4& mins4, const fltx4& maxs4) const
+{
+	fltx4 minx = SplatXSIMD(mins4);
+	fltx4 miny = SplatYSIMD(mins4);
+	fltx4 minz = SplatZSIMD(mins4);
+	fltx4 maxx = SplatXSIMD(maxs4);
+	fltx4 maxy = SplatYSIMD(maxs4);
+	fltx4 maxz = SplatZSIMD(maxs4);
+
+	// compute the dot product of the normal and the farthest corner
+	// dotBack0 = DotProduct( normal, normals.x < 0 ? mins.x : maxs.x );
+	for (int i = 0; i < 2; i++)
+	{
+		fltx4 xTotalBack = MulSIMD(planes[i].nX, MaskedAssign(planes[i].xSign, minx, maxx));
+		fltx4 yTotalBack = MulSIMD(planes[i].nY, MaskedAssign(planes[i].ySign, miny, maxy));
+		fltx4 zTotalBack = MulSIMD(planes[i].nZ, MaskedAssign(planes[i].zSign, minz, maxz));
+		fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack));
+		// if plane of the farthest corner is behind the plane, then the box is completely outside this plane
+		if (IsVector4LessThan(dotBack, planes[i].dist))
+			return true;
+	}
+	return false;
+}
+
+bool Frustum_t::CullBoxCenterExtents(const Vector3D& center, const Vector3D& extents) const
+{
+	fltx4 center4 = LoadUnalignedSIMD(&center.x);
+	fltx4 centerx = SplatXSIMD(center4);
+	fltx4 centery = SplatYSIMD(center4);
+	fltx4 centerz = SplatZSIMD(center4);
+	fltx4 extents4 = LoadUnalignedSIMD(&extents.x);
+	fltx4 extx = SplatXSIMD(extents4);
+	fltx4 exty = SplatYSIMD(extents4);
+	fltx4 extz = SplatZSIMD(extents4);
+
+	// compute the dot product of the normal and the farthest corner
+	for (int i = 0; i < 2; i++)
+	{
+		fltx4 xTotalBack = AddSIMD(MulSIMD(planes[i].nX, centerx), MulSIMD(planes[i].nXAbs, extx));
+		fltx4 yTotalBack = AddSIMD(MulSIMD(planes[i].nY, centery), MulSIMD(planes[i].nYAbs, exty));
+		fltx4 zTotalBack = AddSIMD(MulSIMD(planes[i].nZ, centerz), MulSIMD(planes[i].nZAbs, extz));
+		fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack));
+		// if plane of the farthest corner is behind the plane, then the box is completely outside this plane
+		if (IsVector4LessThan(dotBack, planes[i].dist))
+			return true;
+	}
+	return false;
+}
+
+
+bool Frustum_t::CullBoxCenterExtents(const fltx4& fl4Center, const fltx4& fl4Extents) const
+{
+	fltx4 centerx = SplatXSIMD(fl4Center);
+	fltx4 centery = SplatYSIMD(fl4Center);
+	fltx4 centerz = SplatZSIMD(fl4Center);
+	fltx4 extx = SplatXSIMD(fl4Extents);
+	fltx4 exty = SplatYSIMD(fl4Extents);
+	fltx4 extz = SplatZSIMD(fl4Extents);
+
+	// compute the dot product of the normal and the farthest corner
+	for (int i = 0; i < 2; i++)
+	{
+		fltx4 xTotalBack = AddSIMD(MulSIMD(planes[i].nX, centerx), MulSIMD(planes[i].nXAbs, extx));
+		fltx4 yTotalBack = AddSIMD(MulSIMD(planes[i].nY, centery), MulSIMD(planes[i].nYAbs, exty));
+		fltx4 zTotalBack = AddSIMD(MulSIMD(planes[i].nZ, centerz), MulSIMD(planes[i].nZAbs, extz));
+		fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack));
+		// if plane of the farthest corner is behind the plane, then the box is completely outside this plane
+		if (IsVector4LessThan(dotBack, planes[i].dist))
+			return true;
+	}
+	return false;
+}
+
+// Return true if this bounding volume is contained in the frustum, false if it is not
+// TODO SIMDIFY
+bool Frustum_t::Contains(const Vector3D& mins, const Vector3D& maxs) const
+{
+	// Get box corners 
+	Vector3D vCorners[8];
+	vCorners[0] = mins;
+	vCorners[1] = Vector3D(mins.x, mins.y, maxs.z);
+	vCorners[2] = Vector3D(mins.x, maxs.y, mins.z);
+	vCorners[3] = Vector3D(mins.x, maxs.y, maxs.z);
+
+	vCorners[4] = Vector3D(maxs.x, mins.y, mins.z);
+	vCorners[5] = Vector3D(maxs.x, mins.y, maxs.z);
+	vCorners[6] = Vector3D(maxs.x, maxs.y, mins.z);
+	vCorners[7] = maxs;
+
+
+	// if we are in with all points, then we are fully in
+	for (int j = 0; j < FRUSTUM_NUMPLANES; ++j)
+	{
+		for (int i = 0; i < 8; ++i)
+		{
+			// compute the dot product of the normal and the corner
+			Vector3D vNormal;
+			float dist;
+			GetPlane(i, &vNormal, &dist);
+			if (DotProduct(vCorners[j], vNormal) <= 0)
+			{
+				return false;
+			}
+		}
+	}
+
+	return true;	// all pts were inside
+}
+
+// Brute force SAT frustum intersection between two frustums
+bool Frustum_t::Intersects(Frustum_t& otherFrustum) const
+{
+	Vector3D pPointsA[8];
+	bool bResult = false;
+	bResult = GetCorners(pPointsA);
+	Assert(bResult);
+	VPlane pPlanesA[FRUSTUM_NUMPLANES];
+	GetPlanes(pPlanesA);
+
+	Vector3D pPointsB[8];
+	bResult = otherFrustum.GetCorners(pPointsB);
+	Assert(bResult);
+	VPlane pPlanesB[FRUSTUM_NUMPLANES];
+	otherFrustum.GetPlanes(pPlanesB);
+
+	// See if all points in B are on one side of any plane in A
+	for (int p = 0; p < 6; ++p)
+	{
+		bool bPointsOnOutside = true;
+		for (int i = 0; i < 8; ++i)
+		{
+			float flDist = pPlanesA[p].DistTo(pPointsB[i]);
+
+			// If dist is pos, we are not on the outside
+			if (flDist > 0)
+			{
+				bPointsOnOutside = false;
+				break;
+			}
+		}
+
+		// We never hit a negative case, we have a separating axis
+		if (bPointsOnOutside)
+		{
+			return false;
+		}
+	}
+
+	// See if all points in A are on one side of any plane in B
+	for (int p = 0; p < 6; ++p)
+	{
+		bool bPointsOnOutside = true;
+		for (int i = 0; i < 8; ++i)
+		{
+			float flDist = pPlanesB[p].DistTo(pPointsA[i]);
+
+			// If dist is pos, we are not on the outside
+			if (flDist > 0)
+			{
+				bPointsOnOutside = false;
+				break;
+			}
+		}
+
+		// We never hit a negative case, we have a separating axis
+		if (bPointsOnOutside)
+		{
+			return false;
+		}
+	}
+
+	// They intersect
+	return true;
+}
+
+// Return true if this bounding volume intersects the frustum, false if it is outside
+bool Frustum_t::Intersects(const Vector3D& mins, const Vector3D& maxs) const
+{
+	fltx4 mins4 = LoadUnalignedSIMD(&mins.x);
+	fltx4 minx = SplatXSIMD(mins4);
+	fltx4 miny = SplatYSIMD(mins4);
+	fltx4 minz = SplatZSIMD(mins4);
+	fltx4 maxs4 = LoadUnalignedSIMD(&maxs.x);
+	fltx4 maxx = SplatXSIMD(maxs4);
+	fltx4 maxy = SplatYSIMD(maxs4);
+	fltx4 maxz = SplatZSIMD(maxs4);
+
+	// compute the dot product of the normal and the farthest corner
+	// dotBack0 = DotProduct( normal, normals.x < 0 ? mins.x : maxs.x );
+	for (int i = 0; i < 2; i++)
+	{
+		fltx4 xTotalBack = MulSIMD(planes[i].nX, MaskedAssign(planes[i].xSign, minx, maxx));
+		fltx4 yTotalBack = MulSIMD(planes[i].nY, MaskedAssign(planes[i].ySign, miny, maxy));
+		fltx4 zTotalBack = MulSIMD(planes[i].nZ, MaskedAssign(planes[i].zSign, minz, maxz));
+		fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack));
+		// if plane of the farthest corner is behind the plane, then the box is completely outside this plane
+#if _X360
+		if (!XMVector3GreaterOrEqual(dotBack, planes[i].dist))
+			return false;
+#elif defined( _PS3 )
+		bi32x4 isOut = CmpLtSIMD(dotBack, planes[i].dist);
+		if (IsAnyNegative(isOut))
+			return false;
+#else
+		fltx4 isOut = CmpLtSIMD(dotBack, planes[i].dist);
+		if (IsAnyNegative(isOut))
+			return false;
+#endif
+	}
+	return true;
+}
+
+bool Frustum_t::Intersects(const fltx4& mins4, const fltx4& maxs4) const
+{
+	fltx4 minx = SplatXSIMD(mins4);
+	fltx4 miny = SplatYSIMD(mins4);
+	fltx4 minz = SplatZSIMD(mins4);
+	fltx4 maxx = SplatXSIMD(maxs4);
+	fltx4 maxy = SplatYSIMD(maxs4);
+	fltx4 maxz = SplatZSIMD(maxs4);
+
+	// compute the dot product of the normal and the farthest corner
+	// dotBack0 = DotProduct( normal, normals.x < 0 ? mins.x : maxs.x );
+	for (int i = 0; i < 2; i++)
+	{
+		fltx4 xTotalBack = MulSIMD(planes[i].nX, MaskedAssign(planes[i].xSign, minx, maxx));
+		fltx4 yTotalBack = MulSIMD(planes[i].nY, MaskedAssign(planes[i].ySign, miny, maxy));
+		fltx4 zTotalBack = MulSIMD(planes[i].nZ, MaskedAssign(planes[i].zSign, minz, maxz));
+		fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack));
+		// if plane of the farthest corner is behind the plane, then the box is completely outside this plane
+#if _X360
+		if (!XMVector4GreaterOrEqual(dotBack, planes[i].dist))
+			return false;
+#elif defined( _PS3 )
+		bi32x4 isOut = CmpLtSIMD(dotBack, planes[i].dist);
+		if (IsAnyNegative(isOut))
+			return false;
+#else
+		fltx4 isOut = CmpLtSIMD(dotBack, planes[i].dist);
+		if (IsAnyNegative(isOut))
+			return false;
+#endif
+	}
+	return true;
+}
+
+bool Frustum_t::IntersectsCenterExtents(const Vector3D& center, const Vector3D& extents) const
+{
+	fltx4 center4 = LoadUnalignedSIMD(&center.x);
+	fltx4 centerx = SplatXSIMD(center4);
+	fltx4 centery = SplatYSIMD(center4);
+	fltx4 centerz = SplatZSIMD(center4);
+	fltx4 extents4 = LoadUnalignedSIMD(&extents.x);
+	fltx4 extx = SplatXSIMD(extents4);
+	fltx4 exty = SplatYSIMD(extents4);
+	fltx4 extz = SplatZSIMD(extents4);
+
+	// compute the dot product of the normal and the farthest corner
+	for (int i = 0; i < 2; i++)
+	{
+		fltx4 xTotalBack = AddSIMD(MulSIMD(planes[i].nX, centerx), MulSIMD(planes[i].nXAbs, extx));
+		fltx4 yTotalBack = AddSIMD(MulSIMD(planes[i].nY, centery), MulSIMD(planes[i].nYAbs, exty));
+		fltx4 zTotalBack = AddSIMD(MulSIMD(planes[i].nZ, centerz), MulSIMD(planes[i].nZAbs, extz));
+		fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack));
+		// if plane of the farthest corner is behind the plane, then the box is completely outside this plane
+#if _X360
+		if (!XMVector4GreaterOrEqual(dotBack, planes[i].dist))
+			return false;
+#elif defined( _PS3 )
+		bi32x4 isOut = CmpLtSIMD(dotBack, planes[i].dist);
+		if (IsAnyNegative(isOut))
+			return false;
+#else
+		fltx4 isOut = CmpLtSIMD(dotBack, planes[i].dist);
+		if (IsAnyNegative(isOut))
+			return false;
+#endif
+	}
+	return true;
+}
+
+
+bool Frustum_t::IntersectsCenterExtents(const fltx4& fl4Center, const fltx4& fl4Extents) const
+{
+	fltx4 centerx = SplatXSIMD(fl4Center);
+	fltx4 centery = SplatYSIMD(fl4Center);
+	fltx4 centerz = SplatZSIMD(fl4Center);
+	fltx4 extx = SplatXSIMD(fl4Extents);
+	fltx4 exty = SplatYSIMD(fl4Extents);
+	fltx4 extz = SplatZSIMD(fl4Extents);
+
+	// compute the dot product of the normal and the farthest corner
+	for (int i = 0; i < 2; i++)
+	{
+		fltx4 xTotalBack = AddSIMD(MulSIMD(planes[i].nX, centerx), MulSIMD(planes[i].nXAbs, extx));
+		fltx4 yTotalBack = AddSIMD(MulSIMD(planes[i].nY, centery), MulSIMD(planes[i].nYAbs, exty));
+		fltx4 zTotalBack = AddSIMD(MulSIMD(planes[i].nZ, centerz), MulSIMD(planes[i].nZAbs, extz));
+		fltx4 dotBack = AddSIMD(xTotalBack, AddSIMD(yTotalBack, zTotalBack));
+		// if plane of the farthest corner is behind the plane, then the box is completely outside this plane
+#if _X360
+		if (!XMVector3GreaterOrEqual(dotBack, planes[i].dist))
+			return false;
+#elif defined( _PS3 )
+		bi32x4 isOut = CmpLtSIMD(dotBack, planes[i].dist);
+		if (IsAnyNegative(isOut))
+			return false;
+#else
+		fltx4 isOut = CmpLtSIMD(dotBack, planes[i].dist);
+		if (IsAnyNegative(isOut))
+			return false;
+#endif
+	}
+	return true;
+}
+
+//-----------------------------------------------------------------------------
+// Generate a frustum based on orthographic parameters
+//-----------------------------------------------------------------------------
+void GenerateOrthoFrustumFLU(const Vector3D& origin, const Vector3D& forward, const Vector3D& vLeft, const Vector3D& up, float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar, VPlane* pPlanesOut)
+{
+	// YUP_ACTIVE: FIXME : This is actually producing incorrect planes (see the VectorMA below)
+	Vector3D vRight = vLeft;
+	vRight *= -1.0f;
+
+	float flIntercept = DotProduct(origin, forward);
+
+	pPlanesOut[FRUSTUM_NEARZ].Init(forward, flZNear + flIntercept);
+	pPlanesOut[FRUSTUM_FARZ].Init(-forward, -flZFar - flIntercept);
+
+	flIntercept = DotProduct(origin, vRight);
+
+	pPlanesOut[FRUSTUM_RIGHT].Init(-vRight, -flRight - flIntercept);
+	pPlanesOut[FRUSTUM_LEFT].Init(vRight, flLeft + flIntercept);
+
+	flIntercept = DotProduct(origin, up);
+
+	pPlanesOut[FRUSTUM_BOTTOM].Init(up, flBottom + flIntercept);
+	pPlanesOut[FRUSTUM_TOP].Init(-up, -flTop - flIntercept);
+}
+
 //-----------------------------------------------------------------------------
 // Generate a frustum based on perspective view parameters
 //-----------------------------------------------------------------------------
-void GeneratePerspectiveFrustum(const Vector3D& origin, const Vector3D& forward,
-	const Vector3D& right, const Vector3D& up, float flZNear, float flZFar,
-	float flFovX, float flFovY, Frustum_t& frustum)
+void GeneratePerspectiveFrustumFLU(const Vector3D& origin, const Vector3D& forward,
+	const Vector3D& vLeft, const Vector3D& up, float flZNear, float flZFar,
+	float flFovX, float flAspect, VPlane* pPlanesOut)
 {
+	// YUP_ACTIVE: FIXME : This is actually producing incorrect planes (see the VectorMA below)
+	Vector3D vRight = vLeft;
+	vRight *= -1.0f;
+
 	float flIntercept = DotProduct(origin, forward);
 
 	// Setup the near and far planes.
-	frustum.SetPlane(FRUSTUM_FARZ, PLANE_ANYZ, -forward, -flZFar - flIntercept);
-	frustum.SetPlane(FRUSTUM_NEARZ, PLANE_ANYZ, forward, flZNear + flIntercept);
+	pPlanesOut[FRUSTUM_FARZ].Init(-forward, -flZFar - flIntercept);
+	pPlanesOut[FRUSTUM_NEARZ].Init(forward, flZNear + flIntercept);
 
 	flFovX *= 0.5f;
-	flFovY *= 0.5f;
 
 	float flTanX = tan(DEG2RAD(flFovX));
-	float flTanY = tan(DEG2RAD(flFovY));
+	float flTanY = flTanX / flAspect;
 
 	// OPTIMIZE: Normalizing these planes is not necessary for culling
 	Vector3D normalPos, normalNeg;
 
-	VectorMA(right, flTanX, forward, normalPos);
-	VectorMA(normalPos, -2.0f, right, normalNeg);
+	// NOTE: This should be using left and not right to produce correct planes, not changing it quite yet
+	// because I'm not able to test whether fixing this breaks anything.
+	VectorMA(vRight, flTanX, forward, normalPos);
+	VectorMA(normalPos, -2.0f, vRight, normalNeg);
 
 	VectorNormalize(normalPos);
 	VectorNormalize(normalNeg);
 
-	frustum.SetPlane(FRUSTUM_LEFT, PLANE_ANYZ, normalPos, normalPos.Dot(origin));
-	frustum.SetPlane(FRUSTUM_RIGHT, PLANE_ANYZ, normalNeg, normalNeg.Dot(origin));
+	pPlanesOut[FRUSTUM_LEFT].Init(normalPos, normalPos.Dot(origin));
+	pPlanesOut[FRUSTUM_RIGHT].Init(normalNeg, normalNeg.Dot(origin));
 
 	VectorMA(up, flTanY, forward, normalPos);
 	VectorMA(normalPos, -2.0f, up, normalNeg);
@@ -4108,44 +5344,109 @@ void GeneratePerspectiveFrustum(const Vector3D& origin, const Vector3D& forward,
 	VectorNormalize(normalPos);
 	VectorNormalize(normalNeg);
 
-	frustum.SetPlane(FRUSTUM_BOTTOM, PLANE_ANYZ, normalPos, normalPos.Dot(origin));
-	frustum.SetPlane(FRUSTUM_TOP, PLANE_ANYZ, normalNeg, normalNeg.Dot(origin));
+	pPlanesOut[FRUSTUM_BOTTOM].Init(normalPos, normalPos.Dot(origin));
+	pPlanesOut[FRUSTUM_TOP].Init(normalNeg, normalNeg.Dot(origin));
 }
 
-
-//-----------------------------------------------------------------------------
-// Version that accepts angles instead of Vector3Ds
-//-----------------------------------------------------------------------------
-void GeneratePerspectiveFrustum(const Vector3D& origin, const QAngle& angles, float flZNear, float flZFar, float flFovX, float flAspectRatio, Frustum_t& frustum)
+// Generate a frustum based on perspective view parameters
+void Frustum_t::CreatePerspectiveFrustumFLU(const Vector3D& vOrigin, const Vector3D& vForward,
+	const Vector3D& vLeft, const Vector3D& vUp, float flZNear, float flZFar,
+	float flFovX, float flAspect)
 {
-	Vector3D vecForward, vecRight, vecUp;
-	AngleVectors(angles, &vecForward, &vecRight, &vecUp);
-	float flFovY = CalcFovY(flFovX, flAspectRatio);
-	GeneratePerspectiveFrustum(origin, vecForward, vecRight, vecUp, flZNear, flZFar, flFovX, flFovY, frustum);
+	VPlane planes[FRUSTUM_NUMPLANES];
+	GeneratePerspectiveFrustumFLU(vOrigin, vForward, vLeft, vUp, flZNear, flZFar, flFovX, flAspect, planes);
+	SetPlanes(planes);
 }
 
-bool R_CullBox(const Vector3D& mins, const Vector3D& maxs, const Frustum_t& frustum)
+//#ifndef YUP_ACTIVE
+void Frustum_t::CreatePerspectiveFrustum(const Vector3D& origin, const Vector3D& forward,
+	const Vector3D& right, const Vector3D& up, float flZNear, float flZFar,
+	float flFovX, float flAspect)
 {
-	return ((BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_RIGHT)) == 2) ||
-		(BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_LEFT)) == 2) ||
-		(BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_TOP)) == 2) ||
-		(BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_BOTTOM)) == 2) ||
-		(BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_NEARZ)) == 2) ||
-		(BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_FARZ)) == 2));
+	Vector3D vLeft = right;
+	vLeft *= -1.0f;
+	CreatePerspectiveFrustumFLU(origin, forward, vLeft, up, flZNear, flZFar, flFovX, flAspect);
 }
+//#endif
 
-bool R_CullBoxSkipNear(const Vector3D& mins, const Vector3D& maxs, const Frustum_t& frustum)
+// Version that accepts angles instead of vectors
+void Frustum_t::CreatePerspectiveFrustum(const Vector3D& origin, const QAngle& angles, float flZNear, float flZFar, float flFovX, float flAspectRatio)
 {
-	return ((BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_RIGHT)) == 2) ||
-		(BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_LEFT)) == 2) ||
-		(BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_TOP)) == 2) ||
-		(BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_BOTTOM)) == 2) ||
-		(BoxOnPlaneSide(mins, maxs, frustum.GetPlane(FRUSTUM_FARZ)) == 2));
+	VPlane planes[FRUSTUM_NUMPLANES];
+	Vector3D vecForward, vecLeft, vecUp;
+	AngleVectorsFLU(angles, &vecForward, &vecLeft, &vecUp);
+	GeneratePerspectiveFrustumFLU(origin, vecForward, vecLeft, vecUp, flZNear, flZFar, flFovX, flAspectRatio, planes);
+	SetPlanes(planes);
 }
 
+// Generate a frustum based on orthographic parameters
+void Frustum_t::CreateOrthoFrustumFLU(const Vector3D& origin, const Vector3D& forward, const Vector3D& vLeft, const Vector3D& up, float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar)
+{
+	VPlane planes[FRUSTUM_NUMPLANES];
+	GenerateOrthoFrustumFLU(origin, forward, vLeft, up, flLeft, flRight, flBottom, flTop, flZNear, flZFar, planes);
+	SetPlanes(planes);
+}
+
+//#ifndef YUP_ACTIVE
+void Frustum_t::CreateOrthoFrustum(const Vector3D& origin, const Vector3D& forward, const Vector3D& right, const Vector3D& up, float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar)
+{
+	Vector3D vLeft = right;
+	vLeft *= -1.0f;
+	CreateOrthoFrustumFLU(origin, forward, vLeft, up, flLeft, flRight, flBottom, flTop, flZNear, flZFar);
+}
+
+// The points returned correspond to the corners of the frustum faces 
+// Points 0 to 3 correspond to the near face 
+// Points 4 to 7 correspond to the far face 
+// Returns points in a face in this order:
+//  2--3
+//	|  |
+//	0--1
+bool Frustum_t::GetCorners(Vector3D* pPoints) const
+{
+	VPlane planes[FRUSTUM_NUMPLANES];
+	GetPlanes(planes);
+
+	// Near face
+	// Bottom Left
+	if (!PlaneIntersection(planes[FRUSTUM_NEARZ], planes[FRUSTUM_LEFT], planes[FRUSTUM_BOTTOM], pPoints[0]))
+		return false;
+
+	// Bottom right
+	if (!PlaneIntersection(planes[FRUSTUM_NEARZ], planes[FRUSTUM_RIGHT], planes[FRUSTUM_BOTTOM], pPoints[1]))
+		return false;
+
+	// Upper Left
+	if (!PlaneIntersection(planes[FRUSTUM_NEARZ], planes[FRUSTUM_LEFT], planes[FRUSTUM_TOP], pPoints[2]))
+		return false;
+
+	// Upper right
+	if (!PlaneIntersection(planes[FRUSTUM_NEARZ], planes[FRUSTUM_RIGHT], planes[FRUSTUM_TOP], pPoints[3]))
+		return false;
+
+	// Far face
+	// Bottom Left
+	if (!PlaneIntersection(planes[FRUSTUM_FARZ], planes[FRUSTUM_LEFT], planes[FRUSTUM_BOTTOM], pPoints[4]))
+		return false;
+
+	// Bottom right
+	if (!PlaneIntersection(planes[FRUSTUM_FARZ], planes[FRUSTUM_RIGHT], planes[FRUSTUM_BOTTOM], pPoints[5]))
+		return false;
+
+	// Upper Left
+	if (!PlaneIntersection(planes[FRUSTUM_FARZ], planes[FRUSTUM_LEFT], planes[FRUSTUM_TOP], pPoints[6]))
+		return false;
+
+	// Upper right
+	if (!PlaneIntersection(planes[FRUSTUM_FARZ], planes[FRUSTUM_RIGHT], planes[FRUSTUM_TOP], pPoints[7]))
+		return false;
+
+
+	return true;
+}
 
 // NOTE: This routine was taken (and modified) from NVidia's BlinnReflection demo
-// Creates basis Vector3Ds, based on a vertex and index list.
+// Creates basis vectors, based on a vertex and index list.
 // See the NVidia white paper 'GDC2K PerPixel Lighting' for a description
 // of how this computation works
 #define SMALL_FLOAT 1e-12
@@ -4203,10 +5504,10 @@ void CalcTriangleTangentSpace(const Vector3D& p0, const Vector3D& p1, const Vect
 //-----------------------------------------------------------------------------
 void RGBtoHSV(const Vector3D& rgb, Vector3D& hsv)
 {
-	float flMax = max(rgb.x, rgb.y);
-	flMax = max(flMax, rgb.z);
-	float flMin = min(rgb.x, rgb.y);
-	flMin = min(flMin, rgb.z);
+	float flMax = MAX(rgb.x, rgb.y);
+	flMax = MAX(flMax, rgb.z);
+	float flMin = MIN(rgb.x, rgb.y);
+	flMin = MIN(flMin, rgb.z);
 
 	// hsv.z is the value
 	hsv.z = flMax;
@@ -4267,7 +5568,7 @@ void HSVtoRGB(const Vector3D& hsv, Vector3D& rgb)
 		hue = 0.0F;
 	}
 	hue /= 60.0F;
-	int     i = hue;        // integer part
+	int     i = Float2Int(hue);        // integer part
 	float32 f = hue - i;    // fractional part
 	float32 p = hsv.z * (1.0F - hsv.y);
 	float32 q = hsv.z * (1.0F - hsv.y * f);
@@ -4354,7 +5655,37 @@ void GetInterpolationData(float const* pKnotPositions,
 	return;
 }
 
-float RandomVector3DInUnitSphere(Vector3D* pVector3D)
+
+static Vector3D RandomVectorOnUnitSphere(float u, float v)
+{
+	float flPhi = acos(1 - 2 * u);
+	float flTheta = 2 * M_PI * v;
+
+	float flSinPhi, flCosPhi;
+	float flSinTheta, flCosTheta;
+	SinCos(flPhi, &flSinPhi, &flCosPhi);
+	SinCos(flTheta, &flSinTheta, &flCosTheta);
+
+	return Vector3D(flSinPhi * flCosTheta, flSinPhi * flSinTheta, flCosPhi);
+}
+
+
+Vector3D RandomVectorOnUnitSphere()
+{
+	// Guarantee uniform random distribution on a sphere
+	// Graphics gems III contains this algorithm ("Nonuniform random point sets via warping")
+	float u = RandomFloat(0., 1.);
+	float v = RandomFloat(0., 1.);
+	return RandomVectorOnUnitSphere(u, v);
+}
+
+
+Vector3D RandomVectorOnUnitSphere(IUniformRandomStream* pRnd)
+{
+	return RandomVectorOnUnitSphere(pRnd->RandomFloat(), pRnd->RandomFloat());
+}
+
+float RandomVectorInUnitSphere(Vector3D* pVector)
 {
 	// Guarantee uniform random distribution within a sphere
 	// Graphics gems III contains this algorithm ("Nonuniform random point sets via warping")
@@ -4371,13 +5702,34 @@ float RandomVector3DInUnitSphere(Vector3D* pVector3D)
 	SinCos(flPhi, &flSinPhi, &flCosPhi);
 	SinCos(flTheta, &flSinTheta, &flCosTheta);
 
-	pVector3D->x = flRadius * flSinPhi * flCosTheta;
-	pVector3D->y = flRadius * flSinPhi * flSinTheta;
-	pVector3D->z = flRadius * flCosPhi;
+	pVector->x = flRadius * flSinPhi * flCosTheta;
+	pVector->y = flRadius * flSinPhi * flSinTheta;
+	pVector->z = flRadius * flCosPhi;
 	return flRadius;
 }
 
-float RandomVector3DInUnitCircle(Vector2D* pVector3D)
+
+Vector3D RandomVectorInUnitSphere()
+{
+	Vector3D vOut;
+	RandomVectorInUnitSphere(&vOut);
+	return vOut;
+}
+
+Vector3D RandomVectorInUnitSphere(IUniformRandomStream* pRnd)
+{
+	float w = pRnd->RandomFloat();
+	float flRadius = powf(w, 1.0f / 3.0f);
+
+	Vector3D v = RandomVectorOnUnitSphere(pRnd) * flRadius;
+
+	return v;
+}
+
+
+
+
+float RandomVectorInUnitCircle(Vector2D* pVector)
 {
 	// Guarantee uniform random distribution within a sphere
 	// Graphics gems III contains this algorithm ("Nonuniform random point sets via warping")
@@ -4390,68 +5742,96 @@ float RandomVector3DInUnitCircle(Vector2D* pVector3D)
 	float flSinTheta, flCosTheta;
 	SinCos(flTheta, &flSinTheta, &flCosTheta);
 
-	pVector3D->x = flRadius * flCosTheta;
-	pVector3D->y = flRadius * flSinTheta;
+	pVector->x = flRadius * flCosTheta;
+	pVector->y = flRadius * flSinTheta;
 	return flRadius;
 }
-#ifdef FP_EXCEPTIONS_ENABLED
-#include <float.h> // For _clearfp and _controlfp_s
-#endif
 
-// FPExceptionDisable and FPExceptionEnabler taken from my blog post
-// at http://www.altdevblogaday.com/2012/04/20/exceptional-floating-point/
 
-#ifdef FP_EXCEPTIONS_ENABLED
-// These functions are all inlined NOPs if FP_EXCEPTIONS_ENABLED is not defined.
-FPExceptionDisabler::FPExceptionDisabler()
+const Quaternion RandomQuaternion()
 {
-	// Retrieve the current state of the exception flags. This
-	// must be done before changing them. _MCW_EM is a bit
-	// mask representing all available exception masks.
-	_controlfp_s(&mOldValues, 0, 0);
-	// Set all of the exception flags, which suppresses FP
-	// exceptions on the x87 and SSE units.
-	_controlfp_s(0, _MCW_EM, _MCW_EM);
+	// Guarantee uniform distribution within S^3. Found on the internet, looked through the proof very briefly, looks sound enough to tentatively trust it before testing or checking the proof for real.
+	// http://mathproofs.blogspot.com/2005/05/uniformly-distributed-random-unit.html
+	float u = RandomFloat(0, 2 * M_PI), flSinU = sinf(u);
+	float v = acosf(RandomFloat(-1, 1)), flSinV = sinf(v);
+	float w = 0.5f * (RandomFloat(0, M_PI) + acosf(RandomFloat(0, 1)) + M_PI / 2), flSinW = sinf(w);
+	return Quaternion(cosf(u), flSinU * cosf(v), flSinU * flSinV * cosf(w), flSinU * flSinV * flSinW);
 }
 
-FPExceptionDisabler::~FPExceptionDisabler()
+const Quaternion RandomQuaternion(IUniformRandomStream* pRnd)
 {
-	// Clear any pending FP exceptions. This must be done
-	// prior to enabling FP exceptions since otherwise there
-	// may be a 'deferred crash' as soon the exceptions are
-	// enabled.
-	_clearfp();
-
-	// Reset (possibly enabling) the exception status.
-	_controlfp_s(0, mOldValues, _MCW_EM);
+	// Guarantee uniform distribution within S^3. Found on the internet, looked through the proof very briefly, looks sound enough to tentatively trust it before testing or checking the proof for real.
+	// http://mathproofs.blogspot.com/2005/05/uniformly-distributed-random-unit.html
+	float u = pRnd->RandomFloat(0, 2 * M_PI), flSinU = sinf(u);
+	float v = acosf(pRnd->RandomFloat(-1, 1)), flSinV = sinf(v);
+	float w = 0.5f * (pRnd->RandomFloat(0, M_PI) + acosf(pRnd->RandomFloat(0, 1)) + M_PI / 2), flSinW = sinf(w);
+	return Quaternion(cosf(u), flSinU * cosf(v), flSinU * flSinV * cosf(w), flSinU * flSinV * flSinW);
 }
 
-// Overflow, divide-by-zero, and invalid-operation are the FP
-// exceptions most frequently associated with bugs.
-FPExceptionEnabler::FPExceptionEnabler(unsigned int enableBits /*= _EM_OVERFLOW | _EM_ZERODIVIDE | _EM_INVALID*/)
+// Originally from hammer_mathlib.cpp
+//
+// Generate the corner points of a box:
+// +y       _+z
+// ^        /|
+// |       /
+// |  3---7   
+//   /|  /|
+//  / | / |
+// 2---6  |
+// |  1|--5
+// | / | /
+// |/  |/
+// 0---4   --> +x
+//
+void PointsFromBox(const Vector3D& mins, const Vector3D& maxs, Vector3D* points)
 {
-	// Retrieve the current state of the exception flags. This
-	// must be done before changing them. _MCW_EM is a bit
-	// mask representing all available exception masks.
-	_controlfp_s(&mOldValues, 0, 0);
+	points[0][0] = mins[0];
+	points[0][1] = mins[1];
+	points[0][2] = mins[2];
 
-	// Make sure no non-exception flags have been specified,
-	// to avoid accidental changing of rounding modes, etc.
-	enableBits &= _MCW_EM;
+	points[1][0] = mins[0];
+	points[1][1] = mins[1];
+	points[1][2] = maxs[2];
 
-	// Clear any pending FP exceptions. This must be done
-	// prior to enabling FP exceptions since otherwise there
-	// may be a 'deferred crash' as soon the exceptions are
-	// enabled.
-	_clearfp();
+	points[2][0] = mins[0];
+	points[2][1] = maxs[1];
+	points[2][2] = mins[2];
 
-	// Zero out the specified bits, leaving other bits alone.
-	_controlfp_s(0, ~enableBits, enableBits);
+	points[3][0] = mins[0];
+	points[3][1] = maxs[1];
+	points[3][2] = maxs[2];
+
+	points[4][0] = maxs[0];
+	points[4][1] = mins[1];
+	points[4][2] = mins[2];
+
+	points[5][0] = maxs[0];
+	points[5][1] = mins[1];
+	points[5][2] = maxs[2];
+
+	points[6][0] = maxs[0];
+	points[6][1] = maxs[1];
+	points[6][2] = mins[2];
+
+	points[7][0] = maxs[0];
+	points[7][1] = maxs[1];
+	points[7][2] = maxs[2];
 }
 
-FPExceptionEnabler::~FPExceptionEnabler()
+void BuildTransformedBox(Vector3D* v2, Vector3D const& bbmin, Vector3D const& bbmax, const matrix3x4_t& m)
 {
-	// Reset the exception state.
-	_controlfp_s(0, mOldValues, _MCW_EM);
+	Vector3D v[8];
+	PointsFromBox(bbmin, bbmax, v);
+
+	VectorTransform(v[0], m, v2[0]);
+	VectorTransform(v[1], m, v2[1]);
+	VectorTransform(v[2], m, v2[2]);
+	VectorTransform(v[3], m, v2[3]);
+	VectorTransform(v[4], m, v2[4]);
+	VectorTransform(v[5], m, v2[5]);
+	VectorTransform(v[6], m, v2[6]);
+	VectorTransform(v[7], m, v2[7]);
 }
-#endif
+
+
+#endif // !defined(__SPU__)
diff --git a/r5dev/mathlib/noisedata.h b/r5dev/mathlib/noisedata.h
index d93b10c1..e1bb0ba6 100644
--- a/r5dev/mathlib/noisedata.h
+++ b/r5dev/mathlib/noisedata.h
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+﻿//====== Copyright � 1996-2006, Valve Corporation, All rights reserved. =======//
 //
 // Purpose: static data for noise() primitives.
 //
@@ -178,3 +178,132 @@ static float impulse_zcoords[] = {
     0.796078,0.615686,0.878431,0.921569,0.631373,0.200000,0.403922,0.462745
 };
 
+static float s_randomGradients[] = {
+    -0.460087, -0.887463, -0.058594 ,-0.458151, 0.861646, -0.430176 ,
+    -0.930437, 0.316048, -0.195496 ,-0.883558, -0.393287, -0.276550 ,
+    0.171025, -0.983455, -0.329712 ,-0.033573, -0.941867, -0.994995 ,
+    -0.476492, 0.014764, 0.879150 ,0.834786, -0.454571, 0.348755 ,-0.585801,
+     -0.782531, -0.338745 ,0.973990, -0.023774, 0.225403 ,-0.989659,
+     -0.011313, -0.143005 ,0.507109, -0.838016, -0.369141 ,-0.609995,
+     -0.766277, 0.314087 ,0.429987, 0.599850, -0.843323 ,0.089587,
+     -0.904071, -0.977783 ,-0.306997, -0.901432, 0.705078 ,0.031606,
+     0.994782, -0.950806 ,0.797663, -0.161508, -0.588806 ,0.811569,
+     -0.505360, 0.339783 ,0.936130, -0.114223, 0.334778 ,0.217280,
+     -0.970264, 0.440674 ,0.600976, -0.712375, -0.516418 ,0.197935,
+     0.979260, 0.213501 ,0.002956, 0.999995, -0.268127 ,-0.912763, 0.084651,
+     -0.401062 ,-0.193271, -0.945607, -0.804382 ,0.662480, 0.640156,
+     -0.506348 ,0.363459, -0.884439, 0.627197 ,-0.433415, 0.685363,
+     0.803589 ,-0.721652, 0.416952, -0.607971 ,0.647676, 0.296700,
+     0.734863 ,0.723040, -0.444294, 0.590454 ,-0.716318, -0.420435,
+     -0.613770 ,-0.039076, -0.996459, 0.885437 ,0.175225, -0.969092,
+     0.703918 ,0.116952, -0.991832, -0.399048 ,-0.504674, -0.013997,
+     0.863281 ,-0.436364, -0.817916, 0.651733 ,0.098030, -0.995090,
+     0.137573 ,0.637157, -0.766031, -0.132263 ,-0.594718, 0.583153,
+     -0.681213 ,-0.625632, 0.419913, -0.724426 ,-0.607341, -0.394521,
+     0.750427 ,-0.312161, 0.698925, 0.899719 ,0.101228, -0.927363,
+     -0.962708 ,-0.934241, 0.041214, -0.354553 ,-0.826005, -0.284775,
+     -0.507446 ,-0.363751, -0.929287, -0.173584 ,-0.141266, 0.983869,
+     -0.613525 ,-0.436139, -0.074329, 0.899292 ,-0.875355, -0.480839,
+     0.057556 ,0.250714, 0.071270, 0.967896 ,0.182131, 0.811467, 0.950195 ,
+    -0.687696, -0.668570, -0.380554 ,0.785175, -0.540171, -0.359863 ,
+    0.399774, 0.848526, 0.655151 ,-0.412243, -0.004602, 0.911072 ,-0.132187,
+     -0.990485, 0.278198 ,0.212421, 0.764179, 0.944214 ,-0.694878, 0.234042,
+     -0.699402 ,0.404273, 0.904644, -0.316406 ,0.358393, 0.087135,
+     0.933044 ,-0.473398, 0.820774, -0.559692 ,0.044667, -0.997938,
+     0.718201 ,0.603896, -0.046386, 0.796570 ,-0.968822, 0.180966,
+     0.172058 ,-0.458206, 0.886932, -0.126221 ,-0.656709, -0.410319,
+     0.693848 ,0.999495, -0.018023, 0.026184 ,-0.486069, -0.740178,
+     -0.690979 ,0.942399, -0.333819, 0.022461 ,-0.294545, 0.867619,
+     0.805664 ,0.886791, -0.416081, -0.221252 ,-0.797187, 0.587661,
+     -0.171021 ,-0.617708, -0.762817, -0.295654 ,0.449351, -0.853660,
+     -0.505615 ,0.065153, -0.995535, 0.723572 ,0.996518, 0.000000,
+     0.083374 ,0.263346, 0.088663, -0.964417 ,-0.221316, -0.970864,
+     0.383423 ,-0.512560, 0.718804, 0.675598 ,0.588859, 0.406293,
+     -0.764648 ,-0.803841, -0.592769, -0.061646 ,0.860199, 0.492898,
+     -0.150330 ,-0.351871, 0.858024, 0.728455 ,0.515724, -0.815149,
+     0.455322 ,-0.122322, -0.960484, 0.898254 ,-0.529020, 0.844443,
+     -0.156799 ,0.530671, -0.725304, 0.637024 ,-0.748915, -0.248928,
+     -0.634094 ,-0.188099, 0.584087, 0.972778 ,0.974165, 0.222094,
+     -0.041992 ,0.595326, -0.701663, -0.549438 ,-0.060279, -0.998047,
+     -0.262451 ,-0.191682, -0.782292, -0.951477 ,0.528851, -0.596315,
+     0.752319 ,0.612134, 0.639567, -0.604919 ,0.882803, 0.200541, 0.433594 ,
+    -0.936278, -0.039490, 0.349304 ,0.940848, -0.121649, 0.318604 ,
+    -0.115022, 0.048685, -0.993347 ,-0.324162, -0.935726, -0.394226 ,
+    -0.937457, -0.294685, 0.193909 ,0.894463, -0.437237, 0.104065 ,
+    -0.861852, -0.165102, -0.486206 ,-0.980480, -0.139899, 0.139526 ,
+    -0.024496, 0.960750, -0.996094 ,-0.699760, 0.714256, -0.018860 ,
+    0.538575, -0.792107, 0.470581 ,0.309926, -0.943720, 0.349182 ,0.525671,
+     -0.772280, 0.561523 ,-0.793079, 0.268745, 0.567505 ,0.697504,
+     -0.421131, 0.639221 ,-0.737871, 0.672553, -0.076660 ,-0.390769,
+     -0.894942, -0.482666 ,-0.593469, 0.191892, 0.796448 ,0.439379,
+     -0.896646, 0.123108 ,0.337698, -0.703709, -0.879822 ,-0.654687,
+     0.749517, 0.148071 ,-0.482070, -0.700569, 0.737305 ,0.626971, 0.761948,
+     -0.250610 ,0.616585, 0.015339, -0.787231 ,-0.175877, -0.982000,
+     0.364624 ,0.891483, -0.324585, -0.334167 ,0.858029, 0.438272,
+     -0.297913 ,0.949369, 0.258757, 0.184448 ,0.105948, -0.901183,
+     0.969666 ,-0.261581, 0.943276, -0.615845 ,-0.682063, -0.528339,
+     -0.595520 ,-0.810856, 0.514103, -0.326050 ,-0.163757, 0.986118,
+     0.165527 ,-0.595927, -0.221907, 0.791504 ,-0.160374, -0.977354,
+     0.652405 ,-0.428837, 0.641628, -0.829102 ,-0.634149, -0.486378,
+     -0.687927 ,-0.093271, -0.995222, -0.295654 ,0.988659, -0.150144,
+     -0.003357 ,0.730821, -0.497396, -0.538818 ,-0.781913, -0.621260,
+     -0.065674 ,-0.655884, -0.753313, -0.073486 ,0.845542, -0.409094,
+     0.375977 ,-0.630041, -0.514925, -0.678101 ,0.205571, 0.978634,
+     -0.019531 ,0.582841, 0.763684, -0.430054 ,0.685084, -0.728464,
+     0.000000 ,-0.241437, -0.958430, -0.532898 ,0.741884, 0.020899,
+     -0.670349 ,0.740273, -0.318412, 0.624634 ,-0.738068, -0.539041,
+     0.481812 ,-0.965798, -0.034508, -0.257141 ,0.495184, 0.805372,
+     0.549683 ,-0.572524, 0.809558, -0.221008 ,-0.537181, 0.834652,
+     0.220825 ,-0.899741, 0.097826, -0.427368 ,-0.370148, 0.494066,
+     0.904846 ,0.711387, 0.577688, 0.490356 ,0.183324, -0.722791,
+     -0.964172 ,0.552815, -0.807753, -0.347351 ,-0.096050, 0.994565,
+     -0.386047 ,-0.884907, 0.369536, 0.305115 ,-0.832976, -0.551898,
+     0.047363 ,0.338883, 0.641922, 0.897034 ,0.805354, 0.506187, 0.357727 ,
+    -0.040128, 0.998805, -0.570923 ,0.466918, -0.602455, 0.811035 ,0.139166,
+     -0.983697, 0.633362 ,-0.253765, -0.340498, -0.962891 ,-0.448806,
+     0.843929, 0.547791 ,-0.859087, -0.434649, -0.300110 ,0.287570,
+     0.957661, 0.047729 ,0.379100, 0.795023, 0.780640 ,0.154245, -0.987903,
+     -0.103088 ,-0.538067, 0.794791, -0.462524 ,-0.466455, -0.180966,
+     0.880371 ,-0.175736, -0.983766, 0.202576 ,-0.891655, 0.192080,
+     -0.417725 ,-0.688716, -0.619004, 0.480652 ,0.120790, -0.987844,
+     -0.629456 ,-0.075080, 0.983385, 0.910461 ,0.147032, -0.960431,
+     -0.849304 ,0.732309, 0.671559, 0.152283 ,0.804657, 0.273913,
+     -0.547729 ,0.391462, -0.913976, 0.263184 ,-0.567300, 0.783128,
+     0.409607 ,0.214917, 0.167182, -0.975952 ,0.367428, -0.789995,
+     -0.800537 ,-0.320112, 0.912727, -0.621399 ,0.659247, -0.647346,
+     -0.501892 ,0.222842, -0.696452, -0.950562 ,-0.697513, -0.576278,
+     0.521118 ,0.602260, -0.756081, 0.391418 ,-0.116043, 0.992942,
+     0.206665 ,0.220693, -0.968855, -0.453552 ,0.737991, 0.670137,
+     0.106812 ,0.198419, -0.696590, 0.960999 ,-0.391866, -0.883543,
+     0.547668 ,0.082067, -0.996213, 0.330200 ,-0.806059, 0.491897,
+     -0.377991 ,-0.992265, 0.120698, 0.029236 ,0.406622, -0.867524,
+     0.575928 ,0.789945, 0.608406, 0.096191 ,-0.531904, -0.004218,
+     -0.846802 ,0.558298, -0.089427, 0.828125 ,-0.783155, 0.363828,
+     -0.541382 ,0.981706, -0.183228, 0.052673 ,-0.388642, 0.920618,
+     -0.096497 ,-0.506403, -0.044662, -0.862000 ,-0.512421, -0.852059,
+     -0.204163 ,0.559542, 0.339777, 0.803772 ,0.527502, -0.846389,
+     0.137573 ,-0.184315, -0.952725, 0.794983 ,0.125024, -0.977110,
+     -0.809082 ,-0.643507, 0.678632, 0.482056 ,-0.277474, 0.954056,
+     0.377380 ,-0.622333, -0.717603, 0.448914 ,0.366846, -0.110794,
+     -0.929382 ,0.120402, 0.992596, 0.131653 ,-0.982921, 0.103550,
+     -0.152954 ,-0.058333, -0.997913, -0.428894 ,0.132631, 0.979299,
+     0.755432 ,0.326398, 0.937806, 0.340637 ,0.211720, 0.976659, 0.168640 ,
+    0.957557, -0.019174, -0.287659 ,-0.016554, 0.999650, 0.780090 ,
+    -0.271222, 0.827292, -0.875732 ,0.850790, -0.448069, 0.307129 ,0.115949,
+     0.600003, -0.989441 ,0.285877, -0.940896, -0.536255 ,-0.321317,
+     -0.278336, -0.942383 ,-0.422133, 0.754447, 0.765747 ,0.669674,
+     -0.741852, -0.051514 ,0.213604, -0.949888, 0.730103 ,0.619681,
+     -0.751798, -0.341797 ,-0.223762, 0.438616, -0.968506 ,-0.302925,
+     -0.945732, 0.361877 ,0.121093, -0.977151, -0.821838 ,0.127125,
+     0.758710, -0.980774 ,0.691682, 0.695626, 0.270203 ,0.241114, 0.967463,
+     -0.303040 ,-0.829705, 0.422869, 0.402100 ,-0.484170, -0.741723,
+     0.692017 ,-0.431259, -0.777492, -0.727844 ,0.835756, -0.211986,
+     0.518311 ,0.297724, 0.932993, 0.561829 ,0.633475, -0.764920,
+     -0.181091 ,-0.833849, -0.453546, -0.353027 ,-0.369433, 0.839581,
+     -0.733154 ,0.555847, 0.392934, -0.796631 ,-0.856065, 0.028375,
+     0.516296 ,0.067161, 0.997565, 0.269409 ,-0.962279, -0.051749,
+     0.267456 ,-0.738893, 0.080065, -0.671204 ,-0.764325, 0.462240,
+     0.507019 ,0.148758, 0.751545, 0.974243 ,-0.153430, -0.318230,
+     0.986816 ,-0.439372, 0.776405, 0.716919
+};
+
diff --git a/r5dev/mathlib/powsse.cpp b/r5dev/mathlib/powsse.cpp
index 2144f549..3c217c6e 100644
--- a/r5dev/mathlib/powsse.cpp
+++ b/r5dev/mathlib/powsse.cpp
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+﻿//========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============//
 //
 // Purpose: 
 //
@@ -45,6 +45,7 @@ fltx4 Pow_FixedPoint_Exponent_SIMD(const fltx4& x, int exponent)
 
 
 
+#ifndef _PS3 // these aren't fast (or correct) on the PS3
 /*
  * (c) Ian Stephenson
  *
@@ -94,4 +95,7 @@ float FastPow10(float i)
 {
 	return FastPow2(i * 3.321928f);
 }
+#else
+#pragma message("TODO: revisit fast logs on all PPC hardware")
+#endif
 
diff --git a/r5dev/mathlib/randsse.cpp b/r5dev/mathlib/randsse.cpp
index 85199d58..5469e32b 100644
--- a/r5dev/mathlib/randsse.cpp
+++ b/r5dev/mathlib/randsse.cpp
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+﻿//========= Copyright � 1996-2006, Valve Corporation, All rights reserved. ============//
 //
 // Purpose: generates 4 randum numbers in the range 0..1 quickly, using SIMD
 //
@@ -6,7 +6,7 @@
 
 #include "core/stdafx.h"
 #include "tier0/dbg.h"
-#include "tier0/basetypes.h"
+#include "tier0/threadtools.h"
 #include "mathlib/mathlib.h"
 #include "mathlib/vector.h"
 #include "mathlib/ssemath.h"
@@ -43,7 +43,7 @@ public:
 		fltx4 retval = AddSIMD(*m_pRand_K, *m_pRand_J);
 
 		// if ( ret>=1.0) ret-=1.0
-		fltx4 overflow_mask = CmpGeSIMD(retval, Four_Ones);
+		bi32x4 overflow_mask = CmpGeSIMD(retval, Four_Ones);
 		retval = SubSIMD(retval, AndSIMD(Four_Ones, overflow_mask));
 
 		*m_pRand_K = retval;
@@ -86,6 +86,7 @@ int GetSIMDRandContext(void)
 				// try to take it!
 				if (ThreadInterlockedAssignIf(&(s_nRandContextsInUse[i]), 1, 0))
 				{
+					ThreadMemoryBarrier();
 					return i;								// done!
 				}
 			}
@@ -97,6 +98,7 @@ int GetSIMDRandContext(void)
 
 void ReleaseSIMDRandContext(int nContext)
 {
+	ThreadMemoryBarrier();
 	s_nRandContextsInUse[nContext] = 0;
 }
 
diff --git a/r5dev/mathlib/sseconst.cpp b/r5dev/mathlib/sseconst.cpp
index 9305eefa..c3b2b006 100644
--- a/r5dev/mathlib/sseconst.cpp
+++ b/r5dev/mathlib/sseconst.cpp
@@ -1,13 +1,27 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+﻿//===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======//
 //
 // Purpose: 
 //
 //===========================================================================//
 
+#if defined(__SPU__)
+#include "platform.h"
+#include "basetypes.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/math_pfns.h"
+// #include "mathlib/fltx4.h"
+#include "ps3/spu_job_shared.h"
+#endif
+
 #include "core/stdafx.h"
 #include "mathlib/ssemath.h"
 #include "mathlib/ssequaternion.h"
+//#include "mathlib/compressed_vector.h"
 
+// NOTE: This has to be the last file included!
+//#include "tier0/memdbgon.h"
+
+#if !defined(__SPU__)
 const fltx4 Four_PointFives = { 0.5,0.5,0.5,0.5 };
 #ifndef _X360
 const fltx4 Four_Zeros = { 0.0,0.0,0.0,0.0 };
@@ -23,14 +37,27 @@ const fltx4 Four_2ToThe21s = { (float)(1 << 21), (float)(1 << 21), (float)(1 <<
 const fltx4 Four_2ToThe22s = { (float)(1 << 22), (float)(1 << 22), (float)(1 << 22), (float)(1 << 22) };
 const fltx4 Four_2ToThe23s = { (float)(1 << 23), (float)(1 << 23), (float)(1 << 23), (float)(1 << 23) };
 const fltx4 Four_2ToThe24s = { (float)(1 << 24), (float)(1 << 24), (float)(1 << 24), (float)(1 << 24) };
-
+const fltx4 Four_Thirds = { 0.33333333, 0.33333333, 0.33333333, 0.33333333 };
+const fltx4 Four_TwoThirds = { 0.66666666, 0.66666666, 0.66666666, 0.66666666 };
 const fltx4 Four_Point225s = { .225, .225, .225, .225 };
 const fltx4 Four_Epsilons = { FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON };
+const fltx4 Four_DegToRad = { ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)) };
 
 const fltx4 Four_FLT_MAX = { FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX };
 const fltx4 Four_Negative_FLT_MAX = { -FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX };
 const fltx4 g_SIMD_0123 = { 0., 1., 2., 3. };
 
+const fltx4 Four_LinearToGammaCoefficients_A = { -3.7295, -3.7295, -3.7295, -3.7295 };
+const fltx4 Four_LinearToGammaCoefficients_B = { 8.9635,  8.9635,  8.9635,  8.9635 };
+const fltx4 Four_LinearToGammaCoefficients_C = { -7.7397,  -7.7397,  -7.7397,  -7.7397 };
+const fltx4 Four_LinearToGammaCoefficients_D = { 3.443, 3.443, 3.443, 3.443 };
+const fltx4 Four_LinearToGammaCoefficients_E = { 0.048, 0.048, 0.048, 0.048 };
+
+const fltx4 Four_GammaToLinearCoefficients_A = { .1731, .1731, .1731, .1731 };
+const fltx4 Four_GammaToLinearCoefficients_B = { .8717, .8717, .8717, .8717 };
+const fltx4 Four_GammaToLinearCoefficients_C = { -.0452, -.0452, -.0452, -.0452 };
+const fltx4 Four_GammaToLinearCoefficients_D = { .0012, .0012, .0012, .0012 };
+
 const fltx4 g_QuatMultRowSign[4] =
 {
 	{  1.0f,  1.0f, -1.0f, 1.0f },
@@ -38,20 +65,28 @@ const fltx4 g_QuatMultRowSign[4] =
 	{  1.0f, -1.0f,  1.0f, 1.0f },
 	{ -1.0f, -1.0f, -1.0f, 1.0f }
 };
+#endif
 
-const uint32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = { 0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff };
-const uint32 ALIGN16 g_SIMD_signmask[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
-const uint32 ALIGN16 g_SIMD_lsbmask[4] ALIGN16_POST = { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe };
-const uint32 ALIGN16 g_SIMD_clear_wmask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
-const uint32 ALIGN16 g_SIMD_AllOnesMask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; // ~0,~0,~0,~0
-const uint32 ALIGN16 g_SIMD_Low16BitsMask[4] ALIGN16_POST = { 0xffff, 0xffff, 0xffff, 0xffff }; // 0xffff x 4
 
-const uint32 ALIGN16 g_SIMD_ComponentMask[4][4] ALIGN16_POST =
+const int32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = { 0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff };
+const int32 ALIGN16 g_SIMD_signmask[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+const int32 ALIGN16 g_SIMD_lsbmask[4] ALIGN16_POST = { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe };
+const int32 ALIGN16 g_SIMD_clear_wmask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
+const int32 ALIGN16 g_SIMD_AllOnesMask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; // ~0,~0,~0,~0
+const int32 ALIGN16 g_SIMD_Low16BitsMask[4] ALIGN16_POST = { 0xffff, 0xffff, 0xffff, 0xffff }; // 0xffff x 4
+
+
+const int32 ALIGN16 g_SIMD_ComponentMask[4][4] ALIGN16_POST =
 {
 	{ 0xFFFFFFFF, 0, 0, 0 }, { 0, 0xFFFFFFFF, 0, 0 }, { 0, 0, 0xFFFFFFFF, 0 }, { 0, 0, 0, 0xFFFFFFFF }
 };
 
-const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST =
+const fltx4 g_SIMD_Identity[4] =
+{
+	{ 1.0, 0, 0, 0 }, { 0, 1.0, 0, 0 }, { 0, 0, 1.0, 0 }, { 0, 0, 0, 1.0 }
+};
+
+const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST =
 {
 	{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
 	{ 0xffffffff, 0x00000000, 0x00000000, 0x00000000 },
@@ -59,6 +94,114 @@ const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST =
 	{ 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 },
 };
 
+const int32 ALIGN16 g_SIMD_EveryOtherMask[4] = { 0, ~0, 0, ~0 };
+
+
+
+#ifdef PLATFORM_PPC
+
+/// Passed as a parameter to vslh, shuffles the z component of a quat48 stored in the zw words left by one bit.
+const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[] = {
+	0x00, 0x00,												// x word
+	0x00, 0x00,												// y word
+	0x00, 0x01,												// z word 
+	0x00, 0x00 };											// w word 
+
+// this permutes uint16's x,y,z packed in the most significant four halfwords of a fltx4 
+// so that each gets its own word in the output. expected use is // __vperm( XX, Four_Threes, permute )
+// -- that way each int is represented as 3.0 + n * 2^-22 , which we can pull into the 
+// appropriate range with a single madd!
+const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16] =
+{
+	16, 17, 0, 1,											// word one:   00XX
+	16, 17, 2, 3,											// word two:   00YY
+	16, 17, 4, 5,											// word three: 00ZZ
+	16, 17, 6, 7											// word four:  00WW
+};
+
+// the other permutes are a little trickier. note: I'm defining them out of order.
+// 2 and 5 blend together prior results, rather than a source with 3.0f
+
+// out1 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute1) ); // __x1__y1__z1____
+const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16] =
+{
+	16, 17, 6, 7,											// word one:   00XX
+	16, 17, 8, 9,											// word two:   00YY
+	16, 17, 10, 11,											// word three: 00ZZ
+	16, 17, 12, 13											// word four:  00WW
+};
+
+// out3 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute3) ); // __x3__y3__z3__z2  // z2 is important, goes into out2
+const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16] =
+{
+	16, 17, 2, 3,
+	16, 17, 4, 5,
+	16, 17, 6, 7,
+	16, 17, 0, 1
+};
+
+// out4 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute4) ); // __x4__y4__z4__x5  // x5 is important, goes into out5
+const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16] =
+{
+	16, 17, 8, 9,
+	16, 17, 10, 11,
+	16, 17, 12, 13,
+	16, 17, 14, 15
+};
+
+// out6 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute6) ); // __x6__y6__z6____
+const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16] =
+{
+	16, 17, 4, 5,    // word one
+	16, 17, 6, 7,  // word two
+	16, 17, 8, 9,  // word three
+	16, 17, 10, 11   // word four  (garbage)
+};
+
+// out7 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute7) ); // __x7__y7__z7____
+const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16] =
+{
+	16, 17, 10, 11,    // word one
+	16, 17, 12, 13,  // word two
+	16, 17, 14, 15,  // word three
+	16, 17, 16, 17   // word four  (garbage)
+};
+
+// these last two are tricky because we mix old output with source input. we get the 3.0f
+// from the old output.
+// out2 = __vperm( x0y0z0x1y1z1x2y2, out3, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute2)  ); // __x2__y2__z2____
+const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16] =
+{
+	16, 17, 12, 13,  // 3.x2   
+	16, 17, 14, 15,  // 3.y2
+	16, 17, 30, 31,  // 3.z2 (from out2)
+	16, 17, 16, 17
+};
+
+// out5 = __vperm( y5z5x6y6z6x7y7z7, out4, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute5)  ) // __x5__y5__z5____
+const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16] =
+{
+	16, 17, 30, 31,  // 3.x5  (from out5)  
+	16, 17,  0,  1,  // 3.y5
+	16, 17,  2,  3,  // 3.z5 
+	16, 17, 16, 17   // garbage   
+};
+
+
+// magic constants that we use to convert the unpacked q48 components from 2 + n * 2^-22 (where n = 0 .. 65535)
+// to -1.0 .. 1
+#define UnpackMul16s ( (1 << 22) / 32767.5 )
+#define UnpackAdd16s ( ( -UnpackMul16s * 3.0 ) - 1 )
+// we put the constants all into one word to save a little memory bandwidth
+// but otherwise it would look like this:
+// static const fltx4 vUpkMul = { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
+// static const fltx4 vUpkAdd = { UnpackAdd16s , UnpackAdd16s , UnpackAdd16s , UnpackAdd16s  };
+const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants = { UnpackMul16s , UnpackAdd16s, 0, 0 };
+#undef UnpackMul16s
+#undef UnpackAdd16s
+
+#endif
+
 
 // FUNCTIONS
 // NOTE: WHY YOU **DO NOT** WANT TO PUT FUNCTIONS HERE
@@ -82,7 +225,7 @@ const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST =
 // function is more than one screen long, yours is probably not one
 // of those occasions.
 
-
+#if !defined(__SPU__)
 
 /// You can use this to rotate a long array of FourVectors all by the same
 /// matrix. The first parameter is the head of the array. The second is the
@@ -122,7 +265,7 @@ void FourVectors::RotateManyBy(FourVectors* RESTRICT pVectors, unsigned int numV
 		matSplat22 = SplatZSIMD(matCol2);
 	}
 
-#ifdef _X360
+#if defined(_X360) || defined(_PS3)
 	// Same algorithm as above, but the loop is unrolled to eliminate data hazard latencies
 	// and simplify prefetching. Named variables are deliberately used instead of arrays to
 	// ensure that the variables live on the registers instead of the stack (stack load/store
@@ -216,6 +359,172 @@ void FourVectors::RotateManyBy(FourVectors* RESTRICT pVectors, unsigned int numV
 #endif
 }
 
+// Get the closest point from P to the (infinite) line through vLineA and vLineB and
+// calculate the shortest distance from P to the line.
+// If you pass in a value for t, it will tell you the t for (A + (B-A)t) to get the closest point.
+// If the closest point lies on the segment between A and B, then 0 <= t <= 1.
+void FourVectors::CalcClosestPointOnLineSIMD(const FourVectors& P, const FourVectors& vLineA, const FourVectors& vLineB, FourVectors& vClosest, fltx4* outT)
+{
+	FourVectors vDir;
+	fltx4 t = CalcClosestPointToLineTSIMD(P, vLineA, vLineB, vDir);
+	if (outT) *outT = t;
+	vClosest = vDir;
+	vClosest *= t;
+	vClosest += vLineA;
+}
+
+fltx4 FourVectors::CalcClosestPointToLineTSIMD(const FourVectors& P, const FourVectors& vLineA, const FourVectors& vLineB, FourVectors& vDir)
+{
+	Assert(s_bMathlibInitialized);
+	vDir = vLineB;
+	vDir -= vLineA;
+
+	fltx4 div = vDir * vDir;
+	bi32x4 Mask;
+	fltx4 Compare = ReplicateX4(0.00001f);
+	fltx4 result;
+	Mask = CmpLtSIMD(div, Compare);
+
+	result = DivSIMD(SubSIMD(vDir * P, vDir * vLineA), div);
+
+	MaskedAssign(Mask, Four_Zeros, result);
+	return result;
+}
+
+void FourVectors::RotateManyBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors* RESTRICT pOut)
+{
+	Assert(numVectors > 0);
+	if (numVectors == 0)
+		return;
+
+	// Splat out each of the entries in the matrix to a fltx4. Do this
+	// in the order that we will need them, to hide latency. I'm
+	// avoiding making an array of them, so that they'll remain in 
+	// registers.
+	fltx4 matSplat00, matSplat01, matSplat02,
+		matSplat10, matSplat11, matSplat12,
+		matSplat20, matSplat21, matSplat22;
+
+	{
+		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
+		// often unaligned. The w components will be the tranpose row of
+		// the matrix, but we don't really care about that.
+		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
+		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
+		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
+
+		matSplat00 = SplatXSIMD(matCol0);
+		matSplat01 = SplatYSIMD(matCol0);
+		matSplat02 = SplatZSIMD(matCol0);
+
+		matSplat10 = SplatXSIMD(matCol1);
+		matSplat11 = SplatYSIMD(matCol1);
+		matSplat12 = SplatZSIMD(matCol1);
+
+		matSplat20 = SplatXSIMD(matCol2);
+		matSplat21 = SplatYSIMD(matCol2);
+		matSplat22 = SplatZSIMD(matCol2);
+	}
+
+#if  defined(_X360) || defined(_PS3)
+	// Same algorithm as above, but the loop is unrolled to eliminate data hazard latencies
+	// and simplify prefetching. Named variables are deliberately used instead of arrays to
+	// ensure that the variables live on the registers instead of the stack (stack load/store
+	// is a serious penalty on 360).  Nb: for prefetching to be most efficient here, the
+	// loop should be unrolled to 8 FourVectors per iteration; because each FourVectors is 
+	// 48 bytes long, 48 * 8 = 384, its least common multiple with the 128-byte cache line. 
+	// That way you can fetch the next 3 cache lines while you work on these three. 
+	// If you do go this route, be sure to dissassemble and make sure it doesn't spill 
+	// registers to stack as you do this; the cost of that will be excessive. Unroll the loop
+	// a little and just live with the fact that you'll be doing a couple of redundant dbcts
+	// (they don't cost you anything). Be aware that all three cores share L2 and it can only
+	// have eight cache lines fetching at a time.
+	fltx4 outX0, outY0, outZ0; // bank one of outputs
+	fltx4 outX1, outY1, outZ1; // bank two of outputs
+
+
+	// Because of instruction latencies and scheduling, it's actually faster to use adds and muls
+	// rather than madds. (Empirically determined by timing.)
+	const FourVectors* stop = pVectors + numVectors;
+	FourVectors* RESTRICT pVectNext;
+	FourVectors* RESTRICT pOutNext;
+	// prime the pump.
+	if (numVectors & 0x01)
+	{
+		// odd number of vectors to process
+		// prime the 1 group of registers
+		pVectNext = pVectors++;
+		pOutNext = pOut++;
+		outX1 = AddSIMD(AddSIMD(MulSIMD(pVectNext->x, matSplat00), MulSIMD(pVectNext->y, matSplat01)), MulSIMD(pVectNext->z, matSplat02));
+		outY1 = AddSIMD(AddSIMD(MulSIMD(pVectNext->x, matSplat10), MulSIMD(pVectNext->y, matSplat11)), MulSIMD(pVectNext->z, matSplat12));
+		outZ1 = AddSIMD(AddSIMD(MulSIMD(pVectNext->x, matSplat20), MulSIMD(pVectNext->y, matSplat21)), MulSIMD(pVectNext->z, matSplat22));
+	}
+	else
+	{
+		// even number of total vectors to process; 
+		// prime the zero group and jump into the middle of the loop
+		outX0 = AddSIMD(AddSIMD(MulSIMD(pVectors->x, matSplat00), MulSIMD(pVectors->y, matSplat01)), MulSIMD(pVectors->z, matSplat02));
+		outY0 = AddSIMD(AddSIMD(MulSIMD(pVectors->x, matSplat10), MulSIMD(pVectors->y, matSplat11)), MulSIMD(pVectors->z, matSplat12));
+		outZ0 = AddSIMD(AddSIMD(MulSIMD(pVectors->x, matSplat20), MulSIMD(pVectors->y, matSplat21)), MulSIMD(pVectors->z, matSplat22));
+		goto EVEN_CASE;
+	}
+
+	// perform an even number of iterations through this loop.
+	while (pVectors < stop)
+	{
+		outX0 = MaddSIMD(pVectors->z, matSplat02, AddSIMD(MulSIMD(pVectors->x, matSplat00), MulSIMD(pVectors->y, matSplat01)));
+		outY0 = MaddSIMD(pVectors->z, matSplat12, AddSIMD(MulSIMD(pVectors->x, matSplat10), MulSIMD(pVectors->y, matSplat11)));
+		outZ0 = MaddSIMD(pVectors->z, matSplat22, AddSIMD(MulSIMD(pVectors->x, matSplat20), MulSIMD(pVectors->y, matSplat21)));
+
+		pOutNext->x = outX1;
+		pOutNext->y = outY1;
+		pOutNext->z = outZ1;
+
+	EVEN_CASE:
+		pVectNext = pVectors + 1;
+		pOutNext = pOut + 1;
+
+		outX1 = MaddSIMD(pVectNext->z, matSplat02, AddSIMD(MulSIMD(pVectNext->x, matSplat00), MulSIMD(pVectNext->y, matSplat01)));
+		outY1 = MaddSIMD(pVectNext->z, matSplat12, AddSIMD(MulSIMD(pVectNext->x, matSplat10), MulSIMD(pVectNext->y, matSplat11)));
+		outZ1 = MaddSIMD(pVectNext->z, matSplat22, AddSIMD(MulSIMD(pVectNext->x, matSplat20), MulSIMD(pVectNext->y, matSplat21)));
+
+		pOut->x = outX0;
+		pOut->y = outY0;
+		pOut->z = outZ0;
+
+		pVectors += 2;
+		pOut += 2;
+	}
+
+	// flush the last round of output
+	pVectNext->x = outX1;
+	pVectNext->y = outY1;
+	pVectNext->z = outZ1;
+#else
+	// PC does not benefit from the unroll/scheduling above
+	fltx4 outX0, outY0, outZ0; // bank one of outputs
+
+
+	// Because of instruction latencies and scheduling, it's actually faster to use adds and muls
+	// rather than madds. (Empirically determined by timing.)
+	const FourVectors* stop = pVectors + numVectors;
+
+	// perform an even number of iterations through this loop.
+	while (pVectors < stop)
+	{
+		outX0 = MaddSIMD(pVectors->z, matSplat02, AddSIMD(MulSIMD(pVectors->x, matSplat00), MulSIMD(pVectors->y, matSplat01)));
+		outY0 = MaddSIMD(pVectors->z, matSplat12, AddSIMD(MulSIMD(pVectors->x, matSplat10), MulSIMD(pVectors->y, matSplat11)));
+		outZ0 = MaddSIMD(pVectors->z, matSplat22, AddSIMD(MulSIMD(pVectors->x, matSplat20), MulSIMD(pVectors->y, matSplat21)));
+
+		pOut->x = outX0;
+		pOut->y = outY0;
+		pOut->z = outZ0;
+		pVectors++;
+		pOut++;
+	}
+#endif
+}
+
 #ifdef _X360
 // Loop-scheduled code to process FourVectors in groups of eight quite efficiently.
 void FourVectors_TransformManyGroupsOfEightBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors* RESTRICT pOut)
@@ -1162,4 +1471,9 @@ void TransformManyPointsBy(VectorAligned* RESTRICT pVectors, unsigned int numVec
 
 
 }
+
+#endif // #if !defined(__SPU__)
+
+
+
 #endif
diff --git a/r5dev/mathlib/ssemath.h b/r5dev/mathlib/ssemath.h
index aa4186bd..5d27f38e 100644
--- a/r5dev/mathlib/ssemath.h
+++ b/r5dev/mathlib/ssemath.h
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+//===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======//
 //
 // Purpose: - defines SIMD "structure of arrays" classes and functions.
 //
@@ -8,60 +8,24 @@
 
 #if defined( _X360 )
 #include <xboxmath.h>
+#elif defined ( _PS3 )
+#include <vectormath/c/vectormath_aos.h>
+#include <vectormath/c/vectormath_aos_v.h>
 #else
 #include <xmmintrin.h>
+#ifndef _LINUX
+#include <emmintrin.h>
+#endif
 #endif
 
-#include <mathlib/vector.h>
-#include <mathlib/mathlib.h>
-
-#if defined(GNUC)
-#define USE_STDC_FOR_SIMD 0
+#ifndef SPU
+#include "mathlib/vector.h"
+#include "mathlib/mathlib.h"
 #else
-#define USE_STDC_FOR_SIMD 0
+#include "mathlib/math_pfns.h"
 #endif
 
-#if (!defined(_X360) && (USE_STDC_FOR_SIMD == 0))
-#define _SSE1 1
-#endif
-
-// I thought about defining a class/union for the SIMD packed floats instead of using fltx4,
-// but decided against it because (a) the nature of SIMD code which includes comparisons is to blur
-// the relationship between packed floats and packed integer types and (b) not sure that the
-// compiler would handle generating good code for the intrinsics.
-
-#if USE_STDC_FOR_SIMD
-
-typedef union
-{
-	float  m128_f32[4];
-	uint32 m128_u32[4];
-} fltx4;
-
-typedef fltx4 i32x4;
-typedef fltx4 u32x4;
-
-#elif ( defined( _X360 ) )
-
-typedef union
-{
-	// This union allows float/int access (which generally shouldn't be done in inner loops)
-	__vector4	vmx;
-	float		m128_f32[4];
-	uint32		m128_u32[4];
-} fltx4_union;
-
-typedef __vector4 fltx4;
-typedef __vector4 i32x4; // a VMX register; just a way of making it explicit that we're doing integer ops.
-typedef __vector4 u32x4; // a VMX register; just a way of making it explicit that we're doing unsigned integer ops.
-
-#else
-
-typedef __m128 fltx4;
-typedef __m128 i32x4;
-typedef __m128 u32x4;
-
-#endif
+#include "mathlib/fltx4.h"
 
 // The FLTX4 type is a fltx4 used as a parameter to a function.
 // On the 360, the best way to do this is pass-by-copy on the registers.
@@ -71,6 +35,8 @@ typedef __m128 u32x4;
 // explicitly use a FLTX4 as the parameter type.
 #ifdef _X360
 typedef __vector4 FLTX4;
+#elif defined( _PS3 )
+typedef vec_float4 FLTX4;
 #else
 typedef const fltx4& FLTX4;
 #endif
@@ -101,7 +67,7 @@ struct ALIGN16 intx4
 		return m_i32;
 	}
 
-	inline const bool operator==(const intx4& other) const
+	inline bool operator==(const intx4& other) const
 	{
 		return m_i32[0] == other.m_i32[0] &&
 			m_i32[1] == other.m_i32[1] &&
@@ -134,7 +100,33 @@ FORCEINLINE void TestVPUFlags() {}
 // but are manufactured directly in one or two 
 // instructions, saving a load and possible L2
 // miss.)
-#ifndef _X360
+
+#ifdef _X360
+// Shouldn't the PS3 have something similar?
+#define			   Four_Zeros			XMVectorZero()			// 0 0 0 0
+#define			   Four_Ones			XMVectorSplatOne()		// 1 1 1 1
+extern const fltx4 Four_Twos;									// 2 2 2 2
+extern const fltx4 Four_Threes;									// 3 3 3 3
+extern const fltx4 Four_Fours;									// guess.
+extern const fltx4 Four_Point225s;								// .225 .225 .225 .225
+extern const fltx4 Four_PointFives;								// .5 .5 .5 .5
+extern const fltx4 Four_Thirds;									// 1/3
+extern const fltx4 Four_TwoThirds;								// 2/3
+extern const fltx4 Four_NegativeOnes;							// -1 -1 -1 -1 
+extern const fltx4 Four_DegToRad;								// (float)(M_PI_F / 180.f) times four
+#elif defined(SPU)
+#define			   Four_Zeros			spu_splats( 0.0f )		// 0 0 0 0
+#define			   Four_Ones			spu_splats( 1.0f )		// 1 1 1 1
+#define			   Four_Twos			spu_splats( 2.0f )		// 2 2 2 2
+#define			   Four_Threes			spu_splats( 3.0f )		// 3 3 3 3
+#define			   Four_Fours			spu_splats( 4.0f )		// guess.
+#define			   Four_Point225s		spu_splats( 0.225f )		// .225 .225 .225 .225
+#define			   Four_PointFives		spu_splats( 0.5f )		// .5 .5 .5 .5
+#define			   Four_Thirds			spu_splats( 0.33333333 );	// 1/3
+#define			   Four_TwoThirds		spu_splats( 0.66666666 );	// 2/3
+#define			   Four_NegativeOnes	spu_splats( -1.0f )		// -1 -1 -1 -1 
+#define			   Four_DegToRad		spu_splats((float)(M_PI_F / 180.f))
+#else
 extern const fltx4 Four_Zeros;									// 0 0 0 0
 extern const fltx4 Four_Ones;									// 1 1 1 1
 extern const fltx4 Four_Twos;									// 2 2 2 2
@@ -142,46 +134,56 @@ extern const fltx4 Four_Threes;									// 3 3 3 3
 extern const fltx4 Four_Fours;									// guess.
 extern const fltx4 Four_Point225s;								// .225 .225 .225 .225
 extern const fltx4 Four_PointFives;								// .5 .5 .5 .5
-extern const fltx4 Four_Epsilons;								// FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
-extern const fltx4 Four_2ToThe21s;								// (1<<21)..
-extern const fltx4 Four_2ToThe22s;								// (1<<22)..
-extern const fltx4 Four_2ToThe23s;								// (1<<23)..
-extern const fltx4 Four_2ToThe24s;								// (1<<24)..
-extern const fltx4 Four_Origin;									// 0 0 0 1 (origin point, like vr0 on the PS2)
-extern const fltx4 Four_NegativeOnes;							// -1 -1 -1 -1 
-#else
-#define			   Four_Zeros XMVectorZero()					// 0 0 0 0
-#define			   Four_Ones XMVectorSplatOne()					// 1 1 1 1
-extern const fltx4 Four_Twos;									// 2 2 2 2
-extern const fltx4 Four_Threes;									// 3 3 3 3
-extern const fltx4 Four_Fours;									// guess.
-extern const fltx4 Four_Point225s;								// .225 .225 .225 .225
-extern const fltx4 Four_PointFives;								// .5 .5 .5 .5
-extern const fltx4 Four_Epsilons;								// FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
-extern const fltx4 Four_2ToThe21s;								// (1<<21)..
-extern const fltx4 Four_2ToThe22s;								// (1<<22)..
-extern const fltx4 Four_2ToThe23s;								// (1<<23)..
-extern const fltx4 Four_2ToThe24s;								// (1<<24)..
-extern const fltx4 Four_Origin;									// 0 0 0 1 (origin point, like vr0 on the PS2)
+extern const fltx4 Four_Thirds;									// 1/3
+extern const fltx4 Four_TwoThirds;								// 2/3
 extern const fltx4 Four_NegativeOnes;							// -1 -1 -1 -1 
+extern const fltx4 Four_DegToRad;								// (float)(M_PI_F / 180.f) times four
 #endif
+extern const fltx4 Four_Epsilons;								// FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
+extern const fltx4 Four_2ToThe21s;								// (1<<21)..
+extern const fltx4 Four_2ToThe22s;								// (1<<22)..
+extern const fltx4 Four_2ToThe23s;								// (1<<23)..
+extern const fltx4 Four_2ToThe24s;								// (1<<24)..
+extern const fltx4 Four_Origin;									// 0 0 0 1 (origin point, like vr0 on the PS2)
 extern const fltx4 Four_FLT_MAX;								// FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX
 extern const fltx4 Four_Negative_FLT_MAX;						// -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX
 extern const fltx4 g_SIMD_0123;									// 0 1 2 3 as float
 
+
+// coefficients for polynomial approximation of srgb conversions
+
+// 4th order polynomial for x^(1/2.2), x in 0..1
+extern const fltx4 Four_LinearToGammaCoefficients_A;		// *x^4
+extern const fltx4 Four_LinearToGammaCoefficients_B;		// *x^3
+extern const fltx4 Four_LinearToGammaCoefficients_C;		// *x^2
+extern const fltx4 Four_LinearToGammaCoefficients_D;		// *x^1
+extern const fltx4 Four_LinearToGammaCoefficients_E;		// *x^0
+
+// 3rd order polynomial for x^2.2 x in 0..1
+extern const fltx4 Four_GammaToLinearCoefficients_A;		// *x^3
+extern const fltx4 Four_GammaToLinearCoefficients_B;		// *x^2
+extern const fltx4 Four_GammaToLinearCoefficients_C;		// *x^1
+extern const fltx4 Four_GammaToLinearCoefficients_D;		// *x^0
+
+
 // external aligned integer constants
-extern const ALIGN16 uint32 g_SIMD_clear_signmask[] ALIGN16_POST;			// 0x7fffffff x 4
-extern const ALIGN16 uint32 g_SIMD_signmask[] ALIGN16_POST;				// 0x80000000 x 4
-extern const ALIGN16 uint32 g_SIMD_lsbmask[] ALIGN16_POST;				// 0xfffffffe x 4
-extern const ALIGN16 uint32 g_SIMD_clear_wmask[] ALIGN16_POST;			// -1 -1 -1 0
-extern const ALIGN16 uint32 g_SIMD_ComponentMask[4][4] ALIGN16_POST;		// [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF]
-extern const ALIGN16 uint32 g_SIMD_AllOnesMask[] ALIGN16_POST;			// ~0,~0,~0,~0
-extern const ALIGN16 uint32 g_SIMD_Low16BitsMask[] ALIGN16_POST;			// 0xffff x 4
+#ifndef ALIGN16_POST
+#define ALIGN16_POST
+#endif
+extern const ALIGN16 int32 g_SIMD_clear_signmask[] ALIGN16_POST;			// 0x7fffffff x 4
+extern const ALIGN16 int32 g_SIMD_signmask[] ALIGN16_POST;				// 0x80000000 x 4
+extern const ALIGN16 int32 g_SIMD_lsbmask[] ALIGN16_POST;				// 0xfffffffe x 4
+extern const ALIGN16 int32 g_SIMD_clear_wmask[] ALIGN16_POST;			// -1 -1 -1 0
+extern const ALIGN16 int32 g_SIMD_ComponentMask[4][4] ALIGN16_POST;		// [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF]
+extern const ALIGN16 int32 g_SIMD_AllOnesMask[] ALIGN16_POST;			// ~0,~0,~0,~0
+extern const fltx4 g_SIMD_Identity[4];									// [1 0 0 0], [0 1 0 0], [0 0 1 0], [0 0 0 1]
+extern const ALIGN16 int32 g_SIMD_Low16BitsMask[] ALIGN16_POST;			// 0xffff x 4
 
 // this mask is used for skipping the tail of things. If you have N elements in an array, and wish
 // to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration.
-extern const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST;
+extern const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST;
 
+extern const int32 ALIGN16 g_SIMD_EveryOtherMask[];				// 0, ~0, 0, ~0
 // Define prefetch macros.
 // The characteristics of cache and prefetch are completely 
 // different between the different platforms, so you DO NOT
@@ -191,12 +193,62 @@ extern const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST;
 // a higher level code change. 
 // On the other hand, I'm tired of typing #ifdef _X360
 // all over the place, so this is just a nop on Intel, PS3.
-#ifdef _X360
+#ifdef PLATFORM_PPC
+#if defined(_X360)
 #define PREFETCH360(address, offset) __dcbt(offset,address)
+#elif defined(_PS3)
+#define PREFETCH360(address, offset) __dcbt( reinterpret_cast< const char * >(address) + offset )
+#else
+#error Prefetch not defined for this platform!
+#endif
 #else
 #define PREFETCH360(x,y) // nothing
 #endif
 
+// Here's a handy function to align a pointer to the next
+// sixteen byte boundary -- it'll round it up to the nearest
+// multiple of 16. This is useful if you're subdividing
+// big swaths of allocated memory, but in that case, remember
+// to leave yourself the necessary slack in the allocation.
+template<class T>
+inline T* AlignPointer(void* ptr)
+{
+#if defined( __clang__ )
+	uintp temp = (uintp)ptr;
+#else
+	unsigned temp = ptr;
+#endif
+	temp = ALIGN_VALUE(temp, sizeof(T));
+	return (T*)temp;
+}
+
+#ifdef _PS3
+
+// Note that similar defines exist in math_pfns.h
+// Maybe we should consolidate in one place for all platforms.
+
+#define _VEC_CLEAR_SIGNMASK (__vector unsigned int)		{0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff}
+#define _VEC_SIGNMASK		(__vector unsigned int)		{ 0x80000000, 0x80000000, 0x80000000, 0x80000000 }
+#define _VEC_LSBMASK		(__vector unsigned int)		{ 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe }
+#define _VEC_CLEAR_WMASK	(__vector unsigned int)		{0xffffffff, 0xffffffff, 0xffffffff, 0}
+#define _VEC_COMPONENT_MASK_0 (__vector unsigned int)	{0xffffffff, 0, 0, 0}
+#define _VEC_COMPONENT_MASK_1 (__vector unsigned int)	{0, 0xffffffff, 0, 0}
+#define _VEC_COMPONENT_MASK_2 (__vector unsigned int)	{0, 0, 0xffffffff, 0}
+#define _VEC_COMPONENT_MASK_3 (__vector unsigned int)	{0, 0, 0, 0xffffffff}
+
+#define _VEC_SWIZZLE_WZYX (__vector unsigned char)		{ 0x0c,0x0d,0x0e,0x0f, 0x08,0x09,0x0a,0x0b, 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03 }
+#define _VEC_SWIZZLE_ZWXY (__vector unsigned char)		{ 0x08,0x09,0x0a,0x0b, 0x0c,0x0d,0x0e,0x0f, 0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07 }
+#define _VEC_SWIZZLE_YXWZ (__vector unsigned char)		{ 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03, 0x0c,0x0d,0x0e,0x0f, 0x08,0x09,0x0a,0x0b }
+
+#define _VEC_ZERO           (__vector unsigned int)		{0,0,0,0}
+
+#define _VEC_FLTMAX			(__vector float)			{FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}
+#define _VEC_FLTMIN			(__vector float)			{FLT_MIN,FLT_MIN,FLT_MIN,FLT_MIN}
+
+#define _VEC_ORIGIN			(__vector unsigned int)		{ 0x00000000, 0x00000000, 0x00000000, 0xffffffff }
+
+#endif
+
 #if USE_STDC_FOR_SIMD
 
 //---------------------------------------------------------------------
@@ -310,6 +362,7 @@ FORCEINLINE fltx4 SetComponentSIMD(const fltx4& a, int nComponent, float flValue
 	return result;
 }
 
+
 // a b c d -> b c d a
 FORCEINLINE fltx4 RotateLeft(const fltx4& a)
 {
@@ -368,6 +421,10 @@ FORCEINLINE fltx4 DivSIMD(const fltx4& a, const fltx4& b)				// a/b
 	BINOP(/ );
 }
 
+FORCEINLINE fltx4 DivEstSIMD(const fltx4& a, const fltx4& b)			// a/b
+{
+	BINOP(/ );
+}
 
 FORCEINLINE fltx4 MaddSIMD(const fltx4& a, const fltx4& b, const fltx4& c)				// a*b + c
 {
@@ -528,6 +585,15 @@ FORCEINLINE bool IsAllEqual(const fltx4& a, const fltx4& b)
 		SubFloat(a, 3) == SubFloat(b, 3);
 }
 
+// For branching if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w
+FORCEINLINE bool IsAnyEqual(const fltx4& a, const fltx4& b)
+{
+	return	SubFloat(a, 0) == SubFloat(b, 0) ||
+		SubFloat(a, 1) == SubFloat(b, 1) ||
+		SubFloat(a, 2) == SubFloat(b, 2) ||
+		SubFloat(a, 3) == SubFloat(b, 3);
+}
+
 FORCEINLINE int TestSignSIMD(const fltx4& a)								// mask of which floats have the high bit set
 {
 	int nRet = 0;
@@ -545,6 +611,11 @@ FORCEINLINE bool IsAnyNegative(const fltx4& a)							// (a.x < 0) || (a.y < 0) |
 	return (0 != TestSignSIMD(a));
 }
 
+FORCEINLINE bool IsAnyTrue(const fltx4& a)							// (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
+{
+	return (0 != TestSignSIMD(a));
+}
+
 FORCEINLINE fltx4 CmpEqSIMD(const fltx4& a, const fltx4& b)				// (a==b) ? ~0:0
 {
 	fltx4 retVal;
@@ -806,6 +877,14 @@ FORCEINLINE fltx4 LoadUnaligned3SIMD(const void* pSIMD)
 	return *(reinterpret_cast<const fltx4*> (pSIMD));
 }
 
+// load a single unaligned float into the x component of a SIMD word
+FORCEINLINE fltx4 LoadUnalignedFloatSIMD(const float* pFlt)
+{
+	fltx4 retval;
+	SubFloat(retval, 0) = *pFlt;
+	return retval;
+}
+
 FORCEINLINE fltx4 LoadAlignedSIMD(const void* pSIMD)
 {
 	return *(reinterpret_cast<const fltx4*> (pSIMD));
@@ -820,6 +899,14 @@ FORCEINLINE fltx4 LoadAlignedSIMD(const VectorAligned& pSIMD)
 	return retval;
 }
 
+
+// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous
+FORCEINLINE fltx4 LoadGatherSIMD(const float& x, const float& y, const float& z, const float& w)
+{
+	fltx4 retval = { x, y, z, w };
+	return retval;
+}
+
 FORCEINLINE void StoreAlignedSIMD(float* pSIMD, const fltx4& a)
 {
 	*(reinterpret_cast<fltx4*> (pSIMD)) = a;
@@ -830,6 +917,11 @@ FORCEINLINE void StoreUnalignedSIMD(float* pSIMD, const fltx4& a)
 	*(reinterpret_cast<fltx4*> (pSIMD)) = a;
 }
 
+FORCEINLINE void StoreUnalignedFloat(float* pSingleFloat, const fltx4& a)
+{
+	*pSingleFloat = SubFloat(a, 0);
+}
+
 FORCEINLINE void StoreUnaligned3SIMD(float* pSIMD, const fltx4& a)
 {
 	*pSIMD = SubFloat(a, 0);
@@ -837,12 +929,41 @@ FORCEINLINE void StoreUnaligned3SIMD(float* pSIMD, const fltx4& a)
 	*(pSIMD + 2) = SubFloat(a, 2);
 }
 
+
 // strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
 FORCEINLINE void StoreAligned3SIMD(VectorAligned* RESTRICT pSIMD, const fltx4& a)
 {
 	StoreAlignedSIMD(pSIMD->Base(), a);
 }
 
+// Store the x,y,z components of the four FLTX4 parameters
+// into the four consecutive Vectors:
+//    pDestination[0],  pDestination[1],  pDestination[2],  pDestination[3]
+// The Vectors are assumed to be unaligned.
+FORCEINLINE void StoreFourUnalignedVector3SIMD(fltx4 a, fltx4 b, fltx4	c, FLTX4 d, // first three passed by copy (deliberate)
+	Vector* const pDestination)
+{
+	StoreUnaligned3SIMD(pDestination->Base(), a);
+	StoreUnaligned3SIMD((pDestination + 1)->Base(), b);
+	StoreUnaligned3SIMD((pDestination + 2)->Base(), c);
+	StoreUnaligned3SIMD((pDestination + 3)->Base(), d);
+}
+
+// Store the x,y,z components of the four FLTX4 parameters
+// into the four consecutive Vectors:
+//    pDestination ,  pDestination + 1,  pDestination + 2,  pDestination + 3
+// The Vectors are assumed to start on an ALIGNED address, that is, 
+// pDestination is 16-byte aligned (thhough obviously pDestination+1 is not).
+FORCEINLINE void StoreFourAlignedVector3SIMD(fltx4 a, fltx4 b, fltx4	c, FLTX4 d, // first three passed by copy (deliberate)
+	Vector* const pDestination)
+{
+	StoreUnaligned3SIMD(pDestination->Base(), a);
+	StoreUnaligned3SIMD((pDestination + 1)->Base(), b);
+	StoreUnaligned3SIMD((pDestination + 2)->Base(), c);
+	StoreUnaligned3SIMD((pDestination + 3)->Base(), d);
+}
+
+
 FORCEINLINE void TransposeSIMD(fltx4& x, fltx4& y, fltx4& z, fltx4& w)
 {
 #define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ ) { float tmp = SubFloat( _a_, _ia_ ); SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ ); SubFloat( _b_, _ib_ ) = tmp; }
@@ -924,6 +1045,19 @@ FORCEINLINE void StoreUnalignedIntSIMD(int32* pSIMD, const fltx4& a)
 	*(reinterpret_cast<i32x4*> (pSIMD)) = a;
 }
 
+// Load four consecutive uint16's, and turn them into floating point numbers.
+// This function isn't especially fast and could be made faster if anyone is
+// using it heavily.
+FORCEINLINE fltx4 LoadAndConvertUint16SIMD(const uint16* pInts)
+{
+	fltx4 retval;
+	SubFloat(retval, 0) = pInts[0];
+	SubFloat(retval, 1) = pInts[1];
+	SubFloat(retval, 2) = pInts[2];
+	SubFloat(retval, 3) = pInts[3];
+}
+
+
 // Take a fltx4 containing fixed-point uints and 
 // return them as single precision floats. No
 // fixed point conversion is done.
@@ -931,10 +1065,10 @@ FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD(const u32x4& vSrcA)
 {
 	Assert(0);			/* pc has no such operation */
 	fltx4 retval;
-	SubFloat(retval, 0) = ((float)SubInt(retval, 0));
-	SubFloat(retval, 1) = ((float)SubInt(retval, 1));
-	SubFloat(retval, 2) = ((float)SubInt(retval, 2));
-	SubFloat(retval, 3) = ((float)SubInt(retval, 3));
+	SubFloat(retval, 0) = ((float)SubInt(vSrcA, 0));
+	SubFloat(retval, 1) = ((float)SubInt(vSrcA, 1));
+	SubFloat(retval, 2) = ((float)SubInt(vSrcA, 2));
+	SubFloat(retval, 3) = ((float)SubInt(vSrcA, 3));
 	return retval;
 }
 
@@ -974,14 +1108,1449 @@ FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4& vSrcA, const i32x4& vSrcB)
 
 	return retval;
 }
+
 #endif
 
+#elif ( defined( _PS3 ) )
+#define SN_IMPROVED_INTRINSICS ( (( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )) ||\
+							     (defined(__SN_VER__) && (__SN_VER__ > 25002)) )
+
+//---------------------------------------------------------------------
+// PS3 implementation
+//---------------------------------------------------------------------
+
+FORCEINLINE float FloatSIMD(fltx4& a, int idx)
+{
+#if SN_IMPROVED_INTRINSICS
+	return vec_extract(a, idx);
+#else
+	fltx4_union a_union;
+	vec_st(a, 0, &a_union.vmxf);
+	return a_union.m128_f32[idx];
+#endif
+}
+
+FORCEINLINE unsigned int UIntSIMD(u32x4& a, int idx)
+{
+#if SN_IMPROVED_INTRINSICS
+	return vec_extract(a, idx);
+#else
+	fltx4_union a_union;
+	vec_st(a, 0, &a_union.vmxui);
+	return a_union.m128_u32[idx];
+#endif
+}
+
+FORCEINLINE fltx4 AddSIMD(const fltx4& a, const fltx4& b)
+{
+	return vec_add(a, b);
+}
+
+FORCEINLINE fltx4 SubSIMD(const fltx4& a, const fltx4& b)				// a-b
+{
+	return vec_sub(a, b);
+}
+
+FORCEINLINE fltx4 MulSIMD(const fltx4& a, const fltx4& b)				// a*b
+{
+	return vec_madd(a, b, _VEC_ZEROF);
+}
+
+FORCEINLINE fltx4 MaddSIMD(const fltx4& a, const fltx4& b, const fltx4& c)				// a*b + c
+{
+	return vec_madd(a, b, c);
+}
+
+FORCEINLINE fltx4 MsubSIMD(const fltx4& a, const fltx4& b, const fltx4& c)				// c - a*b
+{
+	return vec_nmsub(a, b, c);
+};
+
+FORCEINLINE fltx4 Dot3SIMD(const fltx4& a, const fltx4& b)
+{
+	// oliviern: it seems that this code could be optimized
+	//  (or maybe the latency will slow down if there is nothing to put in between)
+	//	Something like that (to verify on PS3 and SPU):
+	//		result2 = vec_madd(a, b, _VEC_ZEROF);						// a0 * b0, a1 * b1, a2 * b2, a3 * b3
+	//		result = vec_add(vec_sld(result2, result2, 4), result2);	// (a0 * b0) + (a1 * b1), (a1 * b1) + (a2 * b2), (a2 * b2) + (a3 * b3), (a3 * b3) + (a0 * b0)
+	//		result = vec_add(vec_sld(result2, result2, 8), result);		// (a0 * b0) + (a1 * b1) + (a2 * b2), (a1 * b1) + (a2 * b2) + (a3 * b3), (a2 * b2) + (a3 * b3) + (a0 * b0), (a3 * b3) + (a0 * b0) + ...
+	//		result = vec_splat(result, 0);								// DotProduct3...
+	//		6 SIMD instructions instead of 8 (but again with potentially one more latency - it depends if other stuff can be interleaved in between).
+	//		It may still be a bit faster in the worst case.
+
+	fltx4 result;
+
+	result = vec_madd(a, b, _VEC_ZEROF);
+	result = vec_madd(vec_sld(a, a, 4), vec_sld(b, b, 4), result);
+	result = vec_madd(vec_sld(a, a, 8), vec_sld(b, b, 8), result);
+
+	// replicate across all
+	result = vec_splat(result, 0);
+
+	return result;
+}
+
+FORCEINLINE fltx4 Dot4SIMD(const fltx4& a, const fltx4& b)
+{
+	// See comment in Dot3SIMD, we could reduce to 6 SIMD instructions instead of 7 (but again with potentially one more latency).
+	//		result = vec_madd(a, b, _VEC_ZEROF);						// a0 * b0, a1 * b1, a2 * b2, a3 * b3
+	//		result = vec_add(vec_sld(result, result, 4), result);		// (a0 * b0) + (a1 * b1), (a1 * b1) + (a2 * b2), (a2 * b2) + (a3 * b3), (a3 * b3) + (a0 * b0)
+	//		result = vec_add(vec_sld(result, result, 8), result);		// (a0 * b0) + (a1 * b1) + (a2 * b2) + (a3 * b3), ...
+	//		result = vec_splat(result, 0);								// DotProduct3...
+	//		6 SIMD instructions instead of 7 (but again with potentially one more latency - it depends if other stuff can be interleaved in between).
+	//		It may be a wash in the worst case.
+
+	fltx4 result;
+
+	result = vec_madd(a, b, _VEC_ZEROF);
+	result = vec_madd(vec_sld(a, a, 4), vec_sld(b, b, 4), result);
+	result = vec_add(vec_sld(result, result, 8), result);
+
+	// replicate across all
+	result = vec_splat(result, 0);
+
+	return result;
+}
+
+FORCEINLINE fltx4 SinSIMD(const fltx4& radians)
+{
+	return sinf4(radians);
+}
+
+FORCEINLINE void SinCos3SIMD(fltx4& sine, fltx4& cosine, const fltx4& radians)
+{
+	sincosf4(radians, &sine, &cosine);
+}
+
+FORCEINLINE void SinCosSIMD(fltx4& sine, fltx4& cosine, const fltx4& radians)				// a*b + c
+{
+	sincosf4(radians, &sine, &cosine);
+}
+
+FORCEINLINE fltx4 ArcCosSIMD(const fltx4& cs)
+{
+	return acosf4(cs);
+}
+
+FORCEINLINE fltx4 ArcTan2SIMD(const fltx4& a, const fltx4& b)
+{
+	return atan2f4(a, b);
+}
+
+FORCEINLINE fltx4 ArcSinSIMD(const fltx4& sine)
+{
+	return asinf4(sine);
+}
+
+// DivSIMD defined further down, since it uses ReciprocalSIMD
+
+FORCEINLINE fltx4 MaxSIMD(const fltx4& a, const fltx4& b)				// max(a,b)
+{
+	return vec_max(a, b);
+}
+FORCEINLINE fltx4 MinSIMD(const fltx4& a, const fltx4& b)				// min(a,b)
+{
+	return vec_min(a, b);
+}
+
+FORCEINLINE fltx4 AndSIMD(const fltx4& a, const fltx4& b)				// a & b
+{
+	return vec_and(a, b);
+}
+FORCEINLINE fltx4 AndSIMD(const bi32x4& a, const fltx4& b)				// a & b
+{
+	return vec_and((fltx4)a, b);
+}
+FORCEINLINE fltx4 AndSIMD(const fltx4& a, const bi32x4& b)				// a & b
+{
+	return vec_and(a, (fltx4)b);
+}
+FORCEINLINE bi32x4 AndSIMD(const bi32x4& a, const bi32x4& b)				// a & b
+{
+	return vec_and(a, b);
+}
+
+#if 0
+FORCEINLINE fltx4 AndNotSIMD(const fltx4& a, const fltx4& b)			// ~a & b
+{
+	// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
+	return vec_andc(b, a);
+}
+FORCEINLINE fltx4 AndNotSIMD(const bi32x4& a, const fltx4& b)			// ~a & b
+{
+	// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
+	return vec_andc(b, (fltx4)a);
+}
+FORCEINLINE fltx4 AndNotSIMD(const fltx4& a, const bi32x4& b)			// ~a & b
+{
+	// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
+	return (fltx4)vec_andc(b, (bi32x4)a);
+}
+FORCEINLINE bi32x4 AndNotSIMD(const bi32x4& a, const bi32x4& b)			// ~a & b
+{
+	// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
+	return vec_andc(b, a);
+}
+#else
+template< typename T, typename U >
+FORCEINLINE T AndNotSIMD(const T& a, const U& b) // ~a & b
+{
+	return vec_andc(b, (T)a);
+}
+
+// specialize for the case of bi, flt
+FORCEINLINE fltx4 AndNotSIMD(const bi32x4& a, const fltx4& b) // ~a & b
+{
+	return vec_andc(b, (fltx4)a);
+}
+#endif
+
+FORCEINLINE fltx4 XorSIMD(const fltx4& a, const fltx4& b)				// a ^ b
+{
+	return vec_xor(a, b);
+}
+FORCEINLINE fltx4 XorSIMD(const bi32x4& a, const fltx4& b)				// a ^ b
+{
+	return vec_xor((fltx4)a, b);
+}
+FORCEINLINE fltx4 XorSIMD(const fltx4& a, const bi32x4& b)				// a ^ b
+{
+	return vec_xor(a, (fltx4)b);
+}
+FORCEINLINE bi32x4 XorSIMD(const bi32x4& a, const bi32x4& b)				// a ^ b
+{
+	return vec_xor(a, b);
+}
+
+FORCEINLINE fltx4 OrSIMD(const fltx4& a, const fltx4& b)				// a | b
+{
+	return vec_or(a, b);
+}
+FORCEINLINE fltx4 OrSIMD(const bi32x4& a, const fltx4& b)				// a | b
+{
+	return vec_or((fltx4)a, b);
+}
+FORCEINLINE fltx4 OrSIMD(const fltx4& a, const bi32x4& b)				// a | b
+{
+	return vec_or(a, (fltx4)b);
+}
+FORCEINLINE i32x4 OrSIMD(const i32x4& a, const i32x4& b)				// a | b
+{
+	return vec_or(a, b);
+}
+FORCEINLINE u32x4 OrSIMD(const u32x4& a, const u32x4& b)				// a | b
+{
+	return vec_or(a, b);
+}
+
+#if !defined(__SPU__)	// bi32x4 typedef to same as u32x4 on SPU
+FORCEINLINE bi32x4 OrSIMD(const bi32x4& a, const bi32x4& b)				// a | b
+{
+	return vec_or(a, b);
+}
+#endif
+
+FORCEINLINE fltx4 NegSIMD(const fltx4& a) // negate: -a
+{
+	return(SubSIMD(_VEC_ZEROF, a));
+
+	// untested
+	//	vec_float4 signMask;
+	//	vec_float4 result;
+	//	signMask = vec_splat_s32(-1);
+	//	signMask = vec_sll(signMask, signMask);
+	//	result = vec_xor(a, signMask);
+	//	return result;
+}
+
+FORCEINLINE bool IsAnyZeros(const fltx4& a)								// any floats are zero?
+{
+	return vec_any_eq(a, _VEC_ZEROF);
+}
+
+FORCEINLINE bool IsAnyZeros(const bi32x4& a)								// any floats are zero?
+{
+	return vec_any_eq((u32x4)a, _VEC_ZERO);
+}
+
+FORCEINLINE bool IsAllZeros(const bi32x4& a)								// all floats of a zero?
+{
+	return vec_all_eq((u32x4)a, _VEC_ZERO);
+}
+
+FORCEINLINE bool IsAnyXYZZero(const fltx4& a)								// are any of x,y,z zero?
+{
+#if SN_IMPROVED_INTRINSICS
+
+	// push 1.0 into w (NON-ZERO)
+	fltx4 b = vec_insert(1.0f, a, 3);
+
+	return vec_any_eq(b, _VEC_ZEROF);
+#else
+	fltx4 b = vec_perm(a, _VEC_ONEF, _VEC_PERMUTE_XYZ0W1);
+	return vec_any_eq(b, _VEC_ZEROF);
+#endif
+}
+
+// for branching when a.xyzw > b.xyzw
+FORCEINLINE bool IsAllGreaterThan(const fltx4& a, const fltx4& b)
+{
+	return vec_all_gt(a, b);
+}
+
+// for branching when a.xyzw >= b.xyzw
+FORCEINLINE bool IsAllGreaterThanOrEq(const fltx4& a, const fltx4& b)
+{
+	return vec_all_ge(a, b);
+}
+
+FORCEINLINE bool IsAllEqual(const fltx4& a, const fltx4& b)
+{
+	return vec_all_eq(a, b);
+}
+
+
+FORCEINLINE int TestSignSIMD(const fltx4& a)								// mask of which floats have the high bit set
+{
+	// NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
+	int nRet = 0;
+
+	fltx4_union a_union;
+	vec_st(a, 0, &a_union.vmxf);
+
+	nRet |= (a_union.m128_u32[0] & 0x80000000) >> 31; // sign(x) -> bit 0
+	nRet |= (a_union.m128_u32[1] & 0x80000000) >> 30; // sign(y) -> bit 1
+	nRet |= (a_union.m128_u32[2] & 0x80000000) >> 29; // sign(z) -> bit 2
+	nRet |= (a_union.m128_u32[3] & 0x80000000) >> 28; // sign(w) -> bit 3
+
+	return nRet;
+}
+FORCEINLINE int TestSignSIMD(const bi32x4& a)								// mask of which floats have the high bit set
+{
+	// NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
+	int nRet = 0;
+
+	fltx4_union a_union;
+	vec_st(a, 0, &a_union.vmxbi);
+
+	nRet |= (a_union.m128_u32[0] & 0x80000000) >> 31; // sign(x) -> bit 0
+	nRet |= (a_union.m128_u32[1] & 0x80000000) >> 30; // sign(y) -> bit 1
+	nRet |= (a_union.m128_u32[2] & 0x80000000) >> 29; // sign(z) -> bit 2
+	nRet |= (a_union.m128_u32[3] & 0x80000000) >> 28; // sign(w) -> bit 3
+
+	return nRet;
+}
+
+FORCEINLINE bool IsAnyNegative(const bi32x4& a)							// (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
+{
+	return (0 != TestSignSIMD(a));
+}
+
+// Squelch the w component of a vector to +0.0.
+// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
+FORCEINLINE fltx4 SetWToZeroSIMD(const fltx4& a)
+{
+	return (fltx4)vec_and((u32x4)a, _VEC_CLEAR_WMASK);
+}
+FORCEINLINE bi32x4 SetWToZeroSIMD(const bi32x4& a)
+{
+	return (bi32x4)vec_and((u32x4)a, _VEC_CLEAR_WMASK);
+}
+
+FORCEINLINE bool IsAnyNegative(const fltx4& a)							// (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
+{
+	// NOTE: this tests the top bits of each vector element using integer math
+	//       (so it ignores NaNs - it will return true for "-NaN")
+	return vec_any_lt(a, _VEC_ZEROF);
+}
+
+FORCEINLINE bool IsAnyTrue(const fltx4& a)
+{
+	return vec_any_ne(a, _VEC_ZEROF);
+}
+
+#ifdef DIFFERENT_NATIVE_VECTOR_TYPES
+
+FORCEINLINE bool IsAnyTrue(const bi32x4& a)
+{
+	return vec_any_ne((vector unsigned int) a, _VEC_0L);
+}
+
+#endif
+
+FORCEINLINE bi32x4 CmpEqSIMD(const fltx4& a, const fltx4& b)				// (a==b) ? ~0:0
+{
+	return (bi32x4)vec_cmpeq(a, b);
+}
+FORCEINLINE bi32x4 CmpEqSIMD(const i32x4& a, const i32x4& b)				// (a==b) ? ~0:0
+{
+	return (bi32x4)vec_cmpeq(a, b);
+}
+FORCEINLINE bi32x4 CmpEqSIMD(const u32x4& a, const u32x4& b)				// (a==b) ? ~0:0
+{
+	return (bi32x4)vec_cmpeq(a, b);
+}
+
+FORCEINLINE bi32x4 CmpGtSIMD(const fltx4& a, const fltx4& b)				// (a>b) ? ~0:0
+{
+	return (bi32x4)vec_cmpgt(a, b);
+}
+FORCEINLINE bi32x4 CmpGtSIMD(const i32x4& a, const i32x4& b)				// (a>b) ? ~0:0
+{
+	return (bi32x4)vec_cmpgt(a, b);
+}
+FORCEINLINE bi32x4 CmpGtSIMD(const u32x4& a, const u32x4& b)				// (a>b) ? ~0:0
+{
+	return (bi32x4)vec_cmpgt(a, b);
+}
+
+FORCEINLINE bi32x4 CmpGeSIMD(const fltx4& a, const fltx4& b)				// (a>=b) ? ~0:0
+{
+	return (bi32x4)vec_cmpge(a, b);
+}
+
+
+FORCEINLINE bi32x4 CmpLtSIMD(const fltx4& a, const fltx4& b)				// (a<b) ? ~0:0
+{
+	return (bi32x4)vec_cmplt(a, b);
+}
+
+FORCEINLINE bi32x4 CmpLeSIMD(const fltx4& a, const fltx4& b)				// (a<=b) ? ~0:0
+{
+	return (bi32x4)vec_cmple(a, b);
+}
+
+
+
+FORCEINLINE bi32x4 CmpInBoundsSIMD(const fltx4& a, const fltx4& b)		// (a <= b && a >= -b) ? ~0 : 0
+{
+	i32x4 control;
+	control = vec_cmpb(a, b);
+	return (bi32x4)vec_cmpeq((u32x4)control, _VEC_ZERO);
+}
+
+FORCEINLINE int CmpAnyLeSIMD(const fltx4& a, const fltx4& b)
+{
+	return vec_any_le(a, b);
+}
+
+FORCEINLINE int CmpAnyGeSIMD(const fltx4& a, const fltx4& b)
+{
+	return vec_any_ge(a, b);
+}
+
+FORCEINLINE int CmpAnyLtSIMD(const fltx4& a, const fltx4& b)
+{
+	return vec_any_lt(a, b);
+}
+FORCEINLINE int CmpAnyLtSIMD(const bi32x4& a, const i32x4& b)
+{
+	return vec_any_lt((i32x4)a, b);
+}
+
+FORCEINLINE int CmpAnyGtSIMD(const fltx4& a, const fltx4& b)
+{
+	return vec_any_gt(a, b);
+}
+
+FORCEINLINE int CmpAnyNeSIMD(const fltx4& a, const fltx4& b)
+{
+	return vec_any_ne(a, b);
+}
+FORCEINLINE int CmpAnyNeSIMD(const bi32x4& a, const bi32x4& b)
+{
+	return vec_any_ne(a, b);
+}
+FORCEINLINE int CmpAnyNeSIMD(const bi32x4& a, const i32x4& b)
+{
+	return vec_any_ne(a, (bi32x4)b);
+}
+
+FORCEINLINE int CmpAllLeSIMD(const fltx4& a, const fltx4& b)
+{
+	return vec_all_le(a, b);
+}
+
+FORCEINLINE fltx4 MaskedAssign(const bi32x4& ReplacementMask, const fltx4& NewValue, const fltx4& OldValue)
+{
+	return vec_sel(OldValue, NewValue, ReplacementMask);
+}
+
+FORCEINLINE fltx4 MaskedAssign(const fltx4& ReplacementMask, const fltx4& NewValue, const fltx4& OldValue)
+{
+	return vec_sel(OldValue, NewValue, (const bi32x4)ReplacementMask);
+}
+
+FORCEINLINE vector signed short MaskedAssign(const vector unsigned short& ReplacementMask, const vector signed short& NewValue, const vector signed short& OldValue)
+{
+	return vec_sel(OldValue, NewValue, ReplacementMask);
+}
+
+// AKA "Broadcast", "Splat"
+FORCEINLINE fltx4 ReplicateX4(float flValue)					//  a,a,a,a
+{
+#if SN_IMPROVED_INTRINSICS
+	return vec_splats(flValue);
+#else
+	// NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
+	float* pValue = &flValue;
+	Assert(pValue);
+	Assert(((unsigned int)pValue & 3) == 0);
+
+	fltx4 result;
+
+	result = vec_ld(0, pValue);
+	result = vec_splat(vec_perm(result, result, vec_lvsl(0, pValue)), 0);
+
+	return result;
+#endif
+}
+
+FORCEINLINE fltx4 ReplicateX4(const float* pValue)					//  a,a,a,a
+{
+#if SN_IMPROVED_INTRINSICS
+	return vec_splats(*pValue);
+#else
+	Assert(pValue);
+	fltx4 result;
+
+	result = vec_ld(0, pValue);
+	result = vec_splat(vec_perm(result, result, vec_lvsl(0, pValue)), 0);
+
+	return result;
+#endif
+}
+
+/// replicate a single 32 bit integer value to all 4 components of an m128
+FORCEINLINE i32x4 ReplicateIX4(int nValue)
+{
+#if SN_IMPROVED_INTRINSICS
+	return vec_splats(nValue);
+#else
+	// NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!)
+	int* pValue = &nValue;
+	Assert(pValue);
+	Assert(((unsigned int)pValue & 3) == 0);
+	i32x4 result;
+
+	result = vec_ld(0, pValue);
+	result = vec_splat(vec_perm(result, result, vec_lvsl(0, pValue)), 0);
+
+	return result;
+#endif
+}
+
+FORCEINLINE fltx4 SqrtSIMD(const fltx4& a)					// sqrt(a)
+{
+	return sqrtf4(a);
+}
+
+FORCEINLINE fltx4 SqrtEstSIMD(const fltx4& a)				// sqrt(a), more or less
+{
+#if defined( _PS3 ) && !defined( SPU )
+	// This is exactly what the Xbox 360 does in XMVectorSqrtEst
+	fltx4 vRecipSquareRoot = vec_rsqrte(a);
+	i32x4 vOne = vec_splat_s32(1);
+	i32x4 vAllOnes = vec_splat_s32(-1);
+	i32x4 vShiftLeft24 = vec_splat_s32(-8); // -8 is the same bit pattern as 24 with a 5-bit mask
+	fltx4 vZero = (fltx4)vec_splat_s32(0);
+	u32x4 vInputShifted = vec_sl((u32x4)a, (u32x4)vOne);
+	u32x4 vInfinityShifted = vec_sl((u32x4)vAllOnes, (u32x4)vShiftLeft24);
+	bi32x4 vEqualsZero = vec_vcmpeqfp(a, vZero);
+	bi32x4 vEqualsInfinity = vec_vcmpequw(vInputShifted, vInfinityShifted);
+	fltx4 vSquareRoot = vec_madd(a, vRecipSquareRoot, _VEC_ZEROF);
+	bi32x4 vResultMask = vec_vcmpequw((u32x4)vEqualsInfinity, (u32x4)vEqualsZero); // mask has 1s wherever the square root is valid
+	fltx4 vCorrectedSquareRoot = vec_sel(a, vSquareRoot, vResultMask);
+
+	return vCorrectedSquareRoot;
+#else
+	return SqrtSIMD(a);
+#endif
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSIMD(const fltx4& a)		// 1/sqrt(a), more or less
+{
+	return vec_rsqrte(a);
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtSIMD(const fltx4& a)			// 1/sqrt(a)
+{
+	// This matches standard library function rsqrtf4
+	fltx4 result;
+	vmathV4RsqrtPerElem((VmathVector4*)&result, (const VmathVector4*)&a);
+
+	return result;
+}
+
+FORCEINLINE fltx4 ReciprocalEstSIMD(const fltx4& a)			// 1/a, more or less
+{
+	return vec_re(a);
+}
+
+/// 1/x for all 4 values, more or less
+/// 1/0 will result in a big but NOT infinite result
+FORCEINLINE fltx4 ReciprocalEstSaturateSIMD(const fltx4& a)
+{
+	bi32x4 zero_mask = CmpEqSIMD(a, Four_Zeros);
+	fltx4 ret = OrSIMD(a, AndSIMD(Four_Epsilons, zero_mask));
+	ret = ReciprocalEstSIMD(ret);
+	return ret;
+}
+
+
+/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
+/// No error checking!
+FORCEINLINE fltx4 ReciprocalSIMD(const fltx4& a)				// 1/a
+{
+	// This matches standard library function recipf4
+	fltx4 result;
+	vmathV4RecipPerElem((VmathVector4*)&result, (const VmathVector4*)&a);
+
+	return result;
+}
+
+FORCEINLINE fltx4 DivSIMD(const fltx4& a, const fltx4& b)	// a/b
+{
+	return MulSIMD(ReciprocalSIMD(b), a);
+}
+
+FORCEINLINE fltx4 DivEstSIMD(const fltx4& a, const fltx4& b)	// Est(a/b)
+{
+	return MulSIMD(ReciprocalEstSIMD(b), a);
+}
+
+/// 1/x for all 4 values.
+/// 1/0 will result in a big but NOT infinite result
+FORCEINLINE fltx4 ReciprocalSaturateSIMD(const fltx4& a)
+{
+	// Convert zeros to epsilons
+	bi32x4 zero_mask = CmpEqSIMD(a, _VEC_ZEROF);
+	fltx4 a_safe = OrSIMD(a, AndSIMD(_VEC_EPSILONF, zero_mask));
+	return ReciprocalSIMD(a_safe);
+
+	// FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does)
+	// fltx4 zeroMask = CmpEqSIMD( gFour_Zeros, a );
+	// fltx4 a_safe = XMVectorSelect( a, gFour_Epsilons, zeroMask );
+	// return ReciprocalSIMD( a_safe );
+}
+
+
+// CHRISG: is it worth doing integer bitfiddling for this?
+// 2^x for all values (the antilog)
+FORCEINLINE fltx4 ExpSIMD(const fltx4& toPower)
+{
+	return exp2f4(toPower);
+}
+
+// a unique Altivec concept, the "Vector 2 Raised to the Exponent Estimate Floating Point",
+// which is accurate to four bits of mantissa.
+FORCEINLINE fltx4 Exp2EstSIMD(const fltx4& f)
+{
+	return exp2f4fast(f);
+}
+
+
+// Clamps the components of a vector to a specified minimum and maximum range.
+FORCEINLINE fltx4 ClampVectorSIMD(FLTX4 in, FLTX4 min, FLTX4 max)
+{
+	fltx4 result = vec_max(min, in);
+	return vec_min(max, result);
+}
+
+
+FORCEINLINE fltx4 LoadUnalignedSIMD(const void* pSIMD)
+{
+#if SN_IMPROVED_INTRINSICS
+
+	fltx4 v0, v1;
+
+	Assert(pSIMD);
+
+
+	v0 = (fltx4)vec_lvlx(0, (float*)pSIMD);
+	v1 = (fltx4)vec_lvrx(16, (float*)pSIMD);
+	return vec_or(v0, v1);
+
+#else
+
+	fltx4 v0, v1;
+	vector unsigned char permMask;
+
+	Assert(pSIMD);
+
+	v0 = vec_ld(0, pSIMD);
+	permMask = vec_lvsl(0, pSIMD);
+	v1 = vec_ld(15, pSIMD);
+
+	return vec_perm(v0, v1, permMask);
+
+#endif
+}
+
+FORCEINLINE fltx4 LoadUnsignedByte4SIMD(unsigned char* pBytes)	// unpack contiguous 4 bytes into vec float 4
+{
+
+#if SN_IMPROVED_INTRINSICS
+
+	__vector unsigned char  res_uc;
+	__vector unsigned short res_us;
+
+	__vector unsigned char vZero8 = (__vector unsigned char)vec_splat_u8(0);
+	__vector unsigned short vZero16 = (__vector unsigned short)vec_splat_u16(0);
+
+	res_uc = (__vector unsigned char)vec_lvlx(0, pBytes);
+	res_uc = vec_mergeh(vZero8, res_uc);
+	res_us = vec_mergeh(vZero16, (__vector unsigned short)res_uc);
+	return vec_ctf((__vector unsigned int)res_us, 0);
+
+#else
+
+	vector unsigned char v0, v1;
+	vector bool char res_uc;
+	vector unsigned char permMask;
+	vector bool short res_us;
+
+	vector bool char vZero8 = (vector bool char)vec_splat_u8(0);
+	vector bool short vZero16 = (vector bool short)vec_splat_u16(0);
+
+	v0 = vec_ld(0, pBytes);
+	permMask = vec_lvsl(0, pBytes);
+	v1 = vec_ld(3, pBytes);
+	res_uc = (vector bool char)vec_perm(v0, v1, permMask);
+	res_uc = vec_mergeh(vZero8, res_uc);
+	res_us = vec_mergeh(vZero16, (vector bool short)res_uc);
+	return vec_ctf((vector unsigned int)res_us, 0);
+
+#endif
+
+}
+
+FORCEINLINE fltx4 LoadSignedByte4SIMD(signed char* pBytes)	// unpack contiguous 4 bytes into vec float 4
+{
+
+#if SN_IMPROVED_INTRINSICS	
+
+	vector signed char  res_uc;
+	vector signed short res_us;
+	vector signed int   res_ui;
+
+	res_uc = (vector signed char)vec_lvlx(0, pBytes);
+	res_us = vec_unpackh(res_uc);
+	res_ui = vec_unpackh(res_us);
+	return vec_ctf(res_ui, 0);
+
+#else
+
+	vector signed char v0, v1, res_uc;
+	vector unsigned char permMask;
+	vector signed short res_us;
+	vector signed int   res_ui;
+
+	v0 = vec_ld(0, pBytes);
+	permMask = vec_lvsl(0, pBytes);
+	v1 = vec_ld(3, pBytes);
+	res_uc = vec_perm(v0, v1, permMask);
+	res_us = vec_unpackh(res_uc);
+	res_ui = vec_unpackh(res_us);
+	return vec_ctf(res_ui, 0);
+
+#endif
+
+}
+
+
+// load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec). 
+FORCEINLINE fltx4 LoadUnaligned3SIMD(const void* pSIMD)
+{
+	Assert(pSIMD);
+
+	fltx4 v0 = vec_ld(0, (float*)(pSIMD));
+	vector unsigned char permMask = vec_lvsl(0, (float*)(pSIMD));
+	fltx4 v1 = vec_ld(11, (float*)(pSIMD));
+
+	return vec_perm(v0, v1, permMask);
+}
+
+
+// load a single unaligned float into the x component of a SIMD word
+FORCEINLINE fltx4 LoadUnalignedFloatSIMD(const float* pFlt)
+{
+	fltx4 v0 = vec_lde(0, const_cast<float*>(pFlt));
+	vector unsigned char permMask = vec_lvsl(0, const_cast<float*>(pFlt));
+	return vec_perm(v0, v0, permMask);
+}
+
+
+FORCEINLINE fltx4 LoadAlignedSIMD(const void* pSIMD)
+{
+	return vec_ld(0, (float*)pSIMD);
+}
+
+#ifndef SPU
+// No reason to support VectorAligned on SPU.
+
+// for the transitional class -- load a 3-by VectorAligned and squash its w component
+FORCEINLINE fltx4 LoadAlignedSIMD(const VectorAligned& pSIMD)
+{
+	fltx4 out;
+	out = vec_ld(0, pSIMD.Base());
+
+	// squelch w
+	return (fltx4)vec_and((u32x4)out, _VEC_CLEAR_WMASK);
+}
+
+// for the transitional class -- load a 3-by VectorAligned and squash its w component
+FORCEINLINE fltx4 LoadAlignedSIMD(const VectorAligned* RESTRICT pSIMD)
+{
+	fltx4 out;
+	out = vec_ld(0, pSIMD->Base());
+
+	// squelch w
+	return (fltx4)vec_and((u32x4)out, _VEC_CLEAR_WMASK);
+}
+
+
+// strongly typed -- for typechecking as we transition to SIMD
+FORCEINLINE void StoreAligned3SIMD(VectorAligned* RESTRICT pSIMD, const fltx4& a)
+{
+	vec_st(a, 0, pSIMD->Base());
+}
+#endif
+
+FORCEINLINE void StoreAlignedSIMD(float* pSIMD, const fltx4& a)
+{
+	vec_st(a, 0, pSIMD);
+}
+
+FORCEINLINE void StoreUnalignedSIMD(float* pSIMD, const fltx4& a)
+{
+#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
+	vec_stvlx(a, 0, pSIMD);
+	vec_stvrx(a, 16, pSIMD);
+#else
+	fltx4_union a_union;
+	vec_st(a, 0, &a_union.vmxf);
+	pSIMD[0] = a_union.m128_f32[0];
+	pSIMD[1] = a_union.m128_f32[1];
+	pSIMD[2] = a_union.m128_f32[2];
+	pSIMD[3] = a_union.m128_f32[3];
+#endif
+
+}
+
+FORCEINLINE void StoreUnaligned3SIMD(float* pSIMD, const fltx4& a)
+{
+	fltx4_union a_union;
+	vec_st(a, 0, &a_union.vmxf);
+	pSIMD[0] = a_union.m128_f32[0];
+	pSIMD[1] = a_union.m128_f32[1];
+	pSIMD[2] = a_union.m128_f32[2];
+};
+
+
+
+#ifndef SPU
+// No reason to support unaligned Vectors on SPU
+
+
+FORCEINLINE fltx4 Compress4SIMD(fltx4 const a, fltx4 const& b, fltx4 const& c, fltx4 const& d);
+// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous
+FORCEINLINE fltx4 LoadGatherSIMD(const float& x, const float& y, const float& z, const float& w)
+{
+#if USING_POINTLESSLY_SLOW_SONY_CODE
+	return vmathV4MakeFromElems_V(x, y, z, w).vec128;
+#else
+	// load the float into the low word of each vector register (this exploits the unaligned load op)
+	fltx4 vx = vec_lvlx(0, &x);
+	fltx4 vy = vec_lvlx(0, &y);
+	fltx4 vz = vec_lvlx(0, &z);
+	fltx4 vw = vec_lvlx(0, &w);
+	return Compress4SIMD(vx, vy, vz, vw);
+#endif
+}
+
+
+// Store the x,y,z components of the four FLTX4 parameters
+// into the four consecutive Vectors:
+//    pDestination[0],  pDestination[1],  pDestination[2],  pDestination[3]
+// The Vectors are assumed to be unaligned.
+FORCEINLINE void StoreFourUnalignedVector3SIMD(fltx4 a, fltx4 b, fltx4	c, FLTX4 d, // first three passed by copy (deliberate)
+	Vector* const pDestination)
+{
+	StoreUnaligned3SIMD(pDestination->Base(), a);
+	StoreUnaligned3SIMD((pDestination + 1)->Base(), b);
+	StoreUnaligned3SIMD((pDestination + 2)->Base(), c);
+	StoreUnaligned3SIMD((pDestination + 3)->Base(), d);
+}
+
+// Store the x,y,z components of the four FLTX4 parameters
+// into the four consecutive Vectors:
+//    pDestination ,  pDestination + 1,  pDestination + 2,  pDestination + 3
+// The Vectors are assumed to start on an ALIGNED address, that is, 
+// pDestination is 16-byte aligned (thhough obviously pDestination+1 is not).
+FORCEINLINE void StoreFourAlignedVector3SIMD(fltx4 a, fltx4 b, fltx4	c, FLTX4 d, // first three passed by copy (deliberate)
+	Vector* const pDestination)
+{
+	StoreUnaligned3SIMD(pDestination->Base(), a);
+	StoreUnaligned3SIMD((pDestination + 1)->Base(), b);
+	StoreUnaligned3SIMD((pDestination + 2)->Base(), c);
+	StoreUnaligned3SIMD((pDestination + 3)->Base(), d);
+}
+#endif
+
+// Fixed-point conversion and save as SIGNED INTS.
+// pDest->x = Int (vSrc.x)
+// note: some architectures have means of doing 
+// fixed point conversion when the fix depth is
+// specified as an immediate.. but there is no way 
+// to guarantee an immediate as a parameter to function
+// like this.
+FORCEINLINE void ConvertStoreAsIntsSIMD(intx4* RESTRICT pDest, const fltx4& vSrc)
+{
+	i32x4 asInt = vec_cts(vSrc, 0);
+	vec_st(asInt, 0, pDest->Base());
+}
+
+FORCEINLINE void TransposeSIMD(fltx4& x, fltx4& y, fltx4& z, fltx4& w)
+{
+	fltx4 p0, p1, p2, p3;
+
+	p0 = vec_mergeh(x, z);
+	p1 = vec_mergeh(y, w);
+	p2 = vec_mergel(x, z);
+	p3 = vec_mergel(y, w);
+
+	x = vec_mergeh(p0, p1);
+	y = vec_mergel(p0, p1);
+	z = vec_mergeh(p2, p3);
+	w = vec_mergel(p2, p3);
+}
+
+// Return one in the fastest way -- faster even than loading.
+FORCEINLINE fltx4 LoadZeroSIMD(void)
+{
+	return _VEC_ZEROF;
+}
+FORCEINLINE i32x4 LoadZeroISIMD(void)
+{
+	return vec_splat_s32(0);
+}
+
+
+// Return one in the fastest way -- faster even than loading.
+FORCEINLINE fltx4 LoadOneSIMD(void)
+{
+	return _VEC_ONEF;
+}
+FORCEINLINE i32x4 LoadOneISIMD(void)
+{
+	return vec_splat_s32(1);
+}
+
+FORCEINLINE fltx4 SplatXSIMD(fltx4 a)
+{
+	return vec_splat(a, 0);
+}
+FORCEINLINE fltx4 SplatYSIMD(fltx4 a)
+{
+	return vec_splat(a, 1);
+}
+FORCEINLINE fltx4 SplatZSIMD(fltx4 a)
+{
+	return vec_splat(a, 2);
+}
+FORCEINLINE fltx4 SplatWSIMD(fltx4 a)
+{
+	return vec_splat(a, 3);
+}
+
+FORCEINLINE bi32x4 SplatXSIMD(bi32x4 a)
+{
+	return vec_splat(a, 0);
+}
+FORCEINLINE bi32x4 SplatYSIMD(bi32x4 a)
+{
+	return vec_splat(a, 1);
+}
+FORCEINLINE bi32x4 SplatZSIMD(bi32x4 a)
+{
+	return vec_splat(a, 2);
+}
+FORCEINLINE bi32x4 SplatWSIMD(bi32x4 a)
+{
+	return vec_splat(a, 3);
+}
+
+FORCEINLINE fltx4 SetXSIMD(const fltx4& a, const fltx4& x)
+{
+	return vec_sel(a, x, _VEC_COMPONENT_MASK_0);
+}
+
+FORCEINLINE fltx4 SetYSIMD(const fltx4& a, const fltx4& y)
+{
+	return vec_sel(a, y, _VEC_COMPONENT_MASK_1);
+}
+
+FORCEINLINE fltx4 SetZSIMD(const fltx4& a, const fltx4& z)
+{
+	return vec_sel(a, z, _VEC_COMPONENT_MASK_2);
+}
+
+FORCEINLINE fltx4 SetWSIMD(const fltx4& a, const fltx4& w)
+{
+	return vec_sel(a, w, _VEC_COMPONENT_MASK_3);
+}
+
+FORCEINLINE fltx4 SetComponentSIMD(const fltx4& a, int nComponent, float flValue)
+{
+#if SN_IMPROVED_INTRINSICS
+	return vec_insert(flValue, a, nComponent);
+#else
+	fltx4_union a_union;
+	a_union.vmxf = vec_ld(0, &a);
+	a_union.m128_f32[nComponent] = flValue;
+	return a_union.vmxf;
+#endif
+}
+
+FORCEINLINE float GetComponentSIMD(const fltx4& a, int nComponent)
+{
+#if SN_IMPROVED_INTRINSICS
+	return vec_extract(a, nComponent);
+#else
+	fltx4_union a_union;
+	a_union.vmxf = vec_ld(0, &a);
+	return a_union.m128_f32[nComponent];
+#endif
+}
+
+
+FORCEINLINE fltx4 RotateLeft(const fltx4& a)
+{
+	return vec_sld(a, a, 4);
+}
+
+FORCEINLINE fltx4 RotateLeft2(const fltx4& a)
+{
+	return vec_sld(a, a, 8);
+}
+
+FORCEINLINE fltx4 RotateRight(const fltx4& a)
+{
+	return vec_sld(a, a, 12);
+}
+
+FORCEINLINE fltx4 RotateRight2(const fltx4& a)
+{
+	return vec_sld(a, a, 8);
+}
+
+// rotate a vector left by an arbitrary number of 
+// bits known at compile time. The bit parameter
+// is template because it's actually used as an 
+// immediate field in an instruction, eg it absolutely
+// must be known at compile time. nBits>127 leads
+// to doom. 
+// zeroes are shifted in from the right
+template < uint nBits, typename T >
+FORCEINLINE T ShiftLeftByBits(const T& a)
+{
+	// hopefully the compiler, seeing nBits as a const immediate, elides these ifs
+	if (nBits >= 128) // WTF are you doing?!
+	{
+		return (T)LoadZeroSIMD();
+	}
+	else if (nBits == 0)
+	{
+		return a;
+	}
+	else if ((nBits > 7)) // if we have to rotate by at least one byte, do the by-octet rotation first
+	{
+		T t = vec_sld(a, ((T)LoadZeroSIMD()), (nBits >> 3)); // rotated left by octets
+		return ShiftLeftByBits< (nBits & 0x7) >(t);
+	}
+	else // we need to rotate by <= 7 bits 
+	{
+		// on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift.
+		// the splat, however, does require an immediate. Go IBM!
+		vector unsigned int shifter = (vector unsigned int) (vec_splat_s8(((signed char)(nBits & 0x7))));
+		return (T)vec_sll((vector signed int) a, shifter);
+	}
+}
+
+// as above, but shift right
+template < uint nBits, typename T >
+FORCEINLINE T ShiftRightByBits(const T& a)
+{
+	// hopefully the compiler, seeing nBits as a const immediate, elides these ifs
+	if (nBits >= 128) // WTF are you doing?!
+	{
+		return (T)LoadZeroSIMD();
+	}
+	else if (nBits == 0)
+	{
+		return a;
+	}
+	else if ((nBits > 7)) // if we have to rotate by at least one byte, do the by-octet rotation first
+	{
+		T t = vec_sld(((T)LoadZeroSIMD()), a, 16 - (nBits >> 3)); // rotated right by octets -- a rotate right of one is like a rotate left of fifteen. 
+		return ShiftRightByBits< (nBits & 0x7) >(t);
+	}
+	else // we need to rotate by <= 7 bits 
+	{
+		// on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift.
+		// the splat, however, does require an immediate. Go IBM!
+		vector unsigned int shifter = (vector unsigned int) (vec_splat_s8(((signed char)(nBits & 0x7))));
+		return (T)vec_srl((vector unsigned int) a, shifter);
+	}
+}
+
+
+/**** an example of ShiftLeftByBits:
+fltx4 ShiftByTwentyOne( fltx4 foo )
+{
+	return ShiftLeftByBits<21>(foo);
+}
+
+compiles to:
+
+	ShiftByTwentyOne(float __vector):
+	0x000059FC: 0x1060038C vspltisw v3,0                  PIPE
+	0x00005A00: 0x1085030C vspltisb v4,5
+	0x00005A04: 0x104218AC vsldoi   v2,v2,v3,2            02 (000059FC) REG PIPE
+	0x00005A08: 0x104221C4 vsl      v2,v2,v4              03 (00005A04) REG
+	0x00005A0C: 0x4E800020 blr
+*****/
+
+
+
+// find the lowest component of a.x, a.y, a.z,
+// and replicate it to the whole return value.
+// ignores a.w.
+// Forcing this inline should hopefully help with scheduling.
+FORCEINLINE fltx4 FindLowestSIMD3(const fltx4& a)
+{
+	fltx4 result;
+	fltx4 x = vec_splat(a, 0);
+	fltx4 y = vec_splat(a, 1);
+	fltx4 z = vec_splat(a, 2);
+
+	if (vec_any_nan(a))
+	{
+		x = vec_all_nan(x) ? _VEC_FLTMAX : x;
+		y = vec_all_nan(y) ? _VEC_FLTMAX : y;
+		z = vec_all_nan(z) ? _VEC_FLTMAX : z;
+	}
+
+	result = vec_min(y, x);
+	result = vec_min(z, result);
+
+	return result;
+
+}
+
+// find the highest component of a.x, a.y, a.z,
+// and replicate it to the whole return value.
+// ignores a.w.
+// Though this is only five instructions long,
+// they are all dependent, making this stall city.
+// Forcing this inline should hopefully help with scheduling.
+FORCEINLINE fltx4 FindHighestSIMD3(const fltx4& a)
+{
+	fltx4 result;
+	fltx4 x = vec_splat(a, 0);
+	fltx4 y = vec_splat(a, 1);
+	fltx4 z = vec_splat(a, 2);
+
+	if (vec_any_nan(a))
+	{
+		x = vec_all_nan(x) ? _VEC_FLTMIN : x;
+		y = vec_all_nan(y) ? _VEC_FLTMIN : y;
+		z = vec_all_nan(z) ? _VEC_FLTMIN : z;
+	}
+
+	result = vec_max(y, x);
+	result = vec_max(z, result);
+
+	return result;
+}
+
+
+// ------------------------------------
+// INTEGER SIMD OPERATIONS.
+// ------------------------------------
+
+// Load 4 aligned words into a SIMD register
+FORCEINLINE i32x4 LoadAlignedIntSIMD(const int32* RESTRICT pSIMD)
+{
+	return vec_ld(0, const_cast<int32*>(pSIMD));
+}
+
+// Load 4 unaligned words into a SIMD register
+FORCEINLINE i32x4 LoadUnalignedIntSIMD(const int32* RESTRICT pSIMD)
+{
+	i32x4 v0, v1;
+	vector unsigned char permMask;
+
+	Assert(pSIMD);
+
+	v0 = vec_ld(0, const_cast<int32*>(pSIMD));
+	permMask = vec_lvsl(0, const_cast<int32*>(pSIMD));
+	v1 = vec_ld(15, const_cast<int32*>(pSIMD));
+
+	return vec_perm(v0, v1, permMask);
+
+}
+
+// save into four words, 16-byte aligned
+FORCEINLINE void StoreAlignedIntSIMD(int32* pSIMD, const i32x4& a)
+{
+	vec_st(a, 0, pSIMD);
+}
+
+FORCEINLINE void StoreAlignedIntSIMD(int32* pSIMD, const fltx4& a)
+{
+	vec_st((i32x4)a, 0, pSIMD);
+}
+
+FORCEINLINE void StoreAlignedIntSIMD(intx4& pSIMD, const i32x4& a)
+{
+	vec_st(a, 0, pSIMD.Base());
+}
+
+FORCEINLINE void StoreUnalignedIntSIMD(int32* pSIMD, const i32x4& a)
+{
+#if SN_IMPROVED_INTRINSICS
+
+	// NOTE : NOT TESTED
+	vec_stvlx(a, 0, pSIMD);
+	vec_stvrx(a, 16, pSIMD);
+
+#else
+
+	fltx4_union tmp;
+	vec_st(a, 0, &tmp.vmxi);
+
+	pSIMD[0] = tmp.m128_u32[0];
+	pSIMD[1] = tmp.m128_u32[1];
+	pSIMD[2] = tmp.m128_u32[2];
+	pSIMD[3] = tmp.m128_u32[3];
+
+#endif
+}
+
+// a={ a.x, a.z, b.x, b.z }
+// combine two fltx4s by throwing away every other field.
+FORCEINLINE fltx4 CompressSIMD(fltx4 const& a, fltx4 const& b)
+{
+	const int32 ALIGN16 n4shuffleACXZ[4] ALIGN16_POST = { 0x00010203, 0x08090A0B, 0x10111213, 0x18191A1B };
+	return vec_perm(a, b, (vec_uchar16)LoadAlignedIntSIMD(n4shuffleACXZ));
+}
+
+// a={ a.x, b.x, c.x, d.x }
+// combine 4 fltx4s by throwing away 3/4s of the fields
+// TODO: make more efficient by doing this in a parallel way at the caller
+//    Compress4SIMD(FourVectors.. )
+FORCEINLINE fltx4 Compress4SIMD(fltx4 const a, fltx4 const& b, fltx4 const& c, fltx4 const& d)
+{
+	fltx4 ab = vec_mergeh(a, b);  // a.x, b.x, a.y, b.y
+	fltx4 cd = vec_mergeh(c, d);  // c.x, d.x...
+	static const int32 ALIGN16 shuffleABXY[4] ALIGN16_POST = { 0x00010203, 0x04050607, 0x10111213, 0x14151617 };
+
+	return vec_perm(ab, cd, (vec_uchar16)LoadAlignedIntSIMD(shuffleABXY));
+}
+
+
+// Take a fltx4 containing fixed-point uints and 
+// return them as single precision floats. No
+// fixed point conversion is done.
+FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD(const i32x4& vSrcA)
+{
+	return vec_ctf(vSrcA, 0);
+}
+
+
+// Take a fltx4 containing fixed-point sints and 
+// return them as single precision floats. No 
+// fixed point conversion is done.
+FORCEINLINE fltx4 SignedIntConvertToFltSIMD(const i32x4& vSrcA)
+{
+	return vec_ctf(vSrcA, 0);
+}
+
+// Take a fltx4 containing fixed-point uints and 
+// return them as single precision floats. Each uint
+// will be divided by 2^immed after conversion
+// (eg, this is fixed point math). 
+/* as if:
+FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
+{
+return vec_ctf(vSrcA,uImmed);
+}
+*/
+#define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (vec_ctf( (vSrcA), (uImmed) ))
+
+// Take a fltx4 containing fixed-point sints and 
+// return them as single precision floats. Each int
+// will be divided by 2^immed (eg, this is fixed point
+// math). 
+/* as if:
+FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
+{
+return vec_ctf(vSrcA,uImmed);
+}
+*/
+#define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (vec_ctf( (vSrcA), (uImmed) ))
+
+// set all components of a vector to a signed immediate int number.
+/* as if:
+FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate)
+{
+return vec_splat_s32( toImmediate );
+}
+*/
+#define IntSetImmediateSIMD(x) (vec_splat_s32(x))
+
+
+/*
+works on fltx4's as if they are four uints.
+the first parameter contains the words to be shifted,
+the second contains the amount to shift by AS INTS
+
+for i = 0 to 3
+shift = vSrcB_i*32:(i*32)+4
+vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
+*/
+FORCEINLINE u32x4 IntShiftLeftWordSIMD(u32x4 vSrcA, u32x4 vSrcB)
+{
+	return vec_sl(vSrcA, vSrcB);
+}
+
+
+FORCEINLINE float SubFloat(const fltx4& a, int idx)
+{
+#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
+	return(vec_extract(a, idx));
+#else // GCC 4.1.1
+	// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
+	fltx4_union a_union;
+	vec_st(a, 0, &a_union.vmxf);
+	return a_union.m128_f32[idx];
+#endif // GCC 4.1.1
+}
+
+FORCEINLINE float& SubFloat(fltx4& a, int idx)
+{
+	fltx4_union& a_union = (fltx4_union&)a;
+	return a_union.m128_f32[idx];
+}
+
+FORCEINLINE uint32 SubInt(const u32x4& a, int idx)
+{
+#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
+	return(vec_extract(a, idx));
+#else // GCC 4.1.1
+	fltx4_union a_union;
+	vec_st(a, 0, &a_union.vmxui);
+	return a_union.m128_u32[idx];
+#endif // GCC 4.1.1
+}
+
+FORCEINLINE uint32 SubInt(const fltx4& a, int idx)
+{
+#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
+	return(vec_extract((u32x4)a, idx));
+#else
+	fltx4_union a_union;
+	vec_st(a, 0, &a_union.vmxf);
+	return a_union.m128_u32[idx];
+#endif
+}
+
+FORCEINLINE uint32& SubInt(u32x4& a, int idx)
+{
+	fltx4_union& a_union = (fltx4_union&)a;
+	return a_union.m128_u32[idx];
+}
+
+FORCEINLINE uint32 SubFloatConvertToInt(const fltx4& a, int idx)
+{
+
+#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
+	return(vec_extract(vec_ctu(a, 0), idx));
+#else
+	u32x4 t = vec_ctu(a, 0);
+	return SubInt(t, idx);
+#endif
+
+}
+
+// perform an Altivec permute op. There is no corresponding SSE op, so 
+// this function is missing from that fork. This is deliberate, because
+// permute-based algorithms simply need to be abandoned and rebuilt 
+// differently way for SSE. 
+// (see http://developer.apple.com/hardwaredrivers/ve/sse.html#Translation_Perm )
+template< typename T, typename U >
+FORCEINLINE T PermuteVMX(T a, T b, U swizzleMask)
+{
+	return vec_perm(a, b, (vec_uchar16)swizzleMask);
+}
+
+
+// __fsel(double fComparand, double fValGE, double fLT) == fComparand >= 0 ? fValGE : fLT
+// this is much faster than if ( aFloat > 0 ) { x = .. }
+#if !defined(__SPU__)
+#define fsel __fsel
+#endif
+
+inline bool IsVector3LessThan(const fltx4& v1, const fltx4& v2)
+{
+	return vec_any_lt(v1, v2);
+}
+
+inline bool IsVector3GreaterOrEqual(const fltx4& v1, const fltx4& v2)
+{
+	return !IsVector3LessThan(v1, v2);
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD(const fltx4& a)
+{
+	fltx4 retVal;
+	SubFloat(retVal, 0) = 1.0 / sqrt(SubFloat(a, 0) != 0.0f ? SubFloat(a, 0) : FLT_EPSILON);
+	SubFloat(retVal, 1) = 1.0 / sqrt(SubFloat(a, 1) != 0.0f ? SubFloat(a, 1) : FLT_EPSILON);
+	SubFloat(retVal, 2) = 1.0 / sqrt(SubFloat(a, 2) != 0.0f ? SubFloat(a, 2) : FLT_EPSILON);
+	SubFloat(retVal, 3) = 1.0 / sqrt(SubFloat(a, 3) != 0.0f ? SubFloat(a, 3) : FLT_EPSILON);
+	return retVal;
+}
+
+// Round towards negative infinity
+FORCEINLINE fltx4 FloorSIMD(const fltx4& a)
+{
+	fltx4 retVal;
+	SubFloat(retVal, 0) = floor(SubFloat(a, 0));
+	SubFloat(retVal, 1) = floor(SubFloat(a, 1));
+	SubFloat(retVal, 2) = floor(SubFloat(a, 2));
+	SubFloat(retVal, 3) = floor(SubFloat(a, 3));
+	return retVal;
+}
+
 #elif ( defined( _X360 ) )
 
 //---------------------------------------------------------------------
 // X360 implementation
 //---------------------------------------------------------------------
 
+inline bool IsVector3LessThan(const fltx4& v1, const fltx4& v2)
+{
+	return !XMVector3GreaterOrEqual(v1, v2);
+}
+
+inline BOOL IsVector3GreaterOrEqual(const fltx4& v1, const fltx4& v2)
+{
+	return XMVector3GreaterOrEqual(v1, v2);
+}
+
+
 FORCEINLINE float& FloatSIMD(fltx4& a, int idx)
 {
 	fltx4_union& a_union = (fltx4_union&)a;
@@ -1142,6 +2711,22 @@ FORCEINLINE bool IsAllGreaterThanOrEq(const fltx4& a, const fltx4& b)
 	return XMComparisonAllTrue(cr);
 }
 
+// for branching when a.xyzw > b.xyzw
+FORCEINLINE bool IsAnyGreaterThan(const fltx4& a, const fltx4& b)
+{
+	unsigned int cr;
+	XMVectorGreaterR(&cr, a, b);
+	return XMComparisonAnyTrue(cr);
+}
+
+// for branching when a.xyzw >= b.xyzw
+FORCEINLINE bool IsAnyGreaterThanOrEq(const fltx4& a, const fltx4& b)
+{
+	unsigned int cr;
+	XMVectorGreaterOrEqualR(&cr, a, b);
+	return XMComparisonAnyTrue(cr);
+}
+
 // For branching if all a.xyzw == b.xyzw
 FORCEINLINE bool IsAllEqual(const fltx4& a, const fltx4& b)
 {
@@ -1183,6 +2768,13 @@ FORCEINLINE bool IsAnyNegative(const fltx4& a)							// (a.x < 0) || (a.y < 0) |
 	return !XMComparisonAllTrue(equalFlags);
 }
 
+FORCEINLINE bool IsAnyTrue(const fltx4& a)
+{
+	unsigned int equalFlags = 0;
+	__vcmpequwR(Four_Zeros, a, &equalFlags); // compare to zero
+	return XMComparisonAnyFalse(equalFlags); // at least one element was not zero, eg was true
+}
+
 FORCEINLINE fltx4 CmpEqSIMD(const fltx4& a, const fltx4& b)				// (a==b) ? ~0:0
 {
 	return __vcmpeqfp(a, b);
@@ -1220,6 +2812,18 @@ FORCEINLINE fltx4 MaskedAssign(const fltx4& ReplacementMask, const fltx4& NewVal
 	return __vsel(OldValue, NewValue, ReplacementMask);
 }
 
+
+// perform an Altivec permute op. There is no corresponding SSE op, so 
+// this function is missing from that fork. This is deliberate, because
+// permute-based algorithms simply need to be abandoned and rebuilt 
+// differently way for SSE. 
+// (see http://developer.apple.com/hardwaredrivers/ve/sse.html#Translation_Perm )
+template< typename T, typename U >
+FORCEINLINE T PermuteVMX(T a, T b, U swizzleMask)
+{
+	return __vperm(a, b, swizzleMask);
+}
+
 // AKA "Broadcast", "Splat"
 FORCEINLINE fltx4 ReplicateX4(float flValue)					//  a,a,a,a
 {
@@ -1308,12 +2912,16 @@ FORCEINLINE fltx4 ReciprocalSIMD(const fltx4& a)				// 1/a
 	return XMVectorReciprocal(a);
 }
 
-// FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need DivEstSIMD?)
 FORCEINLINE fltx4 DivSIMD(const fltx4& a, const fltx4& b)	// a/b
 {
 	return MulSIMD(ReciprocalSIMD(b), a);
 }
 
+FORCEINLINE fltx4 DivEstSIMD(const fltx4& a, const fltx4& b)	// Est(a/b)
+{
+	return MulSIMD(ReciprocalEstSIMD(b), a);
+}
+
 /// 1/x for all 4 values.
 /// 1/0 will result in a big but NOT infinite result
 FORCEINLINE fltx4 ReciprocalEstSaturateSIMD(const fltx4& a)
@@ -1344,6 +2952,13 @@ FORCEINLINE fltx4 ExpSIMD(const fltx4& toPower)
 	return XMVectorExp(toPower);
 }
 
+// a unique Altivec concept, the "Vector 2 Raised to the Exponent Estimate Floating Point",
+// which is accurate to four bits of mantissa.
+FORCEINLINE fltx4 Exp2EstSIMD(const fltx4& f)
+{
+	return XMVectorExpEst(f);
+}
+
 // Clamps the components of a vector to a specified minimum and maximum range.
 FORCEINLINE fltx4 ClampVectorSIMD(FLTX4 in, FLTX4 min, FLTX4 max)
 {
@@ -1361,6 +2976,12 @@ FORCEINLINE fltx4 LoadUnaligned3SIMD(const void* pSIMD)
 	return XMLoadVector3(pSIMD);
 }
 
+// load a single unaligned float into the x component of a SIMD word
+FORCEINLINE fltx4 LoadUnalignedFloatSIMD(const float* pFlt)
+{
+	return __lvlx(pFlt, 0);
+}
+
 FORCEINLINE fltx4 LoadAlignedSIMD(const void* pSIMD)
 {
 	return *(reinterpret_cast<const fltx4*> (pSIMD));
@@ -1397,13 +3018,60 @@ FORCEINLINE void StoreUnaligned3SIMD(float* pSIMD, const fltx4& a)
 	XMStoreVector3(pSIMD, a);
 }
 
-
 // strongly typed -- for typechecking as we transition to SIMD
 FORCEINLINE void StoreAligned3SIMD(VectorAligned* RESTRICT pSIMD, const fltx4& a)
 {
 	XMStoreVector3A(pSIMD->Base(), a);
 }
 
+// Store the x,y,z components of the four FLTX4 parameters
+// into the four consecutive Vectors:
+//    pDestination[0],  pDestination[1],  pDestination[2],  pDestination[3]
+// The Vectors are assumed to be unaligned.
+FORCEINLINE void StoreFourUnalignedVector3SIMD(fltx4 a, fltx4 b, fltx4	c, FLTX4 d, // first three passed by copy (deliberate)
+	Vector* const pDestination)
+{
+	// since four Vec3s == 48 bytes, we can use full-vector stores here, so long as 
+	// we arrange the data properly first.
+	// The vrlimi ops trash the destination param which is why we require 
+	// pass-by-copy. I'm counting on the compiler to schedule these properly.
+	b = __vrlimi(b, b, 15, 1);    // b = y1z1__x1
+	c = __vrlimi(c, c, 15, 2);    // c = z2__x2y2
+
+	a = __vrlimi(a, b, 1, 0);     // a = x0y0z0x1
+	b = __vrlimi(b, c, 2 | 1, 0);   // b = y1z1x2y2
+	c = __vrlimi(c, d, 4 | 2 | 1, 3); // c = z2x3y3z3
+
+	float* RESTRICT pOut = pDestination->Base();
+	StoreUnalignedSIMD(pOut + 0, a);
+	StoreUnalignedSIMD(pOut + 4, b);
+	StoreUnalignedSIMD(pOut + 8, c);
+}
+
+// Store the x,y,z components of the four FLTX4 parameters
+// into the four consecutive Vectors:
+//    pDestination ,  pDestination + 1,  pDestination + 2,  pDestination + 3
+// The Vectors are assumed to start on an ALIGNED address, that is, 
+// pDestination is 16-byte aligned (thhough obviously pDestination+1 is not).
+FORCEINLINE void StoreFourAlignedVector3SIMD(fltx4 a, fltx4 b, fltx4	c, FLTX4 d, // first three passed by copy (deliberate)
+	Vector* const pDestination)
+{
+	// since four Vec3s == 48 bytes, we can use full-vector stores here, so long as 
+	// we arrange the data properly first.
+	// The vrlimi ops trash the destination param which is why we require 
+	// pass-by-copy. I'm counting on the compiler to schedule these properly.
+	b = __vrlimi(b, b, 15, 1);    // b = y1z1__x1
+	c = __vrlimi(c, c, 15, 2);    // c = z2__x2y2
+
+	a = __vrlimi(a, b, 1, 0);     // a = x0y0z0x1
+	b = __vrlimi(b, c, 2 | 1, 0);   // b = y1z1x2y2
+	c = __vrlimi(c, d, 4 | 2 | 1, 3); // c = z2x3y3z3
+
+	float* RESTRICT pOut = pDestination->Base();
+	StoreAlignedSIMD(pOut + 0, a);
+	StoreAlignedSIMD(pOut + 4, b);
+	StoreAlignedSIMD(pOut + 8, c);
+}
 
 // Fixed-point conversion and save as SIGNED INTS.
 // pDest->x = Int (vSrc.x)
@@ -1504,7 +3172,78 @@ FORCEINLINE fltx4 RotateLeft2(const fltx4& a)
 	return __vrlimi(compareOne, a, 8 | 4 | 2 | 1, 2);
 }
 
+FORCEINLINE fltx4 RotateRight(const fltx4& a)
+{
+	fltx4 compareOne = a;
+	return __vrlimi(compareOne, a, 8 | 4 | 2 | 1, 3);
+}
 
+FORCEINLINE fltx4 RotateRight2(const fltx4& a)
+{
+	fltx4 compareOne = a;
+	return __vrlimi(compareOne, a, 8 | 4 | 2 | 1, 2);
+}
+
+
+// rotate a vector left by an arbitrary number of 
+// bits known at compile time. The bit parameter
+// is template because it's actually used as an 
+// immediate field in an instruction, eg it absolutely
+// must be known at compile time. nBits>127 leads
+// to doom. 
+// zeroes are shifted in from the right
+template < uint nBits >
+FORCEINLINE fltx4 ShiftLeftByBits(const fltx4& a)
+{
+	// hopefully the compiler, seeing nBits as a const immediate, elides these ifs
+	if (nBits >= 128) // WTF are you doing?!
+	{
+		return LoadZeroSIMD();
+	}
+	else if (nBits == 0)
+	{
+		return a;
+	}
+	else if ((nBits > 7)) // if we have to rotate by at least one byte, do the by-octet rotation first
+	{
+		fltx4 t = __vsldoi(a, (LoadZeroSIMD()), (nBits >> 3)); // rotated left by octets
+		return ShiftLeftByBits< (nBits & 0x7) >(t);
+	}
+	else // we need to rotate by <= 7 bits 
+	{
+		// on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift.
+		// the splat, however, does require an immediate. Go IBM!
+		u32x4 shifter = u32x4(__vspltisb(((signed char)(nBits & 0x7))));
+		return __vsl(a, shifter);
+	}
+}
+
+// as above, but shift right
+template < uint nBits >
+FORCEINLINE fltx4 ShiftRightByBits(const fltx4& a)
+{
+	// hopefully the compiler, seeing nBits as a const immediate, elides these ifs
+	if (nBits >= 128) // WTF are you doing?!
+	{
+		return LoadZeroSIMD();
+	}
+	else if (nBits == 0)
+	{
+		return a;
+	}
+	else if ((nBits > 7)) // if we have to rotate by at least one byte, do the by-octet rotation first
+	{
+		fltx4 t = __vsldoi((LoadZeroSIMD()), a, 16 - (nBits >> 3)); // rotated right by octets -- a rotate right of one is like a rotate left of fifteen. 
+		return ShiftRightByBits< (nBits & 0x7) >(t);
+	}
+	else // we need to rotate by <= 7 bits 
+	{
+		// on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift.
+		// the splat, however, does require an immediate. Go IBM!
+		u32x4 shifter = u32x4(__vspltisb(((signed char)(nBits & 0x7))));
+		return __vsr(a, shifter);
+	}
+}
 
 // find the lowest component of a.x, a.y, a.z,
 // and replicate it to the whole return value.
@@ -1616,6 +3355,45 @@ FORCEINLINE void StoreUnalignedIntSIMD(int32* pSIMD, const fltx4& a)
 	XMStoreVector4(pSIMD, a);
 }
 
+// Load four consecutive uint16's, and turn them into floating point numbers.
+// This function isn't especially fast and could be made faster if anyone is
+// using it heavily.
+FORCEINLINE fltx4 LoadAndConvertUint16SIMD(const uint16* pInts)
+{
+	return XMLoadUShort4(reinterpret_cast<const XMUSHORT4*>(pInts));
+}
+
+// a={ a.x, a.z, b.x, b.z }
+// combine two fltx4s by throwing away every other field.
+FORCEINLINE fltx4 CompressSIMD(fltx4 const& a, fltx4 const& b)
+{
+	return XMVectorPermute(a, b, XMVectorPermuteControl(0, 2, 4, 6));
+}
+
+// a={ a.x, b.x, c.x, d.x }
+// combine 4 fltx4s by throwing away 3/4s of the fields
+// TODO: make more efficient by doing this in a parallel way at the caller
+//    Compress4SIMD(FourVectors.. )
+FORCEINLINE fltx4 Compress4SIMD(fltx4 const a, fltx4 const& b, fltx4 const& c, fltx4 const& d)
+{
+	fltx4 abcd = __vrlimi(a, b, 4, 3);  // a.x, b.x, a.z, a.w
+	abcd = __vrlimi(abcd, c, 2, 2);  // ax, bx, cx, aw
+	abcd = __vrlimi(abcd, d, 1, 1);  // ax, bx, cx, dx
+
+	return abcd;
+}
+
+
+// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous
+FORCEINLINE fltx4 LoadGatherSIMD(const float& x, const float& y, const float& z, const float& w)
+{
+	// load the float into the low word of each vector register (this exploits the unaligned load op)
+	fltx4 vx = __lvlx(&x, 0);
+	fltx4 vy = __lvlx(&y, 0);
+	fltx4 vz = __lvlx(&z, 0);
+	fltx4 vw = __lvlx(&w, 0);
+	return Compress4SIMD(vx, vy, vz, vw);
+}
 
 // Take a fltx4 containing fixed-point uints and 
 // return them as single precision floats. No
@@ -1625,7 +3403,6 @@ FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD(const i32x4& vSrcA)
 	return __vcfux(vSrcA, 0);
 }
 
-
 // Take a fltx4 containing fixed-point sints and 
 // return them as single precision floats. No 
 // fixed point conversion is done.
@@ -1725,11 +3502,25 @@ FORCEINLINE void StoreAlignedSIMD(float* RESTRICT pSIMD, const fltx4& a)
 	_mm_store_ps(pSIMD, a);
 }
 
+FORCEINLINE void StoreAlignedSIMD(short* RESTRICT pSIMD, const shortx8& a)
+{
+	_mm_store_si128((shortx8*)pSIMD, a);
+}
 FORCEINLINE void StoreUnalignedSIMD(float* RESTRICT pSIMD, const fltx4& a)
 {
 	_mm_storeu_ps(pSIMD, a);
 }
 
+FORCEINLINE void StoreUnalignedSIMD(short* RESTRICT pSIMD, const shortx8& a)
+{
+	_mm_storeu_si128((shortx8*)pSIMD, a);
+}
+
+FORCEINLINE void StoreUnalignedFloat(float* pSingleFloat, const fltx4& a)
+{
+	_mm_store_ss(pSingleFloat, a);
+}
+
 
 FORCEINLINE fltx4 RotateLeft(const fltx4& a);
 FORCEINLINE fltx4 RotateLeft2(const fltx4& a);
@@ -1741,23 +3532,61 @@ FORCEINLINE void StoreUnaligned3SIMD(float* pSIMD, const fltx4& a)
 	_mm_store_ss(pSIMD + 2, RotateLeft2(a));
 }
 
+
 // strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
 FORCEINLINE void StoreAligned3SIMD(VectorAligned* RESTRICT pSIMD, const fltx4& a)
 {
 	StoreAlignedSIMD(pSIMD->Base(), a);
 }
 
+// Store the x,y,z components of the four FLTX4 parameters
+// into the four consecutive Vectors:
+//    pDestination[0],  pDestination[1],  pDestination[2],  pDestination[3]
+// The Vectors are assumed to be unaligned.
+FORCEINLINE void StoreFourUnalignedVector3SIMD(fltx4 a, fltx4 b, fltx4	c, FLTX4 d, // first three passed by copy (deliberate)
+	Vector3D* const pDestination)
+{
+	StoreUnaligned3SIMD(pDestination->Base(), a);
+	StoreUnaligned3SIMD((pDestination + 1)->Base(), b);
+	StoreUnaligned3SIMD((pDestination + 2)->Base(), c);
+	StoreUnaligned3SIMD((pDestination + 3)->Base(), d);
+}
+
+// Store the x,y,z components of the four FLTX4 parameters
+// into the four consecutive Vectors:
+//    pDestination ,  pDestination + 1,  pDestination + 2,  pDestination + 3
+// The Vectors are assumed to start on an ALIGNED address, that is, 
+// pDestination is 16-byte aligned (thhough obviously pDestination+1 is not).
+FORCEINLINE void StoreFourAlignedVector3SIMD(fltx4 a, fltx4 b, fltx4	c, FLTX4 d, // first three passed by copy (deliberate)
+	Vector3D* const pDestination)
+{
+	StoreUnaligned3SIMD(pDestination->Base(), a);
+	StoreUnaligned3SIMD((pDestination + 1)->Base(), b);
+	StoreUnaligned3SIMD((pDestination + 2)->Base(), c);
+	StoreUnaligned3SIMD((pDestination + 3)->Base(), d);
+}
+
 FORCEINLINE fltx4 LoadAlignedSIMD(const void* pSIMD)
 {
 	return _mm_load_ps(reinterpret_cast<const float*> (pSIMD));
 }
 
+FORCEINLINE shortx8 LoadAlignedShortSIMD(const void* pSIMD)
+{
+	return _mm_load_si128(reinterpret_cast<const shortx8*> (pSIMD));
+}
+
+FORCEINLINE shortx8 LoadUnalignedShortSIMD(const void* pSIMD)
+{
+	return _mm_loadu_si128(reinterpret_cast<const shortx8*> (pSIMD));
+}
+
 FORCEINLINE fltx4 AndSIMD(const fltx4& a, const fltx4& b)				// a & b
 {
 	return _mm_and_ps(a, b);
 }
 
-FORCEINLINE fltx4 AndNotSIMD(const fltx4& a, const fltx4& b)			// ~a & b
+FORCEINLINE fltx4 AndNotSIMD(const fltx4& a, const fltx4& b)			// a & ~b
 {
 	return _mm_andnot_ps(a, b);
 }
@@ -1795,6 +3624,12 @@ FORCEINLINE fltx4 LoadUnaligned3SIMD(const void* pSIMD)
 	return _mm_loadu_ps(reinterpret_cast<const float*>(pSIMD));
 }
 
+// load a single unaligned float into the x component of a SIMD word
+FORCEINLINE fltx4 LoadUnalignedFloatSIMD(const float* pFlt)
+{
+	return _mm_load_ss(pFlt);
+}
+
 /// replicate a single 32 bit integer value to all 4 components of an m128
 FORCEINLINE fltx4 ReplicateIX4(int i)
 {
@@ -1809,6 +3644,11 @@ FORCEINLINE fltx4 ReplicateX4(float flValue)
 	return _mm_shuffle_ps(value, value, 0);
 }
 
+FORCEINLINE fltx4 ReplicateX4(const float* flValue)
+{
+	__m128 value = _mm_set_ss(*flValue);
+	return _mm_shuffle_ps(value, value, 0);
+}
 
 FORCEINLINE float SubFloat(const fltx4& a, int idx)
 {
@@ -1893,9 +3733,27 @@ FORCEINLINE fltx4 SplatZSIMD(fltx4 const& a)
 
 FORCEINLINE fltx4 SplatWSIMD(fltx4 const& a)
 {
-	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3));
+	return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(3, 3, 3, 3));
 }
 
+FORCEINLINE fltx4 ShuffleXXYY(const fltx4& a)
+{
+	return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(0, 0, 1, 1));
+}
+
+FORCEINLINE fltx4 ShuffleXYXY(const fltx4& a)
+{
+	return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(0, 1, 0, 1));
+}
+
+FORCEINLINE fltx4 ShuffleZZWW(const fltx4& a)
+{
+	return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(2, 2, 3, 3));
+}
+
+
+
+
 FORCEINLINE fltx4 SetXSIMD(const fltx4& a, const fltx4& x)
 {
 	fltx4 result = MaskedAssign(LoadAlignedSIMD(g_SIMD_ComponentMask[0]), x, a);
@@ -1942,20 +3800,19 @@ FORCEINLINE fltx4 RotateLeft2(const fltx4& a)
 // a b c d -> d a b c
 FORCEINLINE fltx4 RotateRight(const fltx4& a)
 {
-	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1));
+	return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(3, 0, 1, 2));
 }
 
 // a b c d -> c d a b
 FORCEINLINE fltx4 RotateRight2(const fltx4& a)
 {
-	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
+	return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(2, 3, 0, 1));
 }
 
-
 FORCEINLINE fltx4 AddSIMD(const fltx4& a, const fltx4& b)				// a+b
 {
 	return _mm_add_ps(a, b);
-};
+}
 
 FORCEINLINE fltx4 SubSIMD(const fltx4& a, const fltx4& b)				// a-b
 {
@@ -1972,6 +3829,12 @@ FORCEINLINE fltx4 DivSIMD(const fltx4& a, const fltx4& b)				// a/b
 	return _mm_div_ps(a, b);
 };
 
+fltx4 ReciprocalEstSIMD(const fltx4& a);
+FORCEINLINE fltx4 DivEstSIMD(const fltx4& a, const fltx4& b)			// Est(a/b)
+{
+	return MulSIMD(ReciprocalEstSIMD(b), a);
+};
+
 FORCEINLINE fltx4 MaddSIMD(const fltx4& a, const fltx4& b, const fltx4& c)				// a*b + c
 {
 	return AddSIMD(MulSIMD(a, b), c);
@@ -1985,15 +3848,17 @@ FORCEINLINE fltx4 MsubSIMD(const fltx4& a, const fltx4& b, const fltx4& c)				//
 FORCEINLINE fltx4 Dot3SIMD(const fltx4& a, const fltx4& b)
 {
 	fltx4 m = MulSIMD(a, b);
-	float flDot = SubFloat(m, 0) + SubFloat(m, 1) + SubFloat(m, 2);
-	return ReplicateX4(flDot);
+	return AddSIMD(AddSIMD(SplatXSIMD(m), SplatYSIMD(m)), SplatZSIMD(m));
 }
 
 FORCEINLINE fltx4 Dot4SIMD(const fltx4& a, const fltx4& b)
 {
-	fltx4 m = MulSIMD(a, b);
-	float flDot = SubFloat(m, 0) + SubFloat(m, 1) + SubFloat(m, 2) + SubFloat(m, 3);
-	return ReplicateX4(flDot);
+	// 4 instructions, serial, order of addition varies so individual elements my differ in the LSB on some CPUs
+	fltx4 fl4Product = MulSIMD(a, b);
+	fltx4 fl4YXWZ = _mm_shuffle_ps(fl4Product, fl4Product, MM_SHUFFLE_REV(1, 0, 3, 2));
+	fltx4 fl4UUVV = AddSIMD(fl4Product, fl4YXWZ); // U = X+Y; V = Z+W
+	fltx4 fl4VVUU = RotateLeft2(fl4UUVV);
+	return AddSIMD(fl4UUVV, fl4VVUU);
 }
 
 //TODO: implement as four-way Taylor series (see xbox implementation)
@@ -2072,6 +3937,11 @@ FORCEINLINE bool IsAnyNegative(const fltx4& a)							// (a.x < 0) || (a.y < 0) |
 	return (0 != TestSignSIMD(a));
 }
 
+FORCEINLINE bool IsAnyTrue(const fltx4& a)
+{
+	return (0 != TestSignSIMD(a));
+}
+
 FORCEINLINE fltx4 CmpEqSIMD(const fltx4& a, const fltx4& b)				// (a==b) ? ~0:0
 {
 	return _mm_cmpeq_ps(a, b);
@@ -2151,7 +4021,9 @@ FORCEINLINE fltx4 CeilSIMD(const fltx4& a)
 
 }
 
+fltx4 AbsSIMD(const fltx4& x);		// To make it more coherent with the whole API (the whole SIMD API is postfixed with SIMD except a couple of methods. Well...)
 fltx4 fabs(const fltx4& x);
+
 // Round towards negative infinity
 // This is the implementation that was here before; it assumes
 // you are in round-to-floor mode, which I guess is usually the
@@ -2167,6 +4039,11 @@ FORCEINLINE fltx4 FloorSIMD(const fltx4& val)
 
 
 
+FORCEINLINE bool IsAnyZeros(const fltx4& a)								// any floats are zero?
+{
+	return TestSignSIMD(CmpEqSIMD(a, Four_Zeros)) != 0;
+}
+
 inline bool IsAllZeros(const fltx4& var)
 {
 	return TestSignSIMD(CmpEqSIMD(var, Four_Zeros)) == 0xF;
@@ -2298,6 +4175,20 @@ FORCEINLINE fltx4 FindHighestSIMD3(const fltx4& a)
 
 }
 
+
+inline bool IsVector3LessThan(const fltx4& v1, const fltx4& v2)
+{
+	bi32x4 isOut = CmpLtSIMD(v1, v2);
+	return IsAnyNegative(isOut);
+}
+
+inline bool IsVector4LessThan(const fltx4& v1, const fltx4& v2)
+{
+	bi32x4 isOut = CmpLtSIMD(v1, v2);
+	return IsAnyNegative(isOut);
+}
+
+
 // ------------------------------------
 // INTEGER SIMD OPERATIONS.
 // ------------------------------------
@@ -2345,6 +4236,61 @@ FORCEINLINE void StoreUnalignedIntSIMD(int32* RESTRICT pSIMD, const fltx4& a)
 	_mm_storeu_ps(reinterpret_cast<float*>(pSIMD), a);
 }
 
+// a={ a.x, a.z, b.x, b.z }
+// combine two fltx4s by throwing away every other field.
+FORCEINLINE fltx4 CompressSIMD(fltx4 const& a, fltx4 const& b)
+{
+	return _mm_shuffle_ps(a, b, MM_SHUFFLE_REV(0, 2, 0, 2));
+}
+
+// Load four consecutive uint16's, and turn them into floating point numbers.
+// This function isn't especially fast and could be made faster if anyone is
+// using it heavily.
+FORCEINLINE fltx4 LoadAndConvertUint16SIMD(const uint16* pInts)
+{
+#ifdef POSIX
+	fltx4 retval;
+	SubFloat(retval, 0) = pInts[0];
+	SubFloat(retval, 1) = pInts[1];
+	SubFloat(retval, 2) = pInts[2];
+	SubFloat(retval, 3) = pInts[3];
+	return retval;
+#else
+	__m128i inA = _mm_loadl_epi64((__m128i const*) pInts); // Load the lower 64 bits of the value pointed to by p into the lower 64 bits of the result, zeroing the upper 64 bits of the result.
+	inA = _mm_unpacklo_epi16(inA, _mm_setzero_si128()); // unpack unsigned 16's to signed 32's
+	return _mm_cvtepi32_ps(inA);
+#endif
+}
+
+
+// a={ a.x, b.x, c.x, d.x }
+// combine 4 fltx4s by throwing away 3/4s of the fields
+FORCEINLINE fltx4 Compress4SIMD(fltx4 const a, fltx4 const& b, fltx4 const& c, fltx4 const& d)
+{
+	fltx4 aacc = _mm_shuffle_ps(a, c, MM_SHUFFLE_REV(0, 0, 0, 0));
+	fltx4 bbdd = _mm_shuffle_ps(b, d, MM_SHUFFLE_REV(0, 0, 0, 0));
+	return MaskedAssign(LoadAlignedSIMD(g_SIMD_EveryOtherMask), bbdd, aacc);
+}
+
+// outa={a.x, a.x, a.y, a.y}, outb = a.z, a.z, a.w, a.w }
+FORCEINLINE void ExpandSIMD(fltx4 const& a, fltx4& fl4OutA, fltx4& fl4OutB)
+{
+	fl4OutA = _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(0, 0, 1, 1));
+	fl4OutB = _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(2, 2, 3, 3));
+
+}
+
+
+// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous
+FORCEINLINE fltx4 LoadGatherSIMD(const float& x, const float& y, const float& z, const float& w)
+{
+	// load the float into the low word of each vector register (this exploits the unaligned load op)
+	fltx4 vx = _mm_load_ss(&x);
+	fltx4 vy = _mm_load_ss(&y);
+	fltx4 vz = _mm_load_ss(&z);
+	fltx4 vw = _mm_load_ss(&w);
+	return Compress4SIMD(vx, vy, vz, vw);
+}
 
 // CHRISG: the conversion functions all seem to operate on m64's only...
 // how do we make them work here?
@@ -2362,7 +4308,20 @@ FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD(const u32x4& vSrcA)
 	return retval;
 }
 
+// Take a fltx4 containing fixed-point sints and 
+// return them as single precision floats. No 
+// fixed point conversion is done.
+FORCEINLINE fltx4 SignedIntConvertToFltSIMD(const i32x4& vSrcA)
+{
+	return  _mm_cvtepi32_ps((const __m128i&)vSrcA);
+}
 
+FORCEINLINE fltx4 SignedIntConvertToFltSIMD(const shortx8& vSrcA)
+{
+	return  _mm_cvtepi32_ps(vSrcA);
+}
+
+#if 0
 // Take a fltx4 containing fixed-point sints and 
 // return them as single precision floats. No 
 // fixed point conversion is done.
@@ -2376,6 +4335,8 @@ FORCEINLINE fltx4 SignedIntConvertToFltSIMD(const i32x4& vSrcA)
 	return retval;
 }
 
+#endif
+
 /*
   works on fltx4's as if they are four uints.
   the first parameter contains the words to be shifted,
@@ -2407,13 +4368,11 @@ FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4& vSrcA, const i32x4& vSrcB)
 // like this.
 FORCEINLINE void ConvertStoreAsIntsSIMD(intx4* RESTRICT pDest, const fltx4& vSrc)
 {
-#if defined( COMPILER_MSVC64 )
-
-	(*pDest)[0] = SubFloat(vSrc, 0);
-	(*pDest)[1] = SubFloat(vSrc, 1);
-	(*pDest)[2] = SubFloat(vSrc, 2);
-	(*pDest)[3] = SubFloat(vSrc, 3);
-
+#if defined(_MSC_VER) && _MSC_VER >= 1900 && defined(COMPILER_MSVC64)
+	(*pDest)[0] = (int)SubFloat(vSrc, 0);
+	(*pDest)[1] = (int)SubFloat(vSrc, 1);
+	(*pDest)[2] = (int)SubFloat(vSrc, 2);
+	(*pDest)[3] = (int)SubFloat(vSrc, 3);
 #else
 	__m64 bottom = _mm_cvttps_pi32(vSrc);
 	__m64 top = _mm_cvttps_pi32(_mm_movehl_ps(vSrc, vSrc));
@@ -2429,8 +4388,179 @@ FORCEINLINE void ConvertStoreAsIntsSIMD(intx4* RESTRICT pDest, const fltx4& vSrc
 
 #endif
 
+// a={a.y, a.z, a.w, b.x } b={b.y, b.z, b.w, b.x }
+FORCEINLINE void RotateLeftDoubleSIMD(fltx4& a, fltx4& b)
+{
+	a = SetWSIMD(RotateLeft(a), SplatXSIMD(b));
+	b = RotateLeft(b);
+}
 
 
+// // Some convenience operator overloads, which are just aliasing the functions above.
+// Unneccessary on 360, as you already have them from xboxmath.h (same for PS3 PPU and SPU)
+#if !defined(PLATFORM_PPC) && !defined( POSIX ) && !defined(SPU)
+#if 1  // TODO: verify generation of non-bad code. 
+// Componentwise add
+FORCEINLINE fltx4 operator+(FLTX4 a, FLTX4 b)
+{
+	return AddSIMD(a, b);
+}
+
+// Componentwise subtract
+FORCEINLINE fltx4 operator-(FLTX4 a, FLTX4 b)
+{
+	return SubSIMD(a, b);
+}
+
+// Componentwise multiply
+FORCEINLINE fltx4 operator*(FLTX4 a, FLTX4 b)
+{
+	return MulSIMD(a, b);
+}
+
+// No divide. You need to think carefully about whether you want a reciprocal
+// or a reciprocal estimate.
+
+// bitwise and
+FORCEINLINE fltx4 operator&(FLTX4 a, FLTX4 b)
+{
+	return AndSIMD(a, b);
+}
+
+// bitwise or
+FORCEINLINE fltx4 operator|(FLTX4 a, FLTX4 b)
+{
+	return OrSIMD(a, b);
+}
+
+// bitwise xor
+FORCEINLINE fltx4 operator^(FLTX4 a, FLTX4 b)
+{
+	return XorSIMD(a, b);
+}
+
+// unary negate
+FORCEINLINE fltx4 operator-(FLTX4 a)
+{
+	return NegSIMD(a);
+}
+#endif // 0
+#endif
+
+#if defined(_X360) || defined(_PS3)
+FORCEINLINE fltx4 VectorMergeHighSIMD(fltx4 fl4SrcA, fltx4 fl4SrcB)
+{
+#if defined( _X360 )
+	return __vmrghw(fl4SrcA, fl4SrcB);
+#else
+	return vec_mergeh(fl4SrcA, fl4SrcB);
+#endif
+}
+
+FORCEINLINE fltx4 VectorMergeLowSIMD(fltx4 fl4SrcA, fltx4 fl4SrcB)
+{
+#if defined( _X360 )
+	return __vmrglw(fl4SrcA, fl4SrcB);
+#else
+	return vec_mergel(fl4SrcA, fl4SrcB);
+#endif
+}
+#endif
+
+#ifndef SPU
+// fourplanes_t, Frustrum_t are not supported on SPU
+// It would make sense to support FourVectors on SPU at some point.
+
+struct ALIGN16 fourplanes_t
+{
+	fltx4		nX;
+	fltx4		nY;
+	fltx4		nZ;
+	fltx4		dist;
+	bi32x4		xSign;
+	bi32x4		ySign;
+	bi32x4		zSign;
+	fltx4		nXAbs;
+	fltx4		nYAbs;
+	fltx4		nZAbs;
+
+	void ComputeSignbits();
+
+	// fast SIMD loads
+	void Set4Planes(const VPlane* pPlanes);
+	void Set2Planes(const VPlane* pPlanes);
+	void Get4Planes(VPlane* pPlanesOut) const;
+	void Get2Planes(VPlane* pPlanesOut) const;
+	// not-SIMD, much slower
+	void GetPlane(int index, Vector3D* pNormal, float* pDist) const;
+	void SetPlane(int index, const Vector3D& vecNormal, float planeDist);
+};
+
+class ALIGN16 Frustum_t
+{
+public:
+	Frustum_t();
+	void SetPlane(int i, const Vector3D& vecNormal, float dist);
+	void GetPlane(int i, Vector3D* pNormalOut, float* pDistOut) const;
+	void SetPlanes(const VPlane* pPlanes);
+	void GetPlanes(VPlane* pPlanesOut) const;
+	// returns false if the box is within the frustum, true if it is outside
+	bool CullBox(const Vector3D& mins, const Vector3D& maxs) const;
+	bool CullBoxCenterExtents(const Vector3D& center, const Vector3D& extents) const;
+
+	bool CullBox(const fltx4& fl4Mins, const fltx4& fl4Maxs) const;
+	bool CullBoxCenterExtents(const fltx4& fl4Center, const fltx4& fl4Extents) const;
+
+
+	// Return true if frustum contains this bounding volume, false if any corner is outside
+	bool Contains(const Vector3D& mins, const Vector3D& maxs) const;
+
+	// Return true if this frustum intersects the frustum, false if it is outside
+	bool Intersects(Frustum_t& otherFrustum) const;
+
+	// Return true if this bounding volume intersects the frustum, false if it is outside
+	bool Intersects(const Vector3D& mins, const Vector3D& maxs) const;
+	bool IntersectsCenterExtents(const Vector3D& center, const Vector3D& extents) const;
+
+	bool Intersects(const fltx4& fl4Mins, const fltx4& fl4Maxs) const;
+	bool IntersectsCenterExtents(const fltx4& fl4Center, const fltx4& fl4Extents) const;
+
+
+	void CreatePerspectiveFrustum(const Vector3D& origin, const Vector3D& forward,
+		const Vector3D& right, const Vector3D& up, float flZNear, float flZFar,
+		float flFovX, float flAspect);
+
+	void CreatePerspectiveFrustumFLU(const Vector3D& vOrigin, const Vector3D& vForward,
+		const Vector3D& vLeft, const Vector3D& vUp, float flZNear, float flZFar,
+		float flFovX, float flAspect);
+
+	// Version that accepts angles instead of vectors
+	void CreatePerspectiveFrustum(const Vector3D& origin, const QAngle& angles, float flZNear,
+		float flZFar, float flFovX, float flAspectRatio);
+
+	// Generate a frustum based on orthographic parameters
+	void CreateOrthoFrustum(const Vector3D& origin, const Vector3D& forward, const Vector3D& right, const Vector3D& up,
+		float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar);
+
+	void CreateOrthoFrustumFLU(const Vector3D& vOrigin, const Vector3D& vForward, const Vector3D& vLeft, const Vector3D& vUp,
+		float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar);
+
+	// The points returned correspond to the corners of the frustum faces 
+	// Points 0 to 3 correspond to the near face 
+	// Points 4 to 7 correspond to the far face 
+	// Returns points in a face in this order:
+	//  2--3
+	//	|  |
+	//	0--1
+	// Returns false if a corner couldn't be generated for some reason.
+	bool GetCorners(Vector3D* pPoints) const;
+
+	fourplanes_t	planes[2];
+};
+
+#endif
+
+class FourQuaternions;
 /// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are
 /// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated.
 class ALIGN16 FourVectors
@@ -2438,6 +4568,76 @@ class ALIGN16 FourVectors
 public:
 	fltx4 x, y, z;
 
+	FourVectors(void)
+	{
+	}
+
+	FourVectors(FourVectors const& src)
+	{
+		x = src.x;
+		y = src.y;
+		z = src.z;
+	}
+
+	explicit FORCEINLINE FourVectors(float a)
+	{
+		fltx4 aReplicated = ReplicateX4(a);
+		x = y = z = aReplicated;
+	}
+
+	FORCEINLINE void Init(void)
+	{
+		x = Four_Zeros;
+		y = Four_Zeros;
+		z = Four_Zeros;
+	}
+
+	FORCEINLINE void Init(float flX, float flY, float flZ)
+	{
+		x = ReplicateX4(flX);
+		y = ReplicateX4(flY);
+		z = ReplicateX4(flZ);
+	}
+
+	FORCEINLINE FourVectors(float flX, float flY, float flZ)
+	{
+		Init(flX, flY, flZ);
+	}
+
+	FORCEINLINE void Init(fltx4 const& fl4X, fltx4 const& fl4Y, fltx4 const& fl4Z)
+	{
+		x = fl4X;
+		y = fl4Y;
+		z = fl4Z;
+	}
+
+	FORCEINLINE FourVectors(fltx4 const& fl4X, fltx4 const& fl4Y, fltx4 const& fl4Z)
+	{
+		Init(fl4X, fl4Y, fl4Z);
+	}
+
+
+
+	/// construct a FourVectors from 4 separate Vectors
+	FORCEINLINE FourVectors(Vector3D const& a, Vector3D const& b, Vector3D const& c, Vector3D const& d)
+	{
+		LoadAndSwizzle(a, b, c, d);
+	}
+
+	/// construct a FourVectors from 4 separate Vectors
+	FORCEINLINE FourVectors(VectorAligned const& a, VectorAligned const& b, VectorAligned const& c, VectorAligned const& d)
+	{
+		LoadAndSwizzleAligned(a, b, c, d);
+	}
+
+	// construct from twelve floats; really only useful for static const constructors.
+	// input arrays must be aligned, and in the fourvectors' native format
+	// (eg in xxxx,yyyy,zzzz form) 
+	// each pointer should be to an aligned array of four floats
+	FORCEINLINE FourVectors(const float* xs, const float* ys, const float* zs) :
+		x(LoadAlignedSIMD(xs)), y(LoadAlignedSIMD(ys)), z(LoadAlignedSIMD(zs))
+	{};
+
 	FORCEINLINE void DuplicateVector(Vector3D const& v)			//< set all 4 vectors to the same vector value
 	{
 		x = ReplicateX4(v.x);
@@ -2505,6 +4705,25 @@ public:
 		return dot;
 	}
 
+	FORCEINLINE FourVectors operator*(float b) const					//< scale
+	{
+		fltx4 scalepacked = ReplicateX4(b);
+		FourVectors res;
+		res.x = MulSIMD(x, scalepacked);
+		res.y = MulSIMD(y, scalepacked);
+		res.z = MulSIMD(z, scalepacked);
+		return res;
+	}
+
+	FORCEINLINE FourVectors operator*(FLTX4 fl4Scale) const					//< scale
+	{
+		FourVectors res;
+		res.x = MulSIMD(x, fl4Scale);
+		res.y = MulSIMD(y, fl4Scale);
+		res.z = MulSIMD(z, fl4Scale);
+		return res;
+	}
+
 	FORCEINLINE void VProduct(FourVectors const& b)				//< component by component mul
 	{
 		x = MulSIMD(x, b.x);
@@ -2529,12 +4748,18 @@ public:
 	// If you have a long list of FourVectors structures that you all want 
 	// to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
 	inline void RotateBy(const matrix3x4_t& matrix);
+	/***** removed because one of the SWIG permutations doesn't include ssequaternion.h, causing a missing symbol on this function:
+	// rotate these vectors ( in place ) by the corresponding quaternions:
+	inline void RotateBy( const FourQuaternions &quats );
+	******/
 
 	/// You can use this to rotate a long array of FourVectors all by the same
 	/// matrix. The first parameter is the head of the array. The second is the
 	/// number of vectors to rotate. The third is the matrix.
 	static void RotateManyBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix);
 
+	static void RotateManyBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors* RESTRICT pOut);
+
 	/// Assume the vectors are points, and transform them in place by the matrix.
 	inline void TransformBy(const matrix3x4_t& matrix);
 
@@ -2552,6 +4777,9 @@ public:
 	/// This is an in-place transformation.
 	static void TransformManyBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix);
 
+	static void CalcClosestPointOnLineSIMD(const FourVectors& P, const FourVectors& vLineA, const FourVectors& vLineB, FourVectors& vClosest, fltx4* outT = 0);
+	static fltx4 CalcClosestPointToLineTSIMD(const FourVectors& P, const FourVectors& vLineA, const FourVectors& vLineB, FourVectors& vDir);
+
 	// X(),Y(),Z() - get at the desired component of the i'th (0..3) vector.
 	FORCEINLINE const float& X(int idx) const
 	{
@@ -2589,17 +4817,6 @@ public:
 		return Vector3D(X(idx), Y(idx), Z(idx));
 	}
 
-	FourVectors(void)
-	{
-	}
-
-	FourVectors(FourVectors const& src)
-	{
-		x = src.x;
-		y = src.y;
-		z = src.z;
-	}
-
 	FORCEINLINE void operator=(FourVectors const& src)
 	{
 		x = src.x;
@@ -2612,19 +4829,19 @@ public:
 	{
 		// TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360
 		// use an unfolded implementation here
-#if _X360
+#if defined( _X360 ) || defined(_PS3)
 		fltx4 tx = LoadUnalignedSIMD(&a.x);
 		fltx4 ty = LoadUnalignedSIMD(&b.x);
 		fltx4 tz = LoadUnalignedSIMD(&c.x);
 		fltx4 tw = LoadUnalignedSIMD(&d.x);
-		fltx4 r0 = __vmrghw(tx, tz);
-		fltx4 r1 = __vmrghw(ty, tw);
-		fltx4 r2 = __vmrglw(tx, tz);
-		fltx4 r3 = __vmrglw(ty, tw);
+		fltx4 r0 = VectorMergeHighSIMD(tx, tz);
+		fltx4 r1 = VectorMergeHighSIMD(ty, tw);
+		fltx4 r2 = VectorMergeLowSIMD(tx, tz);
+		fltx4 r3 = VectorMergeLowSIMD(ty, tw);
 
-		x = __vmrghw(r0, r1);
-		y = __vmrglw(r0, r1);
-		z = __vmrghw(r2, r3);
+		x = VectorMergeHighSIMD(r0, r1);
+		y = VectorMergeLowSIMD(r0, r1);
+		z = VectorMergeHighSIMD(r2, r3);
 #else
 		x = LoadUnalignedSIMD(&(a.x));
 		y = LoadUnalignedSIMD(&(b.x));
@@ -2639,23 +4856,87 @@ public:
 #endif
 	}
 
+	FORCEINLINE void LoadAndSwizzle(Vector3D const& a)
+	{
+		LoadAndSwizzle(a, a, a, a);
+	}
+
+	// Broadcasts a, b, c, and d into the four vectors
+	// This is only performant if the floats are ALREADY IN MEMORY
+	// and not on registers -- eg, 
+	// .Load( &fltArrray[0], &fltArrray[1], &fltArrray[2], &fltArrray[3] ) is okay,
+	// .Load( fltArrray[0] * 0.5f,  fltArrray[1] * 0.5f,  fltArrray[2] * 0.5f,  fltArrray[3] * 0.5f ) is not.
+	FORCEINLINE void Load(const float& a, const float& b, const float& c, const float& d)
+	{
+#if defined( _X360 ) || defined( _PS3 )
+		fltx4 temp[4];
+		temp[0] = LoadUnalignedFloatSIMD(&a);
+		temp[1] = LoadUnalignedFloatSIMD(&b);
+		temp[2] = LoadUnalignedFloatSIMD(&c);
+		temp[3] = LoadUnalignedFloatSIMD(&d);
+		y = VectorMergeHighSIMD(temp[0], temp[2]); // ac__
+		z = VectorMergeHighSIMD(temp[1], temp[3]); // bd__
+
+		x = VectorMergeHighSIMD(y, z); // abcd
+		y = x;
+		z = x;
+#else
+		ALIGN16 float temp[4];
+		temp[0] = a; temp[1] = b; temp[2] = c; temp[3] = d;
+		fltx4 v = LoadAlignedSIMD(temp);
+		x = v;
+		y = v;
+		z = v;
+#endif
+	}
+
+	// transform four horizontal vectors into the internal vertical ones
+	FORCEINLINE void LoadAndSwizzle(FLTX4 a, FLTX4 b, FLTX4 c, FLTX4 d)
+	{
+#if defined( _X360 ) || defined( _PS3 )
+		fltx4 tx = a;
+		fltx4 ty = b;
+		fltx4 tz = c;
+		fltx4 tw = d;
+		fltx4 r0 = VectorMergeHighSIMD(tx, tz);
+		fltx4 r1 = VectorMergeHighSIMD(ty, tw);
+		fltx4 r2 = VectorMergeLowSIMD(tx, tz);
+		fltx4 r3 = VectorMergeLowSIMD(ty, tw);
+
+		x = VectorMergeHighSIMD(r0, r1);
+		y = VectorMergeLowSIMD(r0, r1);
+		z = VectorMergeHighSIMD(r2, r3);
+#else
+		x = a;
+		y = b;
+		z = c;
+		fltx4 w = d;
+		// now, matrix is:
+		// x y z ?
+		// x y z ?
+		// x y z ?
+		// x y z ?
+		TransposeSIMD(x, y, z, w);
+#endif
+	}
+
 	/// LoadAndSwizzleAligned - load 4 Vectors into a FourVectors, performing transpose op.
 	/// all 4 vectors must be 128 bit boundary
 	FORCEINLINE void LoadAndSwizzleAligned(const float* RESTRICT a, const float* RESTRICT b, const float* RESTRICT c, const float* RESTRICT d)
 	{
-#if _X360
+#if defined( _X360 ) || defined( _PS3 )
 		fltx4 tx = LoadAlignedSIMD(a);
 		fltx4 ty = LoadAlignedSIMD(b);
 		fltx4 tz = LoadAlignedSIMD(c);
 		fltx4 tw = LoadAlignedSIMD(d);
-		fltx4 r0 = __vmrghw(tx, tz);
-		fltx4 r1 = __vmrghw(ty, tw);
-		fltx4 r2 = __vmrglw(tx, tz);
-		fltx4 r3 = __vmrglw(ty, tw);
+		fltx4 r0 = VectorMergeHighSIMD(tx, tz);
+		fltx4 r1 = VectorMergeHighSIMD(ty, tw);
+		fltx4 r2 = VectorMergeLowSIMD(tx, tz);
+		fltx4 r3 = VectorMergeLowSIMD(ty, tw);
 
-		x = __vmrghw(r0, r1);
-		y = __vmrglw(r0, r1);
-		z = __vmrghw(r2, r3);
+		x = VectorMergeHighSIMD(r0, r1);
+		y = VectorMergeLowSIMD(r0, r1);
+		z = VectorMergeHighSIMD(r2, r3);
 #else
 		x = LoadAlignedSIMD(a);
 		y = LoadAlignedSIMD(b);
@@ -2675,6 +4956,81 @@ public:
 		LoadAndSwizzleAligned(&a.x, &b.x, &c.x, &d.x);
 	}
 
+	/// Unpack a FourVectors back into four horizontal fltx4s.
+	/// Since the FourVectors doesn't store a w row, you can optionally
+	/// specify your own; otherwise it will be 0.
+	/// This function ABSOLUTELY MUST be inlined or the reference parameters will
+	/// induce a severe load-hit-store.
+	FORCEINLINE void TransposeOnto(fltx4& out0, fltx4& out1, fltx4& out2, fltx4& out3, FLTX4 w = Four_Zeros) const
+	{
+		// TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360
+		// use an unfolded implementation here
+#if defined( _X360 ) || defined(_PS3)
+		fltx4 r0 = VectorMergeHighSIMD(x, z);
+		fltx4 r1 = VectorMergeHighSIMD(y, w);
+		fltx4 r2 = VectorMergeLowSIMD(x, z);
+		fltx4 r3 = VectorMergeLowSIMD(y, w);
+
+		out0 = VectorMergeHighSIMD(r0, r1);
+		out1 = VectorMergeLowSIMD(r0, r1);
+		out2 = VectorMergeHighSIMD(r2, r3);
+		out3 = VectorMergeLowSIMD(r2, r3);
+#else
+		out0 = x;
+		out1 = y;
+		out2 = z;
+		out3 = w;
+
+		TransposeSIMD(out0, out1, out2, out3);
+#endif
+	}
+
+#if !defined(__SPU__)
+	/// Store a FourVectors into four NON-CONTIGUOUS Vector*'s. 
+	FORCEINLINE void StoreUnalignedVector3SIMD(Vector3D* RESTRICT out0, Vector3D* RESTRICT out1, Vector3D* RESTRICT out2, Vector3D* RESTRICT out3) const;
+#endif
+
+	/// Store a FourVectors into four NON-CONTIGUOUS VectorAligned s. 
+	FORCEINLINE void StoreAlignedVectorSIMD(VectorAligned* RESTRICT out0, VectorAligned* RESTRICT out1, VectorAligned* RESTRICT out2, VectorAligned* RESTRICT out3) const;
+
+#if !defined(__SPU__)
+	/// Store a FourVectors into four CONSECUTIVE Vectors in memory,
+	/// where the first vector IS NOT aligned on a 16-byte boundary. 
+	FORCEINLINE void StoreUnalignedContigVector3SIMD(Vector3D* RESTRICT pDestination)
+	{
+		fltx4 a, b, c, d;
+		TransposeOnto(a, b, c, d);
+		StoreFourUnalignedVector3SIMD(a, b, c, d, pDestination);
+	}
+#endif
+
+	/// Store a FourVectors into four CONSECUTIVE Vectors in memory,
+	/// where the first vector IS aligned on a 16-byte boundary. 
+	/// (since four Vector3s = 48 bytes, groups of four can be said
+	///  to be 16-byte aligned though obviously the 2nd, 3d, and 4th
+	///  vectors in the group individually are not)
+#if !defined(__SPU__)
+	FORCEINLINE void StoreAlignedContigVector3SIMD(Vector3D* RESTRICT pDestination)
+	{
+		fltx4 a, b, c, d;
+		TransposeOnto(a, b, c, d);
+		StoreFourAlignedVector3SIMD(a, b, c, d, pDestination);
+	}
+
+	/// Store a FourVectors into four CONSECUTIVE VectorAligneds in memory
+	FORCEINLINE void StoreAlignedContigVectorASIMD(VectorAligned* RESTRICT pDestination)
+	{
+		StoreAlignedVectorSIMD(pDestination, pDestination + 1, pDestination + 2, pDestination + 3);
+	}
+#endif
+
+	/// return the squared length of all 4 vectors, the same name as used on Vector
+	FORCEINLINE fltx4 LengthSqr(void) const
+	{
+		const FourVectors& a = *this;
+		return a * a;
+	}
+
 	/// return the squared length of all 4 vectors
 	FORCEINLINE fltx4 length2(void) const
 	{
@@ -2687,6 +5043,13 @@ public:
 		return SqrtEstSIMD(length2());
 	}
 
+	/// full precision square root. upper/lower case name is an artifact - the lower case one should be changed to refelct the lower accuracy. I added the mixed case one for compat with Vector
+	FORCEINLINE fltx4 Length(void) const
+	{
+		return SqrtSIMD(length2());
+	}
+
+
 	/// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction)
 	FORCEINLINE void VectorNormalizeFast(void)
 	{
@@ -2701,18 +5064,6 @@ public:
 		(*this) *= ReciprocalSqrtSIMD(mag_sq);				// *(1.0/sqrt(length^2))
 	}
 
-	/// construct a FourVectors from 4 separate Vectors
-	FORCEINLINE FourVectors(Vector3D const& a, Vector3D const& b, Vector3D const& c, Vector3D const& d)
-	{
-		LoadAndSwizzle(a, b, c, d);
-	}
-
-	/// construct a FourVectors from 4 separate Vectors
-	FORCEINLINE FourVectors(VectorAligned const& a, VectorAligned const& b, VectorAligned const& c, VectorAligned const& d)
-	{
-		LoadAndSwizzleAligned(a, b, c, d);
-	}
-
 	FORCEINLINE fltx4 DistToSqr(FourVectors const& pnt)
 	{
 		fltx4 fl4dX = SubSIMD(pnt.x, x);
@@ -2748,9 +5099,97 @@ public:
 		lineDelta *= fl4T;
 		return v4OurPnt.DistToSqr(lineDelta);
 	}
+	FORCEINLINE FourVectors Normalized()const
+	{
+		fltx4 fl4LengthInv = ReciprocalSqrtSIMD(LengthSqr());
+		FourVectors out;
+		out.x = x * fl4LengthInv;
+		out.y = y * fl4LengthInv;
+		out.z = z * fl4LengthInv;
+		return out;
+	}
 
+	FORCEINLINE FourVectors NormalizedSafeX() const
+	{
+		fltx4 f4LenSqr = LengthSqr();
+		fltx4 isBigEnough = CmpGeSIMD(f4LenSqr, Four_Epsilons);
+		fltx4 fl4LengthInv = ReciprocalSqrtSIMD(f4LenSqr);
+		FourVectors out;
+		out.x = MaskedAssign(isBigEnough, x * fl4LengthInv, Four_Ones);
+		out.y = AndSIMD(y * fl4LengthInv, isBigEnough);
+		out.z = AndSIMD(z * fl4LengthInv, isBigEnough);
+		return out;
+	}
+	FORCEINLINE FourVectors NormalizedSafeY() const
+	{
+		fltx4 f4LenSqr = LengthSqr();
+		fltx4 isBigEnough = CmpGeSIMD(f4LenSqr, Four_Epsilons);
+		fltx4 fl4LengthInv = ReciprocalSqrtSIMD(f4LenSqr);
+		FourVectors out;
+		out.x = AndSIMD(x * fl4LengthInv, isBigEnough);
+		out.y = MaskedAssign(isBigEnough, y * fl4LengthInv, Four_Ones);
+		out.z = AndSIMD(z * fl4LengthInv, isBigEnough);
+		return out;
+	}
+
+	FORCEINLINE FourVectors NormalizedSafeZ() const
+	{
+		fltx4 f4LenSqr = LengthSqr();
+		fltx4 isBigEnough = CmpGeSIMD(f4LenSqr, Four_Epsilons);
+		fltx4 fl4LengthInv = ReciprocalSqrtSIMD(f4LenSqr);
+		FourVectors out;
+		out.x = AndSIMD(x * fl4LengthInv, isBigEnough);
+		out.y = AndSIMD(y * fl4LengthInv, isBigEnough);
+		out.z = MaskedAssign(isBigEnough, z * fl4LengthInv, Four_Ones);
+		return out;
+	}
 };
 
+
+inline FourVectors CrossProduct(const FourVectors& a, const FourVectors& b)
+{
+	return FourVectors(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+}
+
+inline fltx4 DotProduct(const FourVectors& a, const FourVectors& b)
+{
+	return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+inline FourVectors operator * (fltx4 left, const FourVectors& right)
+{
+	return right * left;
+}
+
+
+//
+inline FourVectors Mul(const FourVectors& a, const fltx4& b)
+{
+	FourVectors ret;
+	ret.x = MulSIMD(a.x, b);
+	ret.y = MulSIMD(a.y, b);
+	ret.z = MulSIMD(a.z, b);
+	return ret;
+}
+
+inline FourVectors Mul(const FourVectors& a, const FourVectors& b)
+{
+	FourVectors ret;
+	ret.x = MulSIMD(a.x, b.x);
+	ret.y = MulSIMD(a.y, b.y);
+	ret.z = MulSIMD(a.z, b.z);
+	return ret;
+}
+
+inline FourVectors Madd(const FourVectors& a, const fltx4& b, const FourVectors& c)	// a*b + c
+{
+	FourVectors ret;
+	ret.x = MaddSIMD(a.x, b, c.x);
+	ret.y = MaddSIMD(a.y, b, c.y);
+	ret.z = MaddSIMD(a.z, b, c.z);
+	return ret;
+}
+
 /// form 4 cross products
 inline FourVectors operator ^(const FourVectors& a, const FourVectors& b)
 {
@@ -2761,6 +5200,24 @@ inline FourVectors operator ^(const FourVectors& a, const FourVectors& b)
 	return ret;
 }
 
+inline FourVectors operator-(const FourVectors& a, const FourVectors& b)
+{
+	FourVectors ret;
+	ret.x = SubSIMD(a.x, b.x);
+	ret.y = SubSIMD(a.y, b.y);
+	ret.z = SubSIMD(a.z, b.z);
+	return ret;
+}
+
+inline FourVectors operator+(const FourVectors& a, const FourVectors& b)
+{
+	FourVectors ret;
+	ret.x = AddSIMD(a.x, b.x);
+	ret.y = AddSIMD(a.y, b.y);
+	ret.z = AddSIMD(a.z, b.z);
+	return ret;
+}
+
 /// component-by-componentwise MAX operator
 inline FourVectors maximum(const FourVectors& a, const FourVectors& b)
 {
@@ -2781,6 +5238,32 @@ inline FourVectors minimum(const FourVectors& a, const FourVectors& b)
 	return ret;
 }
 
+FORCEINLINE FourVectors RotateLeft(const FourVectors& src)
+{
+	FourVectors ret;
+	ret.x = RotateLeft(src.x);
+	ret.y = RotateLeft(src.y);
+	ret.z = RotateLeft(src.z);
+	return ret;
+}
+
+FORCEINLINE FourVectors RotateRight(const FourVectors& src)
+{
+	FourVectors ret;
+	ret.x = RotateRight(src.x);
+	ret.y = RotateRight(src.y);
+	ret.z = RotateRight(src.z);
+	return ret;
+}
+FORCEINLINE FourVectors MaskedAssign(const bi32x4& ReplacementMask, const FourVectors& NewValue, const FourVectors& OldValue)
+{
+	FourVectors ret;
+	ret.x = MaskedAssign(ReplacementMask, NewValue.x, OldValue.x);
+	ret.y = MaskedAssign(ReplacementMask, NewValue.y, OldValue.y);
+	ret.z = MaskedAssign(ReplacementMask, NewValue.z, OldValue.z);
+	return ret;
+}
+
 /// calculate reflection vector. incident and normal dir assumed normalized
 FORCEINLINE FourVectors VectorReflect(const FourVectors& incident, const FourVectors& normal)
 {
@@ -2804,7 +5287,77 @@ FORCEINLINE FourVectors VectorSlide(const FourVectors& incident, const FourVecto
 	return ret;
 }
 
+/// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction)
+FORCEINLINE FourVectors VectorNormalizeFast(const FourVectors& src)
+{
+	fltx4 mag_sq = ReciprocalSqrtEstSIMD(src * src);					// *(1.0/sqrt(length^2))
+	FourVectors result;
+	result.x = MulSIMD(src.x, mag_sq);
+	result.y = MulSIMD(src.y, mag_sq);
+	result.z = MulSIMD(src.z, mag_sq);
+	return result;
+}
 
+#if !defined(__SPU__)
+/// Store a FourVectors into four NON-CONTIGUOUS Vector*'s. 
+FORCEINLINE void FourVectors::StoreUnalignedVector3SIMD(Vector3D* RESTRICT out0, Vector3D* RESTRICT out1, Vector3D* RESTRICT out2, Vector3D* RESTRICT out3) const
+{
+#ifdef _X360
+	fltx4 x0, x1, x2, x3, y0, y1, y2, y3, z0, z1, z2, z3;
+	x0 = SplatXSIMD(x); // all x0x0x0x0
+	x1 = SplatYSIMD(x);
+	x2 = SplatZSIMD(x);
+	x3 = SplatWSIMD(x);
+
+	y0 = SplatXSIMD(y);
+	y1 = SplatYSIMD(y);
+	y2 = SplatZSIMD(y);
+	y3 = SplatWSIMD(y);
+
+	z0 = SplatXSIMD(z);
+	z1 = SplatYSIMD(z);
+	z2 = SplatZSIMD(z);
+	z3 = SplatWSIMD(z);
+
+	__stvewx(x0, out0->Base(), 0);  // store X word
+	__stvewx(y0, out0->Base(), 4);  // store Y word
+	__stvewx(z0, out0->Base(), 8);  // store Z word
+
+	__stvewx(x1, out1->Base(), 0);  // store X word
+	__stvewx(y1, out1->Base(), 4);  // store Y word
+	__stvewx(z1, out1->Base(), 8);  // store Z word
+
+	__stvewx(x2, out2->Base(), 0);  // store X word
+	__stvewx(y2, out2->Base(), 4);  // store Y word
+	__stvewx(z2, out2->Base(), 8);  // store Z word
+
+	__stvewx(x3, out3->Base(), 0);  // store X word
+	__stvewx(y3, out3->Base(), 4);  // store Y word
+	__stvewx(z3, out3->Base(), 8);  // store Z word
+#else
+	fltx4 a, b, c, d;
+	TransposeOnto(a, b, c, d);
+	StoreUnaligned3SIMD(out0->Base(), a);
+	StoreUnaligned3SIMD(out1->Base(), b);
+	StoreUnaligned3SIMD(out2->Base(), c);
+	StoreUnaligned3SIMD(out3->Base(), d);
+#endif
+}
+
+/// Store a FourVectors into four NON-CONTIGUOUS VectorAligned s. 
+FORCEINLINE void FourVectors::StoreAlignedVectorSIMD(VectorAligned* RESTRICT out0, VectorAligned* RESTRICT out1, VectorAligned* RESTRICT out2, VectorAligned* RESTRICT out3) const
+{
+	fltx4 a, b, c, d;
+	TransposeOnto(a, b, c, d);
+	StoreAligned3SIMD(out0, a);
+	StoreAligned3SIMD(out1, b);
+	StoreAligned3SIMD(out2, c);
+	StoreAligned3SIMD(out3, d);
+
+}
+#endif
+
+#if !defined(__SPU__)
 // Assume the given matrix is a rotation, and rotate these vectors by it.
 // If you have a long list of FourVectors structures that you all want 
 // to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
@@ -2818,26 +5371,24 @@ void FourVectors::RotateBy(const matrix3x4_t& matrix)
 		matSplat10, matSplat11, matSplat12,
 		matSplat20, matSplat21, matSplat22;
 
-	{
-		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
-		// often unaligned. The w components will be the tranpose row of
-		// the matrix, but we don't really care about that.
-		fltx4 matCol0 = LoadUnalignedSIMD(matrix[0]);
-		fltx4 matCol1 = LoadUnalignedSIMD(matrix[1]);
-		fltx4 matCol2 = LoadUnalignedSIMD(matrix[2]);
+	// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
+	// often unaligned. The w components will be the tranpose row of
+	// the matrix, but we don't really care about that.
+	fltx4 matCol0 = LoadUnalignedSIMD(matrix[0]);
+	fltx4 matCol1 = LoadUnalignedSIMD(matrix[1]);
+	fltx4 matCol2 = LoadUnalignedSIMD(matrix[2]);
 
-		matSplat00 = SplatXSIMD(matCol0);
-		matSplat01 = SplatYSIMD(matCol0);
-		matSplat02 = SplatZSIMD(matCol0);
+	matSplat00 = SplatXSIMD(matCol0);
+	matSplat01 = SplatYSIMD(matCol0);
+	matSplat02 = SplatZSIMD(matCol0);
 
-		matSplat10 = SplatXSIMD(matCol1);
-		matSplat11 = SplatYSIMD(matCol1);
-		matSplat12 = SplatZSIMD(matCol1);
+	matSplat10 = SplatXSIMD(matCol1);
+	matSplat11 = SplatYSIMD(matCol1);
+	matSplat12 = SplatZSIMD(matCol1);
 
-		matSplat20 = SplatXSIMD(matCol2);
-		matSplat21 = SplatYSIMD(matCol2);
-		matSplat22 = SplatZSIMD(matCol2);
-	}
+	matSplat20 = SplatXSIMD(matCol2);
+	matSplat21 = SplatYSIMD(matCol2);
+	matSplat22 = SplatZSIMD(matCol2);
 
 	// Trust in the compiler to schedule these operations correctly:
 	fltx4 outX, outY, outZ;
@@ -2850,6 +5401,7 @@ void FourVectors::RotateBy(const matrix3x4_t& matrix)
 	z = outZ;
 }
 
+
 // Assume the given matrix is a rotation, and rotate these vectors by it.
 // If you have a long list of FourVectors structures that you all want 
 // to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
@@ -2863,26 +5415,24 @@ void FourVectors::TransformBy(const matrix3x4_t& matrix)
 		matSplat10, matSplat11, matSplat12,
 		matSplat20, matSplat21, matSplat22;
 
-	{
-		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
-		// often unaligned. The w components will be the tranpose row of
-		// the matrix, but we don't really care about that.
-		fltx4 matCol0 = LoadUnalignedSIMD(matrix[0]);
-		fltx4 matCol1 = LoadUnalignedSIMD(matrix[1]);
-		fltx4 matCol2 = LoadUnalignedSIMD(matrix[2]);
+	// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
+	// often unaligned. The w components will be the tranpose row of
+	// the matrix, but we don't really care about that.
+	fltx4 matCol0 = LoadUnalignedSIMD(matrix[0]);
+	fltx4 matCol1 = LoadUnalignedSIMD(matrix[1]);
+	fltx4 matCol2 = LoadUnalignedSIMD(matrix[2]);
 
-		matSplat00 = SplatXSIMD(matCol0);
-		matSplat01 = SplatYSIMD(matCol0);
-		matSplat02 = SplatZSIMD(matCol0);
+	matSplat00 = SplatXSIMD(matCol0);
+	matSplat01 = SplatYSIMD(matCol0);
+	matSplat02 = SplatZSIMD(matCol0);
 
-		matSplat10 = SplatXSIMD(matCol1);
-		matSplat11 = SplatYSIMD(matCol1);
-		matSplat12 = SplatZSIMD(matCol1);
+	matSplat10 = SplatXSIMD(matCol1);
+	matSplat11 = SplatYSIMD(matCol1);
+	matSplat12 = SplatZSIMD(matCol1);
 
-		matSplat20 = SplatXSIMD(matCol2);
-		matSplat21 = SplatYSIMD(matCol2);
-		matSplat22 = SplatZSIMD(matCol2);
-	}
+	matSplat20 = SplatXSIMD(matCol2);
+	matSplat21 = SplatYSIMD(matCol2);
+	matSplat22 = SplatZSIMD(matCol2);
 
 	// Trust in the compiler to schedule these operations correctly:
 	fltx4 outX, outY, outZ;
@@ -2895,12 +5445,8 @@ void FourVectors::TransformBy(const matrix3x4_t& matrix)
 	y = AddSIMD(outY, ReplicateX4(matrix[1][3]));
 	z = AddSIMD(outZ, ReplicateX4(matrix[2][3]));
 }
+#endif
 
-
-
-/// quick, low quality perlin-style noise() function suitable for real time use.
-/// return value is -1..1. Only reliable around +/- 1 million or so.
-fltx4 NoiseSIMD(const fltx4& x, const fltx4& y, const fltx4& z);
 fltx4 NoiseSIMD(FourVectors const& v);
 
 // vector valued noise direction
@@ -2909,6 +5455,13 @@ FourVectors DNoiseSIMD(FourVectors const& v);
 // vector value "curl" noise function. see http://hyperphysics.phy-astr.gsu.edu/hbase/curl.html
 FourVectors CurlNoiseSIMD(FourVectors const& v);
 
+//#endif // !defined SPU
+
+
+/// quick, low quality perlin-style noise() function suitable for real time use.
+/// return value is -1..1. Only reliable around +/- 1 million or so.
+fltx4 NoiseSIMD(const fltx4& x, const fltx4& y, const fltx4& z);
+
 
 /// calculate the absolute value of a packed single
 inline fltx4 fabs(const fltx4& x)
@@ -2916,13 +5469,18 @@ inline fltx4 fabs(const fltx4& x)
 	return AndSIMD(x, LoadAlignedSIMD(g_SIMD_clear_signmask));
 }
 
+// Convenience version
+inline fltx4 AbsSIMD(const fltx4& x)
+{
+	return fabs(x);
+}
+
 /// negate all four components of a SIMD packed single
 inline fltx4 fnegate(const fltx4& x)
 {
 	return XorSIMD(x, LoadAlignedSIMD(g_SIMD_signmask));
 }
 
-
 fltx4 Pow_FixedPoint_Exponent_SIMD(const fltx4& x, int exponent);
 
 // PowSIMD - raise a SIMD register to a power.  This is analogous to the C pow() function, with some
@@ -2936,8 +5494,40 @@ inline fltx4 PowSIMD(const fltx4& x, float exponent)
 	return Pow_FixedPoint_Exponent_SIMD(x, (int)(4.0 * exponent));
 }
 
+///  (x<1)?x^(1/2.2):1. Use a 4th order polynomial to approximate x^(1/2.2) over 0..1
+inline fltx4 LinearToGammaSIMD(fltx4 x)
+{
+	// y = -3.7295x4 + 8.9635x3 - 7.7397x2 + 3.443x + 0.048
+	x = MaxSIMD(MinSIMD(Four_Ones, x), Four_Zeros);
+	return AddSIMD(Four_LinearToGammaCoefficients_E,
+		MulSIMD(x, AddSIMD(Four_LinearToGammaCoefficients_D,
+			MulSIMD(x, AddSIMD(Four_LinearToGammaCoefficients_C,
+				MulSIMD(x, AddSIMD(Four_LinearToGammaCoefficients_B,
+					MulSIMD(x, Four_LinearToGammaCoefficients_A))))))));
+}
 
 
+inline fltx4 GammaToLinearSIMD(fltx4 x)
+{
+	x = MaxSIMD(x, Four_Zeros);
+	x = AddSIMD(Four_GammaToLinearCoefficients_D,
+		MulSIMD(x, AddSIMD(Four_GammaToLinearCoefficients_C,
+			MulSIMD(x, AddSIMD(Four_GammaToLinearCoefficients_B,
+				MulSIMD(x, Four_GammaToLinearCoefficients_A))))));
+	return MinSIMD(x, Four_Ones);
+}
+
+/// ( x > 1 ) ? x : x^2.2
+inline fltx4 GammaToLinearExtendedSIMD(fltx4 x)
+{
+	x = MaxSIMD(x, Four_Zeros);
+	fltx4 fl4Ret = AddSIMD(Four_GammaToLinearCoefficients_D,
+		MulSIMD(x, AddSIMD(Four_GammaToLinearCoefficients_C,
+			MulSIMD(x, AddSIMD(Four_GammaToLinearCoefficients_B,
+				MulSIMD(x, Four_GammaToLinearCoefficients_A))))));
+	return MaskedAssign(CmpGeSIMD(x, Four_Ones), x, fl4Ret);
+}
+
 // random number generation - generate 4 random numbers quickly.
 
 void SeedRandSIMD(uint32 seed);								// seed the random # generator
@@ -2953,6 +5543,18 @@ FORCEINLINE fltx4 RandSignedSIMD(void)					// -1..1
 }
 
 
+FORCEINLINE fltx4 LerpSIMD(const fltx4& percent, const fltx4& a, const fltx4& b)
+{
+	return AddSIMD(a, MulSIMD(SubSIMD(b, a), percent));
+}
+
+FORCEINLINE fltx4 RemapValClampedSIMD(const fltx4& val, const fltx4& a, const fltx4& b, const fltx4& c, const fltx4& d) // Remap val from clamped range between a and b to new range between c and d
+{
+	fltx4 range = MaskedAssign(CmpEqSIMD(a, b), Four_Ones, SubSIMD(b, a)); //make sure range > 0
+	fltx4 cVal = MaxSIMD(Four_Zeros, MinSIMD(Four_Ones, DivSIMD(SubSIMD(val, a), range))); //saturate
+	return LerpSIMD(cVal, c, d);
+}
+
 // SIMD versions of mathlib simplespline functions
 // hermite basis function for smooth interpolation
 // Similar to Gain() above, but very cheap to call
@@ -3002,6 +5604,11 @@ FORCEINLINE fltx4 FracSIMD(const fltx4& val)
 	return XorSIMD(SubSIMD(fl4Abs, ival), XorSIMD(val, fl4Abs));			// restore sign bits
 }
 
+#ifndef SPU
+// Disable on SPU for the moment as it generates a warning
+// warning: dereferencing type-punned pointer will break strict-aliasing rules
+// This is related to LoadAlignedSIMD( (float *) g_SIMD_lsbmask )
+// LoadAlignedSIMD() under the hood is dereferencing the variable.
 FORCEINLINE fltx4 Mod2SIMD(const fltx4& val)
 {
 	fltx4 fl4Abs = fabs(val);
@@ -3009,6 +5616,7 @@ FORCEINLINE fltx4 Mod2SIMD(const fltx4& val)
 	ival = MaskedAssign(CmpGtSIMD(ival, fl4Abs), SubSIMD(ival, Four_Twos), ival);
 	return XorSIMD(SubSIMD(fl4Abs, ival), XorSIMD(val, fl4Abs));			// restore sign bits
 }
+#endif
 
 FORCEINLINE fltx4 Mod2SIMDPositiveInput(const fltx4& val)
 {
@@ -3040,7 +5648,7 @@ FORCEINLINE fltx4 SinEst01SIMD(const fltx4& val)
 {
 	fltx4 fl4Abs = fabs(val);
 	fltx4 fl4Reduced2 = Mod2SIMDPositiveInput(fl4Abs);
-	fltx4 fl4OddMask = CmpGeSIMD(fl4Reduced2, Four_Ones);
+	bi32x4 fl4OddMask = CmpGeSIMD(fl4Reduced2, Four_Ones);
 	fltx4 fl4val = SubSIMD(fl4Reduced2, AndSIMD(Four_Ones, fl4OddMask));
 	fltx4 fl4Sin = _SinEst01SIMD(fl4val);
 	fl4Sin = XorSIMD(fl4Sin, AndSIMD(LoadAlignedSIMD(g_SIMD_signmask), XorSIMD(val, fl4OddMask)));
@@ -3052,7 +5660,7 @@ FORCEINLINE fltx4 Sin01SIMD(const fltx4& val)
 {
 	fltx4 fl4Abs = fabs(val);
 	fltx4 fl4Reduced2 = Mod2SIMDPositiveInput(fl4Abs);
-	fltx4 fl4OddMask = CmpGeSIMD(fl4Reduced2, Four_Ones);
+	bi32x4 fl4OddMask = CmpGeSIMD(fl4Reduced2, Four_Ones);
 	fltx4 fl4val = SubSIMD(fl4Reduced2, AndSIMD(Four_Ones, fl4OddMask));
 	fltx4 fl4Sin = _Sin01SIMD(fl4val);
 	fl4Sin = XorSIMD(fl4Sin, AndSIMD(LoadAlignedSIMD(g_SIMD_signmask), XorSIMD(val, fl4OddMask)));
@@ -3060,6 +5668,17 @@ FORCEINLINE fltx4 Sin01SIMD(const fltx4& val)
 
 }
 
+FORCEINLINE fltx4 NatExpSIMD(const fltx4& val)			// why is ExpSimd( x ) defined to be 2^x?
+{
+	// need to write this. just stub with normal float implementation for now
+	fltx4 fl4Result;
+	SubFloat(fl4Result, 0) = exp(SubFloat(val, 0));
+	SubFloat(fl4Result, 1) = exp(SubFloat(val, 1));
+	SubFloat(fl4Result, 2) = exp(SubFloat(val, 2));
+	SubFloat(fl4Result, 3) = exp(SubFloat(val, 3));
+	return fl4Result;
+}
+
 // Schlick style Bias approximation see graphics gems 4 : bias(t,a)= t/( (1/a-2)*(1-t)+1)
 
 FORCEINLINE fltx4 PreCalcBiasParameter(const fltx4& bias_parameter)
@@ -3081,6 +5700,10 @@ FORCEINLINE fltx4 BiasSIMD(const fltx4& val, const fltx4& precalc_param)
 // Box/plane test 
 // NOTE: The w component of emins + emaxs must be 1 for this to work
 //-----------------------------------------------------------------------------
+
+#ifndef SPU
+// We don't need this on SPU right now
+
 FORCEINLINE int BoxOnPlaneSideSIMD(const fltx4& emins, const fltx4& emaxs, const cplane_t* p, float tolerance = 0.f)
 {
 	fltx4 corners[2];
@@ -3089,13 +5712,13 @@ FORCEINLINE int BoxOnPlaneSideSIMD(const fltx4& emins, const fltx4& emaxs, const
 	normal = SetWSIMD(normal, dist);
 	fltx4 t4 = ReplicateX4(tolerance);
 	fltx4 negt4 = ReplicateX4(-tolerance);
-	fltx4 cmp = CmpGeSIMD(normal, Four_Zeros);
+	bi32x4 cmp = CmpGeSIMD(normal, Four_Zeros);
 	corners[0] = MaskedAssign(cmp, emaxs, emins);
 	corners[1] = MaskedAssign(cmp, emins, emaxs);
 	fltx4 dot1 = Dot4SIMD(normal, corners[0]);
 	fltx4 dot2 = Dot4SIMD(normal, corners[1]);
 	cmp = CmpGeSIMD(dot1, t4);
-	fltx4 cmp2 = CmpGtSIMD(negt4, dot2);
+	bi32x4 cmp2 = CmpGtSIMD(negt4, dot2);
 	fltx4 result = MaskedAssign(cmp, Four_Ones, Four_Zeros);
 	fltx4 result2 = MaskedAssign(cmp2, Four_Twos, Four_Zeros);
 	result = AddSIMD(result, result2);
@@ -3104,4 +5727,246 @@ FORCEINLINE int BoxOnPlaneSideSIMD(const fltx4& emins, const fltx4& emaxs, const
 	return sides[0];
 }
 
+
+// k-dop bounding volume. 26-dop bounds with 13 plane-pairs plus 3 other "arbitrary bounds". The arbitrary values could be used to hold type info, etc,
+// which can compare against "for free"
+class KDop32_t
+{
+public:
+	fltx4 m_Mins[4];
+	fltx4 m_Maxes[4];
+
+	FORCEINLINE bool Intersects(KDop32_t const& other) const;
+
+	FORCEINLINE void operator|=(KDop32_t const& other);
+
+	FORCEINLINE bool IsEmpty(void) const;
+
+	FORCEINLINE void Init(void)
+	{
+		for (int i = 0; i < ARRAYSIZE(m_Mins); i++)
+		{
+			m_Mins[i] = Four_FLT_MAX;
+			m_Maxes[i] = Four_Negative_FLT_MAX;
+		}
+	}
+
+	// given a set of points, expand the kdop to contain them
+	void AddPointSet(Vector3D const* pPoints, int nPnts);
+
+	void CreateFromPointSet(Vector3D const* pPoints, int nPnts);
+};
+
+FORCEINLINE void KDop32_t::operator|=(KDop32_t const& other)
+{
+	m_Mins[0] = MinSIMD(m_Mins[0], other.m_Mins[0]);
+	m_Mins[1] = MinSIMD(m_Mins[1], other.m_Mins[1]);
+	m_Mins[2] = MinSIMD(m_Mins[2], other.m_Mins[2]);
+	m_Mins[3] = MinSIMD(m_Mins[3], other.m_Mins[3]);
+
+	m_Maxes[0] = MaxSIMD(m_Maxes[0], other.m_Maxes[0]);
+	m_Maxes[1] = MaxSIMD(m_Maxes[1], other.m_Maxes[1]);
+	m_Maxes[2] = MaxSIMD(m_Maxes[2], other.m_Maxes[2]);
+	m_Maxes[3] = MaxSIMD(m_Maxes[3], other.m_Maxes[3]);
+
+
+}
+
+FORCEINLINE bool KDop32_t::Intersects(KDop32_t const& other) const
+{
+	bi32x4 c00 = CmpLeSIMD(m_Mins[0], other.m_Maxes[0]);
+	bi32x4 c01 = CmpLeSIMD(m_Mins[1], other.m_Maxes[1]);
+	bi32x4 c02 = CmpLeSIMD(m_Mins[2], other.m_Maxes[2]);
+	bi32x4 c03 = CmpLeSIMD(m_Mins[3], other.m_Maxes[3]);
+
+	bi32x4 c10 = CmpGeSIMD(m_Maxes[0], other.m_Mins[0]);
+	bi32x4 c11 = CmpGeSIMD(m_Maxes[1], other.m_Mins[1]);
+	bi32x4 c12 = CmpGeSIMD(m_Maxes[2], other.m_Mins[2]);
+	bi32x4 c13 = CmpGeSIMD(m_Maxes[3], other.m_Mins[3]);
+
+	bi32x4 a0 = AndSIMD(AndSIMD(c00, c01), AndSIMD(c02, c03));
+	bi32x4 a1 = AndSIMD(AndSIMD(c10, c11), AndSIMD(c12, c13));
+
+	return !(IsAnyZeros(AndSIMD(a1, a0)));
+}
+
+
+FORCEINLINE bool KDop32_t::IsEmpty(void) const
+{
+	bi32x4 c00 = CmpLtSIMD(m_Maxes[0], m_Mins[0]);
+	bi32x4 c01 = CmpLtSIMD(m_Maxes[1], m_Mins[1]);
+	bi32x4 c02 = CmpLtSIMD(m_Maxes[2], m_Mins[2]);
+	bi32x4 c03 = CmpLtSIMD(m_Maxes[3], m_Mins[3]);
+
+	return IsAnyTrue(OrSIMD(OrSIMD(c00, c01), OrSIMD(c02, c03)));
+}
+
+
+extern const fltx4 g_KDop32XDirs[4];
+extern const fltx4 g_KDop32YDirs[4];
+extern const fltx4 g_KDop32ZDirs[4];
+#endif
+
+#if 0
+
+// FIXME!!!  If we need a version of this that runs on 360, this is a work-in-progress version that hasn't been debugged.
+
+#define _VEC_SWIZZLE_QUAT48_UNPACK (__vector unsigned char)		{ 16, 17, 0, 1, 16, 17, 2, 3, 16, 17, 4, 5, 16, 17, 6, 7 }
+#define _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT (__vector unsigned int )		{ 0, 0, 1, 0 }
+
+// unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4
+FORCEINLINE fltx4 UnpackQuaternion48SIMD(const Quaternion48* RESTRICT pVec)
+{
+	// A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 .
+	// z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 .
+	// w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is
+	// w's sign bit.
+//	fltx4 q16s = XMLoadVector3((const void *)pVec);
+	fltx4 q16s = LoadUnaligned3SIMD((const float*)pVec);
+
+	//	fltx4 shift = *( fltx4 * )&g_SIMD_Quat48_Unpack_Shift; // load the aligned shift mask that we use to shuffle z.
+	//	fltx4 permute = *( fltx4 * )&g_SIMD_Quat48_Unpack_Permute0; // load the permute word that shuffles x,y,z into their own words
+	bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS.
+
+	//	q16s = __vperm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f
+	q16s = vec_perm(q16s, Four_Threes, _VEC_SWIZZLE_QUAT48_UNPACK); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f
+
+	//	q16s = __vslh(q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
+//	q16s = vec_sl( *( u32x4 * )( void * )( &q16s ), _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT ); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
+	u32x4 tmp = IntShiftLeftWordSIMD(*(u32x4*)&q16s, _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT);
+	q16s = *(fltx4*)&tmp;
+
+	// each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1
+	const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
+	const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);
+
+	/*
+	fltx4 ret = __vcfux( q16s, 0 ); // convert from uint16 to floats.
+
+	// scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0);
+	ret = __vmaddfp( ret, g_SIMD_Quat48_DivByU15, Four_NegativeOnes  );
+	*/
+	//	fltx4 ret = __vmaddfp( q16s, vUpkMul, vUpkAdd );
+	fltx4 ret = vec_madd(q16s, vUpkMul, vUpkAdd);
+
+	// now, work out what w must be. 
+	fltx4 dotxyz = Dot3SIMD(ret, ret); // all components are dot product of ret w/ self.
+	dotxyz = ClampVectorSIMD(dotxyz, Four_Zeros, Four_Ones);
+
+	fltx4 ww = SubSIMD(Four_Ones, dotxyz); // all components are 1 - dotxyz
+	ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz)
+	if (wneg)
+	{
+		ret = SetWSIMD(ret, NegSIMD(ww));
+		//		ret = __vrlimi( ret, NegSIMD(ww), 1, 0 ); // insert one element from the ww vector into the w component of ret
+	}
+	else
+	{
+		ret = SetWSIMD(ret, ww);
+		//		ret = __vrlimi( ret, ww, 1, 0 ); // insert one element from the ww vector into the w component of ret
+	}
+	return ret;
+}
+
+#endif
+
+// These are not optimized right now for some platforms. We should be able to shuffle the values in some platforms.
+// As the methods are hard-coded we can actually avoid loading memory to do the transfer.
+// We should be able to create all versions.
+FORCEINLINE fltx4 SetWFromXSIMD(const fltx4& a, const fltx4& x)
+{
+	fltx4 value = SplatXSIMD(x);
+	return SetWSIMD(a, value);
+}
+
+FORCEINLINE fltx4 SetWFromYSIMD(const fltx4& a, const fltx4& y)
+{
+	fltx4 value = SplatYSIMD(y);
+	return SetWSIMD(a, value);
+}
+
+FORCEINLINE fltx4 SetWFromZSIMD(const fltx4& a, const fltx4& z)
+{
+	fltx4 value = SplatZSIMD(z);
+	return SetWSIMD(a, value);
+}
+
+FORCEINLINE fltx4 CrossProductSIMD(const fltx4& A, const fltx4& B)
+{
+#if defined( _X360 )
+	return XMVector3Cross(A, B);
+#elif defined( _WIN32 )
+	fltx4 A1 = _mm_shuffle_ps(A, A, MM_SHUFFLE_REV(1, 2, 0, 3));
+	fltx4 B1 = _mm_shuffle_ps(B, B, MM_SHUFFLE_REV(2, 0, 1, 3));
+	fltx4 Result1 = MulSIMD(A1, B1);
+	fltx4 A2 = _mm_shuffle_ps(A, A, MM_SHUFFLE_REV(2, 0, 1, 3));
+	fltx4 B2 = _mm_shuffle_ps(B, B, MM_SHUFFLE_REV(1, 2, 0, 3));
+	fltx4 Result2 = MulSIMD(A2, B2);
+	return SubSIMD(Result1, Result2);
+
+#elif defined(_PS3)
+	/*
+	fltx4 perm1 = (vector unsigned char){0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x00,0x01,0x02,0x03,0x0c,0x0d,0x0e,0x0f};
+	fltx4 perm2 = (vector unsigned char){0x08,0x09,0x0a,0x0b,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e,0x0f};
+
+	fltx4 A1 = __vpermwi( A, A, perm1 );
+	fltx4 A2 = __vpermwi( B, B, perm2 );
+	fltx4 Result1 = MulSIMD( A1, B1 );
+	fltx4 A2 = __vpermwi( A, A, perm2 );
+	fltx4 B2 = __vpermwi( B, B, perm1 );
+	return MsubSIMD( A2, B2, Result1 );
+	*/
+	return _vmathVfCross(A, B);
+#else
+	fltx4 CrossVal;
+	SubFloat(CrossVal, 0) = SubFloat(A, 1) * SubFloat(B, 2) - SubFloat(A, 2) * SubFloat(B, 1);
+	SubFloat(CrossVal, 1) = SubFloat(A, 2) * SubFloat(B, 0) - SubFloat(A, 0) * SubFloat(B, 2);
+	SubFloat(CrossVal, 2) = SubFloat(A, 0) * SubFloat(B, 1) - SubFloat(A, 1) * SubFloat(B, 0);
+	SubFloat(CrossVal, 3) = 0;
+	return CrossVal;
+#endif
+}
+
+inline const fltx4 Length3SIMD(const fltx4 vec)
+{
+	fltx4 scLengthSqr = Dot3SIMD(vec, vec);
+	bi32x4 isSignificant = CmpGtSIMD(scLengthSqr, Four_Epsilons);
+	fltx4 scLengthInv = ReciprocalSqrtSIMD(scLengthSqr);
+	return AndSIMD(isSignificant, MulSIMD(scLengthInv, scLengthSqr));
+}
+
+inline const fltx4 Normalized3SIMD(const fltx4 vec)
+{
+	fltx4 scLengthSqr = Dot3SIMD(vec, vec);
+	bi32x4 isSignificant = CmpGtSIMD(scLengthSqr, Four_Epsilons);
+	fltx4 scLengthInv = ReciprocalSqrtSIMD(scLengthSqr);
+	return AndSIMD(isSignificant, MulSIMD(vec, scLengthInv));
+}
+
+
+// Some convenience operator overloads, which are just aliasing the functions above.
+// Unneccessary on 360, as you already have them from xboxmath.h
+// Componentwise add
+#ifndef COMPILER_GCC
+
+FORCEINLINE fltx4 operator+=(fltx4& a, FLTX4 b)
+{
+	a = AddSIMD(a, b);
+	return a;
+}
+
+FORCEINLINE fltx4 operator-=(fltx4& a, FLTX4 b)
+{
+	a = SubSIMD(a, b);
+	return a;
+}
+
+
+FORCEINLINE fltx4 operator*=(fltx4& a, FLTX4 b)
+{
+	a = MulSIMD(a, b);
+	return a;
+}
+
+#endif
 #endif // _ssemath_h
diff --git a/r5dev/mathlib/ssenoise.cpp b/r5dev/mathlib/ssenoise.cpp
new file mode 100644
index 00000000..a581391f
--- /dev/null
+++ b/r5dev/mathlib/ssenoise.cpp
@@ -0,0 +1,232 @@
+﻿//========= Copyright � 1996-2006, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: Fast low quality noise suitable for real time use
+//
+//=====================================================================================//
+
+#include "core/stdafx.h"
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "mathlib/ssemath.h"
+#include "mathlib/noisedata.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+//#include "tier0/memdbgon.h"
+
+
+#define MAGIC_NUMBER (1<<15)								// gives 8 bits of fraction
+
+static fltx4 Four_MagicNumbers = { MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER };
+
+
+static ALIGN16 int32 idx_mask[4] = { 0xffff, 0xffff, 0xffff, 0xffff };
+
+#define MASK255 (*((fltx4 *)(& idx_mask )))
+
+// returns 0..1
+static inline float GetLatticePointValue(int idx_x, int idx_y, int idx_z)
+{
+	int ret_idx = perm_a[idx_x & 0xff];
+	ret_idx = perm_b[(idx_y + ret_idx) & 0xff];
+	ret_idx = perm_c[(idx_z + ret_idx) & 0xff];
+	return impulse_xcoords[ret_idx];
+
+}
+
+fltx4 NoiseSIMD(const fltx4& x, const fltx4& y, const fltx4& z)
+{
+	// use magic to convert to integer index
+	fltx4 x_idx = AndSIMD(MASK255, AddSIMD(x, Four_MagicNumbers));
+	fltx4 y_idx = AndSIMD(MASK255, AddSIMD(y, Four_MagicNumbers));
+	fltx4 z_idx = AndSIMD(MASK255, AddSIMD(z, Four_MagicNumbers));
+
+	fltx4 lattice000 = Four_Zeros, lattice001 = Four_Zeros, lattice010 = Four_Zeros, lattice011 = Four_Zeros;
+	fltx4 lattice100 = Four_Zeros, lattice101 = Four_Zeros, lattice110 = Four_Zeros, lattice111 = Four_Zeros;
+
+	// FIXME: Converting the input vectors to int indices will cause load-hit-stores (48 bytes)
+	//        Converting the indexed noise values back to vectors will cause more (128 bytes)
+	//        The noise table could store vectors if we chunked it into 2x2x2 blocks.
+	fltx4 xfrac = Four_Zeros, yfrac = Four_Zeros, zfrac = Four_Zeros;
+#define DOPASS(i)															\
+    {	unsigned int xi = SubInt( x_idx, i );								\
+		unsigned int yi = SubInt( y_idx, i );								\
+		unsigned int zi = SubInt( z_idx, i );								\
+		SubFloat( xfrac, i ) = (xi & 0xff)*(1.0/256.0);						\
+		SubFloat( yfrac, i ) = (yi & 0xff)*(1.0/256.0);						\
+		SubFloat( zfrac, i ) = (zi & 0xff)*(1.0/256.0);						\
+		xi>>=8;																\
+		yi>>=8;																\
+		zi>>=8;																\
+																			\
+		SubFloat( lattice000, i ) = GetLatticePointValue( xi,yi,zi );		\
+		SubFloat( lattice001, i ) = GetLatticePointValue( xi,yi,zi+1 );		\
+		SubFloat( lattice010, i ) = GetLatticePointValue( xi,yi+1,zi );		\
+		SubFloat( lattice011, i ) = GetLatticePointValue( xi,yi+1,zi+1 );	\
+		SubFloat( lattice100, i ) = GetLatticePointValue( xi+1,yi,zi );		\
+		SubFloat( lattice101, i ) = GetLatticePointValue( xi+1,yi,zi+1 );	\
+		SubFloat( lattice110, i ) = GetLatticePointValue( xi+1,yi+1,zi );	\
+		SubFloat( lattice111, i ) = GetLatticePointValue( xi+1,yi+1,zi+1 );	\
+    }
+
+	DOPASS(0);
+	DOPASS(1);
+	DOPASS(2);
+	DOPASS(3);
+
+	// now, we have 8 lattice values for each of four points as m128s, and interpolant values for
+	// each axis in m128 form in [xyz]frac. Perfom the trilinear interpolation as SIMD ops
+
+	// first, do x interpolation
+	fltx4 l2d00 = AddSIMD(lattice000, MulSIMD(xfrac, SubSIMD(lattice100, lattice000)));
+	fltx4 l2d01 = AddSIMD(lattice001, MulSIMD(xfrac, SubSIMD(lattice101, lattice001)));
+	fltx4 l2d10 = AddSIMD(lattice010, MulSIMD(xfrac, SubSIMD(lattice110, lattice010)));
+	fltx4 l2d11 = AddSIMD(lattice011, MulSIMD(xfrac, SubSIMD(lattice111, lattice011)));
+
+	// now, do y interpolation
+	fltx4 l1d0 = AddSIMD(l2d00, MulSIMD(yfrac, SubSIMD(l2d10, l2d00)));
+	fltx4 l1d1 = AddSIMD(l2d01, MulSIMD(yfrac, SubSIMD(l2d11, l2d01)));
+
+	// final z interpolation
+	fltx4 rslt = AddSIMD(l1d0, MulSIMD(zfrac, SubSIMD(l1d1, l1d0)));
+
+	// map to 0..1
+	return MulSIMD(Four_Twos, SubSIMD(rslt, Four_PointFives));
+
+
+}
+
+static inline void GetVectorLatticePointValue(int idx, fltx4& x, fltx4& y, fltx4& z,
+	int idx_x, int idx_y, int idx_z)
+{
+	int ret_idx = perm_a[idx_x & 0xff];
+	ret_idx = perm_b[(idx_y + ret_idx) & 0xff];
+	ret_idx = perm_c[(idx_z + ret_idx) & 0xff];
+	float const* pData = s_randomGradients + ret_idx * 3;
+	SubFloat(x, idx) = pData[0];
+	SubFloat(y, idx) = pData[1];
+	SubFloat(z, idx) = pData[2];
+
+}
+
+FourVectors DNoiseSIMD(const fltx4& x, const fltx4& y, const fltx4& z)
+{
+	// use magic to convert to integer index
+	fltx4 x_idx = AndSIMD(MASK255, AddSIMD(x, Four_MagicNumbers));
+	fltx4 y_idx = AndSIMD(MASK255, AddSIMD(y, Four_MagicNumbers));
+	fltx4 z_idx = AndSIMD(MASK255, AddSIMD(z, Four_MagicNumbers));
+
+	fltx4 xlattice000 = Four_Zeros, xlattice001 = Four_Zeros, xlattice010 = Four_Zeros, xlattice011 = Four_Zeros;
+	fltx4 xlattice100 = Four_Zeros, xlattice101 = Four_Zeros, xlattice110 = Four_Zeros, xlattice111 = Four_Zeros;
+	fltx4 ylattice000 = Four_Zeros, ylattice001 = Four_Zeros, ylattice010 = Four_Zeros, ylattice011 = Four_Zeros;
+	fltx4 ylattice100 = Four_Zeros, ylattice101 = Four_Zeros, ylattice110 = Four_Zeros, ylattice111 = Four_Zeros;
+	fltx4 zlattice000 = Four_Zeros, zlattice001 = Four_Zeros, zlattice010 = Four_Zeros, zlattice011 = Four_Zeros;
+	fltx4 zlattice100 = Four_Zeros, zlattice101 = Four_Zeros, zlattice110 = Four_Zeros, zlattice111 = Four_Zeros;
+
+	// FIXME: Converting the input vectors to int indices will cause load-hit-stores (48 bytes)
+	//        Converting the indexed noise values back to vectors will cause more (128 bytes)
+	//        The noise table could store vectors if we chunked it into 2x2x2 blocks.
+	fltx4 xfrac = Four_Zeros, yfrac = Four_Zeros, zfrac = Four_Zeros;
+#define DODPASS(i)															\
+    {	unsigned int xi = SubInt( x_idx, i );								\
+		unsigned int yi = SubInt( y_idx, i );								\
+		unsigned int zi = SubInt( z_idx, i );								\
+		SubFloat( xfrac, i ) = (xi & 0xff)*(1.0/256.0);						\
+		SubFloat( yfrac, i ) = (yi & 0xff)*(1.0/256.0);						\
+		SubFloat( zfrac, i ) = (zi & 0xff)*(1.0/256.0);						\
+		xi>>=8;																\
+		yi>>=8;																\
+		zi>>=8;																\
+																			\
+		GetVectorLatticePointValue( i, xlattice000, ylattice000, zlattice000, xi,yi,zi );		\
+		GetVectorLatticePointValue( i, xlattice001, ylattice001, zlattice001, xi,yi,zi+1 );		\
+		GetVectorLatticePointValue( i, xlattice010, ylattice010, zlattice010, xi,yi+1,zi );		\
+		GetVectorLatticePointValue( i, xlattice011, ylattice011, zlattice011, xi,yi+1,zi+1 );	\
+		GetVectorLatticePointValue( i, xlattice100, ylattice100, zlattice100, xi+1,yi,zi );		\
+		GetVectorLatticePointValue( i, xlattice101, ylattice101, zlattice101, xi+1,yi,zi+1 );	\
+		GetVectorLatticePointValue( i, xlattice110, ylattice110, zlattice110, xi+1,yi+1,zi );	\
+		GetVectorLatticePointValue( i, xlattice111, ylattice111, zlattice111, xi+1,yi+1,zi+1 );	\
+    }
+
+	DODPASS(0);
+	DODPASS(1);
+	DODPASS(2);
+	DODPASS(3);
+
+	// now, we have 8 lattice values for each of four points as m128s, and interpolant values for
+	// each axis in m128 form in [xyz]frac. Perfom the trilinear interpolation as SIMD ops
+
+	// first, do x interpolation
+	fltx4 xl2d00 = AddSIMD(xlattice000, MulSIMD(xfrac, SubSIMD(xlattice100, xlattice000)));
+	fltx4 xl2d01 = AddSIMD(xlattice001, MulSIMD(xfrac, SubSIMD(xlattice101, xlattice001)));
+	fltx4 xl2d10 = AddSIMD(xlattice010, MulSIMD(xfrac, SubSIMD(xlattice110, xlattice010)));
+	fltx4 xl2d11 = AddSIMD(xlattice011, MulSIMD(xfrac, SubSIMD(xlattice111, xlattice011)));
+
+	// now, do y interpolation
+	fltx4 xl1d0 = AddSIMD(xl2d00, MulSIMD(yfrac, SubSIMD(xl2d10, xl2d00)));
+	fltx4 xl1d1 = AddSIMD(xl2d01, MulSIMD(yfrac, SubSIMD(xl2d11, xl2d01)));
+
+	// final z interpolation
+	FourVectors rslt;
+	rslt.x = AddSIMD(xl1d0, MulSIMD(zfrac, SubSIMD(xl1d1, xl1d0)));
+
+	fltx4 yl2d00 = AddSIMD(ylattice000, MulSIMD(xfrac, SubSIMD(ylattice100, ylattice000)));
+	fltx4 yl2d01 = AddSIMD(ylattice001, MulSIMD(xfrac, SubSIMD(ylattice101, ylattice001)));
+	fltx4 yl2d10 = AddSIMD(ylattice010, MulSIMD(xfrac, SubSIMD(ylattice110, ylattice010)));
+	fltx4 yl2d11 = AddSIMD(ylattice011, MulSIMD(xfrac, SubSIMD(ylattice111, ylattice011)));
+
+	// now, do y interpolation
+	fltx4 yl1d0 = AddSIMD(yl2d00, MulSIMD(yfrac, SubSIMD(yl2d10, yl2d00)));
+	fltx4 yl1d1 = AddSIMD(yl2d01, MulSIMD(yfrac, SubSIMD(yl2d11, yl2d01)));
+
+	// final z interpolation
+	rslt.y = AddSIMD(yl1d0, MulSIMD(zfrac, SubSIMD(yl1d1, yl1d0)));
+
+	fltx4 zl2d00 = AddSIMD(zlattice000, MulSIMD(xfrac, SubSIMD(zlattice100, zlattice000)));
+	fltx4 zl2d01 = AddSIMD(zlattice001, MulSIMD(xfrac, SubSIMD(zlattice101, zlattice001)));
+	fltx4 zl2d10 = AddSIMD(zlattice010, MulSIMD(xfrac, SubSIMD(zlattice110, zlattice010)));
+	fltx4 zl2d11 = AddSIMD(zlattice011, MulSIMD(xfrac, SubSIMD(zlattice111, zlattice011)));
+
+	// now, do y interpolation
+	fltx4 zl1d0 = AddSIMD(zl2d00, MulSIMD(yfrac, SubSIMD(zl2d10, zl2d00)));
+	fltx4 zl1d1 = AddSIMD(zl2d01, MulSIMD(yfrac, SubSIMD(zl2d11, zl2d01)));
+
+	// final z interpolation
+	rslt.z = AddSIMD(zl1d0, MulSIMD(zfrac, SubSIMD(zl1d1, zl1d0)));
+
+	return rslt;
+
+
+}
+
+fltx4 NoiseSIMD(FourVectors const& pos)
+{
+	return NoiseSIMD(pos.x, pos.y, pos.z);
+}
+
+FourVectors DNoiseSIMD(FourVectors const& pos)
+{
+	return DNoiseSIMD(pos.x, pos.y, pos.z);
+}
+
+FourVectors CurlNoiseSIMD(FourVectors const& pos)
+{
+	FourVectors fl4Comp1 = DNoiseSIMD(pos);
+	FourVectors fl4Pos = pos;
+	fl4Pos.x = AddSIMD(fl4Pos.x, ReplicateX4(43.256));
+	fl4Pos.y = AddSIMD(fl4Pos.y, ReplicateX4(-67.89));
+	fl4Pos.z = AddSIMD(fl4Pos.z, ReplicateX4(1338.2));
+	FourVectors fl4Comp2 = DNoiseSIMD(fl4Pos);
+	fl4Pos.x = AddSIMD(fl4Pos.x, ReplicateX4(-129.856));
+	fl4Pos.y = AddSIMD(fl4Pos.y, ReplicateX4(-967.23));
+	fl4Pos.z = AddSIMD(fl4Pos.z, ReplicateX4(2338.98));
+	FourVectors fl4Comp3 = DNoiseSIMD(fl4Pos);
+
+	// now we have the 3 derivatives of a vector valued field. return the curl of the field.
+	FourVectors fl4Ret;
+	fl4Ret.x = SubSIMD(fl4Comp3.y, fl4Comp2.z);
+	fl4Ret.y = SubSIMD(fl4Comp1.z, fl4Comp3.x);
+	fl4Ret.z = SubSIMD(fl4Comp2.x, fl4Comp1.y);
+	return fl4Ret;
+
+}
diff --git a/r5dev/mathlib/ssenoise.h b/r5dev/mathlib/ssenoise.h
deleted file mode 100644
index e40ce799..00000000
--- a/r5dev/mathlib/ssenoise.h
+++ /dev/null
@@ -1,107 +0,0 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: Fast low quality noise suitable for real time use
-//
-//=====================================================================================//
-
-#include "core/stdafx.h"
-#include "tier0/dbg.h"
-#include "tier0/basetypes.h"
-#include "mathlib/mathlib.h"
-#include "mathlib/vector.h"
-#include "mathlib/ssemath.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-//#include "tier0/memdbgon.h"
-#include "noisedata.h"
-
-
-#define MAGIC_NUMBER (1<<15)								// gives 8 bits of fraction
-
-static fltx4 Four_MagicNumbers = { MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER };
-
-
-static ALIGN16 int32 idx_mask[4] = { 0xffff, 0xffff, 0xffff, 0xffff };
-
-#define MASK255 (*((fltx4 *)(& idx_mask )))
-
-// returns 0..1
-static inline float GetLatticePointValue(int idx_x, int idx_y, int idx_z)
-{
-	NOTE_UNUSED(perm_d);
-	NOTE_UNUSED(impulse_ycoords);
-	NOTE_UNUSED(impulse_zcoords);
-
-	int ret_idx = perm_a[idx_x & 0xff];
-	ret_idx = perm_b[(idx_y + ret_idx) & 0xff];
-	ret_idx = perm_c[(idx_z + ret_idx) & 0xff];
-	return impulse_xcoords[ret_idx];
-
-}
-
-fltx4 NoiseSIMD(const fltx4& x, const fltx4& y, const fltx4& z)
-{
-	// use magic to convert to integer index
-	fltx4 x_idx = AndSIMD(MASK255, AddSIMD(x, Four_MagicNumbers));
-	fltx4 y_idx = AndSIMD(MASK255, AddSIMD(y, Four_MagicNumbers));
-	fltx4 z_idx = AndSIMD(MASK255, AddSIMD(z, Four_MagicNumbers));
-
-	fltx4 lattice000 = Four_Zeros, lattice001 = Four_Zeros, lattice010 = Four_Zeros, lattice011 = Four_Zeros;
-	fltx4 lattice100 = Four_Zeros, lattice101 = Four_Zeros, lattice110 = Four_Zeros, lattice111 = Four_Zeros;
-
-	// FIXME: Converting the input vectors to int indices will cause load-hit-stores (48 bytes)
-	//        Converting the indexed noise values back to vectors will cause more (128 bytes)
-	//        The noise table could store vectors if we chunked it into 2x2x2 blocks.
-	fltx4 xfrac = Four_Zeros, yfrac = Four_Zeros, zfrac = Four_Zeros;
-#define DOPASS(i)															\
-    {	unsigned int xi = SubInt( x_idx, i );								\
-		unsigned int yi = SubInt( y_idx, i );								\
-		unsigned int zi = SubInt( z_idx, i );								\
-		SubFloat( xfrac, i ) = (xi & 0xff)*(1.0/256.0);						\
-		SubFloat( yfrac, i ) = (yi & 0xff)*(1.0/256.0);						\
-		SubFloat( zfrac, i ) = (zi & 0xff)*(1.0/256.0);						\
-		xi>>=8;																\
-		yi>>=8;																\
-		zi>>=8;																\
-																			\
-		SubFloat( lattice000, i ) = GetLatticePointValue( xi,yi,zi );		\
-		SubFloat( lattice001, i ) = GetLatticePointValue( xi,yi,zi+1 );		\
-		SubFloat( lattice010, i ) = GetLatticePointValue( xi,yi+1,zi );		\
-		SubFloat( lattice011, i ) = GetLatticePointValue( xi,yi+1,zi+1 );	\
-		SubFloat( lattice100, i ) = GetLatticePointValue( xi+1,yi,zi );		\
-		SubFloat( lattice101, i ) = GetLatticePointValue( xi+1,yi,zi+1 );	\
-		SubFloat( lattice110, i ) = GetLatticePointValue( xi+1,yi+1,zi );	\
-		SubFloat( lattice111, i ) = GetLatticePointValue( xi+1,yi+1,zi+1 );	\
-    }
-
-	DOPASS(0);
-	DOPASS(1);
-	DOPASS(2);
-	DOPASS(3);
-
-	// now, we have 8 lattice values for each of four points as m128s, and interpolant values for
-	// each axis in m128 form in [xyz]frac. Perfom the trilinear interpolation as SIMD ops
-
-	// first, do x interpolation
-	fltx4 l2d00 = AddSIMD(lattice000, MulSIMD(xfrac, SubSIMD(lattice100, lattice000)));
-	fltx4 l2d01 = AddSIMD(lattice001, MulSIMD(xfrac, SubSIMD(lattice101, lattice001)));
-	fltx4 l2d10 = AddSIMD(lattice010, MulSIMD(xfrac, SubSIMD(lattice110, lattice010)));
-	fltx4 l2d11 = AddSIMD(lattice011, MulSIMD(xfrac, SubSIMD(lattice111, lattice011)));
-
-	// now, do y interpolation
-	fltx4 l1d0 = AddSIMD(l2d00, MulSIMD(yfrac, SubSIMD(l2d10, l2d00)));
-	fltx4 l1d1 = AddSIMD(l2d01, MulSIMD(yfrac, SubSIMD(l2d11, l2d01)));
-
-	// final z interpolation
-	fltx4 rslt = AddSIMD(l1d0, MulSIMD(zfrac, SubSIMD(l1d1, l1d0)));
-
-	// map to 0..1
-	return MulSIMD(Four_Twos, SubSIMD(rslt, Four_PointFives));
-
-
-}
-
-fltx4 NoiseSIMD(FourVectors const& pos)
-{
-	return NoiseSIMD(pos.x, pos.y, pos.z);
-}
diff --git a/r5dev/mathlib/ssequaternion.h b/r5dev/mathlib/ssequaternion.h
index 90167b83..a1b9b37e 100644
--- a/r5dev/mathlib/ssequaternion.h
+++ b/r5dev/mathlib/ssequaternion.h
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+﻿//===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======//
 //
 // Purpose: - defines SIMD "structure of arrays" classes and functions.
 //
@@ -37,8 +37,10 @@
 // the traditional x87 FPU operations altogether and make everything use
 // the SSE2 registers, which lessens this problem a little.
 
-// permitted only on 360, as we've done careful tuning on its Altivec math:
-#ifdef _X360
+// permitted only on 360, as we've done careful tuning on its Altivec math.
+// FourQuaternions, however, are always allowed, because vertical ops are
+// fine on SSE.
+#ifdef PLATFORM_PPC
 #define ALLOW_SIMD_QUATERNION_MATH 1  // not on PC!
 #endif
 
@@ -48,7 +50,6 @@
 // Load/store quaternions
 //---------------------------------------------------------------------
 #ifndef _X360
-#if ALLOW_SIMD_QUATERNION_MATH
 // Using STDC or SSE
 FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned& pSIMD)
 {
@@ -58,7 +59,7 @@ FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned& pSIMD)
 
 FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned* RESTRICT pSIMD)
 {
-	fltx4 retval = LoadAlignedSIMD(pSIMD);
+	fltx4 retval = LoadAlignedSIMD(pSIMD->Base());
 	return retval;
 }
 
@@ -66,7 +67,6 @@ FORCEINLINE void StoreAlignedSIMD(QuaternionAligned* RESTRICT pSIMD, const fltx4
 {
 	StoreAlignedSIMD(pSIMD->Base(), a);
 }
-#endif
 #else
 
 // for the transitional class -- load a QuaternionAligned
@@ -87,6 +87,9 @@ FORCEINLINE void StoreAlignedSIMD(QuaternionAligned* RESTRICT pSIMD, const fltx4
 	XMStoreVector4A(pSIMD->Base(), a);
 }
 
+// From a RadianEuler packed onto a fltx4, to a quaternion
+fltx4 AngleQuaternionSIMD(FLTX4 vAngles);
+
 #endif
 
 
@@ -101,7 +104,7 @@ FORCEINLINE fltx4 QuaternionAlignSIMD(const fltx4& p, const fltx4& q)
 	fltx4 b = AddSIMD(p, q);
 	a = Dot4SIMD(a, a);
 	b = Dot4SIMD(b, b);
-	fltx4 cmp = CmpGtSIMD(a, b);
+	fltx4 cmp = (fltx4)CmpGtSIMD(a, b);
 	fltx4 result = MaskedAssign(cmp, NegSIMD(q), q);
 	return result;
 }
@@ -133,7 +136,7 @@ FORCEINLINE fltx4 QuaternionNormalizeSIMD(const fltx4& q)
 {
 	fltx4 radius, result, mask;
 	radius = Dot4SIMD(q, q);
-	mask = CmpEqSIMD(radius, Four_Zeros); // all ones iff radius = 0
+	mask = (fltx4)CmpEqSIMD(radius, Four_Zeros); // all ones iff radius = 0
 	result = ReciprocalSqrtSIMD(radius);
 	result = MulSIMD(result, q);
 	return MaskedAssign(mask, q, result);	// if radius was 0, just return q
@@ -222,40 +225,7 @@ FORCEINLINE fltx4 QuaternionMultSIMD(const fltx4& p, const fltx4& q)
 //---------------------------------------------------------------------
 // Quaternion scale
 //---------------------------------------------------------------------
-#ifndef _X360
-
-// SSE and STDC
-FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, float t)
-{
-	float r;
-	fltx4 q;
-
-	// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to 
-	// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
-	float sinom = sqrt(SubFloat(p, 0) * SubFloat(p, 0) + SubFloat(p, 1) * SubFloat(p, 1) + SubFloat(p, 2) * SubFloat(p, 2));
-	sinom = min(sinom, 1.f);
-
-	float sinsom = sin(asin(sinom) * t);
-
-	t = sinsom / (sinom + FLT_EPSILON);
-	SubFloat(q, 0) = t * SubFloat(p, 0);
-	SubFloat(q, 1) = t * SubFloat(p, 1);
-	SubFloat(q, 2) = t * SubFloat(p, 2);
-
-	// rescale rotation
-	r = 1.0f - sinsom * sinsom;
-
-	// Assert( r >= 0 );
-	if (r < 0.0f)
-		r = 0.0f;
-	r = sqrt(r);
-
-	// keep sign of rotation
-	SubFloat(q, 3) = fsel(SubFloat(p, 3), r, -r);
-	return q;
-}
-
-#else
+#ifdef _X360
 
 // X360
 FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, float t)
@@ -286,6 +256,126 @@ FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, float t)
 	return result;
 }
 
+// X360
+// assumes t4 contains a float replicated to each slot
+FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, const fltx4& t4)
+{
+	fltx4 sinom = Dot3SIMD(p, p);
+	sinom = SqrtSIMD(sinom);
+	sinom = MinSIMD(sinom, Four_Ones);
+	fltx4 sinsom = ArcSinSIMD(sinom);
+	sinsom = MulSIMD(sinsom, t4);
+	sinsom = SinSIMD(sinsom);
+	sinom = AddSIMD(sinom, Four_Epsilons);
+	sinom = ReciprocalSIMD(sinom);
+	fltx4 result = MulSIMD(p, MulSIMD(sinsom, sinom));
+
+	// rescale rotation
+	sinsom = MulSIMD(sinsom, sinsom);
+	fltx4 r = SubSIMD(Four_Ones, sinsom);
+	r = MaxSIMD(r, Four_Zeros);
+	r = SqrtSIMD(r);
+
+	// keep sign of rotation
+	fltx4 cmp = CmpGeSIMD(p, Four_Zeros);
+	r = MaskedAssign(cmp, r, NegSIMD(r));
+
+	result = __vrlimi(result, r, 1, 0);
+	return result;
+}
+
+#elif defined(_PS3)
+
+// X360
+FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, float t)
+{
+	fltx4 sinom = Dot3SIMD(p, p);
+	sinom = SqrtSIMD(sinom);
+	sinom = MinSIMD(sinom, Four_Ones);
+	fltx4 sinsom = ArcSinSIMD(sinom);
+	fltx4 t4 = ReplicateX4(t);
+	sinsom = MulSIMD(sinsom, t4);
+	sinsom = SinSIMD(sinsom);
+	sinom = AddSIMD(sinom, Four_Epsilons);
+	sinom = ReciprocalSIMD(sinom);
+	t4 = MulSIMD(sinsom, sinom);
+	fltx4 result = MulSIMD(p, t4);
+
+	// rescale rotation
+	sinsom = MulSIMD(sinsom, sinsom);
+	fltx4 r = SubSIMD(Four_Ones, sinsom);
+	r = MaxSIMD(r, Four_Zeros);
+	r = SqrtSIMD(r);
+
+	// keep sign of rotation
+	r = MaskedAssign(CmpGeSIMD(p, Four_Zeros), r, NegSIMD(r));
+	// set just the w component of result
+	result = MaskedAssign(LoadAlignedSIMD(g_SIMD_ComponentMask[3]), r, result);
+
+	return result;
+}
+
+// X360
+// assumes t4 contains a float replicated to each slot
+FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, const fltx4& t4)
+{
+	fltx4 sinom = Dot3SIMD(p, p);
+	sinom = SqrtSIMD(sinom);
+	sinom = MinSIMD(sinom, Four_Ones);
+	fltx4 sinsom = ArcSinSIMD(sinom);
+	sinsom = MulSIMD(sinsom, t4);
+	sinsom = SinSIMD(sinsom);
+	sinom = AddSIMD(sinom, Four_Epsilons);
+	sinom = ReciprocalSIMD(sinom);
+	fltx4 result = MulSIMD(p, MulSIMD(sinsom, sinom));
+
+	// rescale rotation
+	sinsom = MulSIMD(sinsom, sinsom);
+	fltx4 r = SubSIMD(Four_Ones, sinsom);
+	r = MaxSIMD(r, Four_Zeros);
+	r = SqrtSIMD(r);
+
+	// keep sign of rotation
+	r = MaskedAssign(CmpGeSIMD(p, Four_Zeros), r, NegSIMD(r));
+	// set just the w component of result
+	result = MaskedAssign(LoadAlignedSIMD(g_SIMD_ComponentMask[3]), r, result);
+
+	return result;
+}
+
+#else
+
+// SSE and STDC
+FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, float t)
+{
+	float r;
+	fltx4 q;
+
+	// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to 
+	// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
+	float sinom = sqrt(SubFloat(p, 0) * SubFloat(p, 0) + SubFloat(p, 1) * SubFloat(p, 1) + SubFloat(p, 2) * SubFloat(p, 2));
+	sinom = fmin(sinom, 1.f);
+
+	float sinsom = sin(asin(sinom) * t);
+
+	t = sinsom / (sinom + FLT_EPSILON);
+	SubFloat(q, 0) = t * SubFloat(p, 0);
+	SubFloat(q, 1) = t * SubFloat(p, 1);
+	SubFloat(q, 2) = t * SubFloat(p, 2);
+
+	// rescale rotation
+	r = 1.0f - sinsom * sinsom;
+
+	// Assert( r >= 0 );
+	if (r < 0.0f)
+		r = 0.0f;
+	r = sqrt(r);
+
+	// keep sign of rotation
+	SubFloat(q, 3) = fsel(SubFloat(p, 3), r, -r);
+	return q;
+}
+
 #endif
 
 
@@ -363,5 +453,812 @@ FORCEINLINE fltx4 QuaternionSlerpSIMD(const fltx4& p, const fltx4& q, float t)
 
 #endif // ALLOW_SIMD_QUATERNION_MATH
 
+
+/// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are
+/// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated.
+class ALIGN16 FourQuaternions
+{
+public:
+	fltx4 x, y, z, w;
+
+	FourQuaternions(void)
+	{
+	}
+
+	FourQuaternions(const fltx4& _x,
+		const fltx4& _y,
+		const fltx4& _z,
+		const fltx4& _w)
+		: x(_x), y(_y), z(_z), w(_w)
+	{}
+
+#if !defined(__SPU__)
+	// four rotations around the same axis. angles should be in radians.
+	FourQuaternions(const fltx4& axis,
+		const float& angle0, const float& angle1, const float& angle2, const float& angle3)
+	{
+		FromAxisAndAngles(axis, angle0, angle1, angle2, angle3);
+	}
+#endif
+
+	FourQuaternions(FourQuaternions const& src)
+	{
+		x = src.x;
+		y = src.y;
+		z = src.z;
+		w = src.w;
+	}
+
+	FORCEINLINE void operator=(FourQuaternions const& src)
+	{
+		x = src.x;
+		y = src.y;
+		z = src.z;
+		w = src.w;
+	}
+
+	/// this = this * q;
+	FORCEINLINE FourQuaternions Mul(FourQuaternions const& q) const;
+
+	/// negate the vector part
+	FORCEINLINE FourQuaternions Conjugate() const;
+
+	/// for a quaternion representing a rotation of angle theta, return
+	/// one of angle s*theta
+	/// scale is four floats -- one for each quat
+	FORCEINLINE FourQuaternions ScaleAngle(const fltx4& scale) const;
+
+	/// ret = this * ( s * q ) 
+	/// In other words, for a quaternion representing a rotation of angle theta, return
+	/// one of angle s*theta
+	/// s is four floats in a fltx4 -- one for each quaternion
+	FORCEINLINE FourQuaternions MulAc(const fltx4& s, const FourQuaternions& q) const;
+
+	/// ret = ( s * this ) * q
+	FORCEINLINE FourQuaternions ScaleMul(const fltx4& s, const FourQuaternions& q) const;
+
+	/// Slerp four quaternions at once, FROM me TO the specified out.
+	FORCEINLINE FourQuaternions Slerp(const FourQuaternions& to, const fltx4& t);
+
+	FORCEINLINE FourQuaternions SlerpNoAlign(const FourQuaternions& originalto, const fltx4& t);
+
+#if !defined(__SPU__)
+	/// given an axis and four angles, populate this quaternion with the equivalent rotations
+	/// (ie, make these four quaternions represent four different rotations around the same axis)
+	/// angles should be in RADIANS
+	FORCEINLINE FourQuaternions& FromAxisAndAngles(const fltx4& axis,
+		const float& angle0, const float& angle1, const float& angle2, const float& angle3);
+	FORCEINLINE FourQuaternions& FromAxisAndAngles(const fltx4& axis, const fltx4& angles);
+	// one convenience imp if you're doing this in degrees
+	FORCEINLINE FourQuaternions& FromAxisAndAnglesInDegrees(const fltx4& axis, const fltx4& angles)
+	{
+		return FromAxisAndAngles(axis, MulSIMD(angles, Four_DegToRad));
+	}
+#endif
+
+	// rotate (in place) a FourVectors by this quaternion. there's a corresponding RotateBy in FourVectors.
+	FORCEINLINE void RotateFourVectors(FourVectors* RESTRICT vecs) const RESTRICT;
+
+
+	/// LoadAndSwizzleAligned - load 4 QuaternionAligneds into a FourQuaternions, performing transpose op.
+	/// all 4 vectors must be 128 bit boundary
+	FORCEINLINE void LoadAndSwizzleAligned(const float* RESTRICT a, const float* RESTRICT b, const float* RESTRICT c, const float* RESTRICT d)
+	{
+#if defined( _X360 )
+		fltx4 tx = LoadAlignedSIMD(a);
+		fltx4 ty = LoadAlignedSIMD(b);
+		fltx4 tz = LoadAlignedSIMD(c);
+		fltx4 tw = LoadAlignedSIMD(d);
+		fltx4 r0 = __vmrghw(tx, tz);
+		fltx4 r1 = __vmrghw(ty, tw);
+		fltx4 r2 = __vmrglw(tx, tz);
+		fltx4 r3 = __vmrglw(ty, tw);
+
+		x = __vmrghw(r0, r1);
+		y = __vmrglw(r0, r1);
+		z = __vmrghw(r2, r3);
+		w = __vmrglw(r2, r3);
+#else
+		x = LoadAlignedSIMD(a);
+		y = LoadAlignedSIMD(b);
+		z = LoadAlignedSIMD(c);
+		w = LoadAlignedSIMD(d);
+		// now, matrix is:
+		// x y z w
+		// x y z w
+		// x y z w
+		// x y z w
+		TransposeSIMD(x, y, z, w);
+#endif
+	}
+
+	FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned* RESTRICT a,
+		const QuaternionAligned* RESTRICT b,
+		const QuaternionAligned* RESTRICT c,
+		const QuaternionAligned* RESTRICT d)
+	{
+		LoadAndSwizzleAligned(a->Base(), b->Base(), c->Base(), d->Base());
+	}
+
+
+	/// LoadAndSwizzleAligned - load 4 consecutive QuaternionAligneds into a FourQuaternions, 
+	/// performing transpose op.
+	/// all 4 vectors must be 128 bit boundary
+	FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned* qs)
+	{
+#if defined( _X360 )
+		fltx4 tx = LoadAlignedSIMD(qs++);
+		fltx4 ty = LoadAlignedSIMD(qs++);
+		fltx4 tz = LoadAlignedSIMD(qs++);
+		fltx4 tw = LoadAlignedSIMD(qs);
+		fltx4 r0 = __vmrghw(tx, tz);
+		fltx4 r1 = __vmrghw(ty, tw);
+		fltx4 r2 = __vmrglw(tx, tz);
+		fltx4 r3 = __vmrglw(ty, tw);
+
+		x = __vmrghw(r0, r1);
+		y = __vmrglw(r0, r1);
+		z = __vmrghw(r2, r3);
+		w = __vmrglw(r2, r3);
+#else
+		x = LoadAlignedSIMD(qs++);
+		y = LoadAlignedSIMD(qs++);
+		z = LoadAlignedSIMD(qs++);
+		w = LoadAlignedSIMD(qs++);
+		// now, matrix is:
+		// x y z w
+		// x y z w
+		// x y z w
+		// x y z w
+		TransposeSIMD(x, y, z, w);
+#endif
+	}
+
+	// Store the FourQuaternions out to four nonconsecutive ordinary quaternions in memory.
+	FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned* a, QuaternionAligned* b, QuaternionAligned* c, QuaternionAligned* d)
+	{
+#if defined( _X360 )
+		fltx4 r0 = __vmrghw(x, z);
+		fltx4 r1 = __vmrghw(y, w);
+		fltx4 r2 = __vmrglw(x, z);
+		fltx4 r3 = __vmrglw(y, w);
+
+		fltx4 rx = __vmrghw(r0, r1);
+		fltx4 ry = __vmrglw(r0, r1);
+		fltx4 rz = __vmrghw(r2, r3);
+		fltx4 rw = __vmrglw(r2, r3);
+
+		StoreAlignedSIMD(a, rx);
+		StoreAlignedSIMD(b, ry);
+		StoreAlignedSIMD(c, rz);
+		StoreAlignedSIMD(d, rw);
+#else
+		fltx4 dupes[4] = { x, y, z, w };
+		TransposeSIMD(dupes[0], dupes[1], dupes[2], dupes[3]);
+		StoreAlignedSIMD(a, dupes[0]);
+		StoreAlignedSIMD(b, dupes[1]);
+		StoreAlignedSIMD(c, dupes[2]);
+		StoreAlignedSIMD(d, dupes[3]);
+#endif
+	}
+
+	// Store the FourQuaternions out to four consecutive ordinary quaternions in memory.
+	FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned* qs)
+	{
+#if defined( _X360 )
+		fltx4 r0 = __vmrghw(x, z);
+		fltx4 r1 = __vmrghw(y, w);
+		fltx4 r2 = __vmrglw(x, z);
+		fltx4 r3 = __vmrglw(y, w);
+
+		fltx4 rx = __vmrghw(r0, r1);
+		fltx4 ry = __vmrglw(r0, r1);
+		fltx4 rz = __vmrghw(r2, r3);
+		fltx4 rw = __vmrglw(r2, r3);
+
+		StoreAlignedSIMD(qs, rx);
+		StoreAlignedSIMD(++qs, ry);
+		StoreAlignedSIMD(++qs, rz);
+		StoreAlignedSIMD(++qs, rw);
+#else
+		SwizzleAndStoreAligned(qs, qs + 1, qs + 2, qs + 3);
+#endif
+	}
+
+	// Store the FourQuaternions out to four consecutive ordinary quaternions in memory.
+	// The mask specifies which of the quaternions are actually written out -- each	
+	// word in the fltx4 should be all binary ones or zeros. Ones means the corresponding
+	// quat will be written.
+	FORCEINLINE void SwizzleAndStoreAlignedMasked(QuaternionAligned* RESTRICT qs, const bi32x4& controlMask)
+	{
+		fltx4 originals[4];
+		originals[0] = LoadAlignedSIMD(qs);
+		originals[1] = LoadAlignedSIMD(qs + 1);
+		originals[2] = LoadAlignedSIMD(qs + 2);
+		originals[3] = LoadAlignedSIMD(qs + 3);
+
+		bi32x4 masks[4] = { SplatXSIMD(controlMask),
+			SplatYSIMD(controlMask),
+			SplatZSIMD(controlMask),
+			SplatWSIMD(controlMask) };
+
+#if defined( _X360 )
+		fltx4 r0 = __vmrghw(x, z);
+		fltx4 r1 = __vmrghw(y, w);
+		fltx4 r2 = __vmrglw(x, z);
+		fltx4 r3 = __vmrglw(y, w);
+
+		fltx4 rx = __vmrghw(r0, r1);
+		fltx4 ry = __vmrglw(r0, r1);
+		fltx4 rz = __vmrghw(r2, r3);
+		fltx4 rw = __vmrglw(r2, r3);
+#else
+		fltx4 rx = x;
+		fltx4 ry = y;
+		fltx4 rz = z;
+		fltx4 rw = w;
+		TransposeSIMD(rx, ry, rz, rw);
+#endif
+
+		StoreAlignedSIMD(qs + 0, MaskedAssign(masks[0], rx, originals[0]));
+		StoreAlignedSIMD(qs + 1, MaskedAssign(masks[1], ry, originals[1]));
+		StoreAlignedSIMD(qs + 2, MaskedAssign(masks[2], rz, originals[2]));
+		StoreAlignedSIMD(qs + 3, MaskedAssign(masks[3], rw, originals[3]));
+	}
+};
+
+
+
+FORCEINLINE FourQuaternions FourQuaternions::Conjugate() const
+{
+	return FourQuaternions(NegSIMD(x), NegSIMD(y), NegSIMD(z), w);
+}
+
+
+
+
+FORCEINLINE const fltx4 Dot(const FourQuaternions& a, const FourQuaternions& b)
+{
+	return
+		MaddSIMD(a.x, b.x,
+			MaddSIMD(a.y, b.y,
+				MaddSIMD(a.z, b.z, MulSIMD(a.w, b.w))
+			)
+		);
+}
+
+
+FORCEINLINE const FourQuaternions Madd(const FourQuaternions& a, const fltx4& scale, const FourQuaternions& c)
+{
+	FourQuaternions ret;
+	ret.x = MaddSIMD(a.x, scale, c.x);
+	ret.y = MaddSIMD(a.y, scale, c.y);
+	ret.z = MaddSIMD(a.z, scale, c.z);
+	ret.w = MaddSIMD(a.w, scale, c.w);
+	return ret;
+}
+
+FORCEINLINE const FourQuaternions Mul(const FourQuaternions& a, const fltx4& scale)
+{
+	FourQuaternions ret;
+	ret.x = MulSIMD(a.x, scale);
+	ret.y = MulSIMD(a.y, scale);
+	ret.z = MulSIMD(a.z, scale);
+	ret.w = MulSIMD(a.w, scale);
+	return ret;
+}
+
+FORCEINLINE const FourQuaternions Add(const FourQuaternions& a, const FourQuaternions& b)
+{
+	FourQuaternions ret;
+	ret.x = AddSIMD(a.x, b.x);
+	ret.y = AddSIMD(a.y, b.y);
+	ret.z = AddSIMD(a.z, b.z);
+	ret.w = AddSIMD(a.w, b.w);
+	return ret;
+}
+
+FORCEINLINE const FourQuaternions Sub(const FourQuaternions& a, const FourQuaternions& b)
+{
+	FourQuaternions ret;
+	ret.x = SubSIMD(a.x, b.x);
+	ret.y = SubSIMD(a.y, b.y);
+	ret.z = SubSIMD(a.z, b.z);
+	ret.w = SubSIMD(a.w, b.w);
+	return ret;
+}
+
+FORCEINLINE const FourQuaternions Neg(const FourQuaternions& q)
+{
+	FourQuaternions ret;
+	ret.x = NegSIMD(q.x);
+	ret.y = NegSIMD(q.y);
+	ret.z = NegSIMD(q.z);
+	ret.w = NegSIMD(q.w);
+	return ret;
+}
+
+FORCEINLINE const FourQuaternions MaskedAssign(const bi32x4& mask, const FourQuaternions& a, const FourQuaternions& b)
+{
+	FourQuaternions ret;
+	ret.x = MaskedAssign(mask, a.x, b.x);
+	ret.y = MaskedAssign(mask, a.y, b.y);
+	ret.z = MaskedAssign(mask, a.z, b.z);
+	ret.w = MaskedAssign(mask, a.w, b.w);
+	return ret;
+}
+
+#ifdef DIFFERENT_NATIVE_VECTOR_TYPES
+FORCEINLINE const FourQuaternions MaskedAssign(const fltx4& mask, const FourQuaternions& a, const FourQuaternions& b)
+{
+	return MaskedAssign((bi32x4)mask, a, b);
+}
+#endif
+
+
+FORCEINLINE FourQuaternions QuaternionAlign(const FourQuaternions& p, const FourQuaternions& q)
+{
+	// decide if one of the quaternions is backwards
+	bi32x4 cmp = CmpLtSIMD(Dot(p, q), Four_Zeros);
+	return MaskedAssign(cmp, Neg(q), q);
+}
+
+
+FORCEINLINE const FourQuaternions QuaternionNormalize(const FourQuaternions& q)
+{
+	fltx4 radius = Dot(q, q);
+	bi32x4 mask = CmpEqSIMD(radius, Four_Zeros); // all ones iff radius = 0
+	fltx4 invRadius = ReciprocalSqrtSIMD(radius);
+
+	FourQuaternions ret = MaskedAssign(mask, q, Mul(q, invRadius));
+	return ret;
+}
+
+
+#if !defined(__SPU__)
+FORCEINLINE FourQuaternions& FourQuaternions::FromAxisAndAngles(const fltx4& axis,
+	const float& angle0, const float& angle1, const float& angle2, const float& angle3)
+{
+	return FromAxisAndAngles(axis, LoadGatherSIMD(angle0, angle1, angle2, angle3));
+}
+
+FORCEINLINE FourQuaternions& FourQuaternions::FromAxisAndAngles(const fltx4& axis,
+	const fltx4& angles)
+{
+	// compute the half theta 
+	fltx4 theta = MulSIMD(angles, Four_PointFives);
+	// compute the sine and cosine of each angle simultaneously
+	fltx4 vsines; fltx4 vcoses;
+	SinCosSIMD(vsines, vcoses, theta);
+	// now the sines and coses vectors contain the results for four angles.
+	// for each of the angles, splat them out and then swizzle together so
+	// as to get a < cos, sin, sin, sin > coefficient vector
+
+	x = MulSIMD(vsines, SplatXSIMD(axis)); // sin(t0) * x, sin(t1) * x, etc 
+	y = MulSIMD(vsines, SplatYSIMD(axis));
+	z = MulSIMD(vsines, SplatZSIMD(axis));
+	w = vcoses;
+
+
+	return *this;
+}
+#endif
+
+
+/// this = this * q;
+FORCEINLINE FourQuaternions FourQuaternions::Mul(FourQuaternions const& q) const
+{
+	// W = w1w2 - x1x2 - y1y2 - z1z2
+	FourQuaternions ret;
+	fltx4 signMask = LoadAlignedSIMD((float*)g_SIMD_signmask);
+	// as we do the multiplication, also do a dot product, so we know whether
+	// one of the quats is backwards and if we therefore have to negate at the end
+	fltx4 dotProduct = MulSIMD(w, q.w);
+
+	ret.w = MulSIMD(w, q.w); // W = w1w2
+	ret.x = MulSIMD(w, q.x); // X = w1x2
+	ret.y = MulSIMD(w, q.y); // Y = w1y2
+	ret.z = MulSIMD(w, q.z); // Z = w1z2
+
+	dotProduct = MaddSIMD(x, q.x, dotProduct);
+	ret.w = MsubSIMD(x, q.x, ret.w); // W = w1w2 - x1x2
+	ret.x = MaddSIMD(x, q.w, ret.x); // X = w1x2 + x1w2
+	ret.y = MsubSIMD(x, q.z, ret.y); // Y = w1y2 - x1z2
+	ret.z = MaddSIMD(x, q.y, ret.z); // Z = w1z2 + x1y2
+
+	dotProduct = MaddSIMD(y, q.y, dotProduct);
+	ret.w = MsubSIMD(y, q.y, ret.w); // W = w1w2 - x1x2 - y1y2
+	ret.x = MaddSIMD(y, q.z, ret.x); // X = w1x2 + x1w2 + y1z2
+	ret.y = MaddSIMD(y, q.w, ret.y); // Y = w1y2 - x1z2 + y1w2
+	ret.z = MsubSIMD(y, q.x, ret.z); // Z = w1z2 + x1y2 - y1x2
+
+	dotProduct = MaddSIMD(z, q.z, dotProduct);
+	ret.w = MsubSIMD(z, q.z, ret.w); // W = w1w2 - x1x2 - y1y2 - z1z2
+	ret.x = MsubSIMD(z, q.y, ret.x); // X = w1x2 + x1w2 + y1z2 - z1y2
+	ret.y = MaddSIMD(z, q.x, ret.y); // Y = w1y2 - x1z2 + y1w2 + z1x2
+	ret.z = MaddSIMD(z, q.w, ret.z); // Z = w1z2 + x1y2 - y1x2 + z1w2
+
+	fltx4 Zero = Four_Zeros;
+	bi32x4 control = CmpLtSIMD(dotProduct, Four_Zeros);
+	signMask = MaskedAssign(control, signMask, Zero); // negate quats where q1.q2 < 0
+	ret.w = XorSIMD(signMask, ret.w);
+	ret.x = XorSIMD(signMask, ret.x);
+	ret.y = XorSIMD(signMask, ret.y);
+	ret.z = XorSIMD(signMask, ret.z);
+
+	return ret;
+}
+
+
+FORCEINLINE void FourQuaternions::RotateFourVectors(FourVectors* RESTRICT vecs) const RESTRICT
+{
+	fltx4 tmpX, tmpY, tmpZ, tmpW;
+	fltx4 outX, outY, outZ;
+
+	tmpX = SubSIMD(MaddSIMD(w, vecs->x, MulSIMD(y, vecs->z)),
+		MulSIMD(z, vecs->y));
+
+	tmpY = SubSIMD(MaddSIMD(w, vecs->y, MulSIMD(z, vecs->x)),
+		MulSIMD(x, vecs->z));
+
+	tmpZ = SubSIMD(MaddSIMD(w, vecs->z, MulSIMD(x, vecs->y)),
+		MulSIMD(y, vecs->x));
+
+	tmpW = AddSIMD(MaddSIMD(x, vecs->x, MulSIMD(y, vecs->y)),
+		MulSIMD(z, vecs->z));
+
+
+	outX = AddSIMD(SubSIMD(MaddSIMD(tmpW, x, MulSIMD(tmpX, w)),
+		MulSIMD(tmpY, z)),
+		MulSIMD(tmpZ, y));
+
+	outY = AddSIMD(SubSIMD(MaddSIMD(tmpW, y, MulSIMD(tmpY, w)),
+		MulSIMD(tmpZ, x)),
+		MulSIMD(tmpX, z));
+
+	outZ = AddSIMD(SubSIMD(MaddSIMD(tmpW, z, MulSIMD(tmpZ, w)),
+		MulSIMD(tmpX, y)),
+		MulSIMD(tmpY, x));
+
+	// although apparently redundant, assigning the results to intermediate local variables
+	// seems to improve code scheduling slightly in SN.
+	vecs->x = outX;
+	vecs->y = outY;
+	vecs->z = outZ;
+}
+
+
+/*
+
+void QuaternionScale( const Quaternion &p, float t, Quaternion &q )
+{
+	Assert( s_bMathlibInitialized );
+
+
+	float r;
+
+	// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to
+	// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
+	float sinom = sqrt( DotProduct( &p.x, &p.x ) );
+	sinom = min( sinom, 1.f );
+
+	float sinsom = sin( asin( sinom ) * t );
+
+	t = sinsom / (sinom + FLT_EPSILON);
+	VectorScale( &p.x, t, &q.x );
+
+	// rescale rotation
+	r = 1.0f - sinsom * sinsom;
+
+	// Assert( r >= 0 );
+	if (r < 0.0f)
+	r = 0.0f;
+	r = sqrt( r );
+
+	// keep sign of rotation
+	if (p.w < 0)
+	q.w = -r;
+	else
+	q.w = r;
+
+	Assert( q.IsValid() );
+
+	return;
+}
+
+*/
+
+FORCEINLINE FourQuaternions FourQuaternions::ScaleAngle(const fltx4& scale) const
+{
+	FourQuaternions ret;
+	static const fltx4 OneMinusEpsilon = { 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f };
+	const fltx4 Zero = Four_Zeros;
+	fltx4 signMask = LoadAlignedSIMD((float*)g_SIMD_signmask);
+	// work out if there are any tiny scales or angles, which are unstable
+	bi32x4 tinyAngles = CmpGtSIMD(w, OneMinusEpsilon);
+	bi32x4 negativeRotations = CmpLtSIMD(w, Zero); // if any w's are <0, we will need to negate later down
+
+	// figure out the theta
+	fltx4 angles = ArcCosSIMD(w);
+
+	// test also if w > -1
+	fltx4 negativeWs = XorSIMD(signMask, w);
+	tinyAngles = OrSIMD(CmpGtSIMD(negativeWs, OneMinusEpsilon), tinyAngles);
+
+	// meanwhile start working on computing the dot product of the
+	// vector component, and trust in the scheduler to interleave them
+	fltx4 vLenSq = MulSIMD(x, x);
+	vLenSq = MaddSIMD(y, y, vLenSq);
+	vLenSq = MaddSIMD(z, z, vLenSq);
+
+	// scale the angles
+	angles = MulSIMD(angles, scale);
+
+	// clear out the sign mask where w>=0
+	signMask = MaskedAssign(negativeRotations, signMask, Zero);
+
+	// work out the new w component and vector length
+	fltx4 vLenRecip = ReciprocalSqrtSIMD(vLenSq); // interleave with Cos to hide latencies
+	fltx4 sine;
+	SinCosSIMD(sine, ret.w, angles);
+	ret.x = MulSIMD(x, vLenRecip); // renormalize so the vector length + w = 1
+	ret.y = MulSIMD(y, vLenRecip); // renormalize so the vector length + w = 1
+	ret.z = MulSIMD(z, vLenRecip); // renormalize so the vector length + w = 1
+	ret.x = MulSIMD(ret.x, sine);
+	ret.y = MulSIMD(ret.y, sine);
+	ret.z = MulSIMD(ret.z, sine);
+
+	// negate where necessary
+	ret.x = XorSIMD(ret.x, signMask);
+	ret.y = XorSIMD(ret.y, signMask);
+	ret.z = XorSIMD(ret.z, signMask);
+	ret.w = XorSIMD(ret.w, signMask);
+
+	// finally, toss results from where cos(theta) is close to 1 -- these are non rotations.
+	ret.x = MaskedAssign(tinyAngles, x, ret.x);
+	ret.y = MaskedAssign(tinyAngles, y, ret.y);
+	ret.z = MaskedAssign(tinyAngles, z, ret.z);
+	ret.w = MaskedAssign(tinyAngles, w, ret.w);
+
+	return ret;
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: return = this * ( s * q )
+// In other words, for a quaternion representing a rotation of angle theta, return
+// one of angle s*theta
+// s is four floats in a fltx4 -- one for each quaternion
+//-----------------------------------------------------------------------------
+
+FORCEINLINE FourQuaternions FourQuaternions::MulAc(const fltx4& s, const FourQuaternions& q) const
+{
+	/*
+	void QuaternionMA( const Quaternion &p, float s, const Quaternion &q, Quaternion &qt )
+	{
+		Quaternion p1, q1;
+
+		QuaternionScale( q, s, q1 );
+		QuaternionMult( p, q1, p1 );
+		QuaternionNormalize( p1 );
+		qt[0] = p1[0];
+		qt[1] = p1[1];
+		qt[2] = p1[2];
+		qt[3] = p1[3];
+	}
+	*/
+
+	return Mul(q.ScaleAngle(s));
+}
+
+
+FORCEINLINE FourQuaternions FourQuaternions::ScaleMul(const fltx4& s, const FourQuaternions& q) const
+{
+	return ScaleAngle(s).Mul(q);
+}
+
+
+FORCEINLINE FourQuaternions FourQuaternions::Slerp(const FourQuaternions& originalto, const fltx4& t)
+{
+	FourQuaternions ret;
+	static const fltx4 OneMinusEpsilon = { 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f };
+
+	// align if necessary.
+
+	// actually, before we even do that, start by computing the dot product of 
+	// the quaternions. it has lots of dependent ops and we can sneak it into
+	// the pipeline bubbles as we figure out alignment. Of course we don't know
+	// yet if we need to realign, so compute them both -- there's plenty of
+	// space in the bubbles. They're roomy, those bubbles.
+	fltx4 cosineOmega;
+#if 0 // Maybe I don't need to do alignment seperately, using the xb360 technique...
+	FourQuaternions to;
+	{
+		fltx4 diffs[4], sums[4], originalToNeg[4];
+		fltx4 dotIfAligned, dotIfNotAligned;
+
+		// compute negations of the TO quaternion.
+		originalToNeg[0] = NegSIMD(originalto.x);
+		originalToNeg[1] = NegSIMD(originalto.y);
+		originalToNeg[2] = NegSIMD(originalto.z);
+		originalToNeg[3] = NegSIMD(originalto.w);
+
+		dotIfAligned = MulSIMD(x, originalto.x);
+		dotIfNotAligned = MulSIMD(x, originalToNeg[0]);
+
+		diffs[0] = SubSIMD(x, originalto.x);
+		diffs[1] = SubSIMD(y, originalto.y);
+		diffs[2] = SubSIMD(z, originalto.z);
+		diffs[3] = SubSIMD(w, originalto.w);
+
+		sums[0] = AddSIMD(x, originalto.x);
+		sums[1] = AddSIMD(y, originalto.y);
+		sums[2] = AddSIMD(z, originalto.z);
+		sums[3] = AddSIMD(w, originalto.w);
+
+		dotIfAligned = MaddSIMD(y, originalto.y, dotIfAligned);
+		dotIfNotAligned = MaddSIMD(y, originalToNeg[1], dotIfNotAligned);
+
+		fltx4 diffsDot, sumsDot;
+
+		diffsDot = MulSIMD(diffs[0], diffs[0]); // x^2
+		sumsDot = MulSIMD(sums[0], sums[0]); // x^2
+			// do some work on the dot products while letting the multiplies cook
+		dotIfAligned = MaddSIMD(z, originalto.z, dotIfAligned);
+		dotIfNotAligned = MaddSIMD(z, originalToNeg[2], dotIfNotAligned);
+
+		diffsDot = MaddSIMD(diffs[1], diffs[1], diffsDot); // x^2 + y^2 
+		sumsDot = MaddSIMD(sums[1], sums[1], sumsDot);
+		diffsDot = MaddSIMD(diffs[2], diffs[2], diffsDot); // x^2 + y^2 + z^2
+		sumsDot = MaddSIMD(sums[2], sums[2], sumsDot);
+		diffsDot = MaddSIMD(diffs[3], diffs[3], diffsDot); // x^2 + y^2 + z^2 + w^2
+		sumsDot = MaddSIMD(sums[3], sums[3], sumsDot);
+		// do some work on the dot products while letting the multiplies cook
+		dotIfAligned = MaddSIMD(w, originalto.w, dotIfAligned);
+		dotIfNotAligned = MaddSIMD(w, originalToNeg[3], dotIfNotAligned);
+
+		// are the differences greater than the sums?
+		// if so, we need to negate that quaternion
+		fltx4 mask = CmpGtSIMD(diffsDot, sumsDot); // 1 for diffs>0 and 0 elsewhere
+		to.x = MaskedAssign(mask, originalToNeg[0], originalto.x);
+		to.y = MaskedAssign(mask, originalToNeg[1], originalto.y);
+		to.z = MaskedAssign(mask, originalToNeg[2], originalto.z);
+		to.w = MaskedAssign(mask, originalToNeg[3], originalto.w);
+
+		cosineOmega = MaskedAssign(mask, dotIfNotAligned, dotIfAligned);
+	}
+
+	// right, now to is aligned to be the short way round, and we computed
+	// the dot product while we were figuring all that out.
+#else
+	const FourQuaternions& to = originalto;
+	cosineOmega = MulSIMD(x, to.x);
+	cosineOmega = MaddSIMD(y, to.y, cosineOmega);
+	cosineOmega = MaddSIMD(z, to.z, cosineOmega);
+	cosineOmega = MaddSIMD(w, to.w, cosineOmega);
+#endif
+
+	fltx4 Zero = Four_Zeros;
+	bi32x4 cosOmegaLessThanZero = CmpLtSIMD(cosineOmega, Zero);
+	// fltx4 shouldNegate = MaskedAssign(cosOmegaLessThanZero, Four_NegativeOnes , Four_Ones );
+	fltx4 signMask = LoadAlignedSIMD((float*)g_SIMD_signmask); // contains a one in the sign bit -- xor against a number to negate it
+	fltx4 sinOmega = Four_Ones;
+
+	// negate cosineOmega where necessary
+	cosineOmega = MaskedAssign(cosOmegaLessThanZero, XorSIMD(cosineOmega, signMask), cosineOmega);
+	fltx4 oneMinusT = SubSIMD(Four_Ones, t);
+	bi32x4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps
+
+	// figure out the sin component of the diff quaternion.
+	// since sin^2(t) + cos^2(t) = 1...
+	sinOmega = MsubSIMD(cosineOmega, cosineOmega, sinOmega); // = 1 - cos^2(t) = sin^2(t)
+	fltx4 invSinOmega = ReciprocalSqrtSIMD(sinOmega);  // 1/sin(t)
+	sinOmega = MulSIMD(sinOmega, invSinOmega); // = sin^2(t) / sin(t) = sin(t)
+
+	// use the arctangent technique to work out omega from  tan^-1(sin/cos)
+	fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega);
+
+	// alpha = sin(omega * (1-T))/sin(omega)
+	// beta  = sin(omega * T)/sin(omega)
+	fltx4 alpha = MulSIMD(omega, oneMinusT);  // w(1-T)
+	fltx4 beta = MulSIMD(omega, t);		  // w(T)
+	signMask = MaskedAssign(cosOmegaLessThanZero, signMask, Zero);
+
+	alpha = SinSIMD(alpha);  // sin(w(1-T))
+	beta = SinSIMD(beta);   // sin(wT)
+
+	alpha = MulSIMD(alpha, invSinOmega);
+	beta = MulSIMD(beta, invSinOmega);
+
+	// depending on whether the dot product was less than zero, negate beta, or not
+	beta = XorSIMD(beta, signMask);
+
+	// mask out singularities (where omega = 1)
+	alpha = MaskedAssign(bCosOmegaLessThanOne, alpha, oneMinusT);
+	beta = MaskedAssign(bCosOmegaLessThanOne, beta, t);
+
+	ret.x = MulSIMD(x, alpha);
+	ret.y = MulSIMD(y, alpha);
+	ret.z = MulSIMD(z, alpha);
+	ret.w = MulSIMD(w, alpha);
+
+	ret.x = MaddSIMD(to.x, beta, ret.x);
+	ret.y = MaddSIMD(to.y, beta, ret.y);
+	ret.z = MaddSIMD(to.z, beta, ret.z);
+	ret.w = MaddSIMD(to.w, beta, ret.w);
+
+	return ret;
+}
+
+
+
+FORCEINLINE FourQuaternions FourQuaternions::SlerpNoAlign(const FourQuaternions& originalto, const fltx4& t)
+{
+	FourQuaternions ret;
+	static const fltx4 OneMinusEpsilon = { 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f };
+
+	// align if necessary.
+
+	// actually, before we even do that, start by computing the dot product of 
+	// the quaternions. it has lots of dependent ops and we can sneak it into
+	// the pipeline bubbles as we figure out alignment. Of course we don't know
+	// yet if we need to realign, so compute them both -- there's plenty of
+	// space in the bubbles. They're roomy, those bubbles.
+	fltx4 cosineOmega;
+
+	const FourQuaternions& to = originalto;
+	cosineOmega = MulSIMD(x, to.x);
+	cosineOmega = MaddSIMD(y, to.y, cosineOmega);
+	cosineOmega = MaddSIMD(z, to.z, cosineOmega);
+	cosineOmega = MaddSIMD(w, to.w, cosineOmega);
+
+	fltx4 sinOmega = Four_Ones;
+
+	fltx4 oneMinusT = SubSIMD(Four_Ones, t);
+	bi32x4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps
+
+	// figure out the sin component of the diff quaternion.
+	// since sin^2(t) + cos^2(t) = 1...
+	sinOmega = MsubSIMD(cosineOmega, cosineOmega, sinOmega); // = 1 - cos^2(t) = sin^2(t)
+	fltx4 invSinOmega = ReciprocalSqrtSIMD(sinOmega);  // 1/sin(t)
+	sinOmega = MulSIMD(sinOmega, invSinOmega); // = sin^2(t) / sin(t) = sin(t)
+
+	// use the arctangent technique to work out omega from  tan^-1(sin/cos)
+	fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega);
+
+	// alpha = sin(omega * (1-T))/sin(omega)
+	// beta  = sin(omega * T)/sin(omega)
+	fltx4 alpha = MulSIMD(omega, oneMinusT);  // w(1-T)
+	fltx4 beta = MulSIMD(omega, t);		  // w(T)
+	alpha = SinSIMD(alpha);  // sin(w(1-T))
+	beta = SinSIMD(beta);   // sin(wT)
+	alpha = MulSIMD(alpha, invSinOmega);
+	beta = MulSIMD(beta, invSinOmega);
+
+	// mask out singularities (where omega = 1)
+	alpha = MaskedAssign(bCosOmegaLessThanOne, alpha, oneMinusT);
+	beta = MaskedAssign(bCosOmegaLessThanOne, beta, t);
+
+	ret.x = MulSIMD(x, alpha);
+	ret.y = MulSIMD(y, alpha);
+	ret.z = MulSIMD(z, alpha);
+	ret.w = MulSIMD(w, alpha);
+
+	ret.x = MaddSIMD(to.x, beta, ret.x);
+	ret.y = MaddSIMD(to.y, beta, ret.y);
+	ret.z = MaddSIMD(to.z, beta, ret.z);
+	ret.w = MaddSIMD(to.w, beta, ret.w);
+
+	return ret;
+}
+
+/***** removed because one of the SWIG permutations doesn't include ssequaternion.h, causing a missing symbol on this function:
+inline void FourVectors::RotateBy( const FourQuaternions &quats )
+{
+	quats.RotateFourVectors( this );
+}
+*/
+
+
 #endif // SSEQUATMATH_H
 
+
diff --git a/r5dev/mathlib/transform.cpp b/r5dev/mathlib/transform.cpp
new file mode 100644
index 00000000..8be2e83c
--- /dev/null
+++ b/r5dev/mathlib/transform.cpp
@@ -0,0 +1,179 @@
+//==== Copyright (c) 1996-2011, Valve Corporation, All rights reserved. =====//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//===========================================================================//
+
+#include "core/stdafx.h"
+#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)
+
+#include "mathlib/transform.h"
+#include "mathlib/mathlib.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+//#include "tier0/memdbgon.h"
+
+const CTransform g_TransformIdentity(Vector3D(0.0f, 0.0f, 0.0f), Quaternion(0.0f, 0.0f, 0.0f, 1.0f));
+
+void SetIdentityTransform(CTransform& out)
+{
+	out.m_vPosition = vec3_origin;
+	out.m_orientation = quat_identity;
+}
+
+void ConcatTransforms(const CTransform& in1, const CTransform& in2, CTransform& out)
+{
+	// Store in temp to avoid problems if out == in1 or out == in2
+	CTransform result;
+	QuaternionMult(in1.m_orientation, in2.m_orientation, result.m_orientation);
+	QuaternionMultiply(in1.m_orientation, in2.m_vPosition, result.m_vPosition);
+	result.m_vPosition += in1.m_vPosition;
+	out = result;
+}
+
+void VectorIRotate(const Vector3D& v, const CTransform& t, Vector3D& out)
+{
+	// FIXME: Make work directly with the transform
+	matrix3x4_t m;
+	TransformMatrix(t, m);
+	VectorIRotate(v, m, out);
+}
+
+void VectorITransform(const Vector3D& v, const CTransform& t, Vector3D& out)
+{
+	// FIXME: Make work directly with the transform
+	matrix3x4_t m;
+	TransformMatrix(t, m);
+	VectorITransform(v, m, out);
+}
+
+void TransformSlerp(const CTransform& p, const CTransform& q, float t, CTransform& qt)
+{
+	QuaternionSlerp(p.m_orientation, q.m_orientation, t, qt.m_orientation);
+	VectorLerp(p.m_vPosition, q.m_vPosition, t, qt.m_vPosition);
+}
+
+void TransformLerp(const CTransform& p, const CTransform& q, float t, CTransform& qt)
+{
+	QuaternionBlend(p.m_orientation, q.m_orientation, t, qt.m_orientation);
+	VectorLerp(p.m_vPosition, q.m_vPosition, t, qt.m_vPosition);
+}
+
+void TransformMatrix(const CTransform& in, matrix3x4_t& out)
+{
+	QuaternionMatrix(in.m_orientation, in.m_vPosition, out);
+}
+
+void TransformMatrix(const CTransformUnaligned& in, matrix3x4_t& out)
+{
+	QuaternionMatrix(in.m_orientation, in.m_vPosition, out);
+}
+
+void TransformMatrix(const CTransform& in, const Vector3D& vScaleIn, matrix3x4_t& out)
+{
+	QuaternionMatrix(in.m_orientation, in.m_vPosition, vScaleIn, out);
+}
+
+void MatrixTransform(const matrix3x4_t& in, CTransformUnaligned& out)
+{
+	MatrixQuaternion(in, out.m_orientation);
+	MatrixGetColumn(in, ORIGIN, out.m_vPosition);
+}
+
+void MatrixTransform(const matrix3x4_t& in, CTransform& out)
+{
+	MatrixQuaternion(in, out.m_orientation);
+	MatrixGetColumn(in, ORIGIN, out.m_vPosition);
+}
+
+void MatrixTransform(const matrix3x4_t& in, CTransform& out, Vector3D& vScaleOut)
+{
+	matrix3x4_t norm;
+	vScaleOut = MatrixNormalize(in, norm);
+	MatrixTransform(norm, out);
+}
+
+void AngleTransform(const QAngle& angles, const Vector3D& origin, CTransform& out)
+{
+	AngleQuaternion(angles, out.m_orientation);
+	out.m_vPosition = origin;
+}
+
+void TransformInvert(const CTransform& in, CTransform& out)
+{
+	QuaternionInvert(in.m_orientation, out.m_orientation);
+	QuaternionMultiply(out.m_orientation, in.m_vPosition, out.m_vPosition);
+	out.m_vPosition *= -1.0f;
+}
+
+void AxisAngleTransform(const Vector3D& vecAxis, float flAngleDegrees, CTransform& out)
+{
+	AxisAngleQuaternion(vecAxis, flAngleDegrees, out.m_orientation);
+	out.m_vPosition = vec3_origin;
+}
+
+void TransformVectorsFLU(const CTransform& in, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp)
+{
+	QuaternionVectorsFLU(in.m_orientation, pForward, pLeft, pUp);
+}
+
+void TransformVectorsForward(const CTransform& in, Vector3D* pForward)
+{
+	QuaternionVectorsForward(in.m_orientation, pForward);
+}
+
+bool TransformsAreEqual(const CTransform& src1, const CTransform& src2, float flPosTolerance, float flRotTolerance)
+{
+	if (!VectorsAreEqual(src1.m_vPosition, src2.m_vPosition, flPosTolerance))
+		return false;
+	return QuaternionsAreEqual(src1.m_orientation, src2.m_orientation, flRotTolerance);
+}
+
+// FIXME: optimize this with simd goodness
+void TransformToWorldSpace(int nRootTransformCount, int nTransformCount, const int* pParentIndices, CTransform* pTransforms)
+{
+#ifdef _DEBUG
+	for (int i = 0; i < nRootTransformCount; ++i)
+	{
+		Assert(pParentIndices[i] < 0);
+	}
+#endif
+
+	for (int i = nRootTransformCount; i < nTransformCount; ++i)
+	{
+		int nParentBone = pParentIndices[i];
+		Assert(nParentBone >= 0 && nParentBone < i);
+		ConcatTransforms(pTransforms[nParentBone], pTransforms[i], pTransforms[i]);
+	}
+}
+
+// FIXME: optimize this with simd goodness
+void TransformToParentSpace(int nRootTransformCount, int nTransformCount, const int* pParentIndices, CTransform* pTransforms)
+{
+#ifdef _DEBUG
+	for (int i = 0; i < nRootTransformCount; ++i)
+	{
+		Assert(pParentIndices[i] < 0);
+	}
+#endif
+
+	bool* pComputedParentTransform = (bool*)stackalloc(nTransformCount * sizeof(bool));
+	memset(pComputedParentTransform, 0, nTransformCount * sizeof(bool));
+	CTransform* pWorldToParentTransforms = (CTransform*)stackalloc(nTransformCount * sizeof(CTransform));
+
+	for (int b = nTransformCount; --b >= nRootTransformCount; )
+	{
+		int nParentBone = pParentIndices[b];
+		if (!pComputedParentTransform[nParentBone])
+		{
+			TransformInvert(pTransforms[nParentBone], pWorldToParentTransforms[nParentBone]);
+			pComputedParentTransform[nParentBone] = true;
+		}
+		ConcatTransforms(pWorldToParentTransforms[nParentBone], pTransforms[b], pTransforms[b]);
+	}
+}
+
+#endif // !_STATIC_LINKED || _SHARED_LIB
+
diff --git a/r5dev/mathlib/transform.h b/r5dev/mathlib/transform.h
new file mode 100644
index 00000000..d2cb9b03
--- /dev/null
+++ b/r5dev/mathlib/transform.h
@@ -0,0 +1,401 @@
+//====== Copyright 1996-2005, Valve Corporation, All rights reserved. =======//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//===========================================================================//
+
+#ifndef TRANSFORM_H
+#define TRANSFORM_H
+
+#ifdef COMPILER_MSVC
+#pragma once
+#endif
+
+//#include "tier0/memalloc.h"
+#include "mathlib/vector.h"
+#include "mathlib/mathlib.h"
+
+//-----------------------------------------------------------------------------
+// Matrix 3x4_t
+//-----------------------------------------------------------------------------
+class CTransformUnaligned;
+
+
+//-----------------------------------------------------------------------------
+// Represents a position + orientation using quaternions
+//-----------------------------------------------------------------------------
+class ALIGN16 CTransform
+{
+public:
+	CTransform() {}
+	CTransform(const Vector3D& v, const Quaternion& q) : m_vPosition(v), m_orientation(q) {}
+	CTransform(const Vector3D& v, const QAngle& a) : m_vPosition(v)
+	{
+		AngleQuaternion(a, m_orientation);
+	}
+
+	VectorAligned m_vPosition;
+	QuaternionAligned m_orientation;
+
+	bool IsValid() const
+	{
+		return m_vPosition.IsValid() && m_orientation.IsValid();
+	}
+
+	bool operator==(const CTransform& v) const;					///< exact equality check
+	bool operator!=(const CTransform& v) const;
+
+	// for API compatibility with matrix3x4_t
+	inline void InitFromQAngles(const QAngle& angles, const Vector3D& vPosition = vec3_origin);
+	inline void InitFromMatrix(const matrix3x4_t& transform);
+	inline void InitFromQuaternion(const Quaternion& orientation, const Vector3D& vPosition = vec3_origin);
+
+	inline Quaternion ToQuaternion() const;
+	inline QAngle ToQAngle() const;
+	inline matrix3x4_t ToMatrix() const;
+
+	inline void SetToIdentity();
+
+	inline void SetOrigin(Vector3D const& vPos) { m_vPosition = vPos; }
+	inline void SetAngles(QAngle const& vAngles);
+	inline Vector3D GetOrigin(void) const { return m_vPosition; }
+
+	inline void GetBasisVectorsFLU(Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp) const;
+	inline Vector3D GetForward() const;
+	inline Vector3D TransformVector(const Vector3D& v0) const;
+	inline Vector3D RotateVector(const Vector3D& v0) const;
+	inline Vector3D TransformVectorByInverse(const Vector3D& v0) const;
+	inline Vector3D RotateVectorByInverse(const Vector3D& v0) const;
+	inline Vector3D RotateExtents(const Vector3D& vBoxExtents) const; // these are extents and must remain positive/symmetric after rotation
+	inline void TransformAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const;
+	inline void TransformAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const;
+	inline void RotateAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const;
+	inline void RotateAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const;
+	//inline void TransformPlane( const cplane_t &inPlane, cplane_t &outPlane ) const;
+	//inline void InverseTransformPlane( const cplane_t &inPlane, cplane_t &outPlane ) const;
+
+	/// Computes an inverse.  Uses the 'TR' naming to be consistent with the same method in matrix3x4_t (which only works with orthonormal matrices) 
+	inline void InverseTR(CTransform& out) const;
+
+public:
+	CTransform& operator=(const CTransformUnaligned& i);
+} ALIGN16_POST;
+
+
+extern const CTransform g_TransformIdentity;
+
+
+//-----------------------------------------------------------------------------
+// Represents an unaligned position + orientation using quaternions,
+// used only for copying data around
+//-----------------------------------------------------------------------------
+class CTransformUnaligned
+{
+public:
+	CTransformUnaligned() {}
+	CTransformUnaligned(const Vector3D& v, const Quaternion& q) : m_vPosition(v), m_orientation(q) {}
+	CTransformUnaligned(const CTransform& transform) : m_vPosition(transform.m_vPosition), m_orientation(transform.m_orientation) {}
+	CTransform AsTransform() const { return CTransform(m_vPosition, m_orientation); }
+
+	Vector3D m_vPosition;
+	Quaternion m_orientation;
+
+	bool IsValid() const
+	{
+		return m_vPosition.IsValid() && m_orientation.IsValid();
+	}
+
+public:
+	CTransformUnaligned& operator=(const CTransform& i);
+};
+
+
+//-----------------------------------------------------------------------------
+// Inline methods
+//-----------------------------------------------------------------------------
+inline CTransform& CTransform::operator=(const CTransformUnaligned& i)
+{
+	m_vPosition = i.m_vPosition;
+	m_orientation = i.m_orientation;
+	return *this;
+}
+
+inline CTransformUnaligned& CTransformUnaligned::operator=(const CTransform& i)
+{
+	m_vPosition = i.m_vPosition;
+	m_orientation = i.m_orientation;
+	return *this;
+}
+
+
+//-----------------------------------------------------------------------------
+// Other methods
+//-----------------------------------------------------------------------------
+void ConcatTransforms(const CTransform& in1, const CTransform& in2, CTransform& out);
+void TransformSlerp(const CTransform& p, const CTransform& q, float t, CTransform& qt);
+void TransformLerp(const CTransform& p, const CTransform& q, float t, CTransform& qt);
+void TransformMatrix(const CTransform& in, matrix3x4_t& out);
+void TransformMatrix(const CTransform& in, const Vector3D& vScaleIn, matrix3x4_t& out);
+
+inline void TransformMatrix(const CTransform& in, float flScale, matrix3x4_t& out)
+{
+	QuaternionMatrix(in.m_orientation, in.m_vPosition, Vector3D(flScale, flScale, flScale), out);
+}
+
+inline float TransformNormalize(CTransform& in)
+{
+	return QuaternionNormalize(in.m_orientation);
+}
+
+void TransformMatrix(const CTransformUnaligned& in, matrix3x4_t& out);
+void MatrixTransform(const matrix3x4_t& in, CTransform& out);
+void MatrixTransform(const matrix3x4_t& in, CTransformUnaligned& out);
+void MatrixTransform(const matrix3x4_t& in, CTransform& out, Vector3D& vScaleOut);
+
+inline void MatrixTransform(const matrix3x4_t& in, CTransform& out, float& flScale)
+{
+	Vector3D vScale;
+	MatrixTransform(in, out, vScale);
+	flScale = vScale.LargestComponentValue();
+}
+
+void AngleTransform(const QAngle& angles, const Vector3D& origin, CTransform& out);
+void SetIdentityTransform(CTransform& out);
+void TransformVectorsFLU(const CTransform& in, Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp);
+void TransformVectorsForward(const CTransform& in, Vector3D* pForward);
+
+inline const CTransform GetIdentityTransform()
+{
+	CTransform out;
+	SetIdentityTransform(out);
+	return out;
+}
+
+inline const CTransform MatrixTransform(const matrix3x4_t& in)
+{
+	CTransform out;
+	MatrixTransform(in, out);
+	return out;
+}
+
+inline const matrix3x4_t TransformMatrix(const CTransform& in)
+{
+	matrix3x4_t out;
+	TransformMatrix(in, out);
+	return out;
+}
+inline const matrix3x4_t TransformMatrix(const CTransformUnaligned& in)
+{
+	matrix3x4_t out;
+	TransformMatrix(in, out);
+	return out;
+}
+
+inline const CTransform ConcatTransforms(const CTransform& in1, const CTransform& in2)
+{
+	CTransform result;
+	ConcatTransforms(in1, in2, result);
+	return result;
+}
+
+
+void TransformInvert(const CTransform& in, CTransform& out);
+void AxisAngleTransform(const Vector3D& vecAxis, float flAngleDegrees, CTransform& out);
+void VectorIRotate(const Vector3D& v, const CTransform& t, Vector3D& out);
+void VectorITransform(const Vector3D& v, const CTransform& t, Vector3D& out);
+
+inline Vector3D TransformPoint(const CTransformUnaligned& tm, const Vector3D& p)
+{
+	return Vector3D(
+		tm.m_vPosition.x + (1.0f - 2.0f * tm.m_orientation.y * tm.m_orientation.y - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.x + (2.0f * tm.m_orientation.x * tm.m_orientation.y - 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.x * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.z,
+		tm.m_vPosition.y + (2.0f * tm.m_orientation.x * tm.m_orientation.y + 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.x + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.y * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.z,
+		tm.m_vPosition.z + (2.0f * tm.m_orientation.x * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.x + (2.0f * tm.m_orientation.y * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.y + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.y * tm.m_orientation.y) * p.z
+	);
+}
+
+// TODO: implement in SIMD?
+inline Vector3D TransformPoint(const CTransform& tm, const Vector3D& p)
+{
+	return Vector3D(
+		tm.m_vPosition.x + (1.0f - 2.0f * tm.m_orientation.y * tm.m_orientation.y - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.x + (2.0f * tm.m_orientation.x * tm.m_orientation.y - 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.x * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.z,
+		tm.m_vPosition.y + (2.0f * tm.m_orientation.x * tm.m_orientation.y + 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.x + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.y * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.z,
+		tm.m_vPosition.z + (2.0f * tm.m_orientation.x * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.x + (2.0f * tm.m_orientation.y * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.y + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.y * tm.m_orientation.y) * p.z
+	);
+}
+
+
+template < class T >
+inline void TransformPoint(const T& tm, const Vector3D& p, Vector3D& out)
+{
+	out.x = tm.m_vPosition.x + (1.0f - 2.0f * tm.m_orientation.y * tm.m_orientation.y - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.x + (2.0f * tm.m_orientation.x * tm.m_orientation.y - 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.x * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.z;
+	out.y = tm.m_vPosition.y + (2.0f * tm.m_orientation.x * tm.m_orientation.y + 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.x + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.y * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.z;
+	out.z = tm.m_vPosition.z + (2.0f * tm.m_orientation.x * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.x + (2.0f * tm.m_orientation.y * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.y + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.y * tm.m_orientation.y) * p.z;
+}
+
+template < class T >
+inline void RotatePoint(const T& tm, const Vector3D& p, Vector3D& out)
+{
+	out.x = (1.0f - 2.0f * tm.m_orientation.y * tm.m_orientation.y - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.x + (2.0f * tm.m_orientation.x * tm.m_orientation.y - 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.x * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.z;
+	out.y = (2.0f * tm.m_orientation.x * tm.m_orientation.y + 2.0f * tm.m_orientation.w * tm.m_orientation.z) * p.x + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.z * tm.m_orientation.z) * p.y + (2.0f * tm.m_orientation.y * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.z;
+	out.z = (2.0f * tm.m_orientation.x * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.y) * p.x + (2.0f * tm.m_orientation.y * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.x) * p.y + (1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.y * tm.m_orientation.y) * p.z;
+}
+
+
+inline const CTransform TransformInvert(const CTransform& in)
+{
+	CTransform out;
+	TransformInvert(in, out);
+	return out;
+}
+
+// Transform equality test
+bool TransformsAreEqual(const CTransform& src1, const CTransform& src2, float flPosTolerance = 1e-2, float flRotTolerance = 1e-1f);
+
+// Computes world-space transforms given local-space transforms + parent info
+// The start of the pTransforms array (nRootTransformCount # of transforms) must be filled with 
+// the root transforms which have no parent. The end of the pTransforms array (nTransformCount # of transforms)
+// must be filled with local-space transforms which are relative to other transforms, including possibly the
+// root transforms. Therefore, (nRootTransformCount + nTransformCount) # of transforms must be passed into pTransforms.
+// Only nTransformCount parent indices should be passed in. 
+// Parent indices are relative to the entire array, so a parent index of 0 indicates the first element
+// of the array, which is always a root transform. -1 parent index is *illegal*
+// Parent indices must always be sorted so that the index transforms earlier in the array.
+// The transforms are modified in-place.
+void TransformToWorldSpace(int nRootTransformCount, int nTransformCount, const int* pParentIndices, CTransform* pTransforms);
+void TransformToParentSpace(int nRootTransformCount, int nTransformCount, const int* pParentIndices, CTransform* pTransforms);
+
+
+inline void CTransform::InitFromQAngles(const QAngle& angles, const Vector3D& vPosition)
+{
+	AngleQuaternion(angles, m_orientation);
+	m_vPosition = vPosition;
+}
+
+inline void CTransform::InitFromMatrix(const matrix3x4_t& transform)
+{
+	m_orientation = MatrixQuaternion(transform);
+	m_vPosition = transform.GetOrigin();
+}
+
+inline void CTransform::InitFromQuaternion(const Quaternion& orientation, const Vector3D& vPosition)
+{
+	m_orientation = orientation;
+	m_vPosition = vPosition;
+}
+
+inline void CTransform::SetAngles(QAngle const& vAngles)
+{
+	AngleQuaternion(vAngles, m_orientation);
+}
+
+inline Quaternion CTransform::ToQuaternion() const
+{
+	return m_orientation;
+}
+inline QAngle CTransform::ToQAngle() const
+{
+	QAngle angles;
+	QuaternionAngles(m_orientation, angles);
+	return angles;
+}
+
+inline matrix3x4_t CTransform::ToMatrix() const
+{
+	return TransformMatrix(*this);
+}
+
+inline void CTransform::SetToIdentity()
+{
+	m_vPosition = vec3_origin;
+	m_orientation = quat_identity;
+}
+
+inline void CTransform::GetBasisVectorsFLU(Vector3D* pForward, Vector3D* pLeft, Vector3D* pUp) const
+{
+	TransformVectorsFLU(*this, pForward, pLeft, pUp);
+}
+
+inline Vector3D CTransform::GetForward() const
+{
+	Vector3D vForward;
+	TransformVectorsForward(*this, &vForward);
+	return vForward;
+}
+
+inline Vector3D CTransform::TransformVector(const Vector3D& v0) const
+{
+	return TransformPoint(*this, v0);
+}
+
+inline Vector3D CTransform::RotateVector(const Vector3D& v0) const
+{
+	Vector3D vOut;
+	RotatePoint(*this, v0, vOut);
+	return vOut;
+}
+
+inline Vector3D CTransform::TransformVectorByInverse(const Vector3D& v0) const
+{
+	Vector3D vOut;
+	VectorITransform(v0, *this, vOut);
+	return vOut;
+}
+
+inline Vector3D CTransform::RotateVectorByInverse(const Vector3D& v0) const
+{
+	Vector3D vOut;
+	VectorIRotate(v0, *this, vOut);
+	return vOut;
+}
+
+inline bool CTransform::operator==(const CTransform& t) const
+{
+	return t.m_vPosition == m_vPosition && t.m_orientation == m_orientation;
+}
+
+inline bool CTransform::operator!=(const CTransform& t) const
+{
+	return t.m_vPosition != m_vPosition || t.m_orientation != m_orientation;
+}
+
+// PERFORMANCE: No native versions of these but implement them on matrix for convenient access
+inline void CTransform::TransformAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const
+{
+	ToMatrix().TransformAABB(vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut);
+}
+
+inline void CTransform::TransformAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const
+{
+	ToMatrix().TransformAABBByInverse(vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut);
+}
+
+inline void CTransform::RotateAABB(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const
+{
+	ToMatrix().RotateAABB(vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut);
+}
+inline void CTransform::RotateAABBByInverse(const Vector3D& vecMinsIn, const Vector3D& vecMaxsIn, Vector3D& vecMinsOut, Vector3D& vecMaxsOut) const
+{
+	ToMatrix().RotateAABBByInverse(vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut);
+}
+
+inline void CTransform::InverseTR(CTransform& out) const
+{
+	matrix3x4_t xForm = ToMatrix();
+	out = xForm.InverseTR().ToCTransform();
+}
+
+
+// transform conversion operators on matrix3x4_t
+inline void matrix3x4_t::InitFromCTransform(const CTransform& transform)
+{
+	TransformMatrix(transform, *this);
+}
+inline CTransform matrix3x4_t::ToCTransform() const
+{
+	return MatrixTransform(*this);
+}
+
+
+#endif // TRANSFORM
\ No newline at end of file
diff --git a/r5dev/mathlib/vector.h b/r5dev/mathlib/vector.h
index 833c1c8d..8fca4b5b 100644
--- a/r5dev/mathlib/vector.h
+++ b/r5dev/mathlib/vector.h
@@ -8,7 +8,6 @@
 
 #ifndef VECTOR_H
 #define VECTOR_H
-#define NO_MALLOC_OVERRIDE
 
 #ifdef _WIN32
 #pragma once
@@ -23,7 +22,7 @@
 #if defined( _PS3 )
 //#include <ssemath.h>
 #include <vectormath/c/vectormath_aos.h>
-#include "platform.h"
+#include "tier0/platform.h"
 #include "mathlib/math_pfns.h"
 #endif
 
@@ -36,16 +35,19 @@
 #define ALIGN16_POST
 #endif
 
+#define NO_MALLOC_OVERRIDE
 #if !defined(NO_MALLOC_OVERRIDE)
 #include "tier0/memalloc.h"
 #endif // !NO_MALLOC_OVERRIDE
 #include "tier0/dbg.h"
 #include "tier0/platform.h"
+#if !defined( __SPU__ )
 #include "tier0/threadtools.h"
+#endif
 #include "mathlib/vector2d.h"
 #include "mathlib/math_pfns.h"
-#include "mathlib/bits.h"
 #include "vstdlib/random.h"
+
 // Uncomment this to add extra Asserts to check for NANs, uninitialized vecs, etc.
 //#define VECTOR_PARANOIA	1
 
@@ -92,6 +94,7 @@ public:
 
    // Got any nasty NAN's?
 	bool IsValid() const;
+	bool IsReasonable(float range = 1000000) const;		///< Check for reasonably-sized values (if used as a game world position)
 	void Invalidate();
 
 	// array access...
@@ -157,13 +160,15 @@ public:
 	inline bool IsZeroFast() const RESTRICT
 	{
 		static_assert(sizeof(vec_t) == sizeof(int));
-		return (*(const int*)(&x) == 0 &&
-			*(const int*)(&y) == 0 &&
-			*(const int*)(&z) == 0);
+		return (*reinterpret_cast<const int*>(&x) == 0 &&
+			*reinterpret_cast<const int*>(&y) == 0 &&
+			*reinterpret_cast<const int*>(&z) == 0);
 	}
 
-	vec_t	NormalizeInPlace();
-	Vector3D	Normalized() const;
+	vec_t	NormalizeInPlace();								///< Normalize all components
+	vec_t	NormalizeInPlaceSafe(const Vector3D& vFallback);///< Normalize all components
+	Vector3D	Normalized() const;								///< Return normalized vector
+	Vector3D	NormalizedSafe(const Vector3D& vFallback)const;		///< Return normalized vector, falling back to vFallback if the length of this is 0
 	bool	IsLengthGreaterThan(float val) const;
 	bool	IsLengthLessThan(float val) const;
 
@@ -203,6 +208,9 @@ public:
 
 	// returns 0, 1, 2 corresponding to the component with the largest absolute value
 	inline int LargestComponent() const;
+	inline vec_t LargestComponentValue() const;
+	inline int SmallestComponent() const;
+	inline vec_t SmallestComponentValue() const;
 
 	// 2d
 	vec_t	Length2D(void) const;
@@ -243,7 +251,8 @@ private:
 #endif
 };
 
-
+// Zero the object -- necessary for CNetworkVar and possibly other cases.
+inline void EnsureValidValue(Vector3D& x) { x.Zero(); }
 
 #define USE_M64S defined( PLATFORM_WINDOWS_PC )
 
@@ -608,8 +617,14 @@ Vector3D RandomVector(vec_t minVal, vec_t maxVal);
 #endif
 
 float RandomVectorInUnitSphere(Vector3D* pVector);
+Vector3D RandomVectorInUnitSphere();
+Vector3D RandomVectorInUnitSphere(IUniformRandomStream* pRnd);
+
 float RandomVectorInUnitCircle(Vector2D* pVector);
 
+Vector3D RandomVectorOnUnitSphere();
+Vector3D RandomVectorOnUnitSphere(IUniformRandomStream* pRnd);
+
 
 //-----------------------------------------------------------------------------
 //
@@ -666,6 +681,7 @@ inline void Vector3D::Init(vec_t ix, vec_t iy, vec_t iz)
 	CHECK_VALID(*this);
 }
 
+#if !defined(__SPU__)
 inline void Vector3D::Random(vec_t minVal, vec_t maxVal)
 {
 	x = RandomFloat(minVal, maxVal);
@@ -673,6 +689,7 @@ inline void Vector3D::Random(vec_t minVal, vec_t maxVal)
 	z = RandomFloat(minVal, maxVal);
 	CHECK_VALID(*this);
 }
+#endif
 
 // This should really be a single opcode on the PowerPC (move r0 onto the vec reg)
 inline void Vector3D::Zero()
@@ -749,6 +766,14 @@ inline bool Vector3D::IsValid() const
 	return IsFinite(x) && IsFinite(y) && IsFinite(z);
 }
 
+//-----------------------------------------------------------------------------
+// IsReasonable?
+//-----------------------------------------------------------------------------
+inline bool Vector3D::IsReasonable(float range) const
+{
+	return (Length() < range);
+}
+
 //-----------------------------------------------------------------------------
 // Invalidate
 //-----------------------------------------------------------------------------
@@ -1290,9 +1315,10 @@ inline Vector3D VectorLerp(const Vector3D& src1, const Vector3D& src2, vec_t t)
 //-----------------------------------------------------------------------------
 // Temporary storage for vector results so const Vector& results can be returned
 //-----------------------------------------------------------------------------
-/*inline Vector& AllocTempVector()
+#if !defined(__SPU__)
+inline Vector3D& AllocTempVector()
 {
-	static Vector s_vecTemp[128];
+	static Vector3D s_vecTemp[128];
 	static CInterlockedInt s_nIndex;
 
 	int nIndex;
@@ -1307,9 +1333,9 @@ inline Vector3D VectorLerp(const Vector3D& src1, const Vector3D& src2, vec_t t)
 		}
 		ThreadPause();
 	}
-	return s_vecTemp[nIndex & 0xffff];
-}*/
-
+	return s_vecTemp[nIndex];
+}
+#endif
 
 
 //-----------------------------------------------------------------------------
@@ -1345,6 +1371,40 @@ inline int Vector3D::LargestComponent() const
 	return Z_INDEX;
 }
 
+inline int Vector3D::SmallestComponent() const
+{
+	float flAbsx = fabs(x);
+	float flAbsy = fabs(y);
+	float flAbsz = fabs(z);
+	if (flAbsx < flAbsy)
+	{
+		if (flAbsx < flAbsz)
+			return X_INDEX;
+		return Z_INDEX;
+	}
+	if (flAbsy < flAbsz)
+		return Y_INDEX;
+	return Z_INDEX;
+}
+
+
+inline float Vector3D::LargestComponentValue() const
+{
+	float flAbsX = fabs(x);
+	float flAbsY = fabs(y);
+	float flAbsZ = fabs(z);
+	return MAX(MAX(flAbsX, flAbsY), flAbsZ);
+}
+
+inline float Vector3D::SmallestComponentValue() const
+{
+	float flAbsX = fabs(x);
+	float flAbsY = fabs(y);
+	float flAbsZ = fabs(z);
+	return MIN(MIN(flAbsX, flAbsY), flAbsZ);
+}
+
+
 inline void CrossProduct(const Vector3D& a, const Vector3D& b, Vector3D& result)
 {
 	CHECK_VALID(a);
@@ -1390,9 +1450,9 @@ inline vec_t Vector3D::Length(void) const
 // Normalization
 //-----------------------------------------------------------------------------
 
-
+/*
 // FIXME: Can't use until we're un-macroed in mathlib.h
-inline vec_t VectorNormalize( Vector3D& v )
+inline vec_t VectorNormalize( Vector& v )
 {
 	Assert( v.IsValid() );
 	vec_t l = v.Length();
@@ -1408,7 +1468,7 @@ inline vec_t VectorNormalize( Vector3D& v )
 	}
 	return l;
 }
-
+*/
 
 
 // check a point against a box
@@ -1432,6 +1492,35 @@ inline vec_t Vector3D::DistTo(const Vector3D& vOther) const
 }
 
 
+//-----------------------------------------------------------------------------
+// Float equality with tolerance
+//-----------------------------------------------------------------------------
+inline bool FloatsAreEqual(float f1, float f2, float flTolerance)
+{
+	// Sergiy: the implementation in Source2 is very inefficient, trying to start with a clean slate here, hopefully will reintegrate back to Source2
+	const float flAbsToleranceThreshold = 0.000003814697265625; // 2 ^ -FLOAT_EQUALITY_NOISE_CUTOFF, 
+	return fabsf(f1 - f2) <= flTolerance * (fabsf(f1) + fabsf(f2)) + flAbsToleranceThreshold;
+}
+
+
+//-----------------------------------------------------------------------------
+// Vector equality with percentage tolerance
+// are all components within flPercentageTolerance (expressed as a percentage of the larger component, per component)?
+// and all components have the same sign
+//-----------------------------------------------------------------------------
+inline bool VectorsAreWithinPercentageTolerance(const Vector3D& src1, const Vector3D& src2, float flPercentageTolerance)
+{
+	if (!FloatsAreEqual(src1.x, src2.x, flPercentageTolerance))
+		return false;
+
+	if (!FloatsAreEqual(src1.y, src2.y, flPercentageTolerance))
+		return false;
+
+	return (FloatsAreEqual(src1.z, src2.z, flPercentageTolerance));
+}
+
+
+
 //-----------------------------------------------------------------------------
 // Vector equality with tolerance
 //-----------------------------------------------------------------------------
@@ -1475,6 +1564,11 @@ inline void VectorAbs(const Vector3D& src, Vector3D& dst)
 	dst.z = FloatMakePositive(src.z);
 }
 
+inline Vector3D VectorAbs(const Vector3D& src)
+{
+	return Vector3D(fabsf(src.x), fabsf(src.y), fabsf(src.z));
+}
+
 
 //-----------------------------------------------------------------------------
 //
@@ -1620,6 +1714,7 @@ inline float ComputeVolume(const Vector3D& vecMins, const Vector3D& vecMaxs)
 	return DotProduct(vecDelta, vecDelta);
 }
 
+#if !defined(__SPU__)
 // Get a random vector.
 inline Vector3D RandomVector(float minVal, float maxVal)
 {
@@ -1627,6 +1722,7 @@ inline Vector3D RandomVector(float minVal, float maxVal)
 	random.Random(minVal, maxVal);
 	return random;
 }
+#endif
 
 #endif //slow
 
@@ -1668,6 +1764,13 @@ inline bool operator!=(const Vector3D& v, float const* f)
 // you won't get an "u
 void VectorPerpendicularToVector(Vector3D const& in, Vector3D* pvecOut);
 
+inline const Vector3D VectorPerpendicularToVector(const Vector3D& in)
+{
+	Vector3D out;
+	VectorPerpendicularToVector(in, &out);
+	return out;
+}
+
 //-----------------------------------------------------------------------------
 // AngularImpulse
 //-----------------------------------------------------------------------------
@@ -1676,12 +1779,14 @@ typedef Vector3D AngularImpulse;
 
 #ifndef VECTOR_NO_SLOW_OPERATIONS
 
+#if !defined(__SPU__)
 inline AngularImpulse RandomAngularImpulse(float minVal, float maxVal)
 {
 	AngularImpulse	angImp;
 	angImp.Random(minVal, maxVal);
 	return angImp;
 }
+#endif
 
 #endif
 
@@ -1691,6 +1796,8 @@ inline AngularImpulse RandomAngularImpulse(float minVal, float maxVal)
 //-----------------------------------------------------------------------------
 
 class RadianEuler;
+class DegreeEuler;
+class QAngle;
 
 class Quaternion				// same data-layout as engine's vec4_t,
 {								//		which is a vec_t[4]
@@ -1705,9 +1812,11 @@ public:
 #endif
 	}
 	inline Quaternion(vec_t ix, vec_t iy, vec_t iz, vec_t iw) : x(ix), y(iy), z(iz), w(iw) { }
-	inline Quaternion(RadianEuler const& angle);	// evil auto type promotion!!!
+	inline explicit Quaternion(RadianEuler const& angle);
+	inline explicit Quaternion(DegreeEuler const& angle);
 
 	inline void Init(vec_t ix = 0.0f, vec_t iy = 0.0f, vec_t iz = 0.0f, vec_t iw = 0.0f) { x = ix; y = iy; z = iz; w = iw; }
+	inline void Init(const Vector3D& vImaginaryPart, float flRealPart) { x = vImaginaryPart.x; y = vImaginaryPart.y; z = vImaginaryPart.z; w = flRealPart; }
 
 	bool IsValid() const;
 	void Invalidate();
@@ -1717,19 +1826,47 @@ public:
 
 	inline Quaternion Conjugate() const { return Quaternion(-x, -y, -z, w); }
 
+	// 
+	const Vector3D GetForward()const;
+	const Vector3D GetLeft()const;
+	const Vector3D GetUp()const;
+
 	vec_t* Base() { return (vec_t*)this; }
 	const vec_t* Base() const { return (vec_t*)this; }
 
 	// convenience for debugging
 	inline void Print() const;
 
+	// Imaginary part
+	Vector3D& ImaginaryPart() { return *(Vector3D*)this; }
+	const Vector3D& ImaginaryPart() const { return *(Vector3D*)this; }
+	float& RealPart() { return w; }
+	float RealPart() const { return w; }
+	inline QAngle ToQAngle() const;
+	inline struct matrix3x4_t ToMatrix() const;
+
 	// array access...
 	vec_t operator[](int i) const;
 	vec_t& operator[](int i);
 
+	inline Quaternion operator+(void) const { return *this; }
+	inline Quaternion operator-(void) const { return Quaternion(-x, -y, -z, -w); }
+
 	vec_t x, y, z, w;
 };
 
+// Random Quaternion that is UNIFORMLY distributed over the S^3
+// should be good for random generation of orientation for unit tests and for game
+// NOTE: Nothing trivial like Quaternion(RandomAngle(0,180)) will do the trick , 
+//       one needs to take special care to generate a uniformly distributed quaternion.
+const Quaternion RandomQuaternion();
+const Quaternion RandomQuaternion();
+inline const Quaternion Conjugate(const Quaternion& q)
+{
+	return Quaternion(-q.x, -q.y, -q.z, q.w);
+}
+
+
 
 //-----------------------------------------------------------------------------
 // Array access
@@ -1767,10 +1904,45 @@ inline bool Quaternion::operator!=(const Quaternion& src) const
 void Quaternion::Print() const
 {
 #ifndef _CERT
+#if !defined(__SPU__)
 	DevMsg(eDLL_T::ENGINE, "q{ %.3fi + %.3fj + %.3fk + %.3f }", x, y, z, w);
 #endif
+#endif
 }
 
+
+
+
+//-----------------------------------------------------------------------------
+// Binaray operators
+//-----------------------------------------------------------------------------
+inline Quaternion operator+(const Quaternion& q1, const Quaternion& q2)
+{
+	return Quaternion(q1.x + q2.x, q1.y + q2.y, q1.z + q2.z, q1.w + q2.w);
+}
+
+inline Quaternion operator-(const Quaternion& q1, const Quaternion& q2)
+{
+	return Quaternion(q1.x - q2.x, q1.y - q2.y, q1.z - q2.z, q1.w - q2.w);
+}
+
+inline Quaternion operator*(float s, const Quaternion& q)
+{
+	return Quaternion(s * q.x, s * q.y, s * q.z, s * q.w);
+}
+
+inline Quaternion operator*(const Quaternion& q, float s)
+{
+	return Quaternion(q.x * s, q.y * s, q.z * s, q.w * s);
+}
+
+inline Quaternion operator/(const Quaternion& q, float s)
+{
+	Assert(s != 0.0f);
+	return Quaternion(q.x / s, q.y / s, q.z / s, q.w / s);
+}
+
+
 //-----------------------------------------------------------------------------
 // Quaternion equality with tolerance
 //-----------------------------------------------------------------------------
@@ -1898,17 +2070,35 @@ public:
 #endif
 } ALIGN16_POST;
 
+
+//-----------------------------------------------------------------------------
+// Src data hasn't changed, but work data is of a form more friendly for SPU
+//-----------------------------------------------------------------------------
+#if defined( _PS3 )
+//typedef Vector		BoneVector;
+typedef VectorAligned		BoneVector;
+typedef QuaternionAligned	BoneQuaternion;
+typedef QuaternionAligned	BoneQuaternionAligned;
+#else
+typedef Vector3D				BoneVector;
+typedef Quaternion			BoneQuaternion;
+typedef QuaternionAligned	BoneQuaternionAligned;
+#endif
+
 //-----------------------------------------------------------------------------
 // Radian Euler angle aligned to axis (NOT ROLL/PITCH/YAW)
 //-----------------------------------------------------------------------------
 class QAngle;
+#define VEC_DEG2RAD( a ) (a) * (3.14159265358979323846f / 180.0f)
+#define VEC_RAD2DEG( a ) (a) * (180.0f / 3.14159265358979323846f)
 class RadianEuler
 {
 public:
 	inline RadianEuler(void) { }
 	inline RadianEuler(vec_t X, vec_t Y, vec_t Z) { x = X; y = Y; z = Z; }
-	inline RadianEuler(Quaternion const& q);	// evil auto type promotion!!!
-	inline RadianEuler(QAngle const& angles);	// evil auto type promotion!!!
+	inline explicit RadianEuler(Quaternion const& q);
+	inline explicit RadianEuler(QAngle const& angles);
+	inline explicit RadianEuler(DegreeEuler const& angles);
 
 	// Initialization
 	inline void Init(vec_t ix = 0.0f, vec_t iy = 0.0f, vec_t iz = 0.0f) { x = ix; y = iy; z = iz; }
@@ -1941,6 +2131,18 @@ inline bool Quaternion::IsValid() const
 	return IsFinite(x) && IsFinite(y) && IsFinite(z) && IsFinite(w);
 }
 
+
+FORCEINLINE float QuaternionLength(const Quaternion& q)
+{
+	return sqrtf(q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w);
+}
+
+FORCEINLINE bool QuaternionIsNormalized(const Quaternion& q, float  flTolerance = 1e-6f)
+{
+	float flLen = QuaternionLength(q);
+	return (fabs(flLen - 1.0) < flTolerance);
+}
+
 inline void Quaternion::Invalidate()
 {
 	//#ifdef _DEBUG
@@ -2003,6 +2205,116 @@ inline vec_t RadianEuler::operator[](int i) const
 }
 
 
+//-----------------------------------------------------------------------------
+// Degree Euler angle aligned to axis (NOT ROLL/PITCH/YAW)
+//-----------------------------------------------------------------------------
+class DegreeEuler
+{
+public:
+	///\name Initialization 
+	//@{
+	inline DegreeEuler(void) ///< Create with un-initialized components. If VECTOR_PARANOIA is set, will init with NANS.
+	{
+		// Initialize to NAN to catch errors
+#ifdef VECTOR_PARANOIA
+		x = y = z = VEC_T_NAN;
+#endif
+	}
+	inline DegreeEuler(vec_t X, vec_t Y, vec_t Z) { x = X; y = Y; z = Z; }
+	inline explicit DegreeEuler(Quaternion const& q);
+	inline explicit DegreeEuler(QAngle const& angles);
+	inline explicit DegreeEuler(RadianEuler const& angles);
+
+	// Initialization
+	inline void Init(vec_t ix = 0.0f, vec_t iy = 0.0f, vec_t iz = 0.0f) { x = ix; y = iy; z = iz; }
+
+	inline QAngle ToQAngle() const;
+
+	//	conversion to qangle
+	bool IsValid() const;
+	void Invalidate();
+
+	inline vec_t* Base() { return &x; }
+	inline const vec_t* Base() const { return &x; }
+
+	// array access...
+	vec_t operator[](int i) const;
+	vec_t& operator[](int i);
+
+	vec_t x, y, z;
+};
+
+
+//-----------------------------------------------------------------------------
+// DegreeEuler equality with tolerance
+//-----------------------------------------------------------------------------
+inline bool DegreeEulersAreEqual(const DegreeEuler& src1, const DegreeEuler& src2, float tolerance = 0.0f)
+{
+	if (FloatMakePositive(src1.x - src2.x) > tolerance)
+		return false;
+	if (FloatMakePositive(src1.y - src2.y) > tolerance)
+		return false;
+	return (FloatMakePositive(src1.z - src2.z) <= tolerance);
+}
+
+/*
+extern void AngleQuaternion( DegreeEuler const &angles, Quaternion &qt );
+extern void QuaternionAngles( Quaternion const &q, DegreeEuler &angles );
+extern void QuaternionVectorsFLU( Quaternion const &q, Vector *pForward, Vector *pLeft, Vector *pUp );
+*/
+
+inline Quaternion::Quaternion(DegreeEuler const& angles)
+{
+	RadianEuler radians(angles);
+	AngleQuaternion(radians, *this);
+}
+
+inline DegreeEuler::DegreeEuler(RadianEuler const& angles)
+{
+	Init(VEC_RAD2DEG(angles.x), VEC_RAD2DEG(angles.y), VEC_RAD2DEG(angles.z));
+}
+
+inline RadianEuler::RadianEuler(DegreeEuler const& angles)
+{
+	Init(VEC_DEG2RAD(angles.x), VEC_DEG2RAD(angles.y), VEC_DEG2RAD(angles.z));
+}
+
+inline DegreeEuler::DegreeEuler(Quaternion const& q)
+{
+	RadianEuler radians(q);
+	Init(VEC_RAD2DEG(radians.x), VEC_RAD2DEG(radians.y), VEC_RAD2DEG(radians.z));
+}
+
+inline bool DegreeEuler::IsValid() const
+{
+	return IsFinite(x) && IsFinite(y) && IsFinite(z);
+}
+
+inline void DegreeEuler::Invalidate()
+{
+	//#ifdef VECTOR_PARANOIA
+	x = y = z = VEC_T_NAN;
+	//#endif
+}
+
+
+//-----------------------------------------------------------------------------
+// Array access
+//-----------------------------------------------------------------------------
+inline vec_t& DegreeEuler::operator[](int i)
+{
+	Assert((i >= 0) && (i < 3));
+	return ((vec_t*)this)[i];
+}
+
+inline vec_t DegreeEuler::operator[](int i) const
+{
+	Assert((i >= 0) && (i < 3));
+	return ((vec_t*)this)[i];
+}
+
+
+
 //-----------------------------------------------------------------------------
 // Degree Euler QAngle pitch, yaw, roll
 //-----------------------------------------------------------------------------
@@ -2061,6 +2373,12 @@ public:
 	// No assignment operators either...
 	QAngle& operator=(const QAngle& src);
 
+	void Normalize();
+	void NormalizePositive();
+
+	inline struct matrix3x4_t ToMatrix() const;
+	inline Quaternion ToQuaternion() const;
+
 #ifndef VECTOR_NO_SLOW_OPERATIONS
 	// copy constructors
 
@@ -2080,6 +2398,9 @@ private:
 #endif
 };
 
+// Zero the object -- necessary for CNetworkVar and possibly other cases.
+inline void EnsureValidValue(QAngle& x) { x.Init(); }
+
 //-----------------------------------------------------------------------------
 // Allows us to specifically pass the vector by value when we need to
 //-----------------------------------------------------------------------------
@@ -2141,6 +2462,26 @@ inline void QAngle::Init(vec_t ix, vec_t iy, vec_t iz)
 	CHECK_VALID(*this);
 }
 
+
+extern float AngleNormalize(float angle);
+extern float AngleNormalizePositive(float angle);
+
+inline void QAngle::Normalize()
+{
+	x = AngleNormalize(x);
+	y = AngleNormalize(y);
+	z = AngleNormalize(z);
+}
+
+inline void QAngle::NormalizePositive()
+{
+	x = AngleNormalizePositive(x);
+	y = AngleNormalizePositive(y);
+	z = AngleNormalizePositive(z);
+}
+
+
+#if !defined(__SPU__)
 inline void QAngle::Random(vec_t minVal, vec_t maxVal)
 {
 	x = RandomFloat(minVal, maxVal);
@@ -2148,9 +2489,11 @@ inline void QAngle::Random(vec_t minVal, vec_t maxVal)
 	z = RandomFloat(minVal, maxVal);
 	CHECK_VALID(*this);
 }
+#endif
 
 #ifndef VECTOR_NO_SLOW_OPERATIONS
 
+#if !defined(__SPU__)
 inline QAngle RandomAngle(float minVal, float maxVal)
 {
 	Vector3D random;
@@ -2158,6 +2501,7 @@ inline QAngle RandomAngle(float minVal, float maxVal)
 	QAngle ret(random.x, random.y, random.z);
 	return ret;
 }
+#endif
 
 #endif
 
@@ -2169,17 +2513,22 @@ inline RadianEuler::RadianEuler(QAngle const& angles)
 		angles.y * 3.14159265358979323846f / 180.f);
 }
 
-
-
+inline DegreeEuler::DegreeEuler(QAngle const& angles)
+{
+	Init(angles.z, angles.x, angles.y);
+}
 
 inline QAngle RadianEuler::ToQAngle(void) const
 {
-	return QAngle(
-		y * 180.f / 3.14159265358979323846f,
-		z * 180.f / 3.14159265358979323846f,
-		x * 180.f / 3.14159265358979323846f);
+	return QAngle(VEC_RAD2DEG(y), VEC_RAD2DEG(z), VEC_RAD2DEG(x));
 }
 
+inline QAngle DegreeEuler::ToQAngle() const
+{
+	return QAngle(y, z, x);
+}
+
+
 //-----------------------------------------------------------------------------
 // assignment
 //-----------------------------------------------------------------------------
@@ -2415,6 +2764,15 @@ inline void AngularImpulseToQAngle(const AngularImpulse& impulse, QAngle& angles
 	angles.z = impulse.x;
 }
 
+inline QAngle Quaternion::ToQAngle() const
+{
+	extern void QuaternionAngles(const Quaternion & q, QAngle & angles);
+
+	QAngle anglesOut;
+	QuaternionAngles(*this, anglesOut);
+	return anglesOut;
+}
+
 #if !defined( _X360 ) && !defined( _PS3 )
 
 FORCEINLINE vec_t InvRSquared(const float* v)
@@ -2430,7 +2788,11 @@ FORCEINLINE vec_t InvRSquared(const Vector3D& v)
 #else
 
 // call directly
+#if defined(__SPU__)
+FORCEINLINE float _VMX_InvRSquared(Vector& v)
+#else
 FORCEINLINE float _VMX_InvRSquared(const Vector& v)
+#endif
 {
 #if !defined (_PS3)
 	XMVECTOR xmV = XMVector3ReciprocalLength(XMLoadVector3(v.Base()));
@@ -2616,6 +2978,16 @@ inline vec_t Vector3D::NormalizeInPlace()
 	return VectorNormalize(*this);
 }
 
+inline vec_t Vector3D::NormalizeInPlaceSafe(const Vector3D& vFallback)
+{
+	float flLength = VectorNormalize(*this);
+	if (flLength == 0.0f)
+	{
+		*this = vFallback;
+	}
+	return flLength;
+}
+
 inline Vector3D Vector3D::Normalized() const
 {
 	Vector3D norm = *this;
@@ -2623,6 +2995,15 @@ inline Vector3D Vector3D::Normalized() const
 	return norm;
 }
 
+
+inline Vector3D Vector3D::NormalizedSafe(const Vector3D& vFallback)const
+{
+	Vector3D vNorm = *this;
+	float flLength = VectorNormalize(vNorm);
+	return (flLength != 0.0f) ? vNorm : vFallback;
+}
+
+
 inline bool Vector3D::IsLengthGreaterThan(float val) const
 {
 	return LengthSqr() > val * val;
@@ -2633,5 +3014,68 @@ inline bool Vector3D::IsLengthLessThan(float val) const
 	return LengthSqr() < val * val;
 }
 
+
+inline const Vector3D ScaleVector(const Vector3D& a, const Vector3D& b)
+{
+	return Vector3D(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+
+
+
+inline const Quaternion Exp(const Vector3D& v)
+{
+	float theta = v.Length();
+	if (theta < 0.001f)
+	{
+		// limit case, cos(theta)       ~= 1 - theta^2/2 + theta^4/24
+		//             sin(theta)/theta ~= 1 - theta^2/6 + theta^4/120
+		float theta2_2 = theta * theta * 0.5f, theta4_24 = theta2_2 * theta2_2 * (1.0f / 6.0f);
+		float k = 1.0f - theta2_2 * (1.0f / 3.0f) + theta4_24 * 0.05f;
+		return Quaternion(k * v.x, k * v.y, k * v.z, 1 - theta2_2 + theta4_24);
+	}
+	else
+	{
+		float k = sinf(theta) / theta;
+		return Quaternion(k * v.x, k * v.y, k * v.z, cosf(theta));
+	}
+}
+
+
+inline const Vector3D QuaternionLog(const Quaternion& q)
+{
+	Vector3D axis = q.ImaginaryPart();
+	float sinTheta = axis.Length(), factor;
+	if (sinTheta > 0.001f)
+	{
+		// there's some substantial rotation; if w < 0, it's an over-180-degree rotation (in real space)
+		float theta = asinf(MIN(sinTheta, 1.0f));
+		factor = (q.w < 0.0f ? M_PI_F - theta : theta) / sinTheta;
+	}
+	else
+	{
+		// ArcSin[x]/x = 1 + x^2/6 + x^4 * 3/40 + o( x^5 )
+		float sinTheta2 = sinTheta * sinTheta;
+		float sinTheta4 = sinTheta2 * sinTheta2;
+		factor = (1 + sinTheta2 * (1.0f / 6.0f) + sinTheta4 * (3.0f / 40.0f));
+		if (q.w < 0)
+		{
+			factor = -factor; // because the axis of rotation is not defined, we'll just consider this rotation to be close enough to identity
+		}
+	}
+	return axis * factor;
+}
+
+
+
+inline float Snap(float a, float flSnap)
+{
+	return floorf(a / flSnap + 0.5f) * flSnap;
+}
+
+inline  const Vector3D Snap(const Vector3D& a, float flSnap)
+{
+	return Vector3D(Snap(a.x, flSnap), Snap(a.y, flSnap), Snap(a.z, flSnap));
+}
+
 #endif
 
diff --git a/r5dev/mathlib/vector2d.h b/r5dev/mathlib/vector2d.h
index a66f2077..35d2791b 100644
--- a/r5dev/mathlib/vector2d.h
+++ b/r5dev/mathlib/vector2d.h
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+﻿//========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============//
 //
 // Purpose: 
 //
@@ -19,13 +19,27 @@
 // For vec_t, put this somewhere else?
 #include "tier0/basetypes.h"
 
-// For rand(). We really need a library!
-#include <stdlib.h>
+// For RandomFloat()
+#include "vstdlib/random.h"
 
 #include "tier0/dbg.h"
 #include "mathlib/bits.h"
 #include "mathlib/math_pfns.h"
 
+#ifndef M_PI
+#define M_PI		3.14159265358979323846	// matches value in gcc v2 math.h
+#endif
+
+#ifndef M_PI_F
+#define M_PI_F		((float)(M_PI))
+#endif
+
+#ifndef DEG2RAD
+#define DEG2RAD( x  )  ( (float)(x) * (float)(M_PI_F / 180.f) )
+#endif
+
+extern void inline SinCos(float radians, float* RESTRICT sine, float* RESTRICT cosine);
+
 //=========================================================
 // 2D Vector2D
 //=========================================================
@@ -37,9 +51,9 @@ public:
 	vec_t x, y;
 
 	// Construction/destruction
-	Vector2D(void);
+	Vector2D();
 	Vector2D(vec_t X, vec_t Y);
-	Vector2D(const float* pFloat);
+	explicit Vector2D(const float* pFloat);
 
 	// Initialization
 	void Init(vec_t ix = 0.0f, vec_t iy = 0.0f);
@@ -196,7 +210,7 @@ void Vector2DLerp(const Vector2D& src1, const Vector2D& src2, vec_t t, Vector2D&
 // constructors
 //-----------------------------------------------------------------------------
 
-inline Vector2D::Vector2D(void)
+inline Vector2D::Vector2D()
 {
 #ifdef _DEBUG
 	// Initialize to NAN to catch errors
@@ -238,11 +252,13 @@ inline void Vector2D::Init(vec_t ix, vec_t iy)
 	Assert(IsValid());
 }
 
+#if !defined(__SPU__)
 inline void Vector2D::Random(float minVal, float maxVal)
 {
-	x = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
-	y = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	x = RandomFloat(minVal, maxVal);
+	y = RandomFloat(minVal, maxVal);
 }
+#endif
 
 inline void Vector2DClear(Vector2D& a)
 {
@@ -439,6 +455,15 @@ inline void Vector2DDivide(const Vector2D& a, const Vector2D& b, Vector2D& c)
 	c.y = a.y / b.y;
 }
 
+inline void Vector2DRotate(const Vector2D& vIn, float flDegrees, Vector2D& vOut)
+{
+	float c, s;
+	SinCos(DEG2RAD(flDegrees), &s, &c);
+
+	vOut.x = vIn.x * c - vIn.y * s;
+	vOut.y = vIn.x * s + vIn.y * c;
+}
+
 inline void Vector2DMA(const Vector2D& start, float s, const Vector2D& dir, Vector2D& result)
 {
 	Assert(start.IsValid() && IsFinite(s) && dir.IsValid());
diff --git a/r5dev/mathlib/vector4d.h b/r5dev/mathlib/vector4d.h
index 21585121..cf4e1387 100644
--- a/r5dev/mathlib/vector4d.h
+++ b/r5dev/mathlib/vector4d.h
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+//========= Copyright 1996-2005, Valve Corporation, All rights reserved. ============//
 //
 // Purpose: 
 //
@@ -14,19 +14,19 @@
 #endif
 
 #include <math.h>
-#include <stdlib.h>		// For rand(). We really need a library!
 #include <float.h>
-#if !defined( _X360 )
-#include <xmmintrin.h>	// For SSE
+#if !defined( PLATFORM_PPC ) && !defined( _PS3 )
+#include <xmmintrin.h>	// for sse
 #endif
 #include "tier0/basetypes.h"	// For vec_t, put this somewhere else?
 #include "tier0/dbg.h"
 #include "mathlib/bits.h"
 #include "mathlib/math_pfns.h"
-
+#include "mathlib/vector.h"
+#include "vstdlib/random.h"
 // forward declarations
-class Vector3D;
 class Vector2D;
+class Vector3D;
 
 //=========================================================
 // 4D Vector4D
@@ -39,12 +39,13 @@ public:
 	vec_t x, y, z, w;
 
 	// Construction/destruction
-	Vector4D(void);
+	Vector4D();
 	Vector4D(vec_t X, vec_t Y, vec_t Z, vec_t W);
-	Vector4D(const float* pFloat);
+	explicit Vector4D(const float* pFloat);
 
 	// Initialization
 	void Init(vec_t ix = 0.0f, vec_t iy = 0.0f, vec_t iz = 0.0f, vec_t iw = 0.0f);
+	void Init(const Vector3D& src, vec_t iw = 0.0f);
 
 	// Got any nasty NAN's?
 	bool IsValid() const;
@@ -79,6 +80,13 @@ public:
 	Vector4D& operator/=(const Vector4D& v);
 	Vector4D& operator/=(float s);
 
+	Vector4D	operator-(void) const;
+	Vector4D	operator*(float fl) const;
+	Vector4D	operator/(float fl) const;
+	Vector4D	operator*(const Vector4D& v) const;
+	Vector4D	operator+(const Vector4D& v) const;
+	Vector4D	operator-(const Vector4D& v) const;
+
 	// negate the Vector4D components
 	void	Negate();
 
@@ -202,7 +210,7 @@ void Vector4DLerp(Vector4D const& src1, Vector4D const& src2, vec_t t, Vector4D&
 // constructors
 //-----------------------------------------------------------------------------
 
-inline Vector4D::Vector4D(void)
+inline Vector4D::Vector4D()
 {
 #ifdef _DEBUG
 	// Initialize to NAN to catch errors
@@ -237,20 +245,27 @@ inline Vector4D::Vector4D(const Vector4D& vOther)
 //-----------------------------------------------------------------------------
 // initialization
 //-----------------------------------------------------------------------------
-
 inline void Vector4D::Init(vec_t ix, vec_t iy, vec_t iz, vec_t iw)
 {
 	x = ix; y = iy; z = iz;	w = iw;
 	Assert(IsValid());
 }
 
+inline void Vector4D::Init(const Vector3D& src, vec_t iw)
+{
+	x = src.x; y = src.y; z = src.z; w = iw;
+	Assert(IsValid());
+}
+
+#if !defined(__SPU__)
 inline void Vector4D::Random(vec_t minVal, vec_t maxVal)
 {
-	x = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
-	y = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
-	z = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
-	w = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	x = RandomFloat(minVal, maxVal);
+	y = RandomFloat(minVal, maxVal);
+	z = RandomFloat(minVal, maxVal);
+	w = RandomFloat(minVal, maxVal);
 }
+#endif
 
 inline void Vector4DClear(Vector4D& a)
 {
@@ -412,6 +427,52 @@ inline Vector4D& Vector4D::operator*=(Vector4D const& v)
 	return *this;
 }
 
+inline Vector4D Vector4D::operator-(void) const
+{
+	return Vector4D(-x, -y, -z, -w);
+}
+
+inline Vector4D Vector4D::operator+(const Vector4D& v) const
+{
+	Vector4D res;
+	Vector4DAdd(*this, v, res);
+	return res;
+}
+
+inline Vector4D Vector4D::operator-(const Vector4D& v) const
+{
+	Vector4D res;
+	Vector4DSubtract(*this, v, res);
+	return res;
+}
+
+
+inline Vector4D Vector4D::operator*(float fl) const
+{
+	Vector4D res;
+	Vector4DMultiply(*this, fl, res);
+	return res;
+}
+
+inline Vector4D Vector4D::operator*(const Vector4D& v) const
+{
+	Vector4D res;
+	Vector4DMultiply(*this, v, res);
+	return res;
+}
+
+inline Vector4D Vector4D::operator/(float fl) const
+{
+	Vector4D res;
+	Vector4DDivide(*this, fl, res);
+	return res;
+}
+
+inline Vector4D operator*(float fl, const Vector4D& v)
+{
+	return v * fl;
+}
+
 inline Vector4D& Vector4D::operator/=(float fl)
 {
 	Assert(fl != 0.0f);
@@ -615,8 +676,10 @@ inline void Vector4DAligned::Set(vec_t X, vec_t Y, vec_t Z, vec_t W)
 
 inline void Vector4DAligned::InitZero(void)
 {
-#if !defined( _X360 )
+#if !defined( PLATFORM_PPC )
 	this->AsM128() = _mm_set1_ps(0.0f);
+#elif defined(_PS3)
+	this->AsM128() = VMX_ZERO;
 #else
 	this->AsM128() = __vspltisw(0);
 #endif
@@ -626,11 +689,13 @@ inline void Vector4DAligned::InitZero(void)
 inline void Vector4DMultiplyAligned(Vector4DAligned const& a, Vector4DAligned const& b, Vector4DAligned& c)
 {
 	Assert(a.IsValid() && b.IsValid());
-#if !defined( _X360 )
+#if !defined( PLATFORM_PPC )
 	c.x = a.x * b.x;
 	c.y = a.y * b.y;
 	c.z = a.z * b.z;
 	c.w = a.w * b.w;
+#elif defined(_PS3)
+	c.AsM128() = __vec_mul(a.AsM128(), b.AsM128());
 #else
 	c.AsM128() = __vmulfp(a.AsM128(), b.AsM128());
 #endif
@@ -640,7 +705,7 @@ inline void Vector4DWeightMAD(vec_t w, Vector4DAligned const& vInA, Vector4DAlig
 {
 	Assert(vInA.IsValid() && vInB.IsValid() && IsFinite(w));
 
-#if !defined( _X360 )
+#if !defined( PLATFORM_PPC )
 	vOutA.x += vInA.x * w;
 	vOutA.y += vInA.y * w;
 	vOutA.z += vInA.z * w;
@@ -650,6 +715,16 @@ inline void Vector4DWeightMAD(vec_t w, Vector4DAligned const& vInA, Vector4DAlig
 	vOutB.y += vInB.y * w;
 	vOutB.z += vInB.z * w;
 	vOutB.w += vInB.w * w;
+#elif defined(_PS3)
+#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
+	// GCC 4.1.1
+	__m128 temp = vec_splats(w);
+#else //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1
+	__m128 temp = __m128(w);
+#endif //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1
+
+	vOutA.AsM128() = vec_madd(vInA.AsM128(), temp, vOutA.AsM128());
+	vOutB.AsM128() = vec_madd(vInB.AsM128(), temp, vOutB.AsM128());
 #else
 	__vector4 temp;
 
@@ -665,13 +740,23 @@ inline void Vector4DWeightMADSSE(vec_t w, Vector4DAligned const& vInA, Vector4DA
 {
 	Assert(vInA.IsValid() && vInB.IsValid() && IsFinite(w));
 
-#if !defined( _X360 )
+#if !defined( PLATFORM_PPC )
 	// Replicate scalar float out to 4 components
 	__m128 packed = _mm_set1_ps(w);
 
 	// 4D SSE Vector MAD
 	vOutA.AsM128() = _mm_add_ps(vOutA.AsM128(), _mm_mul_ps(vInA.AsM128(), packed));
 	vOutB.AsM128() = _mm_add_ps(vOutB.AsM128(), _mm_mul_ps(vInB.AsM128(), packed));
+#elif defined(_PS3)
+#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
+	// GCC 4.1.1
+	__m128 temp = vec_splats(w);
+#else //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1
+	__m128 temp = __m128(w);
+#endif //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1
+
+	vOutA.AsM128() = vec_madd(vInA.AsM128(), temp, vOutA.AsM128());
+	vOutB.AsM128() = vec_madd(vInB.AsM128(), temp, vOutB.AsM128());
 #else
 	__vector4 temp;
 
diff --git a/r5dev/mathlib/vmatrix.cpp b/r5dev/mathlib/vmatrix.cpp
index 7e7183f6..fdf2bccb 100644
--- a/r5dev/mathlib/vmatrix.cpp
+++ b/r5dev/mathlib/vmatrix.cpp
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+//========= Copyright (c) 1996-2005, Valve Corporation, All rights reserved. ============//
 //
 // Purpose: 
 //
@@ -6,18 +6,19 @@
 //
 //=============================================================================//
 #include "core/stdafx.h"
+#include "tier0/dbg.h"
 
 #if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)
-#include "tier0/dbg.h"
-#include "tier0/basetypes.h"
+
 #include "mathlib/vmatrix.h"
 #include "mathlib/mathlib.h"
 #include "mathlib/vector4d.h"
+#include "mathlib/ssemath.h"
 
 // memdbgon must be the last include file in a .cpp file!!!
 //#include "tier0/memdbgon.h"
 
-//#pragma warning (disable : 4700) // local variable 'x' used without having been initialized
+#pragma warning (disable : 4700) // local variable 'x' used without having been initialized
 
 // ------------------------------------------------------------------------------------------- //
 // Helper functions.
@@ -120,7 +121,7 @@ VMatrix SetupMatrixProjection(const Vector3D& vOrigin, const VPlane& thePlane)
 
 VMatrix SetupMatrixAxisRot(const Vector3D& vAxis, vec_t fDegrees)
 {
-	vec_t s, c, t;
+	vec_t s, c, t; // sin, cos, 1-cos
 	vec_t tx, ty, tz;
 	vec_t sx, sy, sz;
 	vec_t fRadians;
@@ -142,6 +143,43 @@ VMatrix SetupMatrixAxisRot(const Vector3D& vAxis, vec_t fDegrees)
 		0.0f, 0.0f, 0.0f, 1.0f);
 }
 
+
+// Basically takes a cross product and then does the same thing as SetupMatrixAxisRot
+// above, but takes advantage of the fact that the sin angle is precomputed.
+VMatrix	SetupMatrixAxisToAxisRot(const Vector3D& vFromAxis, const Vector3D& vToAxis)
+{
+	Assert(vFromAxis.LengthSqr() == 1); // these axes
+	Assert(vToAxis.LengthSqr() == 1); // must be normal.
+
+	vec_t s, c, t; // sin(theta), cos(theta), 1-cos
+	vec_t tx, ty, tz;
+	vec_t sx, sy, sz;
+
+	Vector3D vAxis = vFromAxis.Cross(vToAxis);
+
+	s = vAxis.Length();
+	c = vFromAxis.Dot(vToAxis);
+	t = 1.0f - c;
+
+	if (s > 0)
+	{
+		vAxis *= 1.0 / s;
+
+		tx = t * vAxis.x;	ty = t * vAxis.y;	tz = t * vAxis.z;
+		sx = s * vAxis.x;	sy = s * vAxis.y;	sz = s * vAxis.z;
+
+		return VMatrix(
+			tx * vAxis.x + c, tx * vAxis.y - sz, tx * vAxis.z + sy, 0.0f,
+			tx * vAxis.y + sz, ty * vAxis.y + c, ty * vAxis.z - sx, 0.0f,
+			tx * vAxis.z - sy, ty * vAxis.z + sx, tz * vAxis.z + c, 0.0f,
+			0.0f, 0.0f, 0.0f, 1.0f);
+	}
+	else
+	{
+		return SetupMatrixIdentity();
+	}
+}
+
 VMatrix SetupMatrixAngles(const QAngle& vAngles)
 {
 	VMatrix mRet;
@@ -158,8 +196,19 @@ VMatrix SetupMatrixOrgAngles(const Vector3D& origin, const QAngle& vAngles)
 
 #endif // VECTOR_NO_SLOW_OPERATIONS
 
-
+#if 1
 bool PlaneIntersection(const VPlane& vp1, const VPlane& vp2, const VPlane& vp3, Vector3D& vOut)
+{
+	Vector3D v2Cross3 = CrossProduct(vp2.m_Normal, vp3.m_Normal);
+	float flDenom = DotProduct(vp1.m_Normal, v2Cross3);
+	if (fabs(flDenom) < FLT_EPSILON)
+		return false;
+	Vector3D vRet = vp1.m_Dist * v2Cross3 + vp2.m_Dist * CrossProduct(vp3.m_Normal, vp1.m_Normal) + vp3.m_Dist * CrossProduct(vp1.m_Normal, vp2.m_Normal);
+	vOut = vRet * (1.0 / flDenom);
+	return true;
+}
+#else  // old slow innaccurate code
+bool PlaneIntersection(const VPlane& vp1, const VPlane& vp2, const VPlane& vp3, Vector& vOut)
 {
 	VMatrix mMat, mInverse;
 
@@ -169,7 +218,6 @@ bool PlaneIntersection(const VPlane& vp1, const VPlane& vp2, const VPlane& vp3,
 		vp3.m_Normal.x, vp3.m_Normal.y, vp3.m_Normal.z, -vp3.m_Dist,
 		0.0f, 0.0f, 0.0f, 1.0f
 	);
-
 	if (mMat.InverseGeneral(mInverse))
 	{
 		//vOut = mInverse * Vector(0.0f, 0.0f, 0.0f);
@@ -181,7 +229,7 @@ bool PlaneIntersection(const VPlane& vp1, const VPlane& vp2, const VPlane& vp3,
 		return false;
 	}
 }
-
+#endif
 
 
 // ------------------------------------------------------------------------------------------- //
@@ -303,7 +351,7 @@ bool MatrixInverseGeneral(const VMatrix& src, VMatrix& dst)
 	for (iRow = 0; iRow < 4; iRow++)
 	{
 		// Find the row with the largest element in this column.
-		fLargest = 0.00001f;
+		fLargest = 1e-6f;
 		iLargest = -1;
 		for (iTest = iRow; iTest < 4; iTest++)
 		{
@@ -506,7 +554,7 @@ bool VMatrix::IsRotationMatrix() const
 		FloatMakePositive(v2.Dot(v3)) < 0.01f;
 }
 
-static void SetupMatrixAnglesInternal(vec_t m[4][4], const QAngle& vAngles)
+void VMatrix::SetupMatrixOrgAngles(const Vector3D& origin, const QAngle& vAngles)
 {
 	float		sr, sp, sy, cr, cp, cy;
 
@@ -527,11 +575,6 @@ static void SetupMatrixAnglesInternal(vec_t m[4][4], const QAngle& vAngles)
 	m[0][3] = 0.f;
 	m[1][3] = 0.f;
 	m[2][3] = 0.f;
-}
-
-void VMatrix::SetupMatrixOrgAngles(const Vector3D& origin, const QAngle& vAngles)
-{
-	SetupMatrixAnglesInternal(m, vAngles);
 
 	// Add translation
 	m[0][3] = origin.x;
@@ -544,21 +587,6 @@ void VMatrix::SetupMatrixOrgAngles(const Vector3D& origin, const QAngle& vAngles
 }
 
 
-void	VMatrix::SetupMatrixAngles(const QAngle& vAngles)
-{
-	SetupMatrixAnglesInternal(m, vAngles);
-
-	// Zero everything else
-	m[0][3] = 0.0f;
-	m[1][3] = 0.0f;
-	m[2][3] = 0.0f;
-	m[3][0] = 0.0f;
-	m[3][1] = 0.0f;
-	m[3][2] = 0.0f;
-	m[3][3] = 1.0f;
-}
-
-
 //-----------------------------------------------------------------------------
 // Sets matrix to identity
 //-----------------------------------------------------------------------------
@@ -745,7 +773,7 @@ void Vector4DMultiplyPosition(const VMatrix& src1, Vector3D const& src2, Vector4
 {
 	// Make sure it works if src2 == dst
 	Vector3D tmp;
-	Vector3D const& v = (&src2 == &dst.AsVector3D()) ? static_cast<const Vector3D&>(tmp) : src2;
+	Vector3D const& v = (&src2 == &dst.AsVector3D()) ? static_cast<const Vector3D>(tmp) : src2;
 
 	if (&src2 == &dst.AsVector3D())
 	{
@@ -768,7 +796,7 @@ void Vector3DMultiply(const VMatrix& src1, const Vector3D& src2, Vector3D& dst)
 {
 	// Make sure it works if src2 == dst
 	Vector3D tmp;
-	const Vector3D& v = (&src2 == &dst) ? static_cast<const Vector3D&>(tmp) : src2;
+	const Vector3D& v = (&src2 == &dst) ? static_cast<const Vector3D>(tmp) : src2;
 
 	if (&src2 == &dst)
 	{
@@ -789,7 +817,7 @@ void Vector3DMultiplyPositionProjective(const VMatrix& src1, const Vector3D& src
 {
 	// Make sure it works if src2 == dst
 	Vector3D tmp;
-	const Vector3D& v = (&src2 == &dst) ? static_cast<const Vector3D&>(tmp) : src2;
+	const Vector3D& v = (&src2 == &dst) ? static_cast<const Vector3D>(tmp) : src2;
 	if (&src2 == &dst)
 	{
 		VectorCopy(src2, tmp);
@@ -816,7 +844,7 @@ void Vector3DMultiplyProjective(const VMatrix& src1, const Vector3D& src2, Vecto
 {
 	// Make sure it works if src2 == dst
 	Vector3D tmp;
-	const Vector3D& v = (&src2 == &dst) ? static_cast<const Vector3D&>(tmp) : src2;
+	const Vector3D& v = (&src2 == &dst) ? static_cast<const Vector3D>(tmp) : src2;
 	if (&src2 == &dst)
 	{
 		VectorCopy(src2, tmp);
@@ -869,7 +897,7 @@ void Vector3DMultiplyTranspose(const VMatrix& src1, const Vector3D& src2, Vector
 	bool srcEqualsDst = (&src2 == &dst);
 
 	Vector3D tmp;
-	const Vector3D& v = srcEqualsDst ? static_cast<const Vector3D&>(tmp) : src2;
+	const Vector3D& v = srcEqualsDst ? static_cast<const Vector3D>(tmp) : src2;
 
 	if (srcEqualsDst)
 	{
@@ -954,7 +982,7 @@ void MatrixBuildTranslation(VMatrix& dst, const Vector3D& translation)
 //-----------------------------------------------------------------------------
 void MatrixBuildRotationAboutAxis(VMatrix& dst, const Vector3D& vAxisOfRot, float angleDegrees)
 {
-	MatrixBuildRotationAboutAxis(vAxisOfRot, angleDegrees, const_cast<matrix3x4_t&> (dst.As3x4()));
+	MatrixBuildRotationAboutAxis(vAxisOfRot, angleDegrees, dst.As3x4());
 	dst[3][0] = 0;
 	dst[3][1] = 0;
 	dst[3][2] = 0;
@@ -1006,6 +1034,13 @@ void MatrixBuildRotation(VMatrix& dst, const Vector3D& initialDirection, const V
 	}
 
 	MatrixBuildRotationAboutAxis(dst, axis, angle);
+
+#ifdef _DEBUG
+	Vector3D test;
+	Vector3DMultiply(dst, initialDirection, test);
+	test -= finalDirection;
+	Assert(test.LengthSqr() < 1e-3);
+#endif
 }
 
 //-----------------------------------------------------------------------------
@@ -1163,8 +1198,7 @@ void CalculateSphereFromProjectionMatrix(const VMatrix& worldToVolume, Vector3D*
 }
 
 
-static inline void FrustumPlanesFromMatrixHelper(const VMatrix& shadowToWorld, const Vector3D& p1, const Vector3D& p2, const Vector3D& p3,
-	Vector3D& normal, float& dist)
+static inline void FrustumPlanesFromMatrixHelper(const VMatrix& shadowToWorld, const Vector3D& p1, const Vector3D& p2, const Vector3D& p3, VPlane& plane)
 {
 	Vector3D world1, world2, world3;
 	Vector3DMultiplyPositionProjective(shadowToWorld, p1, world1);
@@ -1175,41 +1209,37 @@ static inline void FrustumPlanesFromMatrixHelper(const VMatrix& shadowToWorld, c
 	VectorSubtract(world2, world1, v1);
 	VectorSubtract(world3, world1, v2);
 
-	CrossProduct(v1, v2, normal);
-	VectorNormalize(normal);
-	dist = DotProduct(normal, world1);
+	CrossProduct(v1, v2, plane.m_Normal);
+	VectorNormalize(plane.m_Normal);
+	plane.m_Dist = DotProduct(plane.m_Normal, world1);
 }
 
 void FrustumPlanesFromMatrix(const VMatrix& clipToWorld, Frustum_t& frustum)
 {
-	Vector3D normal;
-	float dist;
+	VPlane planes[6];
 
 	FrustumPlanesFromMatrixHelper(clipToWorld,
-		Vector3D(0.0f, 0.0f, 0.0f), Vector3D(1.0f, 0.0f, 0.0f), Vector3D(0.0f, 1.0f, 0.0f), normal, dist);
-	frustum.SetPlane(FRUSTUM_NEARZ, PLANE_ANYZ, normal, dist);
+		Vector3D(0.0f, 0.0f, 0.0f), Vector3D(1.0f, 0.0f, 0.0f), Vector3D(0.0f, 1.0f, 0.0f), planes[FRUSTUM_NEARZ]);
 
 	FrustumPlanesFromMatrixHelper(clipToWorld,
-		Vector3D(0.0f, 0.0f, 1.0f), Vector3D(0.0f, 1.0f, 1.0f), Vector3D(1.0f, 0.0f, 1.0f), normal, dist);
-	frustum.SetPlane(FRUSTUM_FARZ, PLANE_ANYZ, normal, dist);
+		Vector3D(0.0f, 0.0f, 1.0f), Vector3D(0.0f, 1.0f, 1.0f), Vector3D(1.0f, 0.0f, 1.0f), planes[FRUSTUM_FARZ]);
 
 	FrustumPlanesFromMatrixHelper(clipToWorld,
-		Vector3D(1.0f, 0.0f, 0.0f), Vector3D(1.0f, 1.0f, 1.0f), Vector3D(1.0f, 1.0f, 0.0f), normal, dist);
-	frustum.SetPlane(FRUSTUM_RIGHT, PLANE_ANYZ, normal, dist);
+		Vector3D(1.0f, 0.0f, 0.0f), Vector3D(1.0f, 1.0f, 1.0f), Vector3D(1.0f, 1.0f, 0.0f), planes[FRUSTUM_RIGHT]);
 
 	FrustumPlanesFromMatrixHelper(clipToWorld,
-		Vector3D(0.0f, 0.0f, 0.0f), Vector3D(0.0f, 1.0f, 1.0f), Vector3D(0.0f, 0.0f, 1.0f), normal, dist);
-	frustum.SetPlane(FRUSTUM_LEFT, PLANE_ANYZ, normal, dist);
+		Vector3D(0.0f, 0.0f, 0.0f), Vector3D(0.0f, 1.0f, 1.0f), Vector3D(0.0f, 0.0f, 1.0f), planes[FRUSTUM_LEFT]);
 
 	FrustumPlanesFromMatrixHelper(clipToWorld,
-		Vector3D(1.0f, 1.0f, 0.0f), Vector3D(1.0f, 1.0f, 1.0f), Vector3D(0.0f, 1.0f, 1.0f), normal, dist);
-	frustum.SetPlane(FRUSTUM_TOP, PLANE_ANYZ, normal, dist);
+		Vector3D(1.0f, 1.0f, 0.0f), Vector3D(1.0f, 1.0f, 1.0f), Vector3D(0.0f, 1.0f, 1.0f), planes[FRUSTUM_TOP]);
 
 	FrustumPlanesFromMatrixHelper(clipToWorld,
-		Vector3D(1.0f, 0.0f, 0.0f), Vector3D(0.0f, 0.0f, 1.0f), Vector3D(1.0f, 0.0f, 1.0f), normal, dist);
-	frustum.SetPlane(FRUSTUM_BOTTOM, PLANE_ANYZ, normal, dist);
+		Vector3D(1.0f, 0.0f, 0.0f), Vector3D(0.0f, 0.0f, 1.0f), Vector3D(1.0f, 0.0f, 1.0f), planes[FRUSTUM_BOTTOM]);
+
+	frustum.SetPlanes(planes);
 }
 
+// BEWARE: top/bottom are FLIPPED relative to D3DXMatrixOrthoOffCenterRH().
 void MatrixBuildOrtho(VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar)
 {
 	// FIXME: This is being used incorrectly! Should read:
@@ -1243,29 +1273,19 @@ void MatrixBuildOrtho(VMatrix& dst, double left, double top, double right, doubl
 		0.0f, 0.0f, 0.0f, 1.0f);
 }
 
-void MatrixBuildPerspectiveZRange(VMatrix& dst, double flZNear, double flZFar)
-{
-	dst.m[2][0] = 0.0f;
-	dst.m[2][1] = 0.0f;
-	dst.m[2][2] = flZFar / (flZNear - flZFar);
-	dst.m[2][3] = flZNear * flZFar / (flZNear - flZFar);
-}
-
 void MatrixBuildPerspectiveX(VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar)
 {
-	float flWidthScale = 1.0f / tanf(flFovX * M_PI / 360.0f);
-	float flHeightScale = flAspect * flWidthScale;
-	dst.Init(flWidthScale, 0.0f, 0.0f, 0.0f,
-		0.0f, flHeightScale, 0.0f, 0.0f,
-		0.0f, 0.0f, 0.0f, 0.0f,
+	float flWidth = 2.0f * flZNear * tanf(flFovX * M_PI / 360.0f);
+	float flHeight = flWidth / flAspect;
+	dst.Init(2.0f * flZNear / flWidth, 0.0f, 0.0f, 0.0f,
+		0.0f, 2.0f * flZNear / flHeight, 0.0f, 0.0f,
+		0.0f, 0.0f, flZFar / (flZNear - flZFar), flZNear * flZFar / (flZNear - flZFar),
 		0.0f, 0.0f, -1.0f, 0.0f);
-
-	MatrixBuildPerspectiveZRange(dst, flZNear, flZFar);
 }
 
 void MatrixBuildPerspectiveOffCenterX(VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar, double bottom, double top, double left, double right)
 {
-	float flWidth = tanf(flFovX * M_PI / 360.0f);
+	float flWidth = 2.0f * flZNear * tanf(flFovX * M_PI / 360.0f);
 	float flHeight = flWidth / flAspect;
 
 	// bottom, top, left, right are 0..1 so convert to -<val>/2..<val>/2
@@ -1274,12 +1294,58 @@ void MatrixBuildPerspectiveOffCenterX(VMatrix& dst, double flFovX, double flAspe
 	float flBottom = -(flHeight / 2.0f) * (1.0f - bottom) + bottom * (flHeight / 2.0f);
 	float flTop = -(flHeight / 2.0f) * (1.0f - top) + top * (flHeight / 2.0f);
 
-	dst.Init(1.0f / (flRight - flLeft), 0.0f, (flLeft + flRight) / (flRight - flLeft), 0.0f,
-		0.0f, 1.0f / (flTop - flBottom), (flTop + flBottom) / (flTop - flBottom), 0.0f,
-		0.0f, 0.0f, 0.0f, 0.0f,
+	dst.Init((2.0f * flZNear) / (flRight - flLeft), 0.0f, (flLeft + flRight) / (flRight - flLeft), 0.0f,
+		0.0f, 2.0f * flZNear / (flTop - flBottom), (flTop + flBottom) / (flTop - flBottom), 0.0f,
+		0.0f, 0.0f, flZFar / (flZNear - flZFar), flZNear * flZFar / (flZNear - flZFar),
 		0.0f, 0.0f, -1.0f, 0.0f);
-
-	MatrixBuildPerspectiveZRange(dst, flZNear, flZFar);
 }
-#endif // !_STATIC_LINKED || _SHARED_LIB
 
+void ExtractClipPlanesFromNonTransposedMatrix(const VMatrix& viewProjMatrix, VPlane* pPlanesOut, bool bD3DClippingRange)
+{
+	// Left
+	Vector4D vPlane = MatrixGetRowAsVector4D(viewProjMatrix, 0) + MatrixGetRowAsVector4D(viewProjMatrix, 3);
+	pPlanesOut[FRUSTUM_LEFT].Init(vPlane.AsVector3D(), -vPlane.w);
+
+	// Right
+	vPlane = -MatrixGetRowAsVector4D(viewProjMatrix, 0) + MatrixGetRowAsVector4D(viewProjMatrix, 3);
+	pPlanesOut[FRUSTUM_RIGHT].Init(vPlane.AsVector3D(), -vPlane.w);
+
+	// Bottom
+	vPlane = MatrixGetRowAsVector4D(viewProjMatrix, 1) + MatrixGetRowAsVector4D(viewProjMatrix, 3);
+	pPlanesOut[FRUSTUM_BOTTOM].Init(vPlane.AsVector3D(), -vPlane.w);
+
+	// Top
+	vPlane = -MatrixGetRowAsVector4D(viewProjMatrix, 1) + MatrixGetRowAsVector4D(viewProjMatrix, 3);
+	pPlanesOut[FRUSTUM_TOP].Init(vPlane.AsVector3D(), -vPlane.w);
+
+	// Near
+	if (bD3DClippingRange)
+	{
+		// [0,1] Z clipping range (D3D-style)
+		vPlane = MatrixGetRowAsVector4D(viewProjMatrix, 2);
+	}
+	else
+	{
+		// [-1,1] Z clipping range (OpenGL-style)
+		vPlane = MatrixGetRowAsVector4D(viewProjMatrix, 2) + MatrixGetRowAsVector4D(viewProjMatrix, 3);
+	}
+
+	pPlanesOut[FRUSTUM_NEARZ].Init(vPlane.AsVector3D(), -vPlane.w);
+
+	// Far
+	vPlane = -MatrixGetRowAsVector4D(viewProjMatrix, 2) + MatrixGetRowAsVector4D(viewProjMatrix, 3);
+	pPlanesOut[FRUSTUM_FARZ].Init(vPlane.AsVector3D(), -vPlane.w);
+
+	for (uint i = 0; i < FRUSTUM_NUMPLANES; ++i)
+	{
+		float flLen2 = pPlanesOut[i].m_Normal.x * pPlanesOut[i].m_Normal.x + pPlanesOut[i].m_Normal.y * pPlanesOut[i].m_Normal.y + pPlanesOut[i].m_Normal.z * pPlanesOut[i].m_Normal.z;
+		if (flLen2 != 0.0f)
+		{
+			float flScale = 1.0f / sqrt(flLen2);
+			pPlanesOut[i].m_Normal *= flScale;
+			pPlanesOut[i].m_Dist *= flScale;
+		}
+	}
+}
+
+#endif // !_STATIC_LINKED || _SHARED_LIB
diff --git a/r5dev/mathlib/vmatrix.h b/r5dev/mathlib/vmatrix.h
index a1520499..02771bef 100644
--- a/r5dev/mathlib/vmatrix.h
+++ b/r5dev/mathlib/vmatrix.h
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+﻿//========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============//
 //
 // Purpose: 
 //
@@ -54,10 +54,9 @@ public:
 	// Creates a matrix where the X axis = forward
 	// the Y axis = left, and the Z axis = up
 	VMatrix(const Vector3D& forward, const Vector3D& left, const Vector3D& up);
-	VMatrix(const Vector3D& forward, const Vector3D& left, const Vector3D& up, const Vector3D& translation);
 
 	// Construct from a 3x4 matrix
-	VMatrix(const matrix3x4_t& matrix3x4);
+	explicit VMatrix(const matrix3x4_t& matrix3x4);
 
 	// Set the values in the matrix.
 	void		Init(
@@ -107,6 +106,7 @@ public:
 	void		PreTranslate(const Vector3D& vTrans);
 	void		PostTranslate(const Vector3D& vTrans);
 
+	matrix3x4_t& As3x4();
 	const matrix3x4_t& As3x4() const;
 	void		CopyFrom3x4(const matrix3x4_t& m3x4);
 	void		Set3x4(matrix3x4_t& matrix3x4) const;
@@ -199,9 +199,6 @@ public:
 	// Setup a matrix for origin and angles.
 	void		SetupMatrixOrgAngles(const Vector3D& origin, const QAngle& vAngles);
 
-	// Setup a matrix for angles and no translation.
-	void		SetupMatrixAngles(const QAngle& vAngles);
-
 	// General inverse. This may fail so check the return!
 	bool		InverseGeneral(VMatrix& vInverse) const;
 
@@ -217,7 +214,7 @@ public:
 	VMatrix		InverseTR() const;
 
 	// Get the scale of the matrix's basis vectors.
-	Vector3D	GetScale() const;
+	Vector3D		GetScale() const;
 
 	// (Fast) multiply by a scaling matrix setup from vScale.
 	VMatrix		Scale(const Vector3D& vScale);
@@ -263,6 +260,9 @@ VMatrix		SetupMatrixProjection(const Vector3D& vOrigin, const VPlane& thePlane);
 // Setup a matrix to rotate the specified amount around the specified axis.
 VMatrix		SetupMatrixAxisRot(const Vector3D& vAxis, vec_t fDegrees);
 
+// Setup a matrix to rotate one axis onto another. Input vectors must be normalized.
+VMatrix		SetupMatrixAxisToAxisRot(const Vector3D& vFromAxis, const Vector3D& vToAxis);
+
 // Setup a matrix from euler angles. Just sets identity and calls MatrixAngles.
 VMatrix		SetupMatrixAngles(const QAngle& vAngles);
 
@@ -460,16 +460,6 @@ inline VMatrix::VMatrix(const Vector3D& xAxis, const Vector3D& yAxis, const Vect
 	);
 }
 
-inline VMatrix::VMatrix(const Vector3D& xAxis, const Vector3D& yAxis, const Vector3D& zAxis, const Vector3D& translation)
-{
-	Init(
-		xAxis.x, yAxis.x, zAxis.x, translation.x,
-		xAxis.y, yAxis.y, zAxis.y, translation.y,
-		xAxis.z, yAxis.z, zAxis.z, translation.z,
-		0.0f, 0.0f, 0.0f, 1.0f
-	);
-}
-
 
 inline void VMatrix::Init(
 	vec_t m00, vec_t m01, vec_t m02, vec_t m03,
@@ -629,6 +619,11 @@ inline const matrix3x4_t& VMatrix::As3x4() const
 	return *((const matrix3x4_t*)this);
 }
 
+inline matrix3x4_t& VMatrix::As3x4()
+{
+	return *((matrix3x4_t*)this);
+}
+
 inline void VMatrix::CopyFrom3x4(const matrix3x4_t& m3x4)
 {
 	memcpy(m, m3x4.Base(), sizeof(matrix3x4_t));
@@ -691,7 +686,7 @@ inline VMatrix VMatrix::operator-() const
 	VMatrix ret;
 	for (int i = 0; i < 16; i++)
 	{
-		((float*)ret.m)[i] = ((float*)m)[i];
+		((float*)ret.m)[i] = -((float*)m)[i];
 	}
 	return ret;
 }
@@ -908,9 +903,9 @@ inline bool MatricesAreEqual(const VMatrix& src1, const VMatrix& src2, float flT
 //
 //-----------------------------------------------------------------------------
 void MatrixBuildOrtho(VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar);
+void MatrixBuildOrthoLH(VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar);
 void MatrixBuildPerspectiveX(VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar);
 void MatrixBuildPerspectiveOffCenterX(VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar, double bottom, double top, double left, double right);
-void MatrixBuildPerspectiveZRange(VMatrix& dst, double flZNear, double flZFar);
 
 inline void MatrixOrtho(VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar)
 {
@@ -922,6 +917,16 @@ inline void MatrixOrtho(VMatrix& dst, double left, double top, double right, dou
 	dst = temp;
 }
 
+inline void MatrixBuildOrthoLH(VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar)
+{
+	// Same as XMMatrixOrthographicOffCenterLH().
+	dst.Init(
+		2.0f / (right - left), 0.0f, 0.0f, (left + right) / (left - right),
+		0.0f, 2.0f / (bottom - top), 0.0f, (bottom + top) / (top - bottom),
+		0.0f, 0.0f, 1.0f / (zFar - zNear), zNear / (zNear - zFar),
+		0.0f, 0.0f, 0.0f, 1.0f);
+}
+
 inline void MatrixPerspectiveX(VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar)
 {
 	VMatrix mat;
@@ -942,6 +947,61 @@ inline void MatrixPerspectiveOffCenterX(VMatrix& dst, double flFovX, double flAs
 	dst = temp;
 }
 
+inline Vector4D GetMatrixColumnAsVector4D(const VMatrix& mMatrix, int nCol)
+{
+	Vector4D vColumnOut;
+	vColumnOut.x = mMatrix.m[0][nCol];
+	vColumnOut.y = mMatrix.m[1][nCol];
+	vColumnOut.z = mMatrix.m[2][nCol];
+	vColumnOut.w = mMatrix.m[3][nCol];
+	return vColumnOut;
+}
+
+inline Vector4D MatrixGetRowAsVector4D(const VMatrix& src, int nRow)
+{
+	Assert((nRow >= 0) && (nRow <= 3));
+	return Vector4D(src[nRow]);
+}
+
+//-----------------------------------------------------------------------------
+// Extracts clip planes from an arbitrary view projection matrix.
+// This function assumes the matrix has been transposed.
+//-----------------------------------------------------------------------------
+inline void ExtractClipPlanesFromTransposedMatrix(const VMatrix& transposedViewProjMatrix, VPlane* pPlanesOut)
+{
+	// Left
+	Vector4D vPlane = GetMatrixColumnAsVector4D(transposedViewProjMatrix, 0) + GetMatrixColumnAsVector4D(transposedViewProjMatrix, 3);
+	pPlanesOut[FRUSTUM_LEFT].Init(vPlane.AsVector3D(), -vPlane.w);
+
+	// Right
+	vPlane = -GetMatrixColumnAsVector4D(transposedViewProjMatrix, 0) + GetMatrixColumnAsVector4D(transposedViewProjMatrix, 3);
+	pPlanesOut[FRUSTUM_RIGHT].Init(vPlane.AsVector3D(), -vPlane.w);
+
+	// Bottom
+	vPlane = GetMatrixColumnAsVector4D(transposedViewProjMatrix, 1) + GetMatrixColumnAsVector4D(transposedViewProjMatrix, 3);
+	pPlanesOut[FRUSTUM_BOTTOM].Init(vPlane.AsVector3D(), -vPlane.w);
+
+	// Top
+	vPlane = -GetMatrixColumnAsVector4D(transposedViewProjMatrix, 1) + GetMatrixColumnAsVector4D(transposedViewProjMatrix, 3);
+	pPlanesOut[FRUSTUM_TOP].Init(vPlane.AsVector3D(), -vPlane.w);
+
+	// Near
+	vPlane = GetMatrixColumnAsVector4D(transposedViewProjMatrix, 2) + GetMatrixColumnAsVector4D(transposedViewProjMatrix, 3);
+	pPlanesOut[FRUSTUM_NEARZ].Init(vPlane.AsVector3D(), -vPlane.w);
+
+	// Far
+	vPlane = -GetMatrixColumnAsVector4D(transposedViewProjMatrix, 2) + GetMatrixColumnAsVector4D(transposedViewProjMatrix, 3);
+	pPlanesOut[FRUSTUM_FARZ].Init(vPlane.AsVector3D(), -vPlane.w);
+}
+
+//-----------------------------------------------------------------------------
+// Extracts clip planes from an arbitrary view projection matrix.
+// Differences from ExtractClipPlanesFromTransposedMatrix():
+// This function assumes the matrix has NOT been transposed.
+// If bD3DClippingRange is true, the projection space clipping range is assumed
+// to be [0,1], vs. the OpenGL range [-1,1].
+// This function always returns normalized planes.
+//-----------------------------------------------------------------------------
+void ExtractClipPlanesFromNonTransposedMatrix(const VMatrix& viewProjMatrix, VPlane* pPlanesOut, bool bD3DClippingRange = true);
+
 #endif
-
-
diff --git a/r5dev/mathlib/vplane.h b/r5dev/mathlib/vplane.h
index a9dfe040..48f52a0f 100644
--- a/r5dev/mathlib/vplane.h
+++ b/r5dev/mathlib/vplane.h
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+﻿//========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============//
 //
 // Purpose: 
 //
@@ -25,7 +25,6 @@ typedef int SideType;
 
 #define VP_EPSILON	0.01f
 
-
 class VPlane
 {
 public:
@@ -63,7 +62,7 @@ public:
 #endif
 
 public:
-	Vector3D	m_Normal;
+	Vector3D		m_Normal;
 	vec_t		m_Dist;
 
 #ifdef VECTOR_NO_SLOW_OPERATIONS
@@ -176,7 +175,4 @@ inline SideType VPlane::BoxOnPlaneSide(const Vector3D& vMin, const Vector3D& vMa
 	return firstSide;
 }
 
-
-
-
 #endif // VPLANE_H
diff --git a/r5dev/tier0/basetypes.h b/r5dev/tier0/basetypes.h
index 0a0b1d38..491b5fe6 100644
--- a/r5dev/tier0/basetypes.h
+++ b/r5dev/tier0/basetypes.h
@@ -154,6 +154,78 @@
 #define MAX( a, b ) ( ( ( a ) > ( b ) ) ? ( a ) : ( b ) )
 #endif
 
+#ifdef __cplusplus
+
+template< class T, class Y, class X >
+inline T clamp(T const& val, Y const& minVal, X const& maxVal)
+{
+	if (val < minVal)
+		return minVal;
+	else if (val > maxVal)
+		return maxVal;
+	else
+		return val;
+}
+
+// This is the preferred clamp operator. Using the clamp macro can lead to
+// unexpected side-effects or more expensive code. Even the clamp (all
+// lower-case) function can generate more expensive code because of the
+// mixed types involved.
+template< class T >
+T Clamp(T const& val, T const& minVal, T const& maxVal)
+{
+	if (val < minVal)
+		return minVal;
+	else if (val > maxVal)
+		return maxVal;
+	else
+		return val;
+}
+
+// This is the preferred Min operator. Using the MIN macro can lead to unexpected
+// side-effects or more expensive code.
+template< class T >
+T Min(T const& val1, T const& val2)
+{
+	return val1 < val2 ? val1 : val2;
+}
+
+// This is the preferred Max operator. Using the MAX macro can lead to unexpected
+// side-effects or more expensive code.
+template< class T >
+T Max(T const& val1, T const& val2)
+{
+	return val1 > val2 ? val1 : val2;
+}
+
+template <typename T>
+void Swap(T& a, T& b)
+{
+	T temp = a;
+	a = b;
+	b = temp;
+}
+
+#else
+
+#define clamp(val, min, max) (((val) > (max)) ? (max) : (((val) < (min)) ? (min) : (val)))
+
+#endif
+
+#define fsel(c,x,y) ( (c) >= 0 ? (x) : (y) )
+
+// integer conditional move
+// if a >= 0, return x, else y
+#define isel(a,x,y) ( ((a) >= 0) ? (x) : (y) )
+
+// if x = y, return a, else b
+#define ieqsel(x,y,a,b) (( (x) == (y) ) ? (a) : (b))
+
+// if the nth bit of a is set (counting with 0 = LSB),
+// return x, else y
+// this is fast if nbit is a compile-time immediate 
+#define ibitsel(a, nbit, x, y) ( ( ((a) & (1 << (nbit))) != 0 ) ? (x) : (y) )
+
 // MSVC CRT uses 0x7fff while gcc uses MAX_INT, leading to mismatches between platforms
 // As a result, we pick the least common denominator here.  This should be used anywhere
 // you might typically want to use RAND_MAX
diff --git a/r5dev/tier0/dbg.cpp b/r5dev/tier0/dbg.cpp
index 115dc451..a986d734 100644
--- a/r5dev/tier0/dbg.cpp
+++ b/r5dev/tier0/dbg.cpp
@@ -8,9 +8,10 @@
 
 #include "core/stdafx.h"
 #include "core/logdef.h"
+#include "tier0/dbg.h"
 #include "tier0/platform.h"
 #include "tier0/threadtools.h"
-#include "tier0/dbg.h"
+#include <tier0/commandline.h>
 #ifndef DEDICATED
 #include "vgui/vgui_debugpanel.h"
 #include "gameui/IConsole.h"
@@ -28,7 +29,12 @@ std::mutex s_LogMutex;
 //-----------------------------------------------------------------------------
 bool HushAsserts()
 {
+#ifdef DBGFLAG_ASSERT
+	static bool s_bHushAsserts = !!CommandLine()->FindParm("-hushasserts");
+	return s_bHushAsserts;
+#else
 	return true;
+#endif
 }
 
 //-----------------------------------------------------------------------------
diff --git a/r5dev/tier0/dbg.h b/r5dev/tier0/dbg.h
index 11ede439..d1d202fb 100644
--- a/r5dev/tier0/dbg.h
+++ b/r5dev/tier0/dbg.h
@@ -8,6 +8,7 @@
 #ifndef DBG_H
 #define DBG_H
 #define Assert assert
+#define AssertDbg assert
 #include "tier0/dbgflag.h"
 
 bool HushAsserts();
diff --git a/r5dev/tier0/platform.h b/r5dev/tier0/platform.h
index e8449be3..0b6d9730 100644
--- a/r5dev/tier0/platform.h
+++ b/r5dev/tier0/platform.h
@@ -141,6 +141,12 @@
 #define IS_WINDOWS_PC 1
 #endif
 
+#if _MSC_VER >= 1800
+#define	VECTORCALL __vectorcall 
+#else 
+#define	VECTORCALL 
+#endif
+
 #endif // CROSS_PLATFORM_VERSION < 2
 
 #if defined( GNUC )	&& !defined( COMPILER_PS3 ) // use pre-align on PS3
@@ -282,6 +288,8 @@ inline int64 CastPtrToInt64(const void* p)
 
 #endif
 
+#define NO_MALLOC_OVERRIDE
+
 //-----------------------------------------------------------------------------
 // Various compiler-specific keywords
 //-----------------------------------------------------------------------------
diff --git a/r5dev/tier0/threadtools.cpp b/r5dev/tier0/threadtools.cpp
new file mode 100644
index 00000000..3104304f
--- /dev/null
+++ b/r5dev/tier0/threadtools.cpp
@@ -0,0 +1,31 @@
+﻿//===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======//
+//
+// Purpose: Random number generator
+//
+// $Workfile: $
+// $NoKeywords: $
+//===========================================================================//
+
+#include "core/stdafx.h"
+#include "threadtools.h"
+
+LONG ThreadInterlockedCompareExchange64(LONG volatile* pDest, int64 value, int64 comperand)
+{
+	return _InterlockedCompareExchange(pDest, comperand, value);
+}
+
+bool ThreadInterlockedAssignIf(LONG volatile* p, int32 value, int32 comperand)
+{
+	Assert((size_t)p % 4 == 0);
+	return _InterlockedCompareExchange(p, comperand, value);
+}
+
+int64 ThreadInterlockedCompareExchange64(int64 volatile* pDest, int64 value, int64 comperand)
+{
+	return _InterlockedCompareExchange64(pDest, comperand, value);
+}
+
+bool ThreadInterlockedAssignIf64(int64 volatile* pDest, int64 value, int64 comperand)
+{
+	return _InterlockedCompareExchange64(pDest, comperand, value);
+}
\ No newline at end of file
diff --git a/r5dev/tier0/threadtools.h b/r5dev/tier0/threadtools.h
index 956f503d..8189a19c 100644
--- a/r5dev/tier0/threadtools.h
+++ b/r5dev/tier0/threadtools.h
@@ -1,11 +1,6 @@
 #ifndef THREADTOOLS_H
 #define THREADTOOLS_H
 
-inline bool ThreadInterlockedAssignIf(LONG volatile* p, int32 value, int32 comperand)
-{
-	Assert((size_t)p % 4 == 0);
-	return _InterlockedCompareExchange(p, comperand, value);
-}
 inline void ThreadSleep(unsigned nMilliseconds)
 {
 #ifdef _WIN32
@@ -38,6 +33,169 @@ inline void ThreadSleep(unsigned nMilliseconds)
 	usleep(nMilliseconds * 1000);
 #endif
 }
+inline void ThreadPause()
+{
+#if defined( COMPILER_PS3 )
+	__db16cyc();
+#elif defined( COMPILER_GCC )
+	__asm __volatile("pause");
+#elif defined ( COMPILER_MSVC64 )
+	_mm_pause();
+#elif defined( COMPILER_MSVC32 )
+	__asm pause;
+#elif defined( COMPILER_MSVCX360 )
+	YieldProcessor();
+	__asm { or r0, r0, r0 }
+	YieldProcessor();
+	__asm { or r1, r1, r1 }
+#else
+#error "implement me"
+#endif
+}
+LONG ThreadInterlockedCompareExchange64(LONG volatile* pDest, int64 value, int64 comperand);
+bool ThreadInterlockedAssignIf(LONG volatile* p, int32 value, int32 comperand);
+int64 ThreadInterlockedCompareExchange64(int64 volatile* pDest, int64 value, int64 comperand);
+bool ThreadInterlockedAssignIf64(int64 volatile* pDest, int64 value, int64 comperand);
+
+//-----------------------------------------------------------------------------
+//
+// Interlock methods. These perform very fast atomic thread
+// safe operations. These are especially relevant in a multi-core setting.
+//
+//-----------------------------------------------------------------------------
+
+#ifdef _WIN32
+#define NOINLINE
+#elif defined( _PS3 )
+#define NOINLINE __attribute__ ((noinline))
+#elif defined(POSIX)
+#define NOINLINE __attribute__ ((noinline))
+#endif
+
+#if defined( _X360 ) || defined( _PS3 )
+#define ThreadMemoryBarrier() __lwsync()
+#elif defined(COMPILER_MSVC)
+// Prevent compiler reordering across this barrier. This is
+// sufficient for most purposes on x86/x64.
+#define ThreadMemoryBarrier() _ReadWriteBarrier()
+#elif defined(COMPILER_GCC)
+// Prevent compiler reordering across this barrier. This is
+// sufficient for most purposes on x86/x64.
+// http://preshing.com/20120625/memory-ordering-at-compile-time
+#define ThreadMemoryBarrier() asm volatile("" ::: "memory")
+#else
+#error Every platform needs to define ThreadMemoryBarrier to at least prevent compiler reordering
+#endif
+
+//-----------------------------------------------------------------------------
+//
+// A super-fast thread-safe integer A simple class encapsulating the notion of an 
+// atomic integer used across threads that uses the built in and faster 
+// "interlocked" functionality rather than a full-blown mutex. Useful for simple 
+// things like reference counts, etc.
+//
+//-----------------------------------------------------------------------------
+
+template <typename T>
+class CInterlockedIntT
+{
+public:
+	CInterlockedIntT() : m_value(0) { static_assert((sizeof(T) == sizeof(int32)) || (sizeof(T) == sizeof(int64))); }
+
+	CInterlockedIntT(T value) : m_value(value) {}
+
+	T operator()(void) const { return m_value; }
+	operator T() const { return m_value; }
+
+	bool operator!() const { return (m_value == 0); }
+	bool operator==(T rhs) const { return (m_value == rhs); }
+	bool operator!=(T rhs) const { return (m_value != rhs); }
+
+	T operator++() {
+		if (sizeof(T) == sizeof(int32))
+			return (T)ThreadInterlockedIncrement((int32*)&m_value);
+		else
+			return (T)ThreadInterlockedIncrement64((int64*)&m_value);
+	}
+	T operator++(int) { return operator++() - 1; }
+
+	T operator--() {
+		if (sizeof(T) == sizeof(int32))
+			return (T)ThreadInterlockedDecrement((int32*)&m_value);
+		else
+			return (T)ThreadInterlockedDecrement64((int64*)&m_value);
+	}
+
+	T operator--(int) { return operator--() + 1; }
+
+	bool AssignIf(T conditionValue, T newValue)
+	{
+		if (sizeof(T) == sizeof(int32))
+			return ThreadInterlockedAssignIf((LONG*)&m_value, (int32)newValue, (int32)conditionValue);
+		else
+			return ThreadInterlockedAssignIf64((int64*)&m_value, (int64)newValue, (int64)conditionValue);
+	}
+
+
+	T operator=(T newValue) {
+		if (sizeof(T) == sizeof(int32))
+			ThreadInterlockedExchange((int32*)&m_value, newValue);
+		else
+			ThreadInterlockedExchange64((int64*)&m_value, newValue);
+		return m_value;
+	}
+
+	// Atomic add is like += except it returns the previous value as its return value
+	T AtomicAdd(T add) {
+		if (sizeof(T) == sizeof(int32))
+			return (T)ThreadInterlockedExchangeAdd((int32*)&m_value, (int32)add);
+		else
+			return (T)ThreadInterlockedExchangeAdd64((int64*)&m_value, (int64)add);
+	}
+
+
+	void operator+=(T add) {
+		if (sizeof(T) == sizeof(int32))
+			ThreadInterlockedExchangeAdd((int32*)&m_value, (int32)add);
+		else
+			ThreadInterlockedExchangeAdd64((int64*)&m_value, (int64)add);
+	}
+
+	void operator-=(T subtract) { operator+=(-subtract); }
+	void operator*=(T multiplier) {
+		T original, result;
+		do
+		{
+			original = m_value;
+			result = original * multiplier;
+		} while (!AssignIf(original, result));
+	}
+	void operator/=(T divisor) {
+		T original, result;
+		do
+		{
+			original = m_value;
+			result = original / divisor;
+		} while (!AssignIf(original, result));
+	}
+
+	T operator+(T rhs) const { return m_value + rhs; }
+	T operator-(T rhs) const { return m_value - rhs; }
+
+	T InterlockedExchange(T newValue) {
+		if (sizeof(T) == sizeof(int32))
+			return (T)ThreadInterlockedExchange((int32*)&m_value, newValue);
+		else
+			return (T)ThreadInterlockedExchange64((int64*)&m_value, newValue);
+	}
+
+private:
+	volatile T m_value;
+};
+
+typedef CInterlockedIntT<int> CInterlockedInt;
+typedef CInterlockedIntT<unsigned> CInterlockedUInt;
+
 //=============================================================================
 class CThreadFastMutex;
 
diff --git a/r5dev/vproj/clientsdk.vcxproj b/r5dev/vproj/clientsdk.vcxproj
index 55990659..cd3f7e24 100644
--- a/r5dev/vproj/clientsdk.vcxproj
+++ b/r5dev/vproj/clientsdk.vcxproj
@@ -11,6 +11,7 @@
     </ProjectConfiguration>
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="..\bonesetup\bone_utils.cpp" />
     <ClCompile Include="..\bsplib\bsplib.cpp" />
     <ClCompile Include="..\client\cdll_engine_int.cpp" />
     <ClCompile Include="..\client\vengineclient_impl.cpp" />
@@ -70,6 +71,8 @@
     <ClCompile Include="..\mathlib\sha1.cpp" />
     <ClCompile Include="..\mathlib\sha256.cpp" />
     <ClCompile Include="..\mathlib\sseconst.cpp" />
+    <ClCompile Include="..\mathlib\ssenoise.cpp" />
+    <ClCompile Include="..\mathlib\transform.cpp" />
     <ClCompile Include="..\mathlib\vmatrix.cpp" />
     <ClCompile Include="..\networksystem\pylon.cpp" />
     <ClCompile Include="..\protoc\cl_rcon.pb.cc">
@@ -102,6 +105,7 @@
     <ClCompile Include="..\tier0\fasttimer.cpp" />
     <ClCompile Include="..\tier0\jobthread.cpp" />
     <ClCompile Include="..\tier0\platform.cpp" />
+    <ClCompile Include="..\tier0\threadtools.cpp" />
     <ClCompile Include="..\tier0\tslist.cpp" />
     <ClCompile Include="..\tier1\bitbuf.cpp" />
     <ClCompile Include="..\tier1\cmd.cpp" />
@@ -201,6 +205,7 @@
     <ClInclude Include="..\mathlib\bits.h" />
     <ClInclude Include="..\mathlib\color.h" />
     <ClInclude Include="..\mathlib\crc32.h" />
+    <ClInclude Include="..\mathlib\fltx4.h" />
     <ClInclude Include="..\mathlib\halton.h" />
     <ClInclude Include="..\mathlib\IceKey.H" />
     <ClInclude Include="..\mathlib\mathlib.h" />
@@ -210,9 +215,9 @@
     <ClInclude Include="..\mathlib\sha1.h" />
     <ClInclude Include="..\mathlib\sha256.h" />
     <ClInclude Include="..\mathlib\ssemath.h" />
-    <ClInclude Include="..\mathlib\ssenoise.h" />
     <ClInclude Include="..\mathlib\ssequaternion.h" />
     <ClInclude Include="..\mathlib\swap.h" />
+    <ClInclude Include="..\mathlib\transform.h" />
     <ClInclude Include="..\mathlib\vector.h" />
     <ClInclude Include="..\mathlib\vector2d.h" />
     <ClInclude Include="..\mathlib\vector4d.h" />
diff --git a/r5dev/vproj/clientsdk.vcxproj.filters b/r5dev/vproj/clientsdk.vcxproj.filters
index 07a3e855..9151d79a 100644
--- a/r5dev/vproj/clientsdk.vcxproj.filters
+++ b/r5dev/vproj/clientsdk.vcxproj.filters
@@ -211,6 +211,9 @@
     <Filter Include="sdk\engine\client">
       <UniqueIdentifier>{01d3645a-16c3-4910-ac95-049e112cd2b8}</UniqueIdentifier>
     </Filter>
+    <Filter Include="sdk\bonesetup">
+      <UniqueIdentifier>{57e1f0c7-ce4f-4576-960e-0cd15b2b5092}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\client\cdll_engine_int.cpp">
@@ -546,6 +549,18 @@
     <ClCompile Include="..\tier2\meshutils.cpp">
       <Filter>sdk\tier2</Filter>
     </ClCompile>
+    <ClCompile Include="..\bonesetup\bone_utils.cpp">
+      <Filter>sdk\bonesetup</Filter>
+    </ClCompile>
+    <ClCompile Include="..\mathlib\ssenoise.cpp">
+      <Filter>sdk\mathlib</Filter>
+    </ClCompile>
+    <ClCompile Include="..\mathlib\transform.cpp">
+      <Filter>sdk\mathlib</Filter>
+    </ClCompile>
+    <ClCompile Include="..\tier0\threadtools.cpp">
+      <Filter>sdk\tier0</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\client\cdll_engine_int.h">
@@ -1565,9 +1580,6 @@
     <ClInclude Include="..\mathlib\ssemath.h">
       <Filter>sdk\mathlib</Filter>
     </ClInclude>
-    <ClInclude Include="..\mathlib\ssenoise.h">
-      <Filter>sdk\mathlib</Filter>
-    </ClInclude>
     <ClInclude Include="..\mathlib\ssequaternion.h">
       <Filter>sdk\mathlib</Filter>
     </ClInclude>
@@ -1598,6 +1610,12 @@
     <ClInclude Include="..\tier2\meshutils.h">
       <Filter>sdk\tier2</Filter>
     </ClInclude>
+    <ClInclude Include="..\mathlib\fltx4.h">
+      <Filter>sdk\mathlib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\mathlib\transform.h">
+      <Filter>sdk\mathlib</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Image Include="..\shared\resource\lockedserver.png">
diff --git a/r5dev/vproj/dedicated.vcxproj b/r5dev/vproj/dedicated.vcxproj
index 88fa8e03..3c9e74e5 100644
--- a/r5dev/vproj/dedicated.vcxproj
+++ b/r5dev/vproj/dedicated.vcxproj
@@ -188,6 +188,7 @@
     <ClInclude Include="..\mathlib\bits.h" />
     <ClInclude Include="..\mathlib\color.h" />
     <ClInclude Include="..\mathlib\crc32.h" />
+    <ClInclude Include="..\mathlib\fltx4.h" />
     <ClInclude Include="..\mathlib\halton.h" />
     <ClInclude Include="..\mathlib\IceKey.H" />
     <ClInclude Include="..\mathlib\mathlib.h" />
@@ -197,9 +198,9 @@
     <ClInclude Include="..\mathlib\sha1.h" />
     <ClInclude Include="..\mathlib\sha256.h" />
     <ClInclude Include="..\mathlib\ssemath.h" />
-    <ClInclude Include="..\mathlib\ssenoise.h" />
     <ClInclude Include="..\mathlib\ssequaternion.h" />
     <ClInclude Include="..\mathlib\swap.h" />
+    <ClInclude Include="..\mathlib\transform.h" />
     <ClInclude Include="..\mathlib\vector.h" />
     <ClInclude Include="..\mathlib\vector2d.h" />
     <ClInclude Include="..\mathlib\vector4d.h" />
@@ -448,6 +449,7 @@
     <ClInclude Include="..\windows\system.h" />
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="..\bonesetup\bone_utils.cpp" />
     <ClCompile Include="..\bsplib\bsplib.cpp" />
     <ClCompile Include="..\client\vengineclient_impl.cpp" />
     <ClCompile Include="..\common\opcodes.cpp" />
@@ -501,6 +503,8 @@
     <ClCompile Include="..\mathlib\sha1.cpp" />
     <ClCompile Include="..\mathlib\sha256.cpp" />
     <ClCompile Include="..\mathlib\sseconst.cpp" />
+    <ClCompile Include="..\mathlib\ssenoise.cpp" />
+    <ClCompile Include="..\mathlib\transform.cpp" />
     <ClCompile Include="..\mathlib\vmatrix.cpp" />
     <ClCompile Include="..\networksystem\pylon.cpp" />
     <ClCompile Include="..\protoc\cl_rcon.pb.cc">
@@ -532,6 +536,7 @@
     <ClCompile Include="..\tier0\fasttimer.cpp" />
     <ClCompile Include="..\tier0\jobthread.cpp" />
     <ClCompile Include="..\tier0\platform.cpp" />
+    <ClCompile Include="..\tier0\threadtools.cpp" />
     <ClCompile Include="..\tier0\tslist.cpp" />
     <ClCompile Include="..\tier1\bitbuf.cpp" />
     <ClCompile Include="..\tier1\cmd.cpp" />
diff --git a/r5dev/vproj/dedicated.vcxproj.filters b/r5dev/vproj/dedicated.vcxproj.filters
index 8fa542eb..ed637cb0 100644
--- a/r5dev/vproj/dedicated.vcxproj.filters
+++ b/r5dev/vproj/dedicated.vcxproj.filters
@@ -187,6 +187,9 @@
     <Filter Include="sdk\engine\client">
       <UniqueIdentifier>{98975892-5379-4f6c-8c7e-35d92d2bc5e5}</UniqueIdentifier>
     </Filter>
+    <Filter Include="sdk\bonesetup">
+      <UniqueIdentifier>{d49ec580-58c2-49e7-8e83-957da576febd}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\common\opcodes.h">
@@ -1131,9 +1134,6 @@
     <ClInclude Include="..\mathlib\ssemath.h">
       <Filter>sdk\mathlib</Filter>
     </ClInclude>
-    <ClInclude Include="..\mathlib\ssenoise.h">
-      <Filter>sdk\mathlib</Filter>
-    </ClInclude>
     <ClInclude Include="..\mathlib\ssequaternion.h">
       <Filter>sdk\mathlib</Filter>
     </ClInclude>
@@ -1161,6 +1161,12 @@
     <ClInclude Include="..\vstdlib\random.h">
       <Filter>sdk\vstdlib</Filter>
     </ClInclude>
+    <ClInclude Include="..\mathlib\fltx4.h">
+      <Filter>sdk\mathlib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\mathlib\transform.h">
+      <Filter>sdk\mathlib</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\common\opcodes.cpp">
@@ -1448,6 +1454,18 @@
     <ClCompile Include="..\vstdlib\random.cpp">
       <Filter>sdk\vstdlib</Filter>
     </ClCompile>
+    <ClCompile Include="..\bonesetup\bone_utils.cpp">
+      <Filter>sdk\bonesetup</Filter>
+    </ClCompile>
+    <ClCompile Include="..\mathlib\ssenoise.cpp">
+      <Filter>sdk\mathlib</Filter>
+    </ClCompile>
+    <ClCompile Include="..\mathlib\transform.cpp">
+      <Filter>sdk\mathlib</Filter>
+    </ClCompile>
+    <ClCompile Include="..\tier0\threadtools.cpp">
+      <Filter>sdk\tier0</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <None Include="..\Dedicated.def" />
diff --git a/r5dev/vproj/gamesdk.vcxproj b/r5dev/vproj/gamesdk.vcxproj
index 3bd4a075..bfc9a4a9 100644
--- a/r5dev/vproj/gamesdk.vcxproj
+++ b/r5dev/vproj/gamesdk.vcxproj
@@ -11,6 +11,7 @@
     </ProjectConfiguration>
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="..\bonesetup\bone_utils.cpp" />
     <ClCompile Include="..\bsplib\bsplib.cpp" />
     <ClCompile Include="..\client\cdll_engine_int.cpp" />
     <ClCompile Include="..\client\vengineclient_impl.cpp" />
@@ -76,6 +77,8 @@
     <ClCompile Include="..\mathlib\sha1.cpp" />
     <ClCompile Include="..\mathlib\sha256.cpp" />
     <ClCompile Include="..\mathlib\sseconst.cpp" />
+    <ClCompile Include="..\mathlib\ssenoise.cpp" />
+    <ClCompile Include="..\mathlib\transform.cpp" />
     <ClCompile Include="..\mathlib\vmatrix.cpp" />
     <ClCompile Include="..\networksystem\pylon.cpp" />
     <ClCompile Include="..\protoc\cl_rcon.pb.cc">
@@ -109,6 +112,7 @@
     <ClCompile Include="..\tier0\fasttimer.cpp" />
     <ClCompile Include="..\tier0\jobthread.cpp" />
     <ClCompile Include="..\tier0\platform.cpp" />
+    <ClCompile Include="..\tier0\threadtools.cpp" />
     <ClCompile Include="..\tier0\tslist.cpp" />
     <ClCompile Include="..\tier1\bitbuf.cpp" />
     <ClCompile Include="..\tier1\cmd.cpp" />
@@ -219,6 +223,8 @@
     <ClInclude Include="..\mathlib\bits.h" />
     <ClInclude Include="..\mathlib\color.h" />
     <ClInclude Include="..\mathlib\crc32.h" />
+    <ClInclude Include="..\mathlib\float_tools.h" />
+    <ClInclude Include="..\mathlib\fltx4.h" />
     <ClInclude Include="..\mathlib\halton.h" />
     <ClInclude Include="..\mathlib\IceKey.H" />
     <ClInclude Include="..\mathlib\mathlib.h" />
@@ -228,9 +234,9 @@
     <ClInclude Include="..\mathlib\sha1.h" />
     <ClInclude Include="..\mathlib\sha256.h" />
     <ClInclude Include="..\mathlib\ssemath.h" />
-    <ClInclude Include="..\mathlib\ssenoise.h" />
     <ClInclude Include="..\mathlib\ssequaternion.h" />
     <ClInclude Include="..\mathlib\swap.h" />
+    <ClInclude Include="..\mathlib\transform.h" />
     <ClInclude Include="..\mathlib\vector.h" />
     <ClInclude Include="..\mathlib\vector2d.h" />
     <ClInclude Include="..\mathlib\vector4d.h" />
diff --git a/r5dev/vproj/gamesdk.vcxproj.filters b/r5dev/vproj/gamesdk.vcxproj.filters
index 3fd46497..e2d08203 100644
--- a/r5dev/vproj/gamesdk.vcxproj.filters
+++ b/r5dev/vproj/gamesdk.vcxproj.filters
@@ -220,6 +220,9 @@
     <Filter Include="sdk\engine\client">
       <UniqueIdentifier>{b7e33427-fd37-44b1-8530-651ae5f4fde1}</UniqueIdentifier>
     </Filter>
+    <Filter Include="sdk\bonesetup">
+      <UniqueIdentifier>{acbd4b45-6a8d-4d9f-9747-1bc460481bb4}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\client\vengineclient_impl.cpp">
@@ -576,6 +579,18 @@
     <ClCompile Include="..\tier2\meshutils.cpp">
       <Filter>sdk\tier2</Filter>
     </ClCompile>
+    <ClCompile Include="..\bonesetup\bone_utils.cpp">
+      <Filter>sdk\bonesetup</Filter>
+    </ClCompile>
+    <ClCompile Include="..\mathlib\transform.cpp">
+      <Filter>sdk\mathlib</Filter>
+    </ClCompile>
+    <ClCompile Include="..\mathlib\ssenoise.cpp">
+      <Filter>sdk\mathlib</Filter>
+    </ClCompile>
+    <ClCompile Include="..\tier0\threadtools.cpp">
+      <Filter>sdk\tier0</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\client\cdll_engine_int.h">
@@ -1637,9 +1652,6 @@
     <ClInclude Include="..\mathlib\ssemath.h">
       <Filter>sdk\mathlib</Filter>
     </ClInclude>
-    <ClInclude Include="..\mathlib\ssenoise.h">
-      <Filter>sdk\mathlib</Filter>
-    </ClInclude>
     <ClInclude Include="..\mathlib\noisedata.h">
       <Filter>sdk\mathlib</Filter>
     </ClInclude>
@@ -1667,6 +1679,15 @@
     <ClInclude Include="..\tier2\meshutils.h">
       <Filter>sdk\tier2</Filter>
     </ClInclude>
+    <ClInclude Include="..\mathlib\float_tools.h">
+      <Filter>sdk\mathlib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\mathlib\fltx4.h">
+      <Filter>sdk\mathlib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\mathlib\transform.h">
+      <Filter>sdk\mathlib</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Image Include="..\shared\resource\lockedserver.png">
diff --git a/r5dev/vstdlib/random.h b/r5dev/vstdlib/random.h
index d63d22ee..daded0d5 100644
--- a/r5dev/vstdlib/random.h
+++ b/r5dev/vstdlib/random.h
@@ -10,7 +10,6 @@
 #define VSTDLIB_RANDOM_H
 
 #include "tier0/basetypes.h"
-#include "tier0/threadtools.h"
 
 #define NTAB 32