//===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======//
//
// Purpose: 
//
//===========================================================================//

#if defined(__SPU__)
#include "tier0/platform.h"
#include "tier0/basetypes.h"
#include "mathlib/mathlib.h"
#include "mathlib/math_pfns.h"
// #include "mathlib/fltx4.h"
//#include "ps3/spu_job_shared.h"
#endif

#include "mathlib/ssemath.h"
#include "mathlib/ssequaternion.h"
//#include "mathlib/compressed_vector.h"

// NOTE: This has to be the last file included!
#include "tier0/memdbgon.h"

const fltx4 g_SIMD_Identity[4] =
{
	{ 1.0, 0, 0, 0 }, { 0, 1.0, 0, 0 }, { 0, 0, 1.0, 0 }, { 0, 0, 0, 1.0 }
};

#if !defined(__SPU__)
const fltx4 Four_PointFives = { 0.5,0.5,0.5,0.5 };
#ifndef _X360
const fltx4 Four_Zeros = { 0.0,0.0,0.0,0.0 };
const fltx4 Four_Ones = { 1.0,1.0,1.0,1.0 };
#endif
const fltx4 Four_Twos = { 2.0,2.0,2.0,2.0 };
const fltx4 Four_Threes = { 3.0,3.0,3.0,3.0 };
const fltx4 Four_Fours = { 4.0,4.0,4.0,4.0 };
const fltx4 Four_Origin = { 0,0,0,1 };
const fltx4 Four_NegativeOnes = { -1,-1,-1,-1 };

const fltx4 Four_2ToThe21s = { (float)(1 << 21), (float)(1 << 21), (float)(1 << 21), (float)(1 << 21) };
const fltx4 Four_2ToThe22s = { (float)(1 << 22), (float)(1 << 22), (float)(1 << 22), (float)(1 << 22) };
const fltx4 Four_2ToThe23s = { (float)(1 << 23), (float)(1 << 23), (float)(1 << 23), (float)(1 << 23) };
const fltx4 Four_2ToThe24s = { (float)(1 << 24), (float)(1 << 24), (float)(1 << 24), (float)(1 << 24) };
const fltx4 Four_Thirds = { 0.33333333f, 0.33333333f, 0.33333333f, 0.33333333f };
const fltx4 Four_TwoThirds = { 0.66666666f, 0.66666666f, 0.66666666f, 0.66666666f };
const fltx4 Four_Point225s = { .225f, .225f, .225f, .225f };
const fltx4 Four_Epsilons = { FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON };
const fltx4 Four_DegToRad = { ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)) };

const fltx4 Four_FLT_MAX = { FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX };
const fltx4 Four_Negative_FLT_MAX = { -FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX };
const fltx4 g_SIMD_0123 = { 0., 1., 2., 3. };

const fltx4 Four_LinearToGammaCoefficients_A = { -3.7295f, -3.7295f, -3.7295f, -3.7295f };
const fltx4 Four_LinearToGammaCoefficients_B = { 8.9635f,  8.9635f,  8.9635f,  8.9635f };
const fltx4 Four_LinearToGammaCoefficients_C = { -7.7397f,  -7.7397f,  -7.7397f,  -7.7397f };
const fltx4 Four_LinearToGammaCoefficients_D = { 3.443f, 3.443f, 3.443f, 3.443f };
const fltx4 Four_LinearToGammaCoefficients_E = { 0.048f, 0.048f, 0.048f, 0.048f };

const fltx4 Four_GammaToLinearCoefficients_A = { .1731f, .1731f, .1731f, .1731f };
const fltx4 Four_GammaToLinearCoefficients_B = { .8717f, .8717f, .8717f, .8717f };
const fltx4 Four_GammaToLinearCoefficients_C = { -.0452f, -.0452f, -.0452f, -.0452f };
const fltx4 Four_GammaToLinearCoefficients_D = { .0012f, .0012f, .0012f, .0012f };

const fltx4 g_QuatMultRowSign[4] =
{
	{  1.0f,  1.0f, -1.0f, 1.0f },
	{ -1.0f,  1.0f,  1.0f, 1.0f },
	{  1.0f, -1.0f,  1.0f, 1.0f },
	{ -1.0f, -1.0f, -1.0f, 1.0f }
};
#endif


const uint32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = { 0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff };
const uint32 ALIGN16 g_SIMD_signmask[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
const uint32 ALIGN16 g_SIMD_lsbmask[4] ALIGN16_POST = { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe };
const uint32 ALIGN16 g_SIMD_clear_wmask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
const uint32 ALIGN16 g_SIMD_AllOnesMask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; // ~0,~0,~0,~0
const uint32 ALIGN16 g_SIMD_Low16BitsMask[4] ALIGN16_POST = { 0xffff, 0xffff, 0xffff, 0xffff }; // 0xffff x 4


const uint32 ALIGN16 g_SIMD_ComponentMask[4][4] ALIGN16_POST =
{
	{ 0xFFFFFFFF, 0, 0, 0 }, { 0, 0xFFFFFFFF, 0, 0 }, { 0, 0, 0xFFFFFFFF, 0 }, { 0, 0, 0, 0xFFFFFFFF }
};

const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST =
{
	{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
	{ 0xffffffff, 0x00000000, 0x00000000, 0x00000000 },
	{ 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 },
	{ 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 },
};

const uint32 ALIGN16 g_SIMD_EveryOtherMask[4] = { 0, 0xffffffff, 0, 0xffffffff }; // 0,~0,0,~0



#ifdef PLATFORM_PPC

/// Passed as a parameter to vslh, shuffles the z component of a quat48 stored in the zw words left by one bit.
const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[] = {
	0x00, 0x00,												// x word
	0x00, 0x00,												// y word
	0x00, 0x01,												// z word 
	0x00, 0x00 };											// w word 

// this permutes uint16's x,y,z packed in the most significant four halfwords of a fltx4 
// so that each gets its own word in the output. expected use is // __vperm( XX, Four_Threes, permute )
// -- that way each int is represented as 3.0 + n * 2^-22 , which we can pull into the 
// appropriate range with a single madd!
const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16] =
{
	16, 17, 0, 1,											// word one:   00XX
	16, 17, 2, 3,											// word two:   00YY
	16, 17, 4, 5,											// word three: 00ZZ
	16, 17, 6, 7											// word four:  00WW
};

// the other permutes are a little trickier. note: I'm defining them out of order.
// 2 and 5 blend together prior results, rather than a source with 3.0f

// out1 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute1) ); // __x1__y1__z1____
const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16] =
{
	16, 17, 6, 7,											// word one:   00XX
	16, 17, 8, 9,											// word two:   00YY
	16, 17, 10, 11,											// word three: 00ZZ
	16, 17, 12, 13											// word four:  00WW
};

// out3 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute3) ); // __x3__y3__z3__z2  // z2 is important, goes into out2
const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16] =
{
	16, 17, 2, 3,
	16, 17, 4, 5,
	16, 17, 6, 7,
	16, 17, 0, 1
};

// out4 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute4) ); // __x4__y4__z4__x5  // x5 is important, goes into out5
const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16] =
{
	16, 17, 8, 9,
	16, 17, 10, 11,
	16, 17, 12, 13,
	16, 17, 14, 15
};

// out6 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute6) ); // __x6__y6__z6____
const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16] =
{
	16, 17, 4, 5,    // word one
	16, 17, 6, 7,  // word two
	16, 17, 8, 9,  // word three
	16, 17, 10, 11   // word four  (garbage)
};

// out7 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute7) ); // __x7__y7__z7____
const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16] =
{
	16, 17, 10, 11,    // word one
	16, 17, 12, 13,  // word two
	16, 17, 14, 15,  // word three
	16, 17, 16, 17   // word four  (garbage)
};

// these last two are tricky because we mix old output with source input. we get the 3.0f
// from the old output.
// out2 = __vperm( x0y0z0x1y1z1x2y2, out3, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute2)  ); // __x2__y2__z2____
const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16] =
{
	16, 17, 12, 13,  // 3.x2   
	16, 17, 14, 15,  // 3.y2
	16, 17, 30, 31,  // 3.z2 (from out2)
	16, 17, 16, 17
};

// out5 = __vperm( y5z5x6y6z6x7y7z7, out4, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute5)  ) // __x5__y5__z5____
const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16] =
{
	16, 17, 30, 31,  // 3.x5  (from out5)  
	16, 17,  0,  1,  // 3.y5
	16, 17,  2,  3,  // 3.z5 
	16, 17, 16, 17   // garbage   
};


// magic constants that we use to convert the unpacked q48 components from 2 + n * 2^-22 (where n = 0 .. 65535)
// to -1.0 .. 1
#define UnpackMul16s ( (1 << 22) / 32767.5 )
#define UnpackAdd16s ( ( -UnpackMul16s * 3.0 ) - 1 )
// we put the constants all into one word to save a little memory bandwidth
// but otherwise it would look like this:
// static const fltx4 vUpkMul = { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
// static const fltx4 vUpkAdd = { UnpackAdd16s , UnpackAdd16s , UnpackAdd16s , UnpackAdd16s  };
const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants = { UnpackMul16s , UnpackAdd16s, 0, 0 };
#undef UnpackMul16s
#undef UnpackAdd16s

#endif


// FUNCTIONS
// NOTE: WHY YOU **DO NOT** WANT TO PUT FUNCTIONS HERE
// Generally speaking, you want to make sure SIMD math functions
// are inlined, because that gives the compiler much more latitude
// in instruction scheduling. It's not that the overhead of calling
// the function is particularly great; rather, many of the SIMD 
// opcodes have long latencies, and if you have a sequence of 
// several dependent ones inside a function call, the latencies 
// stack up to create a big penalty. If the function is inlined,
// the compiler can interleave its operations with ones from the
// caller to better hide those latencies. Finally, on the 360,
// putting parameters or return values on the stack, and then 
// reading them back within the next forty cycles, is a very 
// severe penalty. So, as much as possible, you want to leave your
// data on the registers.

// That said, there are certain occasions where it is appropriate
// to call into functions -- particularly for very large blocks
// of code that will spill most of the registers anyway. Unless your
// function is more than one screen long, yours is probably not one
// of those occasions.

#if !defined(__SPU__)

/// You can use this to rotate a long array of FourVectors all by the same
/// matrix. The first parameter is the head of the array. The second is the
/// number of vectors to rotate. The third is the matrix.
void FourVectors::RotateManyBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix)
{
	Assert(numVectors > 0);
	if (numVectors == 0)
		return;

	// Splat out each of the entries in the matrix to a fltx4. Do this
	// in the order that we will need them, to hide latency. I'm
	// avoiding making an array of them, so that they'll remain in 
	// registers.
	fltx4 matSplat00, matSplat01, matSplat02,
		matSplat10, matSplat11, matSplat12,
		matSplat20, matSplat21, matSplat22;

	{
		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
		// often unaligned. The w components will be the tranpose row of
		// the matrix, but we don't really care about that.
		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);

		matSplat00 = SplatXSIMD(matCol0);
		matSplat01 = SplatYSIMD(matCol0);
		matSplat02 = SplatZSIMD(matCol0);

		matSplat10 = SplatXSIMD(matCol1);
		matSplat11 = SplatYSIMD(matCol1);
		matSplat12 = SplatZSIMD(matCol1);

		matSplat20 = SplatXSIMD(matCol2);
		matSplat21 = SplatYSIMD(matCol2);
		matSplat22 = SplatZSIMD(matCol2);
	}

#if defined(_X360) || defined(_PS3)
	// Same algorithm as above, but the loop is unrolled to eliminate data hazard latencies
	// and simplify prefetching. Named variables are deliberately used instead of arrays to
	// ensure that the variables live on the registers instead of the stack (stack load/store
	// is a serious penalty on 360).  Nb: for prefetching to be most efficient here, the
	// loop should be unrolled to 8 FourVectors per iteration; because each FourVectors is 
	// 48 bytes long, 48 * 8 = 384, its least common multiple with the 128-byte cache line. 
	// That way you can fetch the next 3 cache lines while you work on these three. 
	// If you do go this route, be sure to disassemble and make sure it doesn't spill 
	// registers to stack as you do this; the cost of that will be excessive. Unroll the loop
	// a little and just live with the fact that you'll be doing a couple of redundant dbcts
	// (they don't cost you anything). Be aware that all three cores share L2 and it can only
	// have eight cache lines fetching at a time.
	fltx4 outX0, outY0, outZ0; // bank one of outputs
	fltx4 outX1, outY1, outZ1; // bank two of outputs


	// Because of instruction latencies and scheduling, it's actually faster to use adds and muls
	// rather than madds. (Empirically determined by timing.)
	const FourVectors* stop = pVectors + numVectors;
	FourVectors* RESTRICT pVectNext;
	// prime the pump.
	if (numVectors & 0x01)
	{
		// odd number of vectors to process
		// prime the 1 group of registers
		pVectNext = pVectors++;
		outX1 = AddSIMD(AddSIMD(MulSIMD(pVectNext->x, matSplat00), MulSIMD(pVectNext->y, matSplat01)), MulSIMD(pVectNext->z, matSplat02));
		outY1 = AddSIMD(AddSIMD(MulSIMD(pVectNext->x, matSplat10), MulSIMD(pVectNext->y, matSplat11)), MulSIMD(pVectNext->z, matSplat12));
		outZ1 = AddSIMD(AddSIMD(MulSIMD(pVectNext->x, matSplat20), MulSIMD(pVectNext->y, matSplat21)), MulSIMD(pVectNext->z, matSplat22));
	}
	else
	{
		// even number of total vectors to process; 
		// prime the zero group and jump into the middle of the loop
		outX0 = AddSIMD(AddSIMD(MulSIMD(pVectors->x, matSplat00), MulSIMD(pVectors->y, matSplat01)), MulSIMD(pVectors->z, matSplat02));
		outY0 = AddSIMD(AddSIMD(MulSIMD(pVectors->x, matSplat10), MulSIMD(pVectors->y, matSplat11)), MulSIMD(pVectors->z, matSplat12));
		outZ0 = AddSIMD(AddSIMD(MulSIMD(pVectors->x, matSplat20), MulSIMD(pVectors->y, matSplat21)), MulSIMD(pVectors->z, matSplat22));
		goto EVEN_CASE;
	}

	// perform an even number of iterations through this loop.
	while (pVectors < stop)
	{
		outX0 = MaddSIMD(pVectors->z, matSplat02, AddSIMD(MulSIMD(pVectors->x, matSplat00), MulSIMD(pVectors->y, matSplat01)));
		outY0 = MaddSIMD(pVectors->z, matSplat12, AddSIMD(MulSIMD(pVectors->x, matSplat10), MulSIMD(pVectors->y, matSplat11)));
		outZ0 = MaddSIMD(pVectors->z, matSplat22, AddSIMD(MulSIMD(pVectors->x, matSplat20), MulSIMD(pVectors->y, matSplat21)));

		pVectNext->x = outX1;
		pVectNext->y = outY1;
		pVectNext->z = outZ1;

	EVEN_CASE:
		pVectNext = pVectors + 1;

		outX1 = MaddSIMD(pVectNext->z, matSplat02, AddSIMD(MulSIMD(pVectNext->x, matSplat00), MulSIMD(pVectNext->y, matSplat01)));
		outY1 = MaddSIMD(pVectNext->z, matSplat12, AddSIMD(MulSIMD(pVectNext->x, matSplat10), MulSIMD(pVectNext->y, matSplat11)));
		outZ1 = MaddSIMD(pVectNext->z, matSplat22, AddSIMD(MulSIMD(pVectNext->x, matSplat20), MulSIMD(pVectNext->y, matSplat21)));

		pVectors->x = outX0;
		pVectors->y = outY0;
		pVectors->z = outZ0;

		pVectors += 2;
	}

	// flush the last round of output
	pVectNext->x = outX1;
	pVectNext->y = outY1;
	pVectNext->z = outZ1;
#else
	// PC does not benefit from the unroll/scheduling above
	fltx4 outX0, outY0, outZ0; // bank one of outputs


	// Because of instruction latencies and scheduling, it's actually faster to use adds and muls
	// rather than madds. (Empirically determined by timing.)
	const FourVectors* stop = pVectors + numVectors;

	// perform an even number of iterations through this loop.
	while (pVectors < stop)
	{
		outX0 = MaddSIMD(pVectors->z, matSplat02, AddSIMD(MulSIMD(pVectors->x, matSplat00), MulSIMD(pVectors->y, matSplat01)));
		outY0 = MaddSIMD(pVectors->z, matSplat12, AddSIMD(MulSIMD(pVectors->x, matSplat10), MulSIMD(pVectors->y, matSplat11)));
		outZ0 = MaddSIMD(pVectors->z, matSplat22, AddSIMD(MulSIMD(pVectors->x, matSplat20), MulSIMD(pVectors->y, matSplat21)));

		pVectors->x = outX0;
		pVectors->y = outY0;
		pVectors->z = outZ0;
		pVectors++;
	}
#endif
}

// Get the closest point from P to the (infinite) line through vLineA and vLineB and
// calculate the shortest distance from P to the line.
// If you pass in a value for t, it will tell you the t for (A + (B-A)t) to get the closest point.
// If the closest point lies on the segment between A and B, then 0 <= t <= 1.
void FourVectors::CalcClosestPointOnLineSIMD(const FourVectors& P, const FourVectors& vLineA, const FourVectors& vLineB, FourVectors& vClosest, fltx4* outT)
{
	FourVectors vDir;
	fltx4 t = CalcClosestPointToLineTSIMD(P, vLineA, vLineB, vDir);
	if (outT) *outT = t;
	vClosest = vDir;
	vClosest *= t;
	vClosest += vLineA;
}

fltx4 FourVectors::CalcClosestPointToLineTSIMD(const FourVectors& P, const FourVectors& vLineA, const FourVectors& vLineB, FourVectors& vDir)
{
	Assert(s_bMathlibInitialized);
	vDir = vLineB;
	vDir -= vLineA;

	fltx4 div = vDir * vDir;
	bi32x4 Mask;
	fltx4 Compare = ReplicateX4(0.00001f);
	fltx4 result;
	Mask = CmpLtSIMD(div, Compare);

	result = DivSIMD(SubSIMD(vDir * P, vDir * vLineA), div);

	MaskedAssign(Mask, Four_Zeros, result);
	return result;
}

void FourVectors::RotateManyBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors* RESTRICT pOut)
{
	Assert(numVectors > 0);
	if (numVectors == 0)
		return;

	// Splat out each of the entries in the matrix to a fltx4. Do this
	// in the order that we will need them, to hide latency. I'm
	// avoiding making an array of them, so that they'll remain in 
	// registers.
	fltx4 matSplat00, matSplat01, matSplat02,
		matSplat10, matSplat11, matSplat12,
		matSplat20, matSplat21, matSplat22;

	{
		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
		// often unaligned. The w components will be the tranpose row of
		// the matrix, but we don't really care about that.
		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);

		matSplat00 = SplatXSIMD(matCol0);
		matSplat01 = SplatYSIMD(matCol0);
		matSplat02 = SplatZSIMD(matCol0);

		matSplat10 = SplatXSIMD(matCol1);
		matSplat11 = SplatYSIMD(matCol1);
		matSplat12 = SplatZSIMD(matCol1);

		matSplat20 = SplatXSIMD(matCol2);
		matSplat21 = SplatYSIMD(matCol2);
		matSplat22 = SplatZSIMD(matCol2);
	}

#if  defined(_X360) || defined(_PS3)
	// Same algorithm as above, but the loop is unrolled to eliminate data hazard latencies
	// and simplify prefetching. Named variables are deliberately used instead of arrays to
	// ensure that the variables live on the registers instead of the stack (stack load/store
	// is a serious penalty on 360).  Nb: for prefetching to be most efficient here, the
	// loop should be unrolled to 8 FourVectors per iteration; because each FourVectors is 
	// 48 bytes long, 48 * 8 = 384, its least common multiple with the 128-byte cache line. 
	// That way you can fetch the next 3 cache lines while you work on these three. 
	// If you do go this route, be sure to disassemble and make sure it doesn't spill 
	// registers to stack as you do this; the cost of that will be excessive. Unroll the loop
	// a little and just live with the fact that you'll be doing a couple of redundant dbcts
	// (they don't cost you anything). Be aware that all three cores share L2 and it can only
	// have eight cache lines fetching at a time.
	fltx4 outX0, outY0, outZ0; // bank one of outputs
	fltx4 outX1, outY1, outZ1; // bank two of outputs


	// Because of instruction latencies and scheduling, it's actually faster to use adds and muls
	// rather than madds. (Empirically determined by timing.)
	const FourVectors* stop = pVectors + numVectors;
	FourVectors* RESTRICT pVectNext;
	FourVectors* RESTRICT pOutNext;
	// prime the pump.
	if (numVectors & 0x01)
	{
		// odd number of vectors to process
		// prime the 1 group of registers
		pVectNext = pVectors++;
		pOutNext = pOut++;
		outX1 = AddSIMD(AddSIMD(MulSIMD(pVectNext->x, matSplat00), MulSIMD(pVectNext->y, matSplat01)), MulSIMD(pVectNext->z, matSplat02));
		outY1 = AddSIMD(AddSIMD(MulSIMD(pVectNext->x, matSplat10), MulSIMD(pVectNext->y, matSplat11)), MulSIMD(pVectNext->z, matSplat12));
		outZ1 = AddSIMD(AddSIMD(MulSIMD(pVectNext->x, matSplat20), MulSIMD(pVectNext->y, matSplat21)), MulSIMD(pVectNext->z, matSplat22));
	}
	else
	{
		// even number of total vectors to process; 
		// prime the zero group and jump into the middle of the loop
		outX0 = AddSIMD(AddSIMD(MulSIMD(pVectors->x, matSplat00), MulSIMD(pVectors->y, matSplat01)), MulSIMD(pVectors->z, matSplat02));
		outY0 = AddSIMD(AddSIMD(MulSIMD(pVectors->x, matSplat10), MulSIMD(pVectors->y, matSplat11)), MulSIMD(pVectors->z, matSplat12));
		outZ0 = AddSIMD(AddSIMD(MulSIMD(pVectors->x, matSplat20), MulSIMD(pVectors->y, matSplat21)), MulSIMD(pVectors->z, matSplat22));
		goto EVEN_CASE;
	}

	// perform an even number of iterations through this loop.
	while (pVectors < stop)
	{
		outX0 = MaddSIMD(pVectors->z, matSplat02, AddSIMD(MulSIMD(pVectors->x, matSplat00), MulSIMD(pVectors->y, matSplat01)));
		outY0 = MaddSIMD(pVectors->z, matSplat12, AddSIMD(MulSIMD(pVectors->x, matSplat10), MulSIMD(pVectors->y, matSplat11)));
		outZ0 = MaddSIMD(pVectors->z, matSplat22, AddSIMD(MulSIMD(pVectors->x, matSplat20), MulSIMD(pVectors->y, matSplat21)));

		pOutNext->x = outX1;
		pOutNext->y = outY1;
		pOutNext->z = outZ1;

	EVEN_CASE:
		pVectNext = pVectors + 1;
		pOutNext = pOut + 1;

		outX1 = MaddSIMD(pVectNext->z, matSplat02, AddSIMD(MulSIMD(pVectNext->x, matSplat00), MulSIMD(pVectNext->y, matSplat01)));
		outY1 = MaddSIMD(pVectNext->z, matSplat12, AddSIMD(MulSIMD(pVectNext->x, matSplat10), MulSIMD(pVectNext->y, matSplat11)));
		outZ1 = MaddSIMD(pVectNext->z, matSplat22, AddSIMD(MulSIMD(pVectNext->x, matSplat20), MulSIMD(pVectNext->y, matSplat21)));

		pOut->x = outX0;
		pOut->y = outY0;
		pOut->z = outZ0;

		pVectors += 2;
		pOut += 2;
	}

	// flush the last round of output
	pVectNext->x = outX1;
	pVectNext->y = outY1;
	pVectNext->z = outZ1;
#else
	// PC does not benefit from the unroll/scheduling above
	fltx4 outX0, outY0, outZ0; // bank one of outputs


	// Because of instruction latencies and scheduling, it's actually faster to use adds and muls
	// rather than madds. (Empirically determined by timing.)
	const FourVectors* stop = pVectors + numVectors;

	// perform an even number of iterations through this loop.
	while (pVectors < stop)
	{
		outX0 = MaddSIMD(pVectors->z, matSplat02, AddSIMD(MulSIMD(pVectors->x, matSplat00), MulSIMD(pVectors->y, matSplat01)));
		outY0 = MaddSIMD(pVectors->z, matSplat12, AddSIMD(MulSIMD(pVectors->x, matSplat10), MulSIMD(pVectors->y, matSplat11)));
		outZ0 = MaddSIMD(pVectors->z, matSplat22, AddSIMD(MulSIMD(pVectors->x, matSplat20), MulSIMD(pVectors->y, matSplat21)));

		pOut->x = outX0;
		pOut->y = outY0;
		pOut->z = outZ0;
		pVectors++;
		pOut++;
	}
#endif
}

#ifdef _X360
// Loop-scheduled code to process FourVectors in groups of eight quite efficiently.
void FourVectors_TransformManyGroupsOfEightBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors* RESTRICT pOut)
{
	Assert(numVectors > 0);
	if (numVectors == 0)
		return;

	AssertMsg((pOut < pVectors&& pOut + numVectors <= pVectors) ||
		(pOut > pVectors && pVectors + numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers.");

	// Splat out each of the entries in the matrix to a fltx4. Do this
	// in the order that we will need them, to hide latency. I'm
	// avoiding making an array of them, so that they'll remain in 
	// registers.
	fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
		matSplat10, matSplat11, matSplat12, matSplat13,
		matSplat20, matSplat21, matSplat22, matSplat23;

	{
		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
		// often unaligned. The w components will be the tranpose row of
		// the matrix.
		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);

		matSplat00 = SplatXSIMD(matCol0);
		matSplat01 = SplatYSIMD(matCol0);
		matSplat02 = SplatZSIMD(matCol0);
		matSplat03 = SplatWSIMD(matCol0);

		matSplat10 = SplatXSIMD(matCol1);
		matSplat11 = SplatYSIMD(matCol1);
		matSplat12 = SplatZSIMD(matCol1);
		matSplat13 = SplatWSIMD(matCol1);

		matSplat20 = SplatXSIMD(matCol2);
		matSplat21 = SplatYSIMD(matCol2);
		matSplat22 = SplatZSIMD(matCol2);
		matSplat23 = SplatWSIMD(matCol2);
	}

	// this macro defines how to compute a specific row from an input and certain splat columns
#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
#define WRITE(term, reg, toptr) toptr->term = reg

	// define result groups (we're going to have an eight-way unroll)

	fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp;	// 48 REGISTERS
	fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
	fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
	fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
	fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
	fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
	fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
	fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;


	// #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z

	/*
	// stage 1 -- 6 ops for xyz, each w 12 cycle latency
	res0X = MulSIMD( (invec)->y, matSplat01 );
	res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
	// stage 2 -- 3 clocks for xyz
	res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
	// stage 3 -- 3 clocks for xyz
	res0X = AddSIMD(res0X, res0Temp);
	*/
#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar)  // frees up the tempvar

#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
										COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
										COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)

#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
										COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
										COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)

#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
										COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
										COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)

	FourVectors* RESTRICT inData = pVectors;
	FourVectors* RESTRICT outData = pOut;
	const FourVectors* const RESTRICT STOP = pVectors + numVectors;

	// Use techniques of loop scheduling to eliminate data hazards; process
	// eight groups simultaneously so that we never have any operations stalling
	// waiting for data.
	// Note: this loop, while pretty fast, could be faster still -- you'll notice
	// that it does all of its loads, then all computation, then writes everything
	// out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
	// stage 3, and write, then throughput could be higher (probably by about 50%). 
	while (inData < STOP)
	{
		// start prefetching the three cache lines
		// we'll hit two iterations from now
		__dcbt(sizeof(FourVectors) * 16, inData);
		__dcbt(sizeof(FourVectors) * 16 + 128, inData);
		__dcbt(sizeof(FourVectors) * 16 + 256, inData);

		// synchro
		COMPUTE_STAGE1_GROUP(res0, inData + 0);
		COMPUTE_STAGE1_GROUP(res1, inData + 1);
		COMPUTE_STAGE1_GROUP(res2, inData + 2);
		COMPUTE_STAGE1_GROUP(res3, inData + 3);

		COMPUTE_STAGE2_GROUP(res0, inData + 0);
		COMPUTE_STAGE1_GROUP(res4, inData + 4);
		COMPUTE_STAGE2_GROUP(res1, inData + 1);
		COMPUTE_STAGE1_GROUP(res5, inData + 5);
		COMPUTE_STAGE2_GROUP(res2, inData + 2);
		COMPUTE_STAGE1_GROUP(res6, inData + 6);
		COMPUTE_STAGE2_GROUP(res3, inData + 3);
		COMPUTE_STAGE1_GROUP(res7, inData + 7);

		COMPUTE_STAGE3_GROUP(res0, inData + 0);
		COMPUTE_STAGE2_GROUP(res4, inData + 4);
		COMPUTE_STAGE3_GROUP(res1, inData + 1);
		COMPUTE_STAGE2_GROUP(res5, inData + 5);
		COMPUTE_STAGE3_GROUP(res2, inData + 2);
		COMPUTE_STAGE2_GROUP(res6, inData + 6);
		COMPUTE_STAGE3_GROUP(res3, inData + 3);
		COMPUTE_STAGE2_GROUP(res7, inData + 7);

		COMPUTE_STAGE3_GROUP(res4, inData + 4);
		WRITE_GROUP(outData + 0, res0);
		COMPUTE_STAGE3_GROUP(res5, inData + 5);
		WRITE_GROUP(outData + 1, res1);
		COMPUTE_STAGE3_GROUP(res6, inData + 6);
		WRITE_GROUP(outData + 2, res2);
		COMPUTE_STAGE3_GROUP(res7, inData + 7);
		WRITE_GROUP(outData + 3, res3);


		WRITE_GROUP(outData + 4, res4);
		WRITE_GROUP(outData + 5, res5);
		WRITE_GROUP(outData + 6, res6);
		WRITE_GROUP(outData + 7, res7);

		inData += 8;
		outData += 8;
	}


#undef COMPUTE
#undef WRITE
#undef COMPUTE_STAGE1_ROW
#undef COMPUTE_STAGE2_ROW
#undef COMPUTE_STAGE3_ROW
#undef COMPUTE_STAGE1_GROUP
#undef COMPUTE_STAGE2_GROUP
#undef COMPUTE_STAGE3_GROUP
#undef COMPUTE_GROUP
#undef WRITE_GROUP
}

#ifdef _X360
// Loop-scheduled code to process FourVectors in groups of eight quite efficiently. This is the version
// to call when starting on a 128-byte-aligned address.
void FourVectors_TransformManyGroupsOfEightBy_128byteAligned(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors* RESTRICT pOut)
{
	/* If this has changed, you will need to change all the prefetches, *
	 * and groups of eight are no longer the ideal unit for iterating   *
	 * on many vectors.													*/
	COMPILE_TIME_ASSERT(sizeof(FourVectors) == 48);

	Assert(numVectors > 0);
	if (numVectors == 0)
		return;

	AssertMsg((numVectors & 0x07) == 0, "FourVectors_TransformManyGroupsOfEight called with numVectors % 8 != 0!");

	// Assert alignment
	AssertMsg(((reinterpret_cast<uint32>(pVectors) & 127) == 0) &&
		((reinterpret_cast<uint32>(pOut) & 127) == 0),
		"FourVectors_Transform..aligned called with non-128-byte-aligned buffers.");

	// Assert non overlap
	AssertMsg((pOut < pVectors&& pOut + numVectors <= pVectors) ||
		(pOut > pVectors && pVectors + numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers.");

	// Here's the plan. 8 four-vecs = 3 cache lines exactly. It takes about 400 cycles to process a group
	// of eight, and cache latency is 600 cycles, so we try to prefetch two iterations ahead (eg fetch
	// iteration 3 while working on iteration 1). In the case of the output, we can simply zero-flush 
	// the cache lines since we are sure to write into them. Because we're reading and fetching two ahead,
	// we want to stop two away from the last iteration.

	// No matter what, we will need to prefetch the first two groups of eight of input (that's the 
	// first six cache lines)
	__dcbt(0, pVectors);
	__dcbt(128, pVectors);
	__dcbt(256, pVectors);
	__dcbt(384, pVectors);
	__dcbt(512, pVectors);
	__dcbt(640, pVectors);


	// Splat out each of the entries in the matrix to a fltx4. Do this
	// in the order that we will need them, to hide latency. I'm
	// avoiding making an array of them, so that they'll remain in 
	// registers.
	fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
		matSplat10, matSplat11, matSplat12, matSplat13,
		matSplat20, matSplat21, matSplat22, matSplat23;

	{
		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
		// often unaligned. The w components will be the tranpose row of
		// the matrix.
		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);

		matSplat00 = SplatXSIMD(matCol0);
		matSplat01 = SplatYSIMD(matCol0);
		matSplat02 = SplatZSIMD(matCol0);
		matSplat03 = SplatWSIMD(matCol0);

		matSplat10 = SplatXSIMD(matCol1);
		matSplat11 = SplatYSIMD(matCol1);
		matSplat12 = SplatZSIMD(matCol1);
		matSplat13 = SplatWSIMD(matCol1);

		matSplat20 = SplatXSIMD(matCol2);
		matSplat21 = SplatYSIMD(matCol2);
		matSplat22 = SplatZSIMD(matCol2);
		matSplat23 = SplatWSIMD(matCol2);
	}

	// this macro defines how to compute a specific row from an input and certain splat columns
#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
#define WRITE(term, reg, toptr) toptr->term = reg

	// define result groups (we're going to have an eight-way unroll)

	fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp;	// 48 REGISTERS
	fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
	fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
	fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
	fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
	fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
	fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
	fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;


	// #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z

	/*
	// stage 1 -- 6 ops for xyz, each w 12 cycle latency
	res0X = MulSIMD( (invec)->y, matSplat01 );
	res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
	// stage 2 -- 3 clocks for xyz
	res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
	// stage 3 -- 3 clocks for xyz
	res0X = AddSIMD(res0X, res0Temp);
	*/
#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar)  // frees up the tempvar

#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
	COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
	COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)

#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
	COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
	COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)

#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
	COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
	COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)


	// Okay. First do all but the last two turns of the crank; we don't want to overshoot with the flush-to-zero.
	FourVectors* RESTRICT inData = pVectors;
	FourVectors* RESTRICT outData = pOut;
	const FourVectors* RESTRICT STOP;
	if (numVectors > 16)
	{
		STOP = pVectors + numVectors - 16;
		// flush the first two blocks we'll write into 
		__dcbz128(0, outData);
		__dcbz128(128, outData);
		__dcbz128(256, outData);

		while (inData < STOP)
		{
			// start prefetching the three cache lines
			// we'll hit two iterations from now
			__dcbt(sizeof(FourVectors) * 16, inData);
			__dcbt(sizeof(FourVectors) * 16 + 128, inData);
			__dcbt(sizeof(FourVectors) * 16 + 256, inData);

			// synchro
			COMPUTE_STAGE1_GROUP(res0, inData + 0);
			COMPUTE_STAGE1_GROUP(res1, inData + 1);
			COMPUTE_STAGE1_GROUP(res2, inData + 2);
			COMPUTE_STAGE1_GROUP(res3, inData + 3);

			// pre-zero the three cache lines we'll overwrite
			// in the next iteration
			__dcbz128(384, outData);
			__dcbz128(512, outData);
			__dcbz128(640, outData);


			COMPUTE_STAGE2_GROUP(res0, inData + 0);
			COMPUTE_STAGE1_GROUP(res4, inData + 4);
			COMPUTE_STAGE2_GROUP(res1, inData + 1);
			COMPUTE_STAGE1_GROUP(res5, inData + 5);
			COMPUTE_STAGE2_GROUP(res2, inData + 2);
			COMPUTE_STAGE1_GROUP(res6, inData + 6);
			COMPUTE_STAGE2_GROUP(res3, inData + 3);
			COMPUTE_STAGE1_GROUP(res7, inData + 7);

			COMPUTE_STAGE3_GROUP(res0, inData + 0);
			COMPUTE_STAGE2_GROUP(res4, inData + 4);
			COMPUTE_STAGE3_GROUP(res1, inData + 1);
			COMPUTE_STAGE2_GROUP(res5, inData + 5);
			COMPUTE_STAGE3_GROUP(res2, inData + 2);
			COMPUTE_STAGE2_GROUP(res6, inData + 6);
			COMPUTE_STAGE3_GROUP(res3, inData + 3);
			COMPUTE_STAGE2_GROUP(res7, inData + 7);

			COMPUTE_STAGE3_GROUP(res4, inData + 4);
			WRITE_GROUP(outData + 0, res0);
			COMPUTE_STAGE3_GROUP(res5, inData + 5);
			WRITE_GROUP(outData + 1, res1);
			COMPUTE_STAGE3_GROUP(res6, inData + 6);
			WRITE_GROUP(outData + 2, res2);
			COMPUTE_STAGE3_GROUP(res7, inData + 7);
			WRITE_GROUP(outData + 3, res3);


			WRITE_GROUP(outData + 4, res4);
			WRITE_GROUP(outData + 5, res5);
			WRITE_GROUP(outData + 6, res6);
			WRITE_GROUP(outData + 7, res7);

			inData += 8;
			outData += 8;
		}
	}
	else if (numVectors == 16)
	{
		// zero out the exactly six cache lines we will write into
		__dcbz128(0, outData);
		__dcbz128(128, outData);
		__dcbz128(256, outData);
		__dcbz128(384, outData);
		__dcbz128(512, outData);
		__dcbz128(640, outData);
	}
	else if (numVectors == 8)
	{
		// zero out the exactly three cache lines we will write into
		__dcbz128(0, outData);
		__dcbz128(128, outData);
		__dcbz128(256, outData);
	}
	else
	{
		AssertMsg(false, "Can't happen!");
	}

	// deal with the ultimate two groups (or, if we were fed
	// less than 16 groups, the whole shebang)
	STOP = pVectors + numVectors - 16;


	// Use techniques of loop scheduling to eliminate data hazards; process
	// eight groups simultaneously so that we never have any operations stalling
	// waiting for data.
	// Note: this loop, while pretty fast, could be faster still -- you'll notice
	// that it does all of its loads, then all computation, then writes everything
	// out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
	// stage 3, and write, then throughput could be higher (probably by about 50%). 
	while (inData < STOP)
	{
		// synchro
		COMPUTE_STAGE1_GROUP(res0, inData + 0);
		COMPUTE_STAGE1_GROUP(res1, inData + 1);
		COMPUTE_STAGE1_GROUP(res2, inData + 2);
		COMPUTE_STAGE1_GROUP(res3, inData + 3);

		COMPUTE_STAGE2_GROUP(res0, inData + 0);
		COMPUTE_STAGE1_GROUP(res4, inData + 4);
		COMPUTE_STAGE2_GROUP(res1, inData + 1);
		COMPUTE_STAGE1_GROUP(res5, inData + 5);
		COMPUTE_STAGE2_GROUP(res2, inData + 2);
		COMPUTE_STAGE1_GROUP(res6, inData + 6);
		COMPUTE_STAGE2_GROUP(res3, inData + 3);
		COMPUTE_STAGE1_GROUP(res7, inData + 7);

		COMPUTE_STAGE3_GROUP(res0, inData + 0);
		COMPUTE_STAGE2_GROUP(res4, inData + 4);
		COMPUTE_STAGE3_GROUP(res1, inData + 1);
		COMPUTE_STAGE2_GROUP(res5, inData + 5);
		COMPUTE_STAGE3_GROUP(res2, inData + 2);
		COMPUTE_STAGE2_GROUP(res6, inData + 6);
		COMPUTE_STAGE3_GROUP(res3, inData + 3);
		COMPUTE_STAGE2_GROUP(res7, inData + 7);

		COMPUTE_STAGE3_GROUP(res4, inData + 4);
		WRITE_GROUP(outData + 0, res0);
		COMPUTE_STAGE3_GROUP(res5, inData + 5);
		WRITE_GROUP(outData + 1, res1);
		COMPUTE_STAGE3_GROUP(res6, inData + 6);
		WRITE_GROUP(outData + 2, res2);
		COMPUTE_STAGE3_GROUP(res7, inData + 7);
		WRITE_GROUP(outData + 3, res3);


		WRITE_GROUP(outData + 4, res4);
		WRITE_GROUP(outData + 5, res5);
		WRITE_GROUP(outData + 6, res6);
		WRITE_GROUP(outData + 7, res7);

		inData += 8;
		outData += 8;
	}


#undef COMPUTE
#undef WRITE
#undef COMPUTE_STAGE1_ROW
#undef COMPUTE_STAGE2_ROW
#undef COMPUTE_STAGE3_ROW
#undef COMPUTE_STAGE1_GROUP
#undef COMPUTE_STAGE2_GROUP
#undef COMPUTE_STAGE3_GROUP
#undef COMPUTE_GROUP
#undef WRITE_GROUP
}
#endif

// Transform a long array of FourVectors by a given matrix. 
void FourVectors::TransformManyBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors* RESTRICT pOut)
{
	Assert(numVectors > 0);

	AssertMsg((pOut < pVectors&& pOut + numVectors <= pVectors) ||
		(pOut > pVectors && pVectors + numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers.");

#ifdef _X360
	// The really fast version of this function likes to operate on blocks of eight. So, chug through
	// groups of eight, then deal with any leftovers.
	int numVectorsRoundedToNearestEight = numVectors & (~0x07);
	if (numVectors >= 8)
	{
		// aligned?
		if ((reinterpret_cast<unsigned int>(pVectors) & 127) == 0 && (reinterpret_cast<unsigned int>(pOut) & 127) == 0)
		{
			FourVectors_TransformManyGroupsOfEightBy_128byteAligned(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut);
		}
		else
		{
			FourVectors_TransformManyGroupsOfEightBy(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut);
		}
		numVectors -= numVectorsRoundedToNearestEight;
		pVectors += numVectorsRoundedToNearestEight;
		pOut += numVectorsRoundedToNearestEight;
	}
#endif

	// any left over?
	if (numVectors > 0)
	{

		// Splat out each of the entries in the matrix to a fltx4. Do this
		// in the order that we will need them, to hide latency. I'm
		// avoiding making an array of them, so that they'll remain in 
		// registers.
		fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
			matSplat10, matSplat11, matSplat12, matSplat13,
			matSplat20, matSplat21, matSplat22, matSplat23;

		{
			// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
			// often unaligned. The w components will be the transpose row of
			// the matrix.
			fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
			fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
			fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);

			matSplat00 = SplatXSIMD(matCol0);
			matSplat01 = SplatYSIMD(matCol0);
			matSplat02 = SplatZSIMD(matCol0);
			matSplat03 = SplatWSIMD(matCol0);

			matSplat10 = SplatXSIMD(matCol1);
			matSplat11 = SplatYSIMD(matCol1);
			matSplat12 = SplatZSIMD(matCol1);
			matSplat13 = SplatWSIMD(matCol1);

			matSplat20 = SplatXSIMD(matCol2);
			matSplat21 = SplatYSIMD(matCol2);
			matSplat22 = SplatZSIMD(matCol2);
			matSplat23 = SplatWSIMD(matCol2);
		}

		do
		{
			// Trust in the compiler to schedule these operations correctly:
			pOut->x = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03)));
			pOut->y = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13)));
			pOut->z = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23)));

			++pOut;
			++pVectors;
			--numVectors;
		} while (numVectors > 0);
	}
}

#ifdef _X360
// Loop-scheduled code to process FourVectors in groups of eight quite efficiently.
static void FourVectors_TransformManyGroupsOfEightBy_InPlace(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix)
{
	Assert(numVectors > 0);
	if (numVectors == 0)
		return;

	// Prefetch line 1 and 2
	__dcbt(0, pVectors);
	__dcbt(128, pVectors);

	// Splat out each of the entries in the matrix to a fltx4. Do this
	// in the order that we will need them, to hide latency. I'm
	// avoiding making an array of them, so that they'll remain in 
	// registers.
	fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
		matSplat10, matSplat11, matSplat12, matSplat13,
		matSplat20, matSplat21, matSplat22, matSplat23;

	{
		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
		// often unaligned. The w components will be the tranpose row of
		// the matrix.
		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);

		matSplat00 = SplatXSIMD(matCol0);
		matSplat01 = SplatYSIMD(matCol0);
		matSplat02 = SplatZSIMD(matCol0);
		matSplat03 = SplatWSIMD(matCol0);

		matSplat10 = SplatXSIMD(matCol1);
		matSplat11 = SplatYSIMD(matCol1);
		matSplat12 = SplatZSIMD(matCol1);
		matSplat13 = SplatWSIMD(matCol1);

		matSplat20 = SplatXSIMD(matCol2);
		matSplat21 = SplatYSIMD(matCol2);
		matSplat22 = SplatZSIMD(matCol2);
		matSplat23 = SplatWSIMD(matCol2);
	}

	// this macro defines how to compute a specific row from an input and certain splat columns
#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
#define WRITE(term, reg, toptr) toptr->term = reg

	// define result groups (we're going to have an eight-way unroll)

	fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp;	// 48 REGISTERS
	fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
	fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
	fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
	fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
	fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
	fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
	fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;


	// #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z

	/*
	// stage 1 -- 6 ops for xyz, each w 12 cycle latency
	res0X = MulSIMD( (invec)->y, matSplat01 );
	res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
	// stage 2 -- 3 clocks for xyz
	res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
	// stage 3 -- 3 clocks for xyz
	res0X = AddSIMD(res0X, res0Temp);
	*/
#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar)  // frees up the tempvar

#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
	COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
	COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)

#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
	COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
	COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)

#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
	COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
	COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)

	const FourVectors* const RESTRICT STOP = pVectors + numVectors;

	// Use techniques of loop scheduling to eliminate data hazards; process
	// eight groups simultaneously so that we never have any operations stalling
	// waiting for data.
	// Note: this loop, while pretty fast, could be faster still -- you'll notice
	// that it does all of its loads, then all computation, then writes everything
	// out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
	// stage 3, and write, then throughput could be higher (probably by about 50%). 
	while (pVectors < STOP)
	{
		// start prefetching the three cache lines
		// we'll hit two iterations from now
		__dcbt(sizeof(FourVectors) * 16, pVectors);
		__dcbt(sizeof(FourVectors) * 16 + 128, pVectors);
		__dcbt(sizeof(FourVectors) * 16 + 256, pVectors);

		// synchro
		COMPUTE_STAGE1_GROUP(res0, pVectors + 0);
		COMPUTE_STAGE1_GROUP(res1, pVectors + 1);
		COMPUTE_STAGE1_GROUP(res2, pVectors + 2);
		COMPUTE_STAGE1_GROUP(res3, pVectors + 3);

		COMPUTE_STAGE2_GROUP(res0, pVectors + 0);
		COMPUTE_STAGE1_GROUP(res4, pVectors + 4);
		COMPUTE_STAGE2_GROUP(res1, pVectors + 1);
		COMPUTE_STAGE1_GROUP(res5, pVectors + 5);
		COMPUTE_STAGE2_GROUP(res2, pVectors + 2);
		COMPUTE_STAGE1_GROUP(res6, pVectors + 6);
		COMPUTE_STAGE2_GROUP(res3, pVectors + 3);
		COMPUTE_STAGE1_GROUP(res7, pVectors + 7);

		COMPUTE_STAGE3_GROUP(res0, pVectors + 0);
		COMPUTE_STAGE2_GROUP(res4, pVectors + 4);
		COMPUTE_STAGE3_GROUP(res1, pVectors + 1);
		COMPUTE_STAGE2_GROUP(res5, pVectors + 5);
		COMPUTE_STAGE3_GROUP(res2, pVectors + 2);
		COMPUTE_STAGE2_GROUP(res6, pVectors + 6);
		COMPUTE_STAGE3_GROUP(res3, pVectors + 3);
		COMPUTE_STAGE2_GROUP(res7, pVectors + 7);

		COMPUTE_STAGE3_GROUP(res4, pVectors + 4);
		WRITE_GROUP(pVectors + 0, res0);
		COMPUTE_STAGE3_GROUP(res5, pVectors + 5);
		WRITE_GROUP(pVectors + 1, res1);
		COMPUTE_STAGE3_GROUP(res6, pVectors + 6);
		WRITE_GROUP(pVectors + 2, res2);
		COMPUTE_STAGE3_GROUP(res7, pVectors + 7);
		WRITE_GROUP(pVectors + 3, res3);

		WRITE_GROUP(pVectors + 4, res4);
		WRITE_GROUP(pVectors + 5, res5);
		WRITE_GROUP(pVectors + 6, res6);
		WRITE_GROUP(pVectors + 7, res7);

		pVectors += 8;
	}


#undef COMPUTE
#undef WRITE
#undef COMPUTE_STAGE1_ROW
#undef COMPUTE_STAGE2_ROW
#undef COMPUTE_STAGE3_ROW
#undef COMPUTE_STAGE1_GROUP
#undef COMPUTE_STAGE2_GROUP
#undef COMPUTE_STAGE3_GROUP
#undef COMPUTE_GROUP
#undef WRITE_GROUP
}
#endif

// In-place version of above. It's necessary to have this, rather than just allowing pOut and pVectors
// to equal each other, because of the semantics of RESTRICT: pVectors and pOut must not be allowed 
// to alias. (Simply un-restricting the pointers results in very poor scheduling.)
void FourVectors::TransformManyBy(FourVectors* RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix)
{
	Assert(numVectors > 0);

#ifdef _X360
	// The really fast version of this function likes to operate on blocks of eight. So, chug through
	// groups of eight, then deal with any leftovers.
	int numVectorsRoundedToNearestEight = numVectors & (~0x07);
	if (numVectors >= 8)
	{
		FourVectors_TransformManyGroupsOfEightBy_InPlace(pVectors, numVectorsRoundedToNearestEight, rotationMatrix);
		numVectors -= numVectorsRoundedToNearestEight;
		pVectors += numVectorsRoundedToNearestEight;
	}
#endif

	// any left over?
	if (numVectors > 0)
	{

		// Splat out each of the entries in the matrix to a fltx4. Do this
		// in the order that we will need them, to hide latency. I'm
		// avoiding making an array of them, so that they'll remain in 
		// registers.
		fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
			matSplat10, matSplat11, matSplat12, matSplat13,
			matSplat20, matSplat21, matSplat22, matSplat23;

		{
			// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
			// often unaligned. The w components will be the transpose row of
			// the matrix.
			fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
			fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
			fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);

			matSplat00 = SplatXSIMD(matCol0);
			matSplat01 = SplatYSIMD(matCol0);
			matSplat02 = SplatZSIMD(matCol0);
			matSplat03 = SplatWSIMD(matCol0);

			matSplat10 = SplatXSIMD(matCol1);
			matSplat11 = SplatYSIMD(matCol1);
			matSplat12 = SplatZSIMD(matCol1);
			matSplat13 = SplatWSIMD(matCol1);

			matSplat20 = SplatXSIMD(matCol2);
			matSplat21 = SplatYSIMD(matCol2);
			matSplat22 = SplatZSIMD(matCol2);
			matSplat23 = SplatWSIMD(matCol2);
		}

		do
		{
			fltx4 resultX, resultY, resultZ;
			// Trust in the compiler to schedule these operations correctly:
			resultX = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03)));
			resultY = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13)));
			resultZ = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23)));

			pVectors->x = resultX;
			pVectors->y = resultY;
			pVectors->z = resultZ;

			++pVectors;
			--numVectors;
		} while (numVectors > 0);
	}
}


#endif

// Transform many (horizontal) points in-place by a 3x4 matrix,
// here already loaded onto three fltx4 registers but not transposed. 
// The points must be stored as 16-byte aligned. They are points
// and not vectors because we assume the w-component to be 1. 
#ifdef _X360
void TransformManyPointsBy(VectorAligned* RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow0, FLTX4 mRow1, FLTX4 mRow2)
{
	/**************************************************
	 *  Here is an elaborate and carefully scheduled  *
	 *  algorithm nicked from xboxmath.inl and hacked *
	 *  up for 3x4 matrices.                          *
	 **************************************************/

	COMPILE_TIME_ASSERT(sizeof(VectorAligned) == sizeof(XMFLOAT4)); // VectorAligned's need to be 16 bytes

	XMVECTOR R0[8], R1[8], R2[8];
	XMVECTOR vIn[8];

	//    C_ASSERT(UnrollCount == 8);
	//    C_ASSERT(sizeof(XMFLOAT4) == 16);
	Assert(pVectors);
	Assert(((UINT_PTR)pVectors & 3) == 0); // assert alignment 

	UINT GroupIndex;

	VectorAligned* RESTRICT vCurrent = pVectors;
	// sentinel pointers
	VectorAligned* vStreamEnd, * vStreamGroupBase, * vStreamGroupEnd;

	{
		// cook up the pointers from integer math. Necessary because otherwise we LHS all over
		// the place. (Odd that this doesn't happen to the xbox math.) 

		UINT_PTR InputVector = (UINT_PTR)pVectors;
		UINT_PTR InputStreamEnd = InputVector + numVectors * sizeof(XMFLOAT4);
		// compute start and end points on 128-byte alignment
		UINT_PTR InputStreamCGroupBase = XMMin(InputVector + (XM_CACHE_LINE_SIZE - 1), InputStreamEnd) & ~(XM_CACHE_LINE_SIZE - 1);
		UINT_PTR InputStreamCGroupEnd = InputStreamCGroupBase + ((InputStreamEnd - InputStreamCGroupBase) & ~(4 * XM_CACHE_LINE_SIZE - 1));

		vStreamEnd = (VectorAligned*)InputStreamEnd;
		vStreamGroupBase = (VectorAligned*)InputStreamCGroupBase;
		vStreamGroupEnd = (VectorAligned*)InputStreamCGroupEnd;
	}


	__dcbt(0, vStreamGroupBase);
	__dcbt(XM_CACHE_LINE_SIZE, vStreamGroupBase);
	__dcbt(XM_CACHE_LINE_SIZE * 2, vStreamGroupBase);
	__dcbt(XM_CACHE_LINE_SIZE * 3, vStreamGroupBase);

	while (vCurrent < vStreamGroupBase)
	{
		fltx4 vec = __lvx(vCurrent->Base(), 0);

		R0[0] = __vmsum4fp(vec, mRow0);
		R1[0] = __vmsum4fp(vec, mRow1);
		R2[0] = __vmsum4fp(vec, mRow2);

		__stvewx(R0[0], vCurrent->Base(), 0);
		__stvewx(R1[0], vCurrent->Base(), 4);
		__stvewx(R2[0], vCurrent->Base(), 8);

		vCurrent++;
	}

	while (vCurrent < vStreamGroupEnd)
	{
		__dcbt(XM_CACHE_LINE_SIZE * 4, vCurrent);
		__dcbt(XM_CACHE_LINE_SIZE * 5, vCurrent);
		__dcbt(XM_CACHE_LINE_SIZE * 6, vCurrent);
		__dcbt(XM_CACHE_LINE_SIZE * 7, vCurrent);

		for (GroupIndex = 0; GroupIndex < 4; GroupIndex++)
		{
			// all kinds of LHS on this pointer. Why?
			VectorAligned* OutputVector = vCurrent;

			vIn[0] = __lvx(vCurrent->Base(), 0);
			vCurrent++;
			vIn[1] = __lvx(vCurrent->Base(), 0);
			vCurrent++;
			vIn[2] = __lvx(vCurrent->Base(), 0);
			vCurrent++;
			vIn[3] = __lvx(vCurrent->Base(), 0);
			vCurrent++;
			vIn[4] = __lvx(vCurrent->Base(), 0);
			vCurrent++;
			vIn[5] = __lvx(vCurrent->Base(), 0);
			vCurrent++;
			vIn[6] = __lvx(vCurrent->Base(), 0);
			vCurrent++;
			vIn[7] = __lvx(vCurrent->Base(), 0);
			vCurrent++;

			R0[0] = __vmsum4fp(vIn[0], mRow0);
			R1[0] = __vmsum4fp(vIn[0], mRow1);
			R2[0] = __vmsum4fp(vIn[0], mRow2);

			R0[1] = __vmsum4fp(vIn[1], mRow0);
			R1[1] = __vmsum4fp(vIn[1], mRow1);
			R2[1] = __vmsum4fp(vIn[1], mRow2);

			R0[2] = __vmsum4fp(vIn[2], mRow0);
			R1[2] = __vmsum4fp(vIn[2], mRow1);
			R2[2] = __vmsum4fp(vIn[2], mRow2);

			R0[3] = __vmsum4fp(vIn[3], mRow0);
			R1[3] = __vmsum4fp(vIn[3], mRow1);
			R2[3] = __vmsum4fp(vIn[3], mRow2);

			R0[4] = __vmsum4fp(vIn[4], mRow0);
			R1[4] = __vmsum4fp(vIn[4], mRow1);
			R2[4] = __vmsum4fp(vIn[4], mRow2);

			R0[5] = __vmsum4fp(vIn[5], mRow0);
			R1[5] = __vmsum4fp(vIn[5], mRow1);
			R2[5] = __vmsum4fp(vIn[5], mRow2);

			R0[6] = __vmsum4fp(vIn[6], mRow0);
			R1[6] = __vmsum4fp(vIn[6], mRow1);
			R2[6] = __vmsum4fp(vIn[6], mRow2);

			R0[7] = __vmsum4fp(vIn[7], mRow0);
			R1[7] = __vmsum4fp(vIn[7], mRow1);
			R2[7] = __vmsum4fp(vIn[7], mRow2);

			__stvewx(R0[0], OutputVector, 0);
			__stvewx(R1[0], OutputVector, 4);
			__stvewx(R2[0], OutputVector, 8);
			OutputVector++;

			__stvewx(R0[1], OutputVector, 0);
			__stvewx(R1[1], OutputVector, 4);
			__stvewx(R2[1], OutputVector, 8);
			OutputVector++;

			__stvewx(R0[2], OutputVector, 0);
			__stvewx(R1[2], OutputVector, 4);
			__stvewx(R2[2], OutputVector, 8);
			OutputVector++;

			__stvewx(R0[3], OutputVector, 0);
			__stvewx(R1[3], OutputVector, 4);
			__stvewx(R2[3], OutputVector, 8);
			OutputVector++;

			__stvewx(R0[4], OutputVector, 0);
			__stvewx(R1[4], OutputVector, 4);
			__stvewx(R2[4], OutputVector, 8);
			OutputVector++;

			__stvewx(R0[5], OutputVector, 0);
			__stvewx(R1[5], OutputVector, 4);
			__stvewx(R2[5], OutputVector, 8);
			OutputVector++;

			__stvewx(R0[6], OutputVector, 0);
			__stvewx(R1[6], OutputVector, 4);
			__stvewx(R2[6], OutputVector, 8);
			OutputVector++;

			__stvewx(R0[7], OutputVector, 0);
			__stvewx(R1[7], OutputVector, 4);
			__stvewx(R2[7], OutputVector, 8);
			OutputVector++;
		}
	}

	while (vCurrent < vStreamEnd)
	{
		vIn[0] = __lvx(vCurrent->Base(), 0);

		R0[0] = __vmsum4fp(vIn[0], mRow0);
		R1[0] = __vmsum4fp(vIn[0], mRow1);
		R2[0] = __vmsum4fp(vIn[0], mRow2);

		__stvewx(R0[0], vCurrent->Base(), 0);
		__stvewx(R1[0], vCurrent->Base(), 4);
		__stvewx(R2[0], vCurrent->Base(), 8);

		vCurrent++;
	}


}

#endif // #if !defined(__SPU__)



#endif