//========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============//
//
// Purpose: 
//
//=====================================================================================//

#ifndef _MATH_PFNS_H_
#define _MATH_PFNS_H_

#include <limits>

// YUP_ACTIVE is from Source2. It's (obviously) not supported on this branch, just including it here to help merge camera.cpp/.h and the CSM shadow code.
//#define YUP_ACTIVE 1

enum MatrixAxisType_t
{
#ifdef YUP_ACTIVE
	FORWARD_AXIS = 2,
	LEFT_AXIS = 0,
	UP_AXIS = 1,
#else
	FORWARD_AXIS = 0,
	LEFT_AXIS = 1,
	UP_AXIS = 2,
#endif

	X_AXIS = 0,
	Y_AXIS = 1,
	Z_AXIS = 2,
	ORIGIN = 3,
	PROJECTIVE = 3,
};

#if defined( _X360 )
#include <xboxmath.h>
#elif defined(_PS3)

#ifdef SPU
#include <vectormath/c/vectormath_aos.h>
#include <spu_intrinsics.h>
#else
#include <ppu_asm_intrinsics.h>
#endif

// Note that similar defines exist in ssemath.h
// Maybe we should consolidate in one place for all platforms.

#define _VEC_0x7ff		(vec_int4){0x7ff,0x7ff,0x7ff,0x7ff}
#define _VEC_0x3ff		(vec_int4){0x3ff,0x3ff,0x3ff,0x3ff}
#define _VEC_22L		(vector unsigned int){22,22,22,22}
#define _VEC_11L		(vector unsigned int){11,11,11,11}
#define _VEC_0L			(vector unsigned int){0,0,0,0}
#define _VEC_255F		(vector float){255.0f,255.0f,255.0f,255.0f}
#define _VEC_NEGONEF	(vector float){-1.0f,-1.0f,-1.0f,-1.0f}
#define _VEC_ONEF		(vector float){1.0f,1.0f,1.0f,1.0f}
#define _VEC_ZEROF		(vector float){0.0f,0.0f,0.0f,0.0f}
#define _VEC_ZEROxyzONEwF (vector float){0.0f,0.0f,0.0f,1.0f}
#define _VEC_HALFF		(vector float){0.5f,0.5f,0.5f,0.5f}
#define _VEC_HALFxyzZEROwF	(vector float){0.5f,0.5f,0.5f,0.0f}
#define _VEC_PERMUTE_XYZ0W1   (vector unsigned char){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x1c,0x1d,0x1e,0x1f}

#define _VEC_IEEEHACK (vector float){(float)(1 << 23),(float)(1 << 23),(float)(1 << 23),(float)(1 << 23)}
#define _VEC_PERMUTE_FASTFTOC (vector unsigned char){0,0,0,0,0,0,0,0,0,0,0,0,0x03,0x07,0x0b,0x0f}

// AngleQuaternion
#define _VEC_PERMUTE_AQsxsxcxcx (vector unsigned char) {0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x10,0x11,0x12,0x13}	
#define _VEC_PERMUTE_AQczszszcz (vector unsigned char) {0x18,0x19,0x1a,0x1b,0x08,0x09,0x0a,0x0b,0x08,0x09,0x0a,0x0b,0x18,0x19,0x1a,0x1b}	
#define _VEC_PERMUTE_AQcxcxsxsx (vector unsigned char) {0x10,0x11,0x12,0x13,0x10,0x11,0x12,0x13,0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03}	
#define _VEC_PERMUTE_AQszczczsz (vector unsigned char) {0x08,0x09,0x0a,0x0b,0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b,0x08,0x09,0x0a,0x0b}	
#define _VEC_PERMUTE_ANGLEQUAT  (vector unsigned char) {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x18,0x19,0x1a,0x1b,0x0c,0x0d,0x0e,0x0f}	

#define _VEC_EPSILONF		(__vector float)			{FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON}

#endif

#if !(defined( PLATFORM_PPC ) || defined(SPU))
// If we are not PPC based or SPU based, then assumes it is SSE2. We should make this code cleaner.

#include <xmmintrin.h>



// These globals are initialized by mathlib and redirected based on available fpu features

// The following are not declared as macros because they are often used in limiting situations,
// and sometimes the compiler simply refuses to inline them for some reason
FORCEINLINE float VECTORCALL FastSqrt(float x)
{
	__m128 root = _mm_sqrt_ss(_mm_load_ss(&x));
	return *(reinterpret_cast<float*>(&root));
}

FORCEINLINE float VECTORCALL FastRSqrtFast(float x)
{
	// use intrinsics
	__m128 rroot = _mm_rsqrt_ss(_mm_load_ss(&x));
	return *(reinterpret_cast<float*>(&rroot));
}
// Single iteration NewtonRaphson reciprocal square root:
// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) 	
// Very low error, and fine to use in place of 1.f / sqrtf(x).	
FORCEINLINE float VECTORCALL FastRSqrt(float x)
{
	float rroot = FastRSqrtFast(x);
	return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
}

void FastSinCos(float x, float* s, float* c);  // any x
float FastCos(float x);



inline float FastRecip(float x) { return 1.0f / x; }
// Simple SSE rsqrt.  Usually accurate to around 6 (relative) decimal places 
// or so, so ok for closed transforms.  (ie, computing lighting normals)
inline float FastSqrtEst(float x) { return FastRSqrtFast(x) * x; }


#else // !defined( PLATFORM_PPC ) && !defined(_SPU)

#ifndef SPU
// We may not need this for SPU, so let's not bother for now

FORCEINLINE float _VMX_Sqrt(float x)
{
	return __fsqrts(x);
}

FORCEINLINE double _VMX_RSqrt(double x)
{
	double rroot = __frsqrte(x);

	// Single iteration NewtonRaphson on reciprocal square root estimate
	return (0.5f * rroot) * (3.0f - (x * rroot) * rroot);
}

FORCEINLINE double _VMX_RSqrtFast(double x)
{
	return __frsqrte(x);
}

#ifdef _X360
FORCEINLINE void _VMX_SinCos(float a, float* pS, float* pC)
{
	XMScalarSinCos(pS, pC, a);
}

FORCEINLINE float _VMX_Cos(float a)
{
	return XMScalarCos(a);
}
#endif

// the 360 has fixed hw and calls directly
#define FastSqrt(x)			_VMX_Sqrt(x)
#define	FastRSqrt(x)		_VMX_RSqrt(x)
#define FastRSqrtFast(x)	_VMX_RSqrtFast(x)
#define FastSinCos(x,s,c)	_VMX_SinCos(x,s,c)
#define FastCos(x)			_VMX_Cos(x)

inline double FastRecip(double x) { return __fres(x); }
inline double FastSqrtEst(double x) { return __frsqrte(x) * x; }

#endif // !defined( PLATFORM_PPC ) && !defined(_SPU)


// if x is infinite, return FLT_MAX
inline float FastClampInfinity(float x)
{
#ifdef PLATFORM_PPC
	return fsel(std::numeric_limits<float>::infinity() - x, x, FLT_MAX);
#else
	return (x > FLT_MAX ? FLT_MAX : x);
#endif
}

#if defined (_PS3) 

#if defined(__SPU__)

inline int _rotl(int a, int count)
{
	vector signed int vi;
	vi = spu_promote(a, 0);
	vi = spu_rl(vi, count);
	return spu_extract(vi, 0);
}

#else

// extern float cosvf(float);      /* single precision cosine      */
// extern float sinvf(float);      /* single precision sine        */
// TODO: need a faster single precision equivalent
#define cosvf cosf
#define sinvf sinf

inline int _rotl(int x, int c)
{
	return __rlwimi(x, x, c, 0, 31);
}

inline int64 _rotl64(int64 x, int c)
{
	return __rldicl(x, c, 0);
}


/*
FORCEINLINE float _VMX_Sqrt( float x )
{
	vector_float_union vIn, vOut;

	vIn.f[0] = x;

	vOut.vf = sqrtf4(vIn.vf);

	return vOut.f[0];
}

FORCEINLINE float _VMX_RSqrt( float x )
{
	vector_float_union vIn, vOut;

	vIn.f[0] = x;

	vOut.vf = rsqrtf4(vIn.vf);

	return vOut.f[0];
}

FORCEINLINE float _VMX_RSqrtFast( float x )
{
	vector_float_union vIn, vOut;

	vIn.f[0] = x;

	vOut.vf = rsqrtf4fast(vIn.vf);

	return vOut.f[0];
}
*/

FORCEINLINE void _VMX_SinCos(float a, float* pS, float* pC)
{
	*pS = sinvf(a);
	*pC = cosvf(a);
}

FORCEINLINE float _VMX_Cos(float a)
{
	return cosvf(a);
}


// the 360 has fixed hw and calls directly
/*
#define FastSqrt(x)			_VMX_Sqrt(x)
#define	FastRSqrt(x)		_VMX_RSqrt(x)
#define FastRSqrtFast(x)	_VMX_RSqrtFast(x)
#define FastSinCos(x,s,c)	_VMX_SinCos(x,s,c)
#define FastCos(x)			_VMX_Cos(x)
*/

#endif


#if defined(__SPU__)

// do we need these optimized yet?

FORCEINLINE float FastSqrt(float x)
{
	return sqrtf(x);
}

FORCEINLINE float FastRSqrt(float x)
{
	float rroot = 1.f / (sqrtf(x) + FLT_EPSILON);
	return rroot;
}


#define FastRSqrtFast(x)	FastRSqrt(x)


#endif



//-----------------------------------------------------------------
// Vector Unions
//-----------------------------------------------------------------

//-----------------------------------------------------------------
// Floats
//-----------------------------------------------------------------
typedef union
{
	vector float vf;
	float f[4];
} vector_float_union;

#if !defined(__SPU__)
//-----------------------------------------------------------------
// Ints
//-----------------------------------------------------------------
typedef union
{
	vector int vi;
	int i[4];
} vector_int4_union;

typedef union
{
	vector unsigned int vui;
	unsigned int ui[4];
} vector_uint4_union;

//-----------------------------------------------------------------
// Shorts
//-----------------------------------------------------------------
typedef union
{
	vector signed short vs;
	signed short s[8];
} vector_short8_union;

typedef union
{
	vector unsigned short vus;
	unsigned short us[8];
} vector_ushort8_union;

//-----------------------------------------------------------------
// Chars
//-----------------------------------------------------------------
typedef union
{
	vector signed char vc;
	signed char c[16];
} vector_char16_union;

typedef union
{
	vector unsigned char vuc;
	unsigned char uc[16];
} vector_uchar16_union;
#endif



#endif	// _PS3
#endif	// #ifndef SPU

#endif // _MATH_PFNS_H_